Spaces:

PuristanLabs1
/

Indic_ParlerTTS_Urdu

Running on Zero

App Files Files Community

Indic_ParlerTTS_Urdu / app.py

PuristanLabs1

Update app.py

6904c9a verified 2 months ago

raw

history blame

5.88 kB

	import spaces # Import spaces first to avoid CUDA initialization issues
	import gradio as gr
	import torch
	from parler_tts import ParlerTTSForConditionalGeneration
	from transformers import AutoTokenizer
	import soundfile as sf
	import tempfile

	# Load model and tokenizers at startup (on CPU initially)
	print("Loading model and tokenizers...")
	model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
	tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
	description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
	print("Model and tokenizers loaded.")

	# Supported languages and default settings
	languages = {
	"Urdu": "A female speaker delivers a clear and expressive speech in Urdu.",
	"Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.",
	"Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.",
	}
	emotions = [
	"Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation",
	"Disgust", "Fear", "News", "Proper Noun", "Surprise"
	]
	default_language = "Urdu"
	default_gender = "Female"
	default_emotion = "Neutral"

	# Generate description function
	def generate_description(language, gender, emotion, noise, reverb, expressivity, pitch, rate, quality):
	description = (
	f"A {gender.lower()} speaker delivers a {emotion.lower()} and {expressivity.lower()} speech "
	f"with a {pitch.lower()} pitch and a {rate.lower()} speaking rate. "
	f"The audio has {noise.lower()} background noise, {reverb.lower()} reverberation, "
	f"and {quality.lower()} voice quality. The text is in {language}."
	)
	return description

	# Generate audio function with GPU allocation
	@spaces.GPU # Allocate GPU for the duration of this function
	def generate_audio(text, description):
	global model # Access the preloaded model

	# Move model to GPU
	model.to("cuda")

	# Prepare model inputs
	input_ids = description_tokenizer(description, return_tensors="pt").input_ids.to("cuda")
	prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda")

	# Generate audio
	generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
	audio_arr = generation.cpu().numpy().squeeze()

	# Save audio to a temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	sf.write(f.name, audio_arr, model.config.sampling_rate)
	audio_path = f.name

	# Move model back to CPU to free GPU memory
	model.to("cpu")

	return audio_path

	# Gradio Interface
	def app():
	with gr.Blocks() as demo:
	gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi")
	gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.")

	with gr.Row():
	lang_dropdown = gr.Dropdown(
	choices=list(languages.keys()),
	value=default_language,
	label="Select Language"
	)
	gender_dropdown = gr.Dropdown(
	choices=["Male", "Female"],
	value=default_gender,
	label="Speaker Gender"
	)
	emotion_dropdown = gr.Dropdown(
	choices=emotions,
	value=default_emotion,
	label="Select Emotion"
	)

	with gr.Row():
	noise_dropdown = gr.Dropdown(
	choices=["Clear", "Slightly Noisy"],
	value="Clear",
	label="Background Noise"
	)
	reverb_dropdown = gr.Dropdown(
	choices=["Close-Sounding", "Distant-Sounding"],
	value="Close-Sounding",
	label="Reverberation"
	)
	expressivity_dropdown = gr.Dropdown(
	choices=["Expressive", "Slightly Expressive", "Monotone"],
	value="Expressive",
	label="Expressivity"
	)
	pitch_dropdown = gr.Dropdown(
	choices=["High", "Low", "Balanced"],
	value="Balanced",
	label="Pitch"
	)
	rate_dropdown = gr.Dropdown(
	choices=["Slow", "Moderate", "Fast"],
	value="Moderate",
	label="Speaking Rate"
	)
	quality_dropdown = gr.Dropdown(
	choices=["Basic", "Refined"],
	value="Refined",
	label="Voice Quality"
	)

	with gr.Row():
	text_input = gr.Textbox(
	label="Enter Text",
	placeholder="Type your text here...",
	lines=5
	)

	with gr.Row():
	generate_caption_button = gr.Button("Generate Caption/Description")
	caption_output = gr.Textbox(
	label="Generated Caption/Description",
	placeholder="The generated caption will appear here...",
	lines=5
	)

	with gr.Row():
	generate_audio_button = gr.Button("Generate Speech")
	audio_output = gr.Audio(label="Generated Audio")

	# Link actions to buttons
	generate_caption_button.click(
	fn=generate_description,
	inputs=[
	lang_dropdown, gender_dropdown, emotion_dropdown,
	noise_dropdown, reverb_dropdown, expressivity_dropdown,
	pitch_dropdown, rate_dropdown, quality_dropdown
	],
	outputs=caption_output
	)

	generate_audio_button.click(
	fn=generate_audio,
	inputs=[text_input, caption_output],
	outputs=audio_output
	)

	return demo

	# Run the app
	app().launch()