Spaces:
Running
on
Zero
Running
on
Zero
import spaces # Import spaces first to avoid CUDA initialization issues | |
import gradio as gr | |
import torch | |
from parler_tts import ParlerTTSForConditionalGeneration | |
from transformers import AutoTokenizer | |
import soundfile as sf | |
import tempfile | |
# Load model and tokenizers at startup (on CPU initially) | |
print("Loading model and tokenizers...") | |
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu") | |
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts") | |
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path) | |
print("Model and tokenizers loaded.") | |
# Supported languages and default settings | |
languages = { | |
"Urdu": "A female speaker delivers a clear and expressive speech in Urdu.", | |
"Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.", | |
"Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.", | |
} | |
emotions = [ | |
"Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation", | |
"Disgust", "Fear", "News", "Proper Noun", "Surprise" | |
] | |
default_language = "Urdu" | |
default_gender = "Female" | |
default_emotion = "Neutral" | |
# Generate description function | |
def generate_description(language, gender, emotion, noise, reverb, expressivity, pitch, rate, quality): | |
description = ( | |
f"A {gender.lower()} speaker delivers a {emotion.lower()} and {expressivity.lower()} speech " | |
f"with a {pitch.lower()} pitch and a {rate.lower()} speaking rate. " | |
f"The audio has {noise.lower()} background noise, {reverb.lower()} reverberation, " | |
f"and {quality.lower()} voice quality. The text is in {language}." | |
) | |
return description | |
# Generate audio function with GPU allocation | |
# Allocate GPU for the duration of this function | |
def generate_audio(text, description): | |
global model # Access the preloaded model | |
# Move model to GPU | |
model.to("cuda") | |
# Prepare model inputs | |
input_ids = description_tokenizer(description, return_tensors="pt").input_ids.to("cuda") | |
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda") | |
# Generate audio | |
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) | |
audio_arr = generation.cpu().numpy().squeeze() | |
# Save audio to a temporary file | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
sf.write(f.name, audio_arr, model.config.sampling_rate) | |
audio_path = f.name | |
# Move model back to CPU to free GPU memory | |
model.to("cpu") | |
return audio_path | |
# Gradio Interface | |
def app(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi") | |
gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.") | |
with gr.Row(): | |
lang_dropdown = gr.Dropdown( | |
choices=list(languages.keys()), | |
value=default_language, | |
label="Select Language" | |
) | |
gender_dropdown = gr.Dropdown( | |
choices=["Male", "Female"], | |
value=default_gender, | |
label="Speaker Gender" | |
) | |
emotion_dropdown = gr.Dropdown( | |
choices=emotions, | |
value=default_emotion, | |
label="Select Emotion" | |
) | |
with gr.Row(): | |
noise_dropdown = gr.Dropdown( | |
choices=["Clear", "Slightly Noisy"], | |
value="Clear", | |
label="Background Noise" | |
) | |
reverb_dropdown = gr.Dropdown( | |
choices=["Close-Sounding", "Distant-Sounding"], | |
value="Close-Sounding", | |
label="Reverberation" | |
) | |
expressivity_dropdown = gr.Dropdown( | |
choices=["Expressive", "Slightly Expressive", "Monotone"], | |
value="Expressive", | |
label="Expressivity" | |
) | |
pitch_dropdown = gr.Dropdown( | |
choices=["High", "Low", "Balanced"], | |
value="Balanced", | |
label="Pitch" | |
) | |
rate_dropdown = gr.Dropdown( | |
choices=["Slow", "Moderate", "Fast"], | |
value="Moderate", | |
label="Speaking Rate" | |
) | |
quality_dropdown = gr.Dropdown( | |
choices=["Basic", "Refined"], | |
value="Refined", | |
label="Voice Quality" | |
) | |
with gr.Row(): | |
text_input = gr.Textbox( | |
label="Enter Text", | |
placeholder="Type your text here...", | |
lines=5 | |
) | |
with gr.Row(): | |
generate_caption_button = gr.Button("Generate Caption/Description") | |
caption_output = gr.Textbox( | |
label="Generated Caption/Description", | |
placeholder="The generated caption will appear here...", | |
lines=5 | |
) | |
with gr.Row(): | |
generate_audio_button = gr.Button("Generate Speech") | |
audio_output = gr.Audio(label="Generated Audio") | |
# Link actions to buttons | |
generate_caption_button.click( | |
fn=generate_description, | |
inputs=[ | |
lang_dropdown, gender_dropdown, emotion_dropdown, | |
noise_dropdown, reverb_dropdown, expressivity_dropdown, | |
pitch_dropdown, rate_dropdown, quality_dropdown | |
], | |
outputs=caption_output | |
) | |
generate_audio_button.click( | |
fn=generate_audio, | |
inputs=[text_input, caption_output], | |
outputs=audio_output | |
) | |
return demo | |
# Run the app | |
app().launch() | |