PuristanLabs1's picture
Update app.py
6904c9a verified
raw
history blame
5.88 kB
import spaces # Import spaces first to avoid CUDA initialization issues
import gradio as gr
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import tempfile
# Load model and tokenizers at startup (on CPU initially)
print("Loading model and tokenizers...")
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
print("Model and tokenizers loaded.")
# Supported languages and default settings
languages = {
"Urdu": "A female speaker delivers a clear and expressive speech in Urdu.",
"Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.",
"Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.",
}
emotions = [
"Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation",
"Disgust", "Fear", "News", "Proper Noun", "Surprise"
]
default_language = "Urdu"
default_gender = "Female"
default_emotion = "Neutral"
# Generate description function
def generate_description(language, gender, emotion, noise, reverb, expressivity, pitch, rate, quality):
description = (
f"A {gender.lower()} speaker delivers a {emotion.lower()} and {expressivity.lower()} speech "
f"with a {pitch.lower()} pitch and a {rate.lower()} speaking rate. "
f"The audio has {noise.lower()} background noise, {reverb.lower()} reverberation, "
f"and {quality.lower()} voice quality. The text is in {language}."
)
return description
# Generate audio function with GPU allocation
@spaces.GPU # Allocate GPU for the duration of this function
def generate_audio(text, description):
global model # Access the preloaded model
# Move model to GPU
model.to("cuda")
# Prepare model inputs
input_ids = description_tokenizer(description, return_tensors="pt").input_ids.to("cuda")
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda")
# Generate audio
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
# Save audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, audio_arr, model.config.sampling_rate)
audio_path = f.name
# Move model back to CPU to free GPU memory
model.to("cpu")
return audio_path
# Gradio Interface
def app():
with gr.Blocks() as demo:
gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi")
gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.")
with gr.Row():
lang_dropdown = gr.Dropdown(
choices=list(languages.keys()),
value=default_language,
label="Select Language"
)
gender_dropdown = gr.Dropdown(
choices=["Male", "Female"],
value=default_gender,
label="Speaker Gender"
)
emotion_dropdown = gr.Dropdown(
choices=emotions,
value=default_emotion,
label="Select Emotion"
)
with gr.Row():
noise_dropdown = gr.Dropdown(
choices=["Clear", "Slightly Noisy"],
value="Clear",
label="Background Noise"
)
reverb_dropdown = gr.Dropdown(
choices=["Close-Sounding", "Distant-Sounding"],
value="Close-Sounding",
label="Reverberation"
)
expressivity_dropdown = gr.Dropdown(
choices=["Expressive", "Slightly Expressive", "Monotone"],
value="Expressive",
label="Expressivity"
)
pitch_dropdown = gr.Dropdown(
choices=["High", "Low", "Balanced"],
value="Balanced",
label="Pitch"
)
rate_dropdown = gr.Dropdown(
choices=["Slow", "Moderate", "Fast"],
value="Moderate",
label="Speaking Rate"
)
quality_dropdown = gr.Dropdown(
choices=["Basic", "Refined"],
value="Refined",
label="Voice Quality"
)
with gr.Row():
text_input = gr.Textbox(
label="Enter Text",
placeholder="Type your text here...",
lines=5
)
with gr.Row():
generate_caption_button = gr.Button("Generate Caption/Description")
caption_output = gr.Textbox(
label="Generated Caption/Description",
placeholder="The generated caption will appear here...",
lines=5
)
with gr.Row():
generate_audio_button = gr.Button("Generate Speech")
audio_output = gr.Audio(label="Generated Audio")
# Link actions to buttons
generate_caption_button.click(
fn=generate_description,
inputs=[
lang_dropdown, gender_dropdown, emotion_dropdown,
noise_dropdown, reverb_dropdown, expressivity_dropdown,
pitch_dropdown, rate_dropdown, quality_dropdown
],
outputs=caption_output
)
generate_audio_button.click(
fn=generate_audio,
inputs=[text_input, caption_output],
outputs=audio_output
)
return demo
# Run the app
app().launch()