Spaces:
Runtime error
Runtime error
import gradio as gr | |
from models.tts import TTSModel | |
from utils.audio_utils import save_audio, get_cached_audio, get_audio_filename | |
from utils.input_validation import validate_input | |
from config.language_mapping import ( | |
LANGUAGE_VOICE_MAPPING, | |
construct_description, | |
EMOTION_DESC, | |
SPEED_DESC, | |
PITCH_DESC, | |
BACKGROUND_NOISE_DESC, | |
REVERBERATION_DESC, | |
QUALITY_DESC, | |
get_speakers_for_language | |
) | |
def generate_speech( | |
text, | |
language, | |
speaker, | |
emotion="Neutral", | |
speed="Normal", | |
pitch="Medium", | |
background_noise="Minimal", | |
reverberation="Close", | |
quality="High" | |
): | |
try: | |
# Validate inputs | |
validate_input(text, language) | |
# Check if audio is already cached | |
cached_audio = get_cached_audio( | |
text, language, speaker, emotion, speed, | |
pitch, background_noise, reverberation, quality | |
) | |
if cached_audio: | |
return cached_audio | |
# Get the description using the imported constructor | |
description = construct_description( | |
speaker, | |
language, | |
emotion, | |
speed, | |
pitch, | |
background_noise, | |
reverberation, | |
quality | |
) | |
# Generate audio | |
tts_model = TTSModel() | |
audio_array = tts_model.generate_audio(text, description) | |
# Save the generated audio | |
filename = get_audio_filename( | |
text, language, speaker, emotion, speed, | |
pitch, background_noise, reverberation, quality | |
) | |
filepath = save_audio(audio_array, filename) | |
return filepath | |
except Exception as e: | |
raise gr.Error(str(e)) | |
# Create Gradio interface | |
with gr.Blocks(title="Indic Text-to-Speech") as demo: | |
gr.Markdown("# Indian Local Text-to-Speech Synthesizer") | |
gr.Markdown("Generate natural speech in multiple Indian languages using AI4Bharat's model") | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Text to speak", | |
placeholder="Enter the text you want to convert to speech...", | |
lines=3 | |
) | |
with gr.Row(): | |
language_input = gr.Dropdown( | |
choices=sorted(list(LANGUAGE_VOICE_MAPPING.keys())), | |
label="Language", | |
value="English" | |
) | |
speaker_input = gr.Dropdown( | |
choices=LANGUAGE_VOICE_MAPPING["English"], # Default choices | |
label="Speaker", | |
value=LANGUAGE_VOICE_MAPPING["English"][0] # Default value | |
) | |
with gr.Row(): | |
emotion_input = gr.Dropdown( | |
choices=list(EMOTION_DESC.keys()), | |
label="Expressivity", | |
value="Neutral" | |
) | |
speed_input = gr.Dropdown( | |
choices=list(SPEED_DESC.keys()), | |
label="Speaking Speed", | |
value="Normal" | |
) | |
with gr.Row(): | |
pitch_input = gr.Dropdown( | |
choices=list(PITCH_DESC.keys()), | |
label="Pitch", | |
value="Medium" | |
) | |
background_input = gr.Dropdown( | |
choices=list(BACKGROUND_NOISE_DESC.keys()), | |
label="Background Noise", | |
value="Minimal" | |
) | |
with gr.Row(): | |
reverb_input = gr.Dropdown( | |
choices=list(REVERBERATION_DESC.keys()), | |
label="Reverberation", | |
value="Close" | |
) | |
quality_input = gr.Dropdown( | |
choices=list(QUALITY_DESC.keys()), | |
label="Audio Quality", | |
value="High" | |
) | |
generate_btn = gr.Button("Generate Speech", variant="primary") | |
with gr.Column(): | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
type="numpy" | |
) | |
# Update speaker choices when language changes | |
def update_speakers(language): | |
speakers = get_speakers_for_language(language) | |
return gr.Dropdown(choices=speakers, value=speakers[0]) | |
language_input.change( | |
fn=update_speakers, | |
inputs=[language_input], | |
outputs=[speaker_input] | |
) | |
# Connect the components | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[ | |
text_input, | |
language_input, | |
speaker_input, | |
emotion_input, | |
speed_input, | |
pitch_input, | |
background_input, | |
reverb_input, | |
quality_input | |
], | |
outputs=audio_output | |
) | |
# Pre-generate and cache example outputs | |
example_outputs = [] | |
examples = [ | |
["Hello, how are you?", "English", "Thoma", "Happy", "Normal", "Medium", "Minimal", "Close", "High"], | |
["नमस्ते, आप कैसे हैं?", "Hindi", "Rohit", "Neutral", "Normal", "Medium", "None", "Very Close", "Studio"], | |
["ನಮಸ್ಕಾರ, ಹೇಗಿದ್ದೀರಾ?", "Kannada", "Suresh", "Highly Expressive", "Fast", "High", "Minimal", "Moderate", "High"], | |
["How are you doing today?", "English", "Mary", "Monotone", "Slow", "Low", "Moderate", "Distant", "Good"], | |
] | |
# Generate and cache example outputs at startup | |
for example in examples: | |
output = generate_speech(*example) | |
example_outputs.append(output) | |
# Add examples with cached outputs | |
gr.Examples( | |
examples=examples, | |
inputs=[ | |
text_input, | |
language_input, | |
speaker_input, | |
emotion_input, | |
speed_input, | |
pitch_input, | |
background_input, | |
reverb_input, | |
quality_input | |
], | |
outputs=audio_output, | |
fn=generate_speech, | |
cache_examples=True, | |
preprocess=False, # Don't preprocess inputs | |
postprocess=False # Don't postprocess outputs | |
) | |
if __name__ == "__main__": | |
demo.launch() | |