abhishekrajpurohit's picture
Upload 39 files
195bb33 verified
import gradio as gr
from models.tts import TTSModel
from utils.audio_utils import save_audio, get_cached_audio, get_audio_filename
from utils.input_validation import validate_input
from config.language_mapping import (
LANGUAGE_VOICE_MAPPING,
construct_description,
EMOTION_DESC,
SPEED_DESC,
PITCH_DESC,
BACKGROUND_NOISE_DESC,
REVERBERATION_DESC,
QUALITY_DESC,
get_speakers_for_language
)
def generate_speech(
text,
language,
speaker,
emotion="Neutral",
speed="Normal",
pitch="Medium",
background_noise="Minimal",
reverberation="Close",
quality="High"
):
try:
# Validate inputs
validate_input(text, language)
# Check if audio is already cached
cached_audio = get_cached_audio(
text, language, speaker, emotion, speed,
pitch, background_noise, reverberation, quality
)
if cached_audio:
return cached_audio
# Get the description using the imported constructor
description = construct_description(
speaker,
language,
emotion,
speed,
pitch,
background_noise,
reverberation,
quality
)
# Generate audio
tts_model = TTSModel()
audio_array = tts_model.generate_audio(text, description)
# Save the generated audio
filename = get_audio_filename(
text, language, speaker, emotion, speed,
pitch, background_noise, reverberation, quality
)
filepath = save_audio(audio_array, filename)
return filepath
except Exception as e:
raise gr.Error(str(e))
# Create Gradio interface
with gr.Blocks(title="Indic Text-to-Speech") as demo:
gr.Markdown("# Indian Local Text-to-Speech Synthesizer")
gr.Markdown("Generate natural speech in multiple Indian languages using AI4Bharat's model")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to speak",
placeholder="Enter the text you want to convert to speech...",
lines=3
)
with gr.Row():
language_input = gr.Dropdown(
choices=sorted(list(LANGUAGE_VOICE_MAPPING.keys())),
label="Language",
value="English"
)
speaker_input = gr.Dropdown(
choices=LANGUAGE_VOICE_MAPPING["English"], # Default choices
label="Speaker",
value=LANGUAGE_VOICE_MAPPING["English"][0] # Default value
)
with gr.Row():
emotion_input = gr.Dropdown(
choices=list(EMOTION_DESC.keys()),
label="Expressivity",
value="Neutral"
)
speed_input = gr.Dropdown(
choices=list(SPEED_DESC.keys()),
label="Speaking Speed",
value="Normal"
)
with gr.Row():
pitch_input = gr.Dropdown(
choices=list(PITCH_DESC.keys()),
label="Pitch",
value="Medium"
)
background_input = gr.Dropdown(
choices=list(BACKGROUND_NOISE_DESC.keys()),
label="Background Noise",
value="Minimal"
)
with gr.Row():
reverb_input = gr.Dropdown(
choices=list(REVERBERATION_DESC.keys()),
label="Reverberation",
value="Close"
)
quality_input = gr.Dropdown(
choices=list(QUALITY_DESC.keys()),
label="Audio Quality",
value="High"
)
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Speech",
type="numpy"
)
# Update speaker choices when language changes
def update_speakers(language):
speakers = get_speakers_for_language(language)
return gr.Dropdown(choices=speakers, value=speakers[0])
language_input.change(
fn=update_speakers,
inputs=[language_input],
outputs=[speaker_input]
)
# Connect the components
generate_btn.click(
fn=generate_speech,
inputs=[
text_input,
language_input,
speaker_input,
emotion_input,
speed_input,
pitch_input,
background_input,
reverb_input,
quality_input
],
outputs=audio_output
)
# Pre-generate and cache example outputs
example_outputs = []
examples = [
["Hello, how are you?", "English", "Thoma", "Happy", "Normal", "Medium", "Minimal", "Close", "High"],
["नमस्ते, आप कैसे हैं?", "Hindi", "Rohit", "Neutral", "Normal", "Medium", "None", "Very Close", "Studio"],
["ನಮಸ್ಕಾರ, ಹೇಗಿದ್ದೀರಾ?", "Kannada", "Suresh", "Highly Expressive", "Fast", "High", "Minimal", "Moderate", "High"],
["How are you doing today?", "English", "Mary", "Monotone", "Slow", "Low", "Moderate", "Distant", "Good"],
]
# Generate and cache example outputs at startup
for example in examples:
output = generate_speech(*example)
example_outputs.append(output)
# Add examples with cached outputs
gr.Examples(
examples=examples,
inputs=[
text_input,
language_input,
speaker_input,
emotion_input,
speed_input,
pitch_input,
background_input,
reverb_input,
quality_input
],
outputs=audio_output,
fn=generate_speech,
cache_examples=True,
preprocess=False, # Don't preprocess inputs
postprocess=False # Don't postprocess outputs
)
if __name__ == "__main__":
demo.launch()