Spaces:

abhishekrajpurohit
/

generate_local_lan

Runtime error

File size: 6,453 Bytes

195bb33

import gradio as gr
from models.tts import TTSModel
from utils.audio_utils import save_audio, get_cached_audio, get_audio_filename
from utils.input_validation import validate_input
from config.language_mapping import (
    LANGUAGE_VOICE_MAPPING, 
    construct_description,
    EMOTION_DESC,
    SPEED_DESC,
    PITCH_DESC,
    BACKGROUND_NOISE_DESC,
    REVERBERATION_DESC,
    QUALITY_DESC,
    get_speakers_for_language
)

def generate_speech(
    text, 
    language,
    speaker,
    emotion="Neutral", 
    speed="Normal", 
    pitch="Medium",
    background_noise="Minimal",
    reverberation="Close",
    quality="High"
):
    try:
        # Validate inputs
        validate_input(text, language)
        
        # Check if audio is already cached
        cached_audio = get_cached_audio(
            text, language, speaker, emotion, speed, 
            pitch, background_noise, reverberation, quality
        )
        
        if cached_audio:
            return cached_audio
        
        # Get the description using the imported constructor
        description = construct_description(
            speaker, 
            language, 
            emotion, 
            speed, 
            pitch,
            background_noise,
            reverberation,
            quality
        )
        
        # Generate audio
        tts_model = TTSModel()
        audio_array = tts_model.generate_audio(text, description)
        
        # Save the generated audio
        filename = get_audio_filename(
            text, language, speaker, emotion, speed,
            pitch, background_noise, reverberation, quality
        )
        filepath = save_audio(audio_array, filename)
        
        return filepath
        
    except Exception as e:
        raise gr.Error(str(e))

# Create Gradio interface
with gr.Blocks(title="Indic Text-to-Speech") as demo:
    gr.Markdown("# Indian Local Text-to-Speech Synthesizer")
    gr.Markdown("Generate natural speech in multiple Indian languages using AI4Bharat's model")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to speak",
                placeholder="Enter the text you want to convert to speech...",
                lines=3
            )
            
            with gr.Row():
                language_input = gr.Dropdown(
                    choices=sorted(list(LANGUAGE_VOICE_MAPPING.keys())),
                    label="Language",
                    value="English"
                )
                speaker_input = gr.Dropdown(
                    choices=LANGUAGE_VOICE_MAPPING["English"],  # Default choices
                    label="Speaker",
                    value=LANGUAGE_VOICE_MAPPING["English"][0]  # Default value
                )
            
            with gr.Row():
                emotion_input = gr.Dropdown(
                    choices=list(EMOTION_DESC.keys()),
                    label="Expressivity",
                    value="Neutral"
                )
                speed_input = gr.Dropdown(
                    choices=list(SPEED_DESC.keys()),
                    label="Speaking Speed",
                    value="Normal"
                )
            
            with gr.Row():
                pitch_input = gr.Dropdown(
                    choices=list(PITCH_DESC.keys()),
                    label="Pitch",
                    value="Medium"
                )
                background_input = gr.Dropdown(
                    choices=list(BACKGROUND_NOISE_DESC.keys()),
                    label="Background Noise",
                    value="Minimal"
                )
            
            with gr.Row():
                reverb_input = gr.Dropdown(
                    choices=list(REVERBERATION_DESC.keys()),
                    label="Reverberation",
                    value="Close"
                )
                quality_input = gr.Dropdown(
                    choices=list(QUALITY_DESC.keys()),
                    label="Audio Quality",
                    value="High"
                )
            
            generate_btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                type="numpy"
            )
    
    # Update speaker choices when language changes
    def update_speakers(language):
        speakers = get_speakers_for_language(language)
        return gr.Dropdown(choices=speakers, value=speakers[0])
    
    language_input.change(
        fn=update_speakers,
        inputs=[language_input],
        outputs=[speaker_input]
    )
    
    # Connect the components
    generate_btn.click(
        fn=generate_speech,
        inputs=[
            text_input, 
            language_input,
            speaker_input,
            emotion_input, 
            speed_input, 
            pitch_input,
            background_input,
            reverb_input,
            quality_input
        ],
        outputs=audio_output
    )
    
    # Pre-generate and cache example outputs
    example_outputs = []
    examples = [
        ["Hello, how are you?", "English", "Thoma", "Happy", "Normal", "Medium", "Minimal", "Close", "High"],
        ["नमस्ते, आप कैसे हैं?", "Hindi", "Rohit", "Neutral", "Normal", "Medium", "None", "Very Close", "Studio"],
        ["ನಮಸ್ಕಾರ, ಹೇಗಿದ್ದೀರಾ?", "Kannada", "Suresh", "Highly Expressive", "Fast", "High", "Minimal", "Moderate", "High"],
        ["How are you doing today?", "English", "Mary", "Monotone", "Slow", "Low", "Moderate", "Distant", "Good"],
    ]
    
    # Generate and cache example outputs at startup
    for example in examples:
        output = generate_speech(*example)
        example_outputs.append(output)
    
    # Add examples with cached outputs
    gr.Examples(
        examples=examples,
        inputs=[
            text_input, 
            language_input,
            speaker_input,
            emotion_input, 
            speed_input, 
            pitch_input,
            background_input,
            reverb_input,
            quality_input
        ],
        outputs=audio_output,
        fn=generate_speech,
        cache_examples=True,
        preprocess=False,  # Don't preprocess inputs
        postprocess=False  # Don't postprocess outputs
    )

if __name__ == "__main__":
    demo.launch()