import os import gradio as gr import outetts import spaces from outetts.version.v1.interface import _DEFAULT_SPEAKERS model_config = outetts.HFModelConfig_v1( model_path="OuteAI/OuteTTS-0.2-500M", language="en", ) interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config) def get_available_speakers(language): """Get available speakers for the selected language.""" if language not in interface.languages: return [] speakers = list(_DEFAULT_SPEAKERS[language].keys()) speakers.insert(0, "None") return speakers def change_interface_language(language): """Change interface language and update available speakers.""" try: interface.change_language(language) speakers = get_available_speakers(language) return gr.update(choices=speakers, value="male_1"), gr.update(visible=True) except ValueError as e: return gr.update(choices=["None"], value="None"), gr.update(visible=False) @spaces.GPU() def generate_tts( text, temperature, repetition_penalty, language, speaker_selection, reference_audio, reference_text ): """Generate TTS with error handling and new features.""" try: # Validate inputs for custom speaker if reference_audio and reference_text: if not os.path.exists(reference_audio): raise ValueError("Reference audio file not found") if not reference_text.strip(): raise ValueError("Reference transcription text is required") speaker = interface.create_speaker(reference_audio, reference_text) # Use selected default speaker elif speaker_selection and speaker_selection != "None": speaker = interface.load_default_speaker(speaker_selection) # No speaker - random characteristics else: speaker = None # Generate audio output = interface.generate( text=text, speaker=speaker, temperature=temperature, repetition_penalty=repetition_penalty, max_length=4096 ) # Verify output if output.audio is None: raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.") # Save and return output output_path = "output.wav" output.save(output_path) return output_path, None except Exception as e: return None, str(e) with gr.Blocks() as demo: gr.Markdown("# OuteTTS-0.2-500M Text-to-Speech Demo") error_box = gr.Textbox(label="Error Messages", visible=False) with gr.Row(): with gr.Column(): # Language selection language_dropdown = gr.Dropdown( choices=list(interface.languages), value="en", label="Interface Language" ) # Speaker selection speaker_dropdown = gr.Dropdown( choices=get_available_speakers("en"), value="male_1", label="Speaker Selection" ) text_input = gr.Textbox( label="Text to Synthesize", placeholder="Enter text here..." ) temperature = gr.Slider( 0.1, 1.0, value=0.1, label="Temperature (lower = more stable tone, higher = more expressive)" ) repetition_penalty = gr.Slider( 0.5, 2.0, value=1.1, label="Repetition Penalty" ) gr.Markdown(""" ### Voice Cloning Guidelines: - Use 10-15 seconds of clear, noise-free audio - Provide accurate transcription - Longer audio clips will reduce maximum output length - Custom speaker overrides speaker selection """) reference_audio = gr.Audio( label="Reference Audio (for voice cloning)", type="filepath" ) reference_text = gr.Textbox( label="Reference Transcription Text", placeholder="Enter exact transcription of reference audio" ) submit_button = gr.Button("Generate Speech") with gr.Column(): audio_output = gr.Audio( label="Generated Audio", type="filepath" ) language_dropdown.change( fn=change_interface_language, inputs=[language_dropdown], outputs=[speaker_dropdown, speaker_dropdown] ) submit_button.click( fn=generate_tts, inputs=[ text_input, temperature, repetition_penalty, language_dropdown, speaker_dropdown, reference_audio, reference_text ], outputs=[audio_output, error_box] ).then( fn=lambda x: gr.update(visible=bool(x)), inputs=[error_box], outputs=[error_box] ) demo.launch()