import gradio as gr import asyncio import base64 import os from tts import voices, tts, get_task_result, Voice import tempfile def generate_speech(text, voice_name, custom_audio=None, custom_prompt_text=None): """Generate speech from text using the selected voice or custom voice""" if not text.strip(): return None, "Please enter some text" output_file = "temp_output.wav" # Handle custom voice upload if custom_audio is not None and custom_prompt_text and custom_prompt_text.strip(): # Create a temporary Voice object with the uploaded audio temp_audio_path = custom_audio voice = { "name": "Custom Voice", "promptText": custom_prompt_text, "promptAudio": temp_audio_path } else: # Use predefined voice voice = voices[voice_name] async def process_tts(): try: task_id = await tts(text, voice) while True: result = await get_task_result(task_id) if result['status'] != 'PENDING': break await asyncio.sleep(1) if result['status'] == 'SUCCESS': audio_data = result['audio_url'] if ',' in audio_data: audio_data = audio_data.split(',')[1] with open(output_file, 'wb') as f: f.write(base64.b64decode(audio_data)) return output_file, f"Successfully generated audio using {voice['name']}" else: return None, f"TTS generation failed: {result['message']}" except Exception as e: return None, f"Error: {str(e)}" return asyncio.run(process_tts()) # Create a dictionary of voice names for the dropdown voice_options = {k: v["name"] for k, v in voices.items()} # Create the Gradio interface with gr.Blocks(title="Cantonese Text-to-Speech") as demo: gr.Markdown("# Cantonese Text-to-Speech Demo") gr.Markdown("Enter text in Cantonese and select a voice to generate speech.") with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( placeholder="輸入廣東話文字...", label="Text to convert", lines=5 ) with gr.Group(): gr.Markdown("### Choose a voice option") voice_dropdown = gr.Dropdown( choices=list(voice_options.keys()), value=list(voice_options.keys())[0], label="Select Predefined Voice", info="Choose a voice for synthesis" ) # Display the actual voice name based on the selection voice_name_display = gr.Markdown(value=f"Selected Voice: {voice_options[list(voice_options.keys())[0]]}") with gr.Column(scale=2): with gr.Group(): gr.Markdown("### Or upload your own voice (optional)") custom_audio = gr.Audio( label="Upload Voice Sample (WAV format)", type="filepath", format="wav" ) custom_prompt_text = gr.Textbox( placeholder="Enter the exact transcription of the uploaded audio...", label="Transcription of Uploaded Audio (required if using custom voice)", lines=2 ) gr.Markdown("*Note: The custom voice sample should be clear with minimal background noise.*") generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(scale=3): audio_output = gr.Audio(label="Generated Speech", type="filepath") status_text = gr.Markdown("Ready to generate speech") # Update the voice name display when dropdown changes voice_dropdown.change( fn=lambda x: f"Selected Voice: {voice_options[x]}", inputs=voice_dropdown, outputs=voice_name_display ) # Generate speech when button is clicked generate_btn.click( fn=generate_speech, inputs=[text_input, voice_dropdown, custom_audio, custom_prompt_text], outputs=[audio_output, status_text], concurrency_limit=1 ) if __name__ == "__main__": demo.launch()