import gradio as gr import base64 import numpy as np from scipy.io import wavfile from voice_processing import tts, get_model_names, voice_mapping from io import BytesIO import asyncio async def convert_tts(model_name, tts_text, selected_voice, slang_rate, use_uploaded_voice, voice_upload): try: print(f"Starting TTS for text: {tts_text[:50]}...") # Log the start of processing edge_tts_voice = voice_mapping.get(selected_voice) if not edge_tts_voice: print(f"Invalid voice selected: {selected_voice}") return {"error": f"Invalid voice '{selected_voice}'."}, None voice_upload_file = None if use_uploaded_voice and voice_upload is not None: print("Processing uploaded voice file...") with open(voice_upload.name, 'rb') as f: voice_upload_file = f.read() print("Calling TTS function...") info, edge_output_filename, tts_output_data = await asyncio.wait_for( tts(model_name, tts_text, edge_tts_voice, slang_rate, use_uploaded_voice, voice_upload_file), timeout=60 ) print("TTS function call completed.") if isinstance(info, dict) and "error" in info: print(f"Error returned from TTS function: {info['error']}") return info, None print("Processing TTS output...") tgt_sr, audio_output = tts_output_data # Clean up the temporary EdgeTTS output file if it exists if edge_output_filename and os.path.exists(edge_output_filename): os.remove(edge_output_filename) audio_data_uri = f"data:audio/wav;base64,{base64.b64encode(audio_bytes).decode('utf-8')}" return {"info": info}, audio_data_uri except asyncio.TimeoutError: return {"error": "Operation timed out"}, None except asyncio.CancelledError: return {"error": "Operation was cancelled"}, None except Exception as e: print(f"Error in convert_tts: {str(e)}") return {"error": str(e)}, None def get_models(): return get_model_names() def get_voices(): return list(voice_mapping.keys()) iface = gr.Interface( fn=convert_tts, inputs=[ gr.Dropdown(choices=get_models(), label="Model", interactive=True), gr.Textbox(label="Text", placeholder="Enter text here"), gr.Dropdown(choices=get_voices(), label="Voice", interactive=True), gr.Slider(minimum=0, maximum=1, step=0.01, label="Slang Rate"), gr.Checkbox(label="Use Uploaded Voice"), gr.File(label="Voice File") ], outputs=[ gr.JSON(label="Info"), gr.Audio(label="Generated Audio", type="numpy") ], title="Text-to-Speech Conversion" ).queue(concurrency_limit=6, max_batch_size=1) iface.launch()