import gradio as gr import torch from TTS.api import TTS import os import tempfile os.environ["COQUI_TOS_AGREED"] = "1" device = "cuda" if torch.cuda.is_available() else "cpu" # Initialize TTS model def load_tts_model(): return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) tts = load_tts_model() def clone(text, audio_file, language, speaking_rate, pitch, volume, emotion, sample_rate, temperature, seed): if seed is not None: torch.manual_seed(seed) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: temp_audio_path = temp_audio.name tts.tts_to_file( text=text, speaker_wav=audio_file, language=language, file_path=temp_audio_path ) return temp_audio_path # Define Gradio interface iface = gr.Interface( fn=clone, inputs=[ gr.Textbox(label="Text"), gr.Audio(label="Voice reference audio file", type="filepath"), gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en"), gr.Slider(0.5, 2.0, value=1.0, label="Speaking Rate"), gr.Slider(-10, 10, value=0, label="Pitch Adjustment"), gr.Slider(0.1, 2.0, value=1.0, label="Volume"), gr.Dropdown(["neutral", "happy", "sad", "angry"], label="Emotion", value="neutral"), gr.Dropdown([22050, 24000, 44100, 48000], label="Sample Rate", value=24000), gr.Slider(0.1, 1.0, value=0.8, label="Temperature"), gr.Number(label="Seed (optional)") ], outputs=gr.Audio(label="Generated Audio"), title="Advanced Voice Clone", description="Customize your voice cloning experience with various parameters." ) # Launch the interface iface.launch() # Clean up temporary files (this will run after the Gradio server is closed) for file in os.listdir(): if file.endswith('.wav') and file.startswith('tmp'): os.remove(file)