import gradio as gr
import torch
from TTS.api import TTS
import os
import tempfile

os.environ["COQUI_TOS_AGREED"] = "1"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize TTS model
def load_tts_model():
    return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

tts = load_tts_model()

def clone(text, audio_file, language, speaking_rate, pitch, volume, 
          emotion, sample_rate, temperature, seed):
    if seed is not None:
        torch.manual_seed(seed)
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio_path = temp_audio.name
    
    tts.tts_to_file(
        text=text, 
        speaker_wav=audio_file,
        language=language,
        file_path=temp_audio_path
    )
    
    return temp_audio_path

# Define Gradio interface
iface = gr.Interface(
    fn=clone,
    inputs=[
        gr.Textbox(label="Text"),
        gr.Audio(label="Voice reference audio file", type="filepath"),
        gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en"),
        gr.Slider(0.5, 2.0, value=1.0, label="Speaking Rate"),
        gr.Slider(-10, 10, value=0, label="Pitch Adjustment"),
        gr.Slider(0.1, 2.0, value=1.0, label="Volume"),
        gr.Dropdown(["neutral", "happy", "sad", "angry"], label="Emotion", value="neutral"),
        gr.Dropdown([22050, 24000, 44100, 48000], label="Sample Rate", value=24000),
        gr.Slider(0.1, 1.0, value=0.8, label="Temperature"),
        gr.Number(label="Seed (optional)")
    ],
    outputs=gr.Audio(label="Generated Audio"),
    title="Advanced Voice Clone",
    description="Customize your voice cloning experience with various parameters."
)

# Launch the interface
iface.launch()

# Clean up temporary files (this will run after the Gradio server is closed)
for file in os.listdir():
    if file.endswith('.wav') and file.startswith('tmp'):
        os.remove(file)