import torch
import gradio as gr
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices

tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)

languages = ['Male', 'Female']
voices = {'Male': ['deniro', 'freeman'], 'Female': ['emma', 'angie']}

def inference(text, Gender, voice, Emotion, Preset):
    texts = [text]

    Angry_tone = "[I am so angry]"
    Sad_tone = "[I am so sad]"
    Happy_tone = "[I am so happy]"
    Scared_tone = "[I am so scared]"

    if Emotion == "Angry":
        text = Angry_tone + text
    if Emotion == "Sad":
        text = Sad_tone + text
    if Emotion == "Happy":
        text = Happy_tone + text
    if Emotion == "Scared":
        text = Scared_tone + text

    voice_samples, conditioning_latents = load_voice(voice)

    audio_frames = []

    for j, text in enumerate(texts):
        for audio_frame in tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=Preset,
            k=1
        ):
            audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy()))

    complete_audio = torch.cat(audio_frames, dim=0)
    yield (24000, complete_audio.numpy())

def rs_change(rs):
    new_choices = voices[rs]
    return gr.update(choices=new_choices, value=new_choices[0] if new_choices else None)

title = "Tortoise TTS"

with gr.Blocks() as app:
    text = gr.Textbox(lines=4, label="Text:")
    rs = gr.Dropdown(choices=languages, value='Male', label="Gender")
    rs_hw = gr.Dropdown(choices=voices['Male'], interactive=True, label="Voice")
    rs.change(fn=rs_change, inputs=[rs], outputs=[rs_hw])
    Emotion = gr.Radio(["Angry", "Sad", "Happy", "Scared"], type="value", label="Emotion")
    Preset = gr.Radio(["ultra_fast", "fast", "standard", "high_quality"], type="value", value="ultra_fast", label="Preset")
    output_audio = gr.Audio(label="Streaming audio:", streaming=True, autoplay=True)
    btn = gr.Button("Generate")
    btn.click(inference, inputs=[text, rs, rs_hw, Emotion, Preset], outputs=[output_audio])

app.launch()