Text_to_Speech / app.py
Pranjal12345's picture
Update app.py
9decc3e
import torch
import gradio as gr
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
languages = ['Male', 'Female']
voices = {'Male': ['deniro', 'freeman'], 'Female': ['emma', 'angie']}
def inference(text, Gender, voice, Emotion, Preset):
texts = [text]
Angry_tone = "[I am so angry]"
Sad_tone = "[I am so sad]"
Happy_tone = "[I am so happy]"
Scared_tone = "[I am so scared]"
if Emotion == "Angry":
text = Angry_tone + text
if Emotion == "Sad":
text = Sad_tone + text
if Emotion == "Happy":
text = Happy_tone + text
if Emotion == "Scared":
text = Scared_tone + text
voice_samples, conditioning_latents = load_voice(voice)
audio_frames = []
for j, text in enumerate(texts):
for audio_frame in tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset=Preset,
k=1
):
audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy()))
complete_audio = torch.cat(audio_frames, dim=0)
yield (24000, complete_audio.numpy())
def rs_change(rs):
new_choices = voices[rs]
return gr.update(choices=new_choices, value=new_choices[0] if new_choices else None)
title = "Tortoise TTS"
with gr.Blocks() as app:
text = gr.Textbox(lines=4, label="Text:")
rs = gr.Dropdown(choices=languages, value='Male', label="Gender")
rs_hw = gr.Dropdown(choices=voices['Male'], interactive=True, label="Voice")
rs.change(fn=rs_change, inputs=[rs], outputs=[rs_hw])
Emotion = gr.Radio(["Angry", "Sad", "Happy", "Scared"], type="value", label="Emotion")
Preset = gr.Radio(["ultra_fast", "fast", "standard", "high_quality"], type="value", value="ultra_fast", label="Preset")
output_audio = gr.Audio(label="Streaming audio:", streaming=True, autoplay=True)
btn = gr.Button("Generate")
btn.click(inference, inputs=[text, rs, rs_hw, Emotion, Preset], outputs=[output_audio])
app.launch()