Spaces:
Running
Running
import torch | |
import gradio as gr | |
from tortoise.api import TextToSpeech | |
from tortoise.utils.audio import load_voice, load_voices | |
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) | |
languages = ['Male', 'Female'] | |
voices = {'Male': ['deniro', 'freeman'], 'Female': ['emma', 'angie']} | |
def inference(text, Gender, voice, Emotion, Preset): | |
texts = [text] | |
Angry_tone = "[I am so angry]" | |
Sad_tone = "[I am so sad]" | |
Happy_tone = "[I am so happy]" | |
Scared_tone = "[I am so scared]" | |
if Emotion == "Angry": | |
text = Angry_tone + text | |
if Emotion == "Sad": | |
text = Sad_tone + text | |
if Emotion == "Happy": | |
text = Happy_tone + text | |
if Emotion == "Scared": | |
text = Scared_tone + text | |
voice_samples, conditioning_latents = load_voice(voice) | |
audio_frames = [] | |
for j, text in enumerate(texts): | |
for audio_frame in tts.tts_with_preset( | |
text, | |
voice_samples=voice_samples, | |
conditioning_latents=conditioning_latents, | |
preset=Preset, | |
k=1 | |
): | |
audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy())) | |
complete_audio = torch.cat(audio_frames, dim=0) | |
yield (24000, complete_audio.numpy()) | |
def rs_change(rs): | |
new_choices = voices[rs] | |
return gr.update(choices=new_choices, value=new_choices[0] if new_choices else None) | |
title = "Tortoise TTS" | |
with gr.Blocks() as app: | |
text = gr.Textbox(lines=4, label="Text:") | |
rs = gr.Dropdown(choices=languages, value='Male', label="Gender") | |
rs_hw = gr.Dropdown(choices=voices['Male'], interactive=True, label="Voice") | |
rs.change(fn=rs_change, inputs=[rs], outputs=[rs_hw]) | |
Emotion = gr.Radio(["Angry", "Sad", "Happy", "Scared"], type="value", label="Emotion") | |
Preset = gr.Radio(["ultra_fast", "fast", "standard", "high_quality"], type="value", value="ultra_fast", label="Preset") | |
output_audio = gr.Audio(label="Streaming audio:", streaming=True, autoplay=True) | |
btn = gr.Button("Generate") | |
btn.click(inference, inputs=[text, rs, rs_hw, Emotion, Preset], outputs=[output_audio]) | |
app.launch() |