import torch import gradio as gr from tortoise.api import TextToSpeech from tortoise.utils.audio import load_voice, load_voices tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True) languages = ['Male', 'Female'] voices = {'Male': ['deniro', 'freeman'], 'Female': ['emma', 'angie']} def inference(text, Gender, voice, Emotion, Preset): texts = [text] Angry_tone = "[I am so angry]" Sad_tone = "[I am so sad]" Happy_tone = "[I am so happy]" Scared_tone = "[I am so scared]" if Emotion == "Angry": text = Angry_tone + text if Emotion == "Sad": text = Sad_tone + text if Emotion == "Happy": text = Happy_tone + text if Emotion == "Scared": text = Scared_tone + text voice_samples, conditioning_latents = load_voice(voice) audio_frames = [] for j, text in enumerate(texts): for audio_frame in tts.tts_with_preset( text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset=Preset, k=1 ): audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy())) complete_audio = torch.cat(audio_frames, dim=0) yield (24000, complete_audio.numpy()) def rs_change(rs): new_choices = voices[rs] return gr.update(choices=new_choices, value=new_choices[0] if new_choices else None) title = "Tortoise TTS" with gr.Blocks() as app: text = gr.Textbox(lines=4, label="Text:") rs = gr.Dropdown(choices=languages, value='Male', label="Gender") rs_hw = gr.Dropdown(choices=voices['Male'], interactive=True, label="Voice") rs.change(fn=rs_change, inputs=[rs], outputs=[rs_hw]) Emotion = gr.Radio(["Angry", "Sad", "Happy", "Scared"], type="value", label="Emotion") Preset = gr.Radio(["ultra_fast", "fast", "standard", "high_quality"], type="value", value="ultra_fast", label="Preset") output_audio = gr.Audio(label="Streaming audio:", streaming=True, autoplay=True) btn = gr.Button("Generate") btn.click(inference, inputs=[text, rs, rs_hw, Emotion, Preset], outputs=[output_audio]) app.launch()