Spaces:

Pranjal12345
/

Text_to_Speech

Running

File size: 2,146 Bytes

97e4faf
 
 
bd104fa
97e4faf
a638c29
97e4faf
a638c29
9decc3e
a638c29
 
bd104fa
97e4faf
ea84da0
 
 
 
 
d743ee9
ea84da0
d743ee9
ea84da0
d743ee9
ea84da0
d743ee9
ea84da0
 
a638c29
97e4faf
44f4964
97e4faf
 
 
 
 
 
44f4964
97e4faf
 
a638c29
44f4964
 
a638c29

import torch
import gradio as gr
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices

tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)

languages = ['Male', 'Female']
voices = {'Male': ['deniro', 'freeman'], 'Female': ['emma', 'angie']}

def inference(text, Gender, voice, Emotion, Preset):
    texts = [text]

    Angry_tone = "[I am so angry]"
    Sad_tone = "[I am so sad]"
    Happy_tone = "[I am so happy]"
    Scared_tone = "[I am so scared]"

    if Emotion == "Angry":
        text = Angry_tone + text
    if Emotion == "Sad":
        text = Sad_tone + text
    if Emotion == "Happy":
        text = Happy_tone + text
    if Emotion == "Scared":
        text = Scared_tone + text

    voice_samples, conditioning_latents = load_voice(voice)

    audio_frames = []

    for j, text in enumerate(texts):
        for audio_frame in tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=Preset,
            k=1
        ):
            audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy()))

    complete_audio = torch.cat(audio_frames, dim=0)
    yield (24000, complete_audio.numpy())

def rs_change(rs):
    new_choices = voices[rs]
    return gr.update(choices=new_choices, value=new_choices[0] if new_choices else None)

title = "Tortoise TTS"

with gr.Blocks() as app:
    text = gr.Textbox(lines=4, label="Text:")
    rs = gr.Dropdown(choices=languages, value='Male', label="Gender")
    rs_hw = gr.Dropdown(choices=voices['Male'], interactive=True, label="Voice")
    rs.change(fn=rs_change, inputs=[rs], outputs=[rs_hw])
    Emotion = gr.Radio(["Angry", "Sad", "Happy", "Scared"], type="value", label="Emotion")
    Preset = gr.Radio(["ultra_fast", "fast", "standard", "high_quality"], type="value", value="ultra_fast", label="Preset")
    output_audio = gr.Audio(label="Streaming audio:", streaming=True, autoplay=True)
    btn = gr.Button("Generate")
    btn.click(inference, inputs=[text, rs, rs_hw, Emotion, Preset], outputs=[output_audio])

app.launch()