Spaces:
Sleeping
Sleeping
Commit
·
a638c29
1
Parent(s):
ebf7396
Update app.py
Browse files
app.py
CHANGED
@@ -1,26 +1,14 @@
|
|
1 |
-
import os
|
2 |
import torch
|
3 |
import gradio as gr
|
4 |
-
import torchaudio
|
5 |
-
import time
|
6 |
-
from datetime import datetime
|
7 |
from tortoise.api import TextToSpeech
|
8 |
from tortoise.utils.audio import load_voice, load_voices
|
9 |
|
10 |
-
|
11 |
-
"angie",
|
12 |
-
"deniro",
|
13 |
-
"freeman",
|
14 |
-
"emma",
|
15 |
-
]
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
Preset,
|
22 |
-
):
|
23 |
-
|
24 |
texts = [text]
|
25 |
|
26 |
Angry_tone = "[I am so angry]"
|
@@ -37,12 +25,7 @@ def inference(
|
|
37 |
if Emotion == "Scared":
|
38 |
text = Scared_tone + text
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
if len(voices) == 1:
|
43 |
-
voice_samples, conditioning_latents = load_voice(voice)
|
44 |
-
else:
|
45 |
-
voice_samples, conditioning_latents = load_voices(voices)
|
46 |
|
47 |
audio_frames = []
|
48 |
|
@@ -54,55 +37,26 @@ def inference(
|
|
54 |
preset=Preset,
|
55 |
k=1
|
56 |
):
|
57 |
-
audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy()))
|
58 |
|
59 |
complete_audio = torch.cat(audio_frames, dim=0)
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
)
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
Preset = gr.Radio(
|
81 |
-
["ultra_fast", "fast", "standard", "high_quality"],
|
82 |
-
type="value",
|
83 |
-
value="ultra_fast",
|
84 |
-
)
|
85 |
-
|
86 |
-
output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
|
87 |
-
interface = gr.Interface(
|
88 |
-
fn=inference,
|
89 |
-
inputs=[
|
90 |
-
text,
|
91 |
-
voice,
|
92 |
-
Emotion,
|
93 |
-
Preset,
|
94 |
-
],
|
95 |
-
title=title,
|
96 |
-
outputs=[output_audio],
|
97 |
-
)
|
98 |
-
interface.queue().launch()
|
99 |
-
|
100 |
-
if __name__ == "__main__":
|
101 |
-
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
|
102 |
-
|
103 |
-
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
|
104 |
-
f.write(
|
105 |
-
f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
|
106 |
-
)
|
107 |
-
|
108 |
-
main()
|
|
|
|
|
1 |
import torch
|
2 |
import gradio as gr
|
|
|
|
|
|
|
3 |
from tortoise.api import TextToSpeech
|
4 |
from tortoise.utils.audio import load_voice, load_voices
|
5 |
|
6 |
+
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
languages = ['Male', 'Female']
|
9 |
+
voices = {'Male': ['deniro', 'freeman'], 'Female': ['emma', 'angie']}
|
10 |
+
|
11 |
+
def inference(text, Gender, voice, Emotion, Preset):
|
|
|
|
|
|
|
12 |
texts = [text]
|
13 |
|
14 |
Angry_tone = "[I am so angry]"
|
|
|
25 |
if Emotion == "Scared":
|
26 |
text = Scared_tone + text
|
27 |
|
28 |
+
voice_samples, conditioning_latents = load_voice(voice)
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
audio_frames = []
|
31 |
|
|
|
37 |
preset=Preset,
|
38 |
k=1
|
39 |
):
|
40 |
+
audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy()))
|
41 |
|
42 |
complete_audio = torch.cat(audio_frames, dim=0)
|
43 |
+
yield (24000, complete_audio.numpy())
|
44 |
+
|
45 |
+
def rs_change(rs):
|
46 |
+
new_choices = voices[rs]
|
47 |
+
return gr.update(choices=new_choices, value=new_choices[0] if new_choices else None)
|
48 |
+
|
49 |
+
title = "Tortoise TTS"
|
50 |
+
|
51 |
+
with gr.Blocks() as app:
|
52 |
+
text = gr.Textbox(lines=4, label="Text:")
|
53 |
+
rs = gr.Dropdown(choices=languages, value='Male', label="Gender")
|
54 |
+
rs_hw = gr.Dropdown(choices=voices['Male'], interactive=True, label="Voice")
|
55 |
+
rs.change(fn=rs_change, inputs=[rs], outputs=[rs_hw])
|
56 |
+
Emotion = gr.Radio(["Angry", "Sad", "Happy", "Scared"], type="value", label="Emotion")
|
57 |
+
Preset = gr.Radio(["ultra_fast", "fast", "standard", "high_quality"], type="value", value="ultra_fast", label="Preset")
|
58 |
+
output_audio = gr.Audio(label="Streaming audio:", streaming=True, autoplay=True)
|
59 |
+
btn = gr.Button("Generate")
|
60 |
+
btn.click(inference, inputs=[text, rs, rs_hw, Emotion, Preset], outputs=[output_audio])
|
61 |
+
|
62 |
+
app.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|