Spaces:
Runtime error
Runtime error
Ahsen Khaliq
commited on
Commit
β’
633eaa6
1
Parent(s):
2c6c0c3
Update app.py
Browse files
app.py
CHANGED
@@ -5,13 +5,12 @@ import scipy.io.wavfile
|
|
5 |
from espnet2.bin.tts_inference import Text2Speech
|
6 |
from espnet2.utils.types import str_or_none
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
vocoder_tag = "none"
|
11 |
|
12 |
-
|
13 |
-
model_tag=str_or_none(
|
14 |
-
vocoder_tag=str_or_none(
|
15 |
device="cpu",
|
16 |
# Only for Tacotron 2 & Transformer
|
17 |
threshold=0.5,
|
@@ -29,11 +28,61 @@ text2speech = Text2Speech.from_pretrained(
|
|
29 |
)
|
30 |
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
with torch.no_grad():
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return "out.wav"
|
38 |
title = "ESPnet2-TTS"
|
39 |
description = "Gradio demo for ESPnet2-TTS: Extending the Edge of TTS Research. To use it, simply add your audio, or click one of the examples to load them. Read more at the links below."
|
@@ -43,7 +92,7 @@ examples=[['This paper describes ESPnet2-TTS, an end-to-end text-to-speech (E2E-
|
|
43 |
|
44 |
gr.Interface(
|
45 |
inference,
|
46 |
-
gr.inputs.Textbox(label="input text",lines=10),
|
47 |
gr.outputs.Audio(type="file", label="Output"),
|
48 |
title=title,
|
49 |
description=description,
|
|
|
5 |
from espnet2.bin.tts_inference import Text2Speech
|
6 |
from espnet2.utils.types import str_or_none
|
7 |
|
8 |
+
tagen = 'kan-bayashi/ljspeech_vits'
|
9 |
+
vocoder_tagen = "none"
|
|
|
10 |
|
11 |
+
text2speechen = Text2Speech.from_pretrained(
|
12 |
+
model_tag=str_or_none(tagen),
|
13 |
+
vocoder_tag=str_or_none(vocoder_tagen),
|
14 |
device="cpu",
|
15 |
# Only for Tacotron 2 & Transformer
|
16 |
threshold=0.5,
|
|
|
28 |
)
|
29 |
|
30 |
|
31 |
+
tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
|
32 |
+
vocoder_tagjp = 'none'
|
33 |
+
|
34 |
+
text2speechjp = Text2Speech.from_pretrained(
|
35 |
+
model_tag=str_or_none(tagjp),
|
36 |
+
vocoder_tag=str_or_none(vocoder_tagjp),
|
37 |
+
device="cpu",
|
38 |
+
# Only for Tacotron 2 & Transformer
|
39 |
+
threshold=0.5,
|
40 |
+
# Only for Tacotron 2
|
41 |
+
minlenratio=0.0,
|
42 |
+
maxlenratio=10.0,
|
43 |
+
use_att_constraint=False,
|
44 |
+
backward_window=1,
|
45 |
+
forward_window=3,
|
46 |
+
# Only for FastSpeech & FastSpeech2 & VITS
|
47 |
+
speed_control_alpha=1.0,
|
48 |
+
# Only for VITS
|
49 |
+
noise_scale=0.333,
|
50 |
+
noise_scale_dur=0.333,
|
51 |
+
)
|
52 |
+
|
53 |
+
tagch = 'kan-bayashi/csmsc_full_band_vits'
|
54 |
+
vocoder_tagch = "none"
|
55 |
+
|
56 |
+
text2speechch = Text2Speech.from_pretrained(
|
57 |
+
model_tag=str_or_none(tagch),
|
58 |
+
vocoder_tag=str_or_none(vocoder_tagch),
|
59 |
+
device="cpu",
|
60 |
+
# Only for Tacotron 2 & Transformer
|
61 |
+
threshold=0.5,
|
62 |
+
# Only for Tacotron 2
|
63 |
+
minlenratio=0.0,
|
64 |
+
maxlenratio=10.0,
|
65 |
+
use_att_constraint=False,
|
66 |
+
backward_window=1,
|
67 |
+
forward_window=3,
|
68 |
+
# Only for FastSpeech & FastSpeech2 & VITS
|
69 |
+
speed_control_alpha=1.0,
|
70 |
+
# Only for VITS
|
71 |
+
noise_scale=0.333,
|
72 |
+
noise_scale_dur=0.333,
|
73 |
+
)
|
74 |
+
|
75 |
+
def inference(text,lang):
|
76 |
with torch.no_grad():
|
77 |
+
if lang == "english"
|
78 |
+
wav = text2speechen(text)["wav"]
|
79 |
+
scipy.io.wavfile.write("out.wav",text2speechen.fs , wav.view(-1).cpu().numpy())
|
80 |
+
if lang == "chinese"
|
81 |
+
wav = text2speechch(text)["wav"]
|
82 |
+
scipy.io.wavfile.write("out.wav",text2speechench.fs , wav.view(-1).cpu().numpy())
|
83 |
+
if lang == "japanese"
|
84 |
+
wav = text2speechjp(text)["wav"]
|
85 |
+
scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
|
86 |
return "out.wav"
|
87 |
title = "ESPnet2-TTS"
|
88 |
description = "Gradio demo for ESPnet2-TTS: Extending the Edge of TTS Research. To use it, simply add your audio, or click one of the examples to load them. Read more at the links below."
|
|
|
92 |
|
93 |
gr.Interface(
|
94 |
inference,
|
95 |
+
[gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english", "chinese", "japanese"], type="value", default="english", label="language")]],
|
96 |
gr.outputs.Audio(type="file", label="Output"),
|
97 |
title=title,
|
98 |
description=description,
|