ESPnet2-TTS

Runtime error

App Files Files Community

Ahsen Khaliq commited on Oct 18, 2021

Commit

633eaa6

•

1 Parent(s): 2c6c0c3

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -11

app.py CHANGED Viewed

@@ -5,13 +5,12 @@ import scipy.io.wavfile
 from espnet2.bin.tts_inference import Text2Speech
 from espnet2.utils.types import str_or_none
-lang = 'English'
-tag = 'kan-bayashi/ljspeech_vits'
-vocoder_tag = "none"
-text2speech = Text2Speech.from_pretrained(
-    model_tag=str_or_none(tag),
-    vocoder_tag=str_or_none(vocoder_tag),
     device="cpu",
     # Only for Tacotron 2 & Transformer
     threshold=0.5,
@@ -29,11 +28,61 @@ text2speech = Text2Speech.from_pretrained(
 )
-def inference(text):
   with torch.no_grad():
-      start = time.time()
-      wav = text2speech(text)["wav"]
-  scipy.io.wavfile.write("out.wav",text2speech.fs , wav.view(-1).cpu().numpy())
   return  "out.wav"
 title = "ESPnet2-TTS"
 description = "Gradio demo for ESPnet2-TTS: Extending the Edge of TTS Research. To use it, simply add your audio, or click one of the examples to load them. Read more at the links below."
@@ -43,7 +92,7 @@ examples=[['This paper describes ESPnet2-TTS, an end-to-end text-to-speech (E2E-
 gr.Interface(
     inference,
-    gr.inputs.Textbox(label="input text",lines=10),
     gr.outputs.Audio(type="file", label="Output"),
     title=title,
     description=description,

 from espnet2.bin.tts_inference import Text2Speech
 from espnet2.utils.types import str_or_none
+tagen = 'kan-bayashi/ljspeech_vits'
+vocoder_tagen = "none"
+text2speechen = Text2Speech.from_pretrained(
+    model_tag=str_or_none(tagen),
+    vocoder_tag=str_or_none(vocoder_tagen),
     device="cpu",
     # Only for Tacotron 2 & Transformer
     threshold=0.5,
 )
+tagjp = 'kan-bayashi/jsut_full_band_vits_prosody'
+vocoder_tagjp = 'none'
+text2speechjp = Text2Speech.from_pretrained(
+    model_tag=str_or_none(tagjp),
+    vocoder_tag=str_or_none(vocoder_tagjp),
+    device="cpu",
+    # Only for Tacotron 2 & Transformer
+    threshold=0.5,
+    # Only for Tacotron 2
+    minlenratio=0.0,
+    maxlenratio=10.0,
+    use_att_constraint=False,
+    backward_window=1,
+    forward_window=3,
+    # Only for FastSpeech & FastSpeech2 & VITS
+    speed_control_alpha=1.0,
+    # Only for VITS
+    noise_scale=0.333,
+    noise_scale_dur=0.333,
+)
+tagch = 'kan-bayashi/csmsc_full_band_vits'
+vocoder_tagch = "none"
+text2speechch = Text2Speech.from_pretrained(
+    model_tag=str_or_none(tagch),
+    vocoder_tag=str_or_none(vocoder_tagch),
+    device="cpu",
+    # Only for Tacotron 2 & Transformer
+    threshold=0.5,
+    # Only for Tacotron 2
+    minlenratio=0.0,
+    maxlenratio=10.0,
+    use_att_constraint=False,
+    backward_window=1,
+    forward_window=3,
+    # Only for FastSpeech & FastSpeech2 & VITS
+    speed_control_alpha=1.0,
+    # Only for VITS
+    noise_scale=0.333,
+    noise_scale_dur=0.333,
+)
+def inference(text,lang):
   with torch.no_grad():
+      if lang == "english"
+          wav = text2speechen(text)["wav"]
+          scipy.io.wavfile.write("out.wav",text2speechen.fs , wav.view(-1).cpu().numpy())
+      if lang == "chinese"
+          wav = text2speechch(text)["wav"]
+          scipy.io.wavfile.write("out.wav",text2speechench.fs , wav.view(-1).cpu().numpy())
+      if lang == "japanese"
+          wav = text2speechjp(text)["wav"]
+          scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
   return  "out.wav"
 title = "ESPnet2-TTS"
 description = "Gradio demo for ESPnet2-TTS: Extending the Edge of TTS Research. To use it, simply add your audio, or click one of the examples to load them. Read more at the links below."
 gr.Interface(
     inference,
+    [gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english", "chinese", "japanese"], type="value", default="english", label="language")]],
     gr.outputs.Audio(type="file", label="Output"),
     title=title,
     description=description,