KevinGeng commited on
Commit
e224a36
1 Parent(s): 7f97911

tab-lization and fix bugs

Browse files
Files changed (1) hide show
  1. app.py +3 -21
app.py CHANGED
@@ -16,29 +16,12 @@ import pdb
16
  # local import
17
  import sys
18
  from espnet2.bin.tts_inference import Text2Speech
19
- from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
 
22
  sys.path.append("src")
23
 
24
  import gradio as gr
25
-
26
- # ASR part
27
-
28
- audio_files = [
29
- str(x)
30
- for x in sorted(
31
- Path(
32
- "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
33
- ).glob("**/*wav")
34
- )
35
- ]
36
- # audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
37
- # transcriber = pipeline(
38
- # "automatic-speech-recognition",
39
- # model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
40
- # )
41
-
42
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
43
 
44
  processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
@@ -51,8 +34,7 @@ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_
51
  # @title English multi-speaker pretrained model { run: "auto" }
52
  lang = "English"
53
  vits_tag = "kan-bayashi/libritts_xvector_vits"
54
- ft2_tag = "kan-bayashi/libritts_xvector_vits" #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer"
55
- # ft2_tag = "kan-bayashi/libritts_xvector_conformer_fastspeech2"
56
  transformer_tag = "kan-bayashi/libritts_xvector_transformer"
57
 
58
  # !!! vits needs no vocoder !!!
@@ -378,7 +360,7 @@ with gr.Blocks(
378
  analytics_enabled=False,
379
  css=".gradio-container {background-color: #78BD91}",
380
  ) as demo:
381
- # Open Version
382
  with gr.Tab("Open Version"):
383
  with gr.Column(elem_id="Column"):
384
  input_format = gr.Radio(
 
16
  # local import
17
  import sys
18
  from espnet2.bin.tts_inference import Text2Speech
19
+ from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
20
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
21
 
22
  sys.path.append("src")
23
 
24
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
26
 
27
  processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
 
34
  # @title English multi-speaker pretrained model { run: "auto" }
35
  lang = "English"
36
  vits_tag = "kan-bayashi/libritts_xvector_vits"
37
+ ft2_tag = "kan-bayashi/libritts_xvector_conformer_fastspeech2"
 
38
  transformer_tag = "kan-bayashi/libritts_xvector_transformer"
39
 
40
  # !!! vits needs no vocoder !!!
 
360
  analytics_enabled=False,
361
  css=".gradio-container {background-color: #78BD91}",
362
  ) as demo:
363
+ # Public Version
364
  with gr.Tab("Open Version"):
365
  with gr.Column(elem_id="Column"):
366
  input_format = gr.Radio(