KevinGeng commited on
Commit
7f97911
β€’
1 Parent(s): 671e149

support local vocoder

Browse files
.gitattributes CHANGED
@@ -34,4 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  xvector filter=lfs diff=lfs merge=lfs -text
36
  TTS_models filter=lfs diff=lfs merge=lfs -text
37
- vocoders filter=lfs diff=lfs merge=lfs -text
 
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  xvector filter=lfs diff=lfs merge=lfs -text
36
  TTS_models filter=lfs diff=lfs merge=lfs -text
37
+ parallel_wavegan filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -47,26 +47,42 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_P
47
 
48
  transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
49
 
 
50
  # @title English multi-speaker pretrained model { run: "auto" }
51
  lang = "English"
52
- # tag = "kan-bayashi/libritts_xvector_vits"
53
  ft2_tag = "kan-bayashi/libritts_xvector_vits" #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer"
54
- transformer_tag = "kan-bayashi/libritts_xvector_transformer"
55
  # ft2_tag = "kan-bayashi/libritts_xvector_conformer_fastspeech2"
56
- # vits needs no vocoder
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # Vocoders
59
  vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
60
  hifigan_vocoder_tag = "parallel_wavegan/parallel_wavegan/libritts_hifigan.v1" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
61
 
 
 
 
 
 
62
  from espnet2.bin.tts_inference import Text2Speech
63
  from espnet2.utils.types import str_or_none
64
 
65
  # local import
66
  text2speech = Text2Speech.from_pretrained(
67
- train_config = "TTS_models/libritts_xvector_vits/config.yaml",
68
- model_file="TTS_models/libritts_xvector_vits/train.total_count.ave_10best.pth",
69
- vocoder_tag=str_or_none(vocoder_tag),
70
  device="cuda",
71
  use_att_constraint=False,
72
  backward_window=1,
@@ -77,7 +93,7 @@ text2speech = Text2Speech.from_pretrained(
77
  # Fastspeech2
78
  ft2_text2speech = Text2Speech.from_pretrained(
79
  model_tag=ft2_tag,
80
- vocoder_tag=str_or_none(vocoder_tag),
81
  device="cuda",
82
  use_att_constraint=False,
83
  backward_window=1,
@@ -88,7 +104,7 @@ ft2_text2speech = Text2Speech.from_pretrained(
88
  # Fastspeech2 + hifigan
89
  ft2_text2speech_hifi = Text2Speech.from_pretrained(
90
  model_tag=ft2_tag,
91
- vocoder_tag=str_or_none(hifigan_vocoder_tag),
92
  device="cuda",
93
  use_att_constraint=False,
94
  backward_window=1,
@@ -99,16 +115,13 @@ ft2_text2speech_hifi = Text2Speech.from_pretrained(
99
  # transformer tag
100
  transformer_text2speech = Text2Speech.from_pretrained(
101
  model_tag=transformer_tag,
102
- vocoder_tag=str_or_none(vocoder_tag),
103
  device="cuda",
104
  use_att_constraint=False,
105
  backward_window=1,
106
  forward_window=3,
107
  speed_control_alpha=1.0,
108
  )
109
- pdb.set_trace()
110
- # from google.cloud import texttospeech
111
- # Google_TTS_client = texttospeech.TextToSpeechClient()
112
 
113
  import glob
114
  import os
@@ -139,8 +152,10 @@ male_spks = {
139
  "Male3": "672_122797"
140
  }
141
 
142
- female_spks = {"Female1": "5683_32865", "Female2": "121_121726", "Female3": "8463_287645"}
143
- # "F3": "121_121726"
 
 
144
  spks = dict(male_spks, **female_spks)
145
  spk_names = sorted(spks.keys())
146
 
 
47
 
48
  transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
49
 
50
+ # Text2Mel models
51
  # @title English multi-speaker pretrained model { run: "auto" }
52
  lang = "English"
53
+ vits_tag = "kan-bayashi/libritts_xvector_vits"
54
  ft2_tag = "kan-bayashi/libritts_xvector_vits" #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer"
 
55
  # ft2_tag = "kan-bayashi/libritts_xvector_conformer_fastspeech2"
56
+ transformer_tag = "kan-bayashi/libritts_xvector_transformer"
57
+
58
+ # !!! vits needs no vocoder !!!
59
+ # Local Text2Mel models
60
+
61
+ vits_config_local = "TTS_models/libritts_xvector_vits/config.yaml"
62
+ vits_model_local = "TTS_models/libritts_xvector_vits/train.total_count.ave_10best.pth"
63
+
64
+ # TODO
65
+ ft2_config_local = ""
66
+ ft2_model_local= ""
67
+ transformer_config_local = ""
68
+ transformer_config_local = ""
69
 
70
  # Vocoders
71
  vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
72
  hifigan_vocoder_tag = "parallel_wavegan/parallel_wavegan/libritts_hifigan.v1" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
73
 
74
+ # Local Vocoders
75
+ ## Make sure the use parallel_wavegan as prefix (PWG feature)
76
+ vocoder_tag_local = "parallel_wavegan/vctk_parallel_wavegan.v1.long"
77
+ hifigan_vocoder_tag_local = "parallel_wavegan/libritts_hifigan.v1"
78
+
79
  from espnet2.bin.tts_inference import Text2Speech
80
  from espnet2.utils.types import str_or_none
81
 
82
  # local import
83
  text2speech = Text2Speech.from_pretrained(
84
+ train_config = vits_config_local,
85
+ model_file=vits_model_local,
 
86
  device="cuda",
87
  use_att_constraint=False,
88
  backward_window=1,
 
93
  # Fastspeech2
94
  ft2_text2speech = Text2Speech.from_pretrained(
95
  model_tag=ft2_tag,
96
+ vocoder_tag=str_or_none(vocoder_tag_local),
97
  device="cuda",
98
  use_att_constraint=False,
99
  backward_window=1,
 
104
  # Fastspeech2 + hifigan
105
  ft2_text2speech_hifi = Text2Speech.from_pretrained(
106
  model_tag=ft2_tag,
107
+ vocoder_tag=str_or_none(hifigan_vocoder_tag_local),
108
  device="cuda",
109
  use_att_constraint=False,
110
  backward_window=1,
 
115
  # transformer tag
116
  transformer_text2speech = Text2Speech.from_pretrained(
117
  model_tag=transformer_tag,
118
+ vocoder_tag=str_or_none(vocoder_tag_local),
119
  device="cuda",
120
  use_att_constraint=False,
121
  backward_window=1,
122
  forward_window=3,
123
  speed_control_alpha=1.0,
124
  )
 
 
 
125
 
126
  import glob
127
  import os
 
152
  "Male3": "672_122797"
153
  }
154
 
155
+ female_spks = {"Female1": "5683_32865",
156
+ "Female2": "121_121726",
157
+ "Female3": "8463_287645"}
158
+
159
  spks = dict(male_spks, **female_spks)
160
  spk_names = sorted(spks.keys())
161
 
{vocoders β†’ parallel_wavegan}/libritts_hifigan.v1/checkpoint-2500000steps.pkl RENAMED
File without changes
{vocoders β†’ parallel_wavegan}/libritts_hifigan.v1/config.yml RENAMED
File without changes
{vocoders β†’ parallel_wavegan}/libritts_hifigan.v1/stats.h5 RENAMED
File without changes
{vocoders β†’ parallel_wavegan}/vctk_parallel_wavegan.v1.long/checkpoint-1000000steps.pkl RENAMED
File without changes
{vocoders β†’ parallel_wavegan}/vctk_parallel_wavegan.v1.long/config.yml RENAMED
File without changes
{vocoders β†’ parallel_wavegan}/vctk_parallel_wavegan.v1.long/stats.h5 RENAMED
File without changes
vocoders/vctk_parallel_wavegan.v1.long/._checkpoint-1000000steps.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccb8cfc739515054284e6ac7c75afdab0c771eba7d132c4e19efff528147a1a1
3
- size 223
 
 
 
 
vocoders/vctk_parallel_wavegan.v1.long/._config.yml DELETED
Binary file (223 Bytes)
 
vocoders/vctk_parallel_wavegan.v1.long/._stats.h5 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f688626155bff6853a6045408be9bb248828abc482e2218ee0d93183cede5062
3
- size 223
 
 
 
 
vocoders/vctk_parallel_wavegan.v1.long/._train_nodev_all_vctk_parallel_wavegan.v1.long DELETED
Binary file (187 Bytes)