KevinGeng commited on
Commit
d0c3405
1 Parent(s): 9baab8e

add fastspeech2 TTS, fastspeech2 TTS + HifiGAN, transformer TTS models

Browse files
Files changed (1) hide show
  1. app.py +219 -17
app.py CHANGED
@@ -12,6 +12,7 @@ import torchaudio
12
  from transformers import pipeline
13
  from pathlib import Path
14
 
 
15
  # local import
16
  import sys
17
  from espnet2.bin.tts_inference import Text2Speech
@@ -48,9 +49,15 @@ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_
48
 
49
  # @title English multi-speaker pretrained model { run: "auto" }
50
  lang = "English"
51
- tag = "kan-bayashi/libritts_xvector_vits"
 
 
 
52
  # vits needs no vocoder
 
 
53
  vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
 
54
 
55
  from espnet2.bin.tts_inference import Text2Speech
56
  from espnet2.utils.types import str_or_none
@@ -67,6 +74,42 @@ text2speech = Text2Speech.from_pretrained(
67
  speed_control_alpha=1.0,
68
  )
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  import glob
71
  import os
72
  import numpy as np
@@ -95,14 +138,12 @@ male_spks = {
95
  "Male2": "1320_122612",
96
  "Male3": "672_122797"
97
  }
98
- # "M3": "1188_133604",
99
- # "M4": "61_70970",
100
  female_spks = {"Female1": "5683_32865", "Female2": "121_121726", "Female3": "8463_287645"}
101
  # "F3": "121_121726"
102
  spks = dict(male_spks, **female_spks)
103
  spk_names = sorted(spks.keys())
104
 
105
-
106
  def ASRTTS(audio_file, spk_name, ref_text=""):
107
  spk = spks[spk_name]
108
  spembs = xvectors[spk]
@@ -156,6 +197,105 @@ def ASRTTS_clean(audio_file, spk_name):
156
  return save_id
157
 
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  reference_textbox = gr.Textbox(
160
  value="",
161
  placeholder="Input reference here",
@@ -198,10 +338,14 @@ def show_icon(choice):
198
  spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
199
  elif choice == "Male2":
200
  spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
 
 
201
  elif choice == "Female1":
202
  spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
203
  elif choice == "Female2":
204
  spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
 
 
205
  return spk_icon
206
 
207
  def get_download_file(audio_file=None):
@@ -246,18 +390,76 @@ with gr.Blocks(
246
  speaker_option.change(
247
  fn=show_icon, inputs=speaker_option, outputs=spk_icon
248
  )
249
-
250
- b2 = gr.Button("Convert")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- output_audio = gr.Audio(
253
- source="upload", file="filepath", label="Converted Audio", interactive=False
254
- )
255
 
256
- b2.click(
257
- ASRTTS_clean,
258
- inputs=[input_audio, speaker_option],
259
- outputs=output_audio,
260
- api_name="convert"
261
- )
262
-
263
- demo.launch(share=False)
 
 
12
  from transformers import pipeline
13
  from pathlib import Path
14
 
15
+ import pdb
16
  # local import
17
  import sys
18
  from espnet2.bin.tts_inference import Text2Speech
 
49
 
50
  # @title English multi-speaker pretrained model { run: "auto" }
51
  lang = "English"
52
+ # tag = "kan-bayashi/libritts_xvector_vits"
53
+ ft2_tag = "kan-bayashi/libritts_xvector_vits" #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer"
54
+ transformer_tag = "kan-bayashi/libritts_xvector_transformer"
55
+ # ft2_tag = "kan-bayashi/libritts_xvector_conformer_fastspeech2"
56
  # vits needs no vocoder
57
+
58
+ # Vocoders
59
  vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
60
+ hifigan_vocoder_tag = "parallel_wavegan/parallel_wavegan/libritts_hifigan.v1" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
61
 
62
  from espnet2.bin.tts_inference import Text2Speech
63
  from espnet2.utils.types import str_or_none
 
74
  speed_control_alpha=1.0,
75
  )
76
 
77
+ # Fastspeech2
78
+ ft2_text2speech = Text2Speech.from_pretrained(
79
+ model_tag=ft2_tag,
80
+ vocoder_tag=str_or_none(vocoder_tag),
81
+ device="cuda",
82
+ use_att_constraint=False,
83
+ backward_window=1,
84
+ forward_window=3,
85
+ speed_control_alpha=1.0,
86
+ )
87
+
88
+ # Fastspeech2 + hifigan
89
+ ft2_text2speech_hifi = Text2Speech.from_pretrained(
90
+ model_tag=ft2_tag,
91
+ vocoder_tag=str_or_none(hifigan_vocoder_tag),
92
+ device="cuda",
93
+ use_att_constraint=False,
94
+ backward_window=1,
95
+ forward_window=3,
96
+ speed_control_alpha=1.0,
97
+ )
98
+
99
+ # transformer tag
100
+ transformer_text2speech = Text2Speech.from_pretrained(
101
+ model_tag=transformer_tag,
102
+ vocoder_tag=str_or_none(vocoder_tag),
103
+ device="cuda",
104
+ use_att_constraint=False,
105
+ backward_window=1,
106
+ forward_window=3,
107
+ speed_control_alpha=1.0,
108
+ )
109
+
110
+ # from google.cloud import texttospeech
111
+ # Google_TTS_client = texttospeech.TextToSpeechClient()
112
+
113
  import glob
114
  import os
115
  import numpy as np
 
138
  "Male2": "1320_122612",
139
  "Male3": "672_122797"
140
  }
141
+
 
142
  female_spks = {"Female1": "5683_32865", "Female2": "121_121726", "Female3": "8463_287645"}
143
  # "F3": "121_121726"
144
  spks = dict(male_spks, **female_spks)
145
  spk_names = sorted(spks.keys())
146
 
 
147
  def ASRTTS(audio_file, spk_name, ref_text=""):
148
  spk = spks[spk_name]
149
  spembs = xvectors[spk]
 
197
  return save_id
198
 
199
 
200
+ def ft2_ASRTTS_clean(audio_file, spk_name):
201
+ spk = spks[spk_name]
202
+ spembs = xvectors[spk]
203
+
204
+ reg_text = transcriber(audio_file)["text"]
205
+
206
+ speech, sr = torchaudio.load(
207
+ audio_file, channels_first=True
208
+ ) # Mono channel
209
+ wav_tensor_spembs = ft2_text2speech(
210
+ text=reg_text, speech=speech, spembs=spembs
211
+ )["wav"]
212
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
213
+ sample_rate = 22050
214
+ save_id = (
215
+ "./wav/" + Path(audio_file).stem + "_fs2_" + spk_name + "_spkembs.wav"
216
+ )
217
+ torchaudio.save(
218
+ save_id,
219
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
220
+ sample_rate=22050,
221
+ )
222
+ return save_id
223
+
224
+ def ft2_ASRTTS_clean_hifi(audio_file, spk_name):
225
+ spk = spks[spk_name]
226
+ spembs = xvectors[spk]
227
+
228
+ reg_text = transcriber(audio_file)["text"]
229
+
230
+ speech, sr = torchaudio.load(
231
+ audio_file, channels_first=True
232
+ ) # Mono channel
233
+ wav_tensor_spembs = ft2_text2speech_hifi(
234
+ text=reg_text, speech=speech, spembs=spembs
235
+ )["wav"]
236
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
237
+ sample_rate = 22050
238
+ save_id = (
239
+ "./wav/" + Path(audio_file).stem + "_fs2_hifi_" + spk_name + "_spkembs.wav"
240
+ )
241
+ torchaudio.save(
242
+ save_id,
243
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
244
+ sample_rate=22050,
245
+ )
246
+ return save_id
247
+
248
+ def transformer_ASRTTS_clean(audio_file, spk_name):
249
+ spk = spks[spk_name]
250
+ spembs = xvectors[spk]
251
+
252
+ reg_text = transcriber(audio_file)["text"]
253
+
254
+ speech, sr = torchaudio.load(
255
+ audio_file, channels_first=True
256
+ ) # Mono channel
257
+ wav_tensor_spembs = transformer_text2speech(
258
+ text=reg_text, speech=speech, spembs=spembs
259
+ )["wav"]
260
+ wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
261
+ sample_rate = 22050
262
+ save_id = (
263
+ "./wav/" + Path(audio_file).stem + "_transformer_" + spk_name + "_spkembs.wav"
264
+ )
265
+ torchaudio.save(
266
+ save_id,
267
+ src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
268
+ sample_rate=22050,
269
+ )
270
+ return save_id
271
+
272
+ # def google_ASRTTS_clean(audio_file, spk_name):
273
+ # spk = spks[spk_name]
274
+ # spembs = xvectors[spk]
275
+
276
+ # reg_text = transcriber(audio_file)["text"]
277
+ # # pdb.set_trace()
278
+ # synthesis_input = texttospeech.SynthesisInput(text=reg_text)
279
+ # voice = texttospeech.VoiceSelectionParams(
280
+ # language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
281
+ # )
282
+ # audio_config = texttospeech.AudioConfig(
283
+ # audio_encoding=texttospeech.AudioEncoding.MP3
284
+ # )
285
+ # response = Google_TTS_client.synthesize_speech(
286
+ # input=synthesis_input, voice=voice, audio_config=audio_config
287
+ # )
288
+
289
+ # save_id = (
290
+ # "./wav/" + Path(audio_file).stem + "_google_" + spk_name + "_spkembs.wav"
291
+
292
+ # )
293
+ # with open(save_id, "wb") as out_file:
294
+ # out_file.write(response.audio_content)
295
+
296
+ # return save_id
297
+
298
+
299
  reference_textbox = gr.Textbox(
300
  value="",
301
  placeholder="Input reference here",
 
338
  spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
339
  elif choice == "Male2":
340
  spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
341
+ elif choice == "Male3":
342
+ spk_icon = gr.Image.update(value="speaker_icons/male3.png", visible=True)
343
  elif choice == "Female1":
344
  spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
345
  elif choice == "Female2":
346
  spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
347
+ elif choice == "Female3":
348
+ spk_icon = gr.Image.update(value="speaker_icons/female3.png", visible=True)
349
  return spk_icon
350
 
351
  def get_download_file(audio_file=None):
 
390
  speaker_option.change(
391
  fn=show_icon, inputs=speaker_option, outputs=spk_icon
392
  )
393
+ with gr.Column():
394
+ with gr.Row():
395
+ b2 = gr.Button("Convert")
396
+
397
+ output_audio = gr.Audio(
398
+ source="upload", file="filepath", label="Converted Audio", interactive=False
399
+ )
400
+
401
+ b2.click(
402
+ ASRTTS_clean,
403
+ inputs=[input_audio, speaker_option],
404
+ outputs=output_audio,
405
+ api_name="convert"
406
+ )
407
+ with gr.Row():
408
+ # Fastspeech2 + PWG [under construction]
409
+ b_ft2 = gr.Button("Convert_fastspeech2")
410
+
411
+ output_audio_ft2= gr.Audio(
412
+ source="upload", file="filepath", label="Converted Audio", interactive=False
413
+ )
414
+
415
+ b_ft2.click(
416
+ ft2_ASRTTS_clean,
417
+ inputs=[input_audio, speaker_option],
418
+ outputs=output_audio_ft2,
419
+ api_name="convert_ft2"
420
+ )
421
+ with gr.Row():
422
+ # Fastspeech2 + hifigan [under construction]
423
+ b_ft2_hifi = gr.Button("Convert_fastspeech2+HifiGAN")
424
+
425
+ output_audio_ft2_hifi= gr.Audio(
426
+ source="upload", file="filepath", label="Converted Audio", interactive=False
427
+ )
428
+
429
+ b_ft2_hifi.click(
430
+ ft2_ASRTTS_clean_hifi,
431
+ inputs=[input_audio, speaker_option],
432
+ outputs=output_audio_ft2_hifi,
433
+ api_name="convert_ft2_hifi"
434
+ )
435
+ with gr.Row():
436
+ # transformer [TODO]
437
+ b_transformer = gr.Button("Convert_transformer")
438
+
439
+ output_audio_transformer= gr.Audio(
440
+ source="upload", file="filepath", label="Converted Audio", interactive=False
441
+ )
442
+
443
+ b_transformer.click(
444
+ transformer_ASRTTS_clean,
445
+ inputs=[input_audio, speaker_option],
446
+ outputs=output_audio_transformer,
447
+ api_name="convert_trans"
448
+ )
449
+
450
+ # google tts [TODO]
451
+ # b_google = gr.Button("Convert_googleTTS")
452
 
453
+ # output_audio_google= gr.Audio(
454
+ # source="upload", file="filepath", label="Converted Audio", interactive=False
455
+ # )
456
 
457
+ # b_google.click(
458
+ # google_ASRTTS_clean,
459
+ # inputs=[input_audio, speaker_option],
460
+ # outputs=output_audio_google,
461
+ # api_name="convert"
462
+ # )
463
+
464
+
465
+ demo.launch(share=True)