Spaces:
Sleeping
Sleeping
add fastspeech2 TTS, fastspeech2 TTS + HifiGAN, transformer TTS models
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ import torchaudio
|
|
12 |
from transformers import pipeline
|
13 |
from pathlib import Path
|
14 |
|
|
|
15 |
# local import
|
16 |
import sys
|
17 |
from espnet2.bin.tts_inference import Text2Speech
|
@@ -48,9 +49,15 @@ transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_
|
|
48 |
|
49 |
# @title English multi-speaker pretrained model { run: "auto" }
|
50 |
lang = "English"
|
51 |
-
tag = "kan-bayashi/libritts_xvector_vits"
|
|
|
|
|
|
|
52 |
# vits needs no vocoder
|
|
|
|
|
53 |
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
|
|
54 |
|
55 |
from espnet2.bin.tts_inference import Text2Speech
|
56 |
from espnet2.utils.types import str_or_none
|
@@ -67,6 +74,42 @@ text2speech = Text2Speech.from_pretrained(
|
|
67 |
speed_control_alpha=1.0,
|
68 |
)
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
import glob
|
71 |
import os
|
72 |
import numpy as np
|
@@ -95,14 +138,12 @@ male_spks = {
|
|
95 |
"Male2": "1320_122612",
|
96 |
"Male3": "672_122797"
|
97 |
}
|
98 |
-
|
99 |
-
# "M4": "61_70970",
|
100 |
female_spks = {"Female1": "5683_32865", "Female2": "121_121726", "Female3": "8463_287645"}
|
101 |
# "F3": "121_121726"
|
102 |
spks = dict(male_spks, **female_spks)
|
103 |
spk_names = sorted(spks.keys())
|
104 |
|
105 |
-
|
106 |
def ASRTTS(audio_file, spk_name, ref_text=""):
|
107 |
spk = spks[spk_name]
|
108 |
spembs = xvectors[spk]
|
@@ -156,6 +197,105 @@ def ASRTTS_clean(audio_file, spk_name):
|
|
156 |
return save_id
|
157 |
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
reference_textbox = gr.Textbox(
|
160 |
value="",
|
161 |
placeholder="Input reference here",
|
@@ -198,10 +338,14 @@ def show_icon(choice):
|
|
198 |
spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
|
199 |
elif choice == "Male2":
|
200 |
spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
|
|
|
|
|
201 |
elif choice == "Female1":
|
202 |
spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
|
203 |
elif choice == "Female2":
|
204 |
spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
|
|
|
|
|
205 |
return spk_icon
|
206 |
|
207 |
def get_download_file(audio_file=None):
|
@@ -246,18 +390,76 @@ with gr.Blocks(
|
|
246 |
speaker_option.change(
|
247 |
fn=show_icon, inputs=speaker_option, outputs=spk_icon
|
248 |
)
|
249 |
-
|
250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
-
|
253 |
-
|
254 |
-
)
|
255 |
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
)
|
262 |
-
|
263 |
-
|
|
|
|
12 |
from transformers import pipeline
|
13 |
from pathlib import Path
|
14 |
|
15 |
+
import pdb
|
16 |
# local import
|
17 |
import sys
|
18 |
from espnet2.bin.tts_inference import Text2Speech
|
|
|
49 |
|
50 |
# @title English multi-speaker pretrained model { run: "auto" }
|
51 |
lang = "English"
|
52 |
+
# tag = "kan-bayashi/libritts_xvector_vits"
|
53 |
+
ft2_tag = "kan-bayashi/libritts_xvector_vits" #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer"
|
54 |
+
transformer_tag = "kan-bayashi/libritts_xvector_transformer"
|
55 |
+
# ft2_tag = "kan-bayashi/libritts_xvector_conformer_fastspeech2"
|
56 |
# vits needs no vocoder
|
57 |
+
|
58 |
+
# Vocoders
|
59 |
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
60 |
+
hifigan_vocoder_tag = "parallel_wavegan/parallel_wavegan/libritts_hifigan.v1" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
61 |
|
62 |
from espnet2.bin.tts_inference import Text2Speech
|
63 |
from espnet2.utils.types import str_or_none
|
|
|
74 |
speed_control_alpha=1.0,
|
75 |
)
|
76 |
|
77 |
+
# Fastspeech2
|
78 |
+
ft2_text2speech = Text2Speech.from_pretrained(
|
79 |
+
model_tag=ft2_tag,
|
80 |
+
vocoder_tag=str_or_none(vocoder_tag),
|
81 |
+
device="cuda",
|
82 |
+
use_att_constraint=False,
|
83 |
+
backward_window=1,
|
84 |
+
forward_window=3,
|
85 |
+
speed_control_alpha=1.0,
|
86 |
+
)
|
87 |
+
|
88 |
+
# Fastspeech2 + hifigan
|
89 |
+
ft2_text2speech_hifi = Text2Speech.from_pretrained(
|
90 |
+
model_tag=ft2_tag,
|
91 |
+
vocoder_tag=str_or_none(hifigan_vocoder_tag),
|
92 |
+
device="cuda",
|
93 |
+
use_att_constraint=False,
|
94 |
+
backward_window=1,
|
95 |
+
forward_window=3,
|
96 |
+
speed_control_alpha=1.0,
|
97 |
+
)
|
98 |
+
|
99 |
+
# transformer tag
|
100 |
+
transformer_text2speech = Text2Speech.from_pretrained(
|
101 |
+
model_tag=transformer_tag,
|
102 |
+
vocoder_tag=str_or_none(vocoder_tag),
|
103 |
+
device="cuda",
|
104 |
+
use_att_constraint=False,
|
105 |
+
backward_window=1,
|
106 |
+
forward_window=3,
|
107 |
+
speed_control_alpha=1.0,
|
108 |
+
)
|
109 |
+
|
110 |
+
# from google.cloud import texttospeech
|
111 |
+
# Google_TTS_client = texttospeech.TextToSpeechClient()
|
112 |
+
|
113 |
import glob
|
114 |
import os
|
115 |
import numpy as np
|
|
|
138 |
"Male2": "1320_122612",
|
139 |
"Male3": "672_122797"
|
140 |
}
|
141 |
+
|
|
|
142 |
female_spks = {"Female1": "5683_32865", "Female2": "121_121726", "Female3": "8463_287645"}
|
143 |
# "F3": "121_121726"
|
144 |
spks = dict(male_spks, **female_spks)
|
145 |
spk_names = sorted(spks.keys())
|
146 |
|
|
|
147 |
def ASRTTS(audio_file, spk_name, ref_text=""):
|
148 |
spk = spks[spk_name]
|
149 |
spembs = xvectors[spk]
|
|
|
197 |
return save_id
|
198 |
|
199 |
|
200 |
+
def ft2_ASRTTS_clean(audio_file, spk_name):
|
201 |
+
spk = spks[spk_name]
|
202 |
+
spembs = xvectors[spk]
|
203 |
+
|
204 |
+
reg_text = transcriber(audio_file)["text"]
|
205 |
+
|
206 |
+
speech, sr = torchaudio.load(
|
207 |
+
audio_file, channels_first=True
|
208 |
+
) # Mono channel
|
209 |
+
wav_tensor_spembs = ft2_text2speech(
|
210 |
+
text=reg_text, speech=speech, spembs=spembs
|
211 |
+
)["wav"]
|
212 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
213 |
+
sample_rate = 22050
|
214 |
+
save_id = (
|
215 |
+
"./wav/" + Path(audio_file).stem + "_fs2_" + spk_name + "_spkembs.wav"
|
216 |
+
)
|
217 |
+
torchaudio.save(
|
218 |
+
save_id,
|
219 |
+
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
220 |
+
sample_rate=22050,
|
221 |
+
)
|
222 |
+
return save_id
|
223 |
+
|
224 |
+
def ft2_ASRTTS_clean_hifi(audio_file, spk_name):
|
225 |
+
spk = spks[spk_name]
|
226 |
+
spembs = xvectors[spk]
|
227 |
+
|
228 |
+
reg_text = transcriber(audio_file)["text"]
|
229 |
+
|
230 |
+
speech, sr = torchaudio.load(
|
231 |
+
audio_file, channels_first=True
|
232 |
+
) # Mono channel
|
233 |
+
wav_tensor_spembs = ft2_text2speech_hifi(
|
234 |
+
text=reg_text, speech=speech, spembs=spembs
|
235 |
+
)["wav"]
|
236 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
237 |
+
sample_rate = 22050
|
238 |
+
save_id = (
|
239 |
+
"./wav/" + Path(audio_file).stem + "_fs2_hifi_" + spk_name + "_spkembs.wav"
|
240 |
+
)
|
241 |
+
torchaudio.save(
|
242 |
+
save_id,
|
243 |
+
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
244 |
+
sample_rate=22050,
|
245 |
+
)
|
246 |
+
return save_id
|
247 |
+
|
248 |
+
def transformer_ASRTTS_clean(audio_file, spk_name):
|
249 |
+
spk = spks[spk_name]
|
250 |
+
spembs = xvectors[spk]
|
251 |
+
|
252 |
+
reg_text = transcriber(audio_file)["text"]
|
253 |
+
|
254 |
+
speech, sr = torchaudio.load(
|
255 |
+
audio_file, channels_first=True
|
256 |
+
) # Mono channel
|
257 |
+
wav_tensor_spembs = transformer_text2speech(
|
258 |
+
text=reg_text, speech=speech, spembs=spembs
|
259 |
+
)["wav"]
|
260 |
+
wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
|
261 |
+
sample_rate = 22050
|
262 |
+
save_id = (
|
263 |
+
"./wav/" + Path(audio_file).stem + "_transformer_" + spk_name + "_spkembs.wav"
|
264 |
+
)
|
265 |
+
torchaudio.save(
|
266 |
+
save_id,
|
267 |
+
src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
|
268 |
+
sample_rate=22050,
|
269 |
+
)
|
270 |
+
return save_id
|
271 |
+
|
272 |
+
# def google_ASRTTS_clean(audio_file, spk_name):
|
273 |
+
# spk = spks[spk_name]
|
274 |
+
# spembs = xvectors[spk]
|
275 |
+
|
276 |
+
# reg_text = transcriber(audio_file)["text"]
|
277 |
+
# # pdb.set_trace()
|
278 |
+
# synthesis_input = texttospeech.SynthesisInput(text=reg_text)
|
279 |
+
# voice = texttospeech.VoiceSelectionParams(
|
280 |
+
# language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
|
281 |
+
# )
|
282 |
+
# audio_config = texttospeech.AudioConfig(
|
283 |
+
# audio_encoding=texttospeech.AudioEncoding.MP3
|
284 |
+
# )
|
285 |
+
# response = Google_TTS_client.synthesize_speech(
|
286 |
+
# input=synthesis_input, voice=voice, audio_config=audio_config
|
287 |
+
# )
|
288 |
+
|
289 |
+
# save_id = (
|
290 |
+
# "./wav/" + Path(audio_file).stem + "_google_" + spk_name + "_spkembs.wav"
|
291 |
+
|
292 |
+
# )
|
293 |
+
# with open(save_id, "wb") as out_file:
|
294 |
+
# out_file.write(response.audio_content)
|
295 |
+
|
296 |
+
# return save_id
|
297 |
+
|
298 |
+
|
299 |
reference_textbox = gr.Textbox(
|
300 |
value="",
|
301 |
placeholder="Input reference here",
|
|
|
338 |
spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
|
339 |
elif choice == "Male2":
|
340 |
spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
|
341 |
+
elif choice == "Male3":
|
342 |
+
spk_icon = gr.Image.update(value="speaker_icons/male3.png", visible=True)
|
343 |
elif choice == "Female1":
|
344 |
spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
|
345 |
elif choice == "Female2":
|
346 |
spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
|
347 |
+
elif choice == "Female3":
|
348 |
+
spk_icon = gr.Image.update(value="speaker_icons/female3.png", visible=True)
|
349 |
return spk_icon
|
350 |
|
351 |
def get_download_file(audio_file=None):
|
|
|
390 |
speaker_option.change(
|
391 |
fn=show_icon, inputs=speaker_option, outputs=spk_icon
|
392 |
)
|
393 |
+
with gr.Column():
|
394 |
+
with gr.Row():
|
395 |
+
b2 = gr.Button("Convert")
|
396 |
+
|
397 |
+
output_audio = gr.Audio(
|
398 |
+
source="upload", file="filepath", label="Converted Audio", interactive=False
|
399 |
+
)
|
400 |
+
|
401 |
+
b2.click(
|
402 |
+
ASRTTS_clean,
|
403 |
+
inputs=[input_audio, speaker_option],
|
404 |
+
outputs=output_audio,
|
405 |
+
api_name="convert"
|
406 |
+
)
|
407 |
+
with gr.Row():
|
408 |
+
# Fastspeech2 + PWG [under construction]
|
409 |
+
b_ft2 = gr.Button("Convert_fastspeech2")
|
410 |
+
|
411 |
+
output_audio_ft2= gr.Audio(
|
412 |
+
source="upload", file="filepath", label="Converted Audio", interactive=False
|
413 |
+
)
|
414 |
+
|
415 |
+
b_ft2.click(
|
416 |
+
ft2_ASRTTS_clean,
|
417 |
+
inputs=[input_audio, speaker_option],
|
418 |
+
outputs=output_audio_ft2,
|
419 |
+
api_name="convert_ft2"
|
420 |
+
)
|
421 |
+
with gr.Row():
|
422 |
+
# Fastspeech2 + hifigan [under construction]
|
423 |
+
b_ft2_hifi = gr.Button("Convert_fastspeech2+HifiGAN")
|
424 |
+
|
425 |
+
output_audio_ft2_hifi= gr.Audio(
|
426 |
+
source="upload", file="filepath", label="Converted Audio", interactive=False
|
427 |
+
)
|
428 |
+
|
429 |
+
b_ft2_hifi.click(
|
430 |
+
ft2_ASRTTS_clean_hifi,
|
431 |
+
inputs=[input_audio, speaker_option],
|
432 |
+
outputs=output_audio_ft2_hifi,
|
433 |
+
api_name="convert_ft2_hifi"
|
434 |
+
)
|
435 |
+
with gr.Row():
|
436 |
+
# transformer [TODO]
|
437 |
+
b_transformer = gr.Button("Convert_transformer")
|
438 |
+
|
439 |
+
output_audio_transformer= gr.Audio(
|
440 |
+
source="upload", file="filepath", label="Converted Audio", interactive=False
|
441 |
+
)
|
442 |
+
|
443 |
+
b_transformer.click(
|
444 |
+
transformer_ASRTTS_clean,
|
445 |
+
inputs=[input_audio, speaker_option],
|
446 |
+
outputs=output_audio_transformer,
|
447 |
+
api_name="convert_trans"
|
448 |
+
)
|
449 |
+
|
450 |
+
# google tts [TODO]
|
451 |
+
# b_google = gr.Button("Convert_googleTTS")
|
452 |
|
453 |
+
# output_audio_google= gr.Audio(
|
454 |
+
# source="upload", file="filepath", label="Converted Audio", interactive=False
|
455 |
+
# )
|
456 |
|
457 |
+
# b_google.click(
|
458 |
+
# google_ASRTTS_clean,
|
459 |
+
# inputs=[input_audio, speaker_option],
|
460 |
+
# outputs=output_audio_google,
|
461 |
+
# api_name="convert"
|
462 |
+
# )
|
463 |
+
|
464 |
+
|
465 |
+
demo.launch(share=True)
|