Spaces:
Sleeping
Sleeping
support local vocoder
Browse files- .gitattributes +1 -1
- app.py +29 -14
- {vocoders β parallel_wavegan}/libritts_hifigan.v1/checkpoint-2500000steps.pkl +0 -0
- {vocoders β parallel_wavegan}/libritts_hifigan.v1/config.yml +0 -0
- {vocoders β parallel_wavegan}/libritts_hifigan.v1/stats.h5 +0 -0
- {vocoders β parallel_wavegan}/vctk_parallel_wavegan.v1.long/checkpoint-1000000steps.pkl +0 -0
- {vocoders β parallel_wavegan}/vctk_parallel_wavegan.v1.long/config.yml +0 -0
- {vocoders β parallel_wavegan}/vctk_parallel_wavegan.v1.long/stats.h5 +0 -0
- vocoders/vctk_parallel_wavegan.v1.long/._checkpoint-1000000steps.pkl +0 -3
- vocoders/vctk_parallel_wavegan.v1.long/._config.yml +0 -0
- vocoders/vctk_parallel_wavegan.v1.long/._stats.h5 +0 -3
- vocoders/vctk_parallel_wavegan.v1.long/._train_nodev_all_vctk_parallel_wavegan.v1.long +0 -0
.gitattributes
CHANGED
@@ -34,4 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
xvector filter=lfs diff=lfs merge=lfs -text
|
36 |
TTS_models filter=lfs diff=lfs merge=lfs -text
|
37 |
-
|
|
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
xvector filter=lfs diff=lfs merge=lfs -text
|
36 |
TTS_models filter=lfs diff=lfs merge=lfs -text
|
37 |
+
parallel_wavegan filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -47,26 +47,42 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_P
|
|
47 |
|
48 |
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
|
49 |
|
|
|
50 |
# @title English multi-speaker pretrained model { run: "auto" }
|
51 |
lang = "English"
|
52 |
-
|
53 |
ft2_tag = "kan-bayashi/libritts_xvector_vits" #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer"
|
54 |
-
transformer_tag = "kan-bayashi/libritts_xvector_transformer"
|
55 |
# ft2_tag = "kan-bayashi/libritts_xvector_conformer_fastspeech2"
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# Vocoders
|
59 |
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
60 |
hifigan_vocoder_tag = "parallel_wavegan/parallel_wavegan/libritts_hifigan.v1" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
61 |
|
|
|
|
|
|
|
|
|
|
|
62 |
from espnet2.bin.tts_inference import Text2Speech
|
63 |
from espnet2.utils.types import str_or_none
|
64 |
|
65 |
# local import
|
66 |
text2speech = Text2Speech.from_pretrained(
|
67 |
-
train_config =
|
68 |
-
model_file=
|
69 |
-
vocoder_tag=str_or_none(vocoder_tag),
|
70 |
device="cuda",
|
71 |
use_att_constraint=False,
|
72 |
backward_window=1,
|
@@ -77,7 +93,7 @@ text2speech = Text2Speech.from_pretrained(
|
|
77 |
# Fastspeech2
|
78 |
ft2_text2speech = Text2Speech.from_pretrained(
|
79 |
model_tag=ft2_tag,
|
80 |
-
vocoder_tag=str_or_none(
|
81 |
device="cuda",
|
82 |
use_att_constraint=False,
|
83 |
backward_window=1,
|
@@ -88,7 +104,7 @@ ft2_text2speech = Text2Speech.from_pretrained(
|
|
88 |
# Fastspeech2 + hifigan
|
89 |
ft2_text2speech_hifi = Text2Speech.from_pretrained(
|
90 |
model_tag=ft2_tag,
|
91 |
-
vocoder_tag=str_or_none(
|
92 |
device="cuda",
|
93 |
use_att_constraint=False,
|
94 |
backward_window=1,
|
@@ -99,16 +115,13 @@ ft2_text2speech_hifi = Text2Speech.from_pretrained(
|
|
99 |
# transformer tag
|
100 |
transformer_text2speech = Text2Speech.from_pretrained(
|
101 |
model_tag=transformer_tag,
|
102 |
-
vocoder_tag=str_or_none(
|
103 |
device="cuda",
|
104 |
use_att_constraint=False,
|
105 |
backward_window=1,
|
106 |
forward_window=3,
|
107 |
speed_control_alpha=1.0,
|
108 |
)
|
109 |
-
pdb.set_trace()
|
110 |
-
# from google.cloud import texttospeech
|
111 |
-
# Google_TTS_client = texttospeech.TextToSpeechClient()
|
112 |
|
113 |
import glob
|
114 |
import os
|
@@ -139,8 +152,10 @@ male_spks = {
|
|
139 |
"Male3": "672_122797"
|
140 |
}
|
141 |
|
142 |
-
female_spks = {"Female1": "5683_32865",
|
143 |
-
|
|
|
|
|
144 |
spks = dict(male_spks, **female_spks)
|
145 |
spk_names = sorted(spks.keys())
|
146 |
|
|
|
47 |
|
48 |
transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
|
49 |
|
50 |
+
# Text2Mel models
|
51 |
# @title English multi-speaker pretrained model { run: "auto" }
|
52 |
lang = "English"
|
53 |
+
vits_tag = "kan-bayashi/libritts_xvector_vits"
|
54 |
ft2_tag = "kan-bayashi/libritts_xvector_vits" #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer"
|
|
|
55 |
# ft2_tag = "kan-bayashi/libritts_xvector_conformer_fastspeech2"
|
56 |
+
transformer_tag = "kan-bayashi/libritts_xvector_transformer"
|
57 |
+
|
58 |
+
# !!! vits needs no vocoder !!!
|
59 |
+
# Local Text2Mel models
|
60 |
+
|
61 |
+
vits_config_local = "TTS_models/libritts_xvector_vits/config.yaml"
|
62 |
+
vits_model_local = "TTS_models/libritts_xvector_vits/train.total_count.ave_10best.pth"
|
63 |
+
|
64 |
+
# TODO
|
65 |
+
ft2_config_local = ""
|
66 |
+
ft2_model_local= ""
|
67 |
+
transformer_config_local = ""
|
68 |
+
transformer_config_local = ""
|
69 |
|
70 |
# Vocoders
|
71 |
vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
72 |
hifigan_vocoder_tag = "parallel_wavegan/parallel_wavegan/libritts_hifigan.v1" # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
|
73 |
|
74 |
+
# Local Vocoders
|
75 |
+
## Make sure the use parallel_wavegan as prefix (PWG feature)
|
76 |
+
vocoder_tag_local = "parallel_wavegan/vctk_parallel_wavegan.v1.long"
|
77 |
+
hifigan_vocoder_tag_local = "parallel_wavegan/libritts_hifigan.v1"
|
78 |
+
|
79 |
from espnet2.bin.tts_inference import Text2Speech
|
80 |
from espnet2.utils.types import str_or_none
|
81 |
|
82 |
# local import
|
83 |
text2speech = Text2Speech.from_pretrained(
|
84 |
+
train_config = vits_config_local,
|
85 |
+
model_file=vits_model_local,
|
|
|
86 |
device="cuda",
|
87 |
use_att_constraint=False,
|
88 |
backward_window=1,
|
|
|
93 |
# Fastspeech2
|
94 |
ft2_text2speech = Text2Speech.from_pretrained(
|
95 |
model_tag=ft2_tag,
|
96 |
+
vocoder_tag=str_or_none(vocoder_tag_local),
|
97 |
device="cuda",
|
98 |
use_att_constraint=False,
|
99 |
backward_window=1,
|
|
|
104 |
# Fastspeech2 + hifigan
|
105 |
ft2_text2speech_hifi = Text2Speech.from_pretrained(
|
106 |
model_tag=ft2_tag,
|
107 |
+
vocoder_tag=str_or_none(hifigan_vocoder_tag_local),
|
108 |
device="cuda",
|
109 |
use_att_constraint=False,
|
110 |
backward_window=1,
|
|
|
115 |
# transformer tag
|
116 |
transformer_text2speech = Text2Speech.from_pretrained(
|
117 |
model_tag=transformer_tag,
|
118 |
+
vocoder_tag=str_or_none(vocoder_tag_local),
|
119 |
device="cuda",
|
120 |
use_att_constraint=False,
|
121 |
backward_window=1,
|
122 |
forward_window=3,
|
123 |
speed_control_alpha=1.0,
|
124 |
)
|
|
|
|
|
|
|
125 |
|
126 |
import glob
|
127 |
import os
|
|
|
152 |
"Male3": "672_122797"
|
153 |
}
|
154 |
|
155 |
+
female_spks = {"Female1": "5683_32865",
|
156 |
+
"Female2": "121_121726",
|
157 |
+
"Female3": "8463_287645"}
|
158 |
+
|
159 |
spks = dict(male_spks, **female_spks)
|
160 |
spk_names = sorted(spks.keys())
|
161 |
|
{vocoders β parallel_wavegan}/libritts_hifigan.v1/checkpoint-2500000steps.pkl
RENAMED
File without changes
|
{vocoders β parallel_wavegan}/libritts_hifigan.v1/config.yml
RENAMED
File without changes
|
{vocoders β parallel_wavegan}/libritts_hifigan.v1/stats.h5
RENAMED
File without changes
|
{vocoders β parallel_wavegan}/vctk_parallel_wavegan.v1.long/checkpoint-1000000steps.pkl
RENAMED
File without changes
|
{vocoders β parallel_wavegan}/vctk_parallel_wavegan.v1.long/config.yml
RENAMED
File without changes
|
{vocoders β parallel_wavegan}/vctk_parallel_wavegan.v1.long/stats.h5
RENAMED
File without changes
|
vocoders/vctk_parallel_wavegan.v1.long/._checkpoint-1000000steps.pkl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ccb8cfc739515054284e6ac7c75afdab0c771eba7d132c4e19efff528147a1a1
|
3 |
-
size 223
|
|
|
|
|
|
|
|
vocoders/vctk_parallel_wavegan.v1.long/._config.yml
DELETED
Binary file (223 Bytes)
|
|
vocoders/vctk_parallel_wavegan.v1.long/._stats.h5
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:f688626155bff6853a6045408be9bb248828abc482e2218ee0d93183cede5062
|
3 |
-
size 223
|
|
|
|
|
|
|
|
vocoders/vctk_parallel_wavegan.v1.long/._train_nodev_all_vctk_parallel_wavegan.v1.long
DELETED
Binary file (187 Bytes)
|
|