Spaces:
Runtime error
Runtime error
RayeRen
commited on
Commit
•
d1b91e7
0
Parent(s):
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .gitignore +148 -0
- README.md +9 -0
- checkpoints/fs2_exp/config.yaml +219 -0
- checkpoints/fs2_exp/model_ckpt_steps_98000.ckpt +3 -0
- checkpoints/hifi_lj/config.yaml +207 -0
- checkpoints/hifi_lj/model_ckpt_steps_2076000.ckpt +3 -0
- checkpoints/ps_normal_exp/config.yaml +258 -0
- checkpoints/ps_normal_exp/model_ckpt_steps_278000.ckpt +3 -0
- checkpoints/ps_small_exp/config.yaml +258 -0
- checkpoints/ps_small_exp/model_ckpt_steps_410000.ckpt +3 -0
- data/binary/ljspeech/phone_set.json +1 -0
- data/binary/ljspeech/spk_map.json +1 -0
- data/binary/ljspeech/word_set.json +0 -0
- data/binary/ljspeech_cwt/phone_set.json +1 -0
- data/binary/ljspeech_cwt/spk_map.json +1 -0
- data/binary/ljspeech_cwt/word_set.json +0 -0
- data_gen/tts/base_binarizer.py +225 -0
- data_gen/tts/base_preprocess.py +251 -0
- data_gen/tts/binarizer_zh.py +25 -0
- data_gen/tts/runs/adapt_mfa_align.py +18 -0
- data_gen/tts/runs/align_and_binarize.py +12 -0
- data_gen/tts/runs/binarize.py +17 -0
- data_gen/tts/runs/preprocess.py +17 -0
- data_gen/tts/runs/train_mfa_align.py +46 -0
- data_gen/tts/txt_processors/__init__.py +1 -0
- data_gen/tts/txt_processors/base_text_processor.py +48 -0
- data_gen/tts/txt_processors/en.py +78 -0
- data_gen/tts/wav_processors/__init__.py +2 -0
- data_gen/tts/wav_processors/base_processor.py +25 -0
- data_gen/tts/wav_processors/common_processors.py +86 -0
- docs/fastspeech2.md +53 -0
- docs/framework.md +106 -0
- docs/portaspeech.md +61 -0
- docs/prepare_data.md +25 -0
- docs/prepare_vocoder.md +49 -0
- egs/datasets/audio/lj/base_mel2wav.yaml +4 -0
- egs/datasets/audio/lj/base_text2mel.yaml +16 -0
- egs/datasets/audio/lj/fs.yaml +3 -0
- egs/datasets/audio/lj/fs2_orig.yaml +4 -0
- egs/datasets/audio/lj/hifigan.yaml +3 -0
- egs/datasets/audio/lj/preprocess.py +9 -0
- egs/datasets/audio/lj/ps_flow.yaml +3 -0
- egs/datasets/audio/lj/ps_flow_nips2021.yaml +11 -0
- egs/datasets/audio/lj/ps_flow_small.yaml +3 -0
- egs/datasets/audio/lj/ps_flow_small_nips2021.yaml +11 -0
- egs/egs_bases/config_base.yaml +41 -0
- egs/egs_bases/tts/base.yaml +56 -0
- egs/egs_bases/tts/dataset_params.yaml +52 -0
- egs/egs_bases/tts/fs.yaml +75 -0
.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Project ignore
|
2 |
+
|
3 |
+
infer_out
|
4 |
+
flagged
|
5 |
+
rsync
|
6 |
+
.idea
|
7 |
+
.DS_Store
|
8 |
+
bak
|
9 |
+
tmp
|
10 |
+
*.tar.gz
|
11 |
+
mos
|
12 |
+
nbs
|
13 |
+
/configs_usr/*
|
14 |
+
!/configs_usr/.gitkeep
|
15 |
+
/egs_usr/*
|
16 |
+
!/egs_usr/.gitkeep
|
17 |
+
/rnnoise
|
18 |
+
#/usr/*
|
19 |
+
#!/usr/.gitkeep
|
20 |
+
scripts_usr
|
21 |
+
|
22 |
+
# Created by .ignore support plugin (hsz.mobi)
|
23 |
+
### Python template
|
24 |
+
# Byte-compiled / optimized / DLL files
|
25 |
+
__pycache__/
|
26 |
+
*.py[cod]
|
27 |
+
*$py.class
|
28 |
+
|
29 |
+
# C extensions
|
30 |
+
*.so
|
31 |
+
|
32 |
+
# Distribution / packaging
|
33 |
+
.Python
|
34 |
+
build/
|
35 |
+
develop-eggs/
|
36 |
+
dist/
|
37 |
+
downloads/
|
38 |
+
eggs/
|
39 |
+
.eggs/
|
40 |
+
lib/
|
41 |
+
lib64/
|
42 |
+
parts/
|
43 |
+
sdist/
|
44 |
+
var/
|
45 |
+
wheels/
|
46 |
+
pip-wheel-metadata/
|
47 |
+
share/python-wheels/
|
48 |
+
*.egg-info/
|
49 |
+
.installed.cfg
|
50 |
+
*.egg
|
51 |
+
MANIFEST
|
52 |
+
|
53 |
+
# PyInstaller
|
54 |
+
# Usually these files are written by a python script from a template
|
55 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
56 |
+
*.manifest
|
57 |
+
*.spec
|
58 |
+
|
59 |
+
# Installer logs
|
60 |
+
pip-log.txt
|
61 |
+
pip-delete-this-directory.txt
|
62 |
+
|
63 |
+
# Unit test / coverage reports
|
64 |
+
htmlcov/
|
65 |
+
.tox/
|
66 |
+
.nox/
|
67 |
+
.coverage
|
68 |
+
.coverage.*
|
69 |
+
.cache
|
70 |
+
nosetests.xml
|
71 |
+
coverage.xml
|
72 |
+
*.cover
|
73 |
+
.hypothesis/
|
74 |
+
.pytest_cache/
|
75 |
+
|
76 |
+
# Translations
|
77 |
+
*.mo
|
78 |
+
*.pot
|
79 |
+
|
80 |
+
# Django stuff:
|
81 |
+
*.log
|
82 |
+
local_settings.py
|
83 |
+
db.sqlite3
|
84 |
+
db.sqlite3-journal
|
85 |
+
|
86 |
+
# Flask stuff:
|
87 |
+
instance/
|
88 |
+
.webassets-cache
|
89 |
+
|
90 |
+
# Scrapy stuff:
|
91 |
+
.scrapy
|
92 |
+
|
93 |
+
# Sphinx documentation
|
94 |
+
docs/_build/
|
95 |
+
|
96 |
+
# PyBuilder
|
97 |
+
target/
|
98 |
+
|
99 |
+
# Jupyter Notebook
|
100 |
+
.ipynb_checkpoints
|
101 |
+
|
102 |
+
# IPython
|
103 |
+
profile_default/
|
104 |
+
ipython_config.py
|
105 |
+
|
106 |
+
# pyenv
|
107 |
+
.python-version
|
108 |
+
|
109 |
+
# pipenv
|
110 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
111 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
112 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
113 |
+
# install all needed dependencies.
|
114 |
+
#Pipfile.lock
|
115 |
+
|
116 |
+
# celery beat schedule file
|
117 |
+
celerybeat-schedule
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
将删除 datasets/remi/test/
|
README.md
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: FastSpeech2
|
3 |
+
emoji: 🤗
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: orange
|
6 |
+
sdk: gradio
|
7 |
+
app_file: "inference/tts/gradio/infer.py"
|
8 |
+
pinned: false
|
9 |
+
---
|
checkpoints/fs2_exp/config.yaml
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
amp: false
|
3 |
+
audio_num_mel_bins: 80
|
4 |
+
audio_sample_rate: 22050
|
5 |
+
base_config:
|
6 |
+
- egs/egs_bases/tts/fs2_orig.yaml
|
7 |
+
- ./base_text2mel.yaml
|
8 |
+
binarization_args:
|
9 |
+
min_sil_duration: 0.1
|
10 |
+
shuffle: false
|
11 |
+
test_range:
|
12 |
+
- 0
|
13 |
+
- 523
|
14 |
+
train_range:
|
15 |
+
- 871
|
16 |
+
- -1
|
17 |
+
trim_eos_bos: false
|
18 |
+
valid_range:
|
19 |
+
- 523
|
20 |
+
- 871
|
21 |
+
with_align: true
|
22 |
+
with_f0: true
|
23 |
+
with_f0cwt: true
|
24 |
+
with_linear: false
|
25 |
+
with_spk_embed: false
|
26 |
+
with_wav: false
|
27 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
28 |
+
binary_data_dir: data/binary/ljspeech_cwt
|
29 |
+
check_val_every_n_epoch: 10
|
30 |
+
clip_grad_norm: 1
|
31 |
+
clip_grad_value: 0
|
32 |
+
conv_use_pos: false
|
33 |
+
cwt_std_scale: 1.0
|
34 |
+
debug: false
|
35 |
+
dec_dilations:
|
36 |
+
- 1
|
37 |
+
- 1
|
38 |
+
- 1
|
39 |
+
- 1
|
40 |
+
dec_ffn_kernel_size: 9
|
41 |
+
dec_inp_add_noise: false
|
42 |
+
dec_kernel_size: 5
|
43 |
+
dec_layers: 4
|
44 |
+
dec_post_net_kernel: 3
|
45 |
+
decoder_rnn_dim: 0
|
46 |
+
decoder_type: fft
|
47 |
+
dropout: 0.0
|
48 |
+
ds_workers: 2
|
49 |
+
dur_predictor_kernel: 3
|
50 |
+
dur_predictor_layers: 2
|
51 |
+
enc_dec_norm: ln
|
52 |
+
enc_dilations:
|
53 |
+
- 1
|
54 |
+
- 1
|
55 |
+
- 1
|
56 |
+
- 1
|
57 |
+
enc_ffn_kernel_size: 9
|
58 |
+
enc_kernel_size: 5
|
59 |
+
enc_layers: 4
|
60 |
+
enc_post_net_kernel: 3
|
61 |
+
enc_pre_ln: true
|
62 |
+
enc_prenet: true
|
63 |
+
encoder_K: 8
|
64 |
+
encoder_type: fft
|
65 |
+
endless_ds: true
|
66 |
+
eval_max_batches: -1
|
67 |
+
f0_max: 800
|
68 |
+
f0_min: 80
|
69 |
+
ffn_act: gelu
|
70 |
+
ffn_hidden_size: 1024
|
71 |
+
fft_size: 1024
|
72 |
+
fmax: 7600
|
73 |
+
fmin: 80
|
74 |
+
frames_multiple: 1
|
75 |
+
gen_dir_name: ''
|
76 |
+
griffin_lim_iters: 30
|
77 |
+
hidden_size: 256
|
78 |
+
hop_size: 256
|
79 |
+
infer: false
|
80 |
+
lambda_commit: 0.25
|
81 |
+
lambda_energy: 0.1
|
82 |
+
lambda_f0: 1.0
|
83 |
+
lambda_ph_dur: 0.1
|
84 |
+
lambda_sent_dur: 1.0
|
85 |
+
lambda_uv: 1.0
|
86 |
+
lambda_word_dur: 1.0
|
87 |
+
layers_in_block: 2
|
88 |
+
load_ckpt: ''
|
89 |
+
loud_norm: false
|
90 |
+
lr: 0.0005
|
91 |
+
max_epochs: 1000
|
92 |
+
max_frames: 1548
|
93 |
+
max_input_tokens: 1550
|
94 |
+
max_sentences: 128
|
95 |
+
max_tokens: 40000
|
96 |
+
max_updates: 160000
|
97 |
+
max_valid_sentences: 1
|
98 |
+
max_valid_tokens: 60000
|
99 |
+
mel_losses: l1:0.5|ssim:0.5
|
100 |
+
mel_vmax: 1.5
|
101 |
+
mel_vmin: -6
|
102 |
+
min_frames: 0
|
103 |
+
num_ckpt_keep: 3
|
104 |
+
num_heads: 2
|
105 |
+
num_sanity_val_steps: 5
|
106 |
+
num_spk: 1
|
107 |
+
num_valid_plots: 10
|
108 |
+
optimizer_adam_beta1: 0.9
|
109 |
+
optimizer_adam_beta2: 0.98
|
110 |
+
out_wav_norm: false
|
111 |
+
pitch_extractor: parselmouth
|
112 |
+
pitch_key: pitch
|
113 |
+
pitch_type: cwt
|
114 |
+
predictor_dropout: 0.5
|
115 |
+
predictor_grad: 0.1
|
116 |
+
predictor_hidden: -1
|
117 |
+
predictor_kernel: 5
|
118 |
+
predictor_layers: 2
|
119 |
+
preprocess_args:
|
120 |
+
add_eos_bos: true
|
121 |
+
mfa_group_shuffle: false
|
122 |
+
mfa_offset: 0.02
|
123 |
+
nsample_per_mfa_group: 1000
|
124 |
+
reset_phone_dict: true
|
125 |
+
reset_word_dict: true
|
126 |
+
save_sil_mask: true
|
127 |
+
txt_processor: en
|
128 |
+
use_mfa: true
|
129 |
+
vad_max_silence_length: 12
|
130 |
+
wav_processors: []
|
131 |
+
with_phsep: true
|
132 |
+
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
133 |
+
print_nan_grads: false
|
134 |
+
processed_data_dir: data/processed/ljspeech
|
135 |
+
profile_infer: false
|
136 |
+
raw_data_dir: data/raw/LJSpeech-1.1
|
137 |
+
ref_norm_layer: bn
|
138 |
+
rename_tmux: true
|
139 |
+
resume_from_checkpoint: 0
|
140 |
+
save_best: false
|
141 |
+
save_codes:
|
142 |
+
- tasks
|
143 |
+
- modules
|
144 |
+
- egs
|
145 |
+
save_f0: false
|
146 |
+
save_gt: true
|
147 |
+
scheduler: warmup
|
148 |
+
seed: 1234
|
149 |
+
sort_by_len: true
|
150 |
+
task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask
|
151 |
+
tb_log_interval: 100
|
152 |
+
test_ids:
|
153 |
+
- 0
|
154 |
+
- 1
|
155 |
+
- 2
|
156 |
+
- 3
|
157 |
+
- 4
|
158 |
+
- 5
|
159 |
+
- 6
|
160 |
+
- 7
|
161 |
+
- 8
|
162 |
+
- 9
|
163 |
+
- 10
|
164 |
+
- 11
|
165 |
+
- 12
|
166 |
+
- 13
|
167 |
+
- 14
|
168 |
+
- 15
|
169 |
+
- 16
|
170 |
+
- 17
|
171 |
+
- 18
|
172 |
+
- 19
|
173 |
+
- 68
|
174 |
+
- 70
|
175 |
+
- 74
|
176 |
+
- 87
|
177 |
+
- 110
|
178 |
+
- 172
|
179 |
+
- 190
|
180 |
+
- 215
|
181 |
+
- 231
|
182 |
+
- 294
|
183 |
+
- 316
|
184 |
+
- 324
|
185 |
+
- 402
|
186 |
+
- 422
|
187 |
+
- 485
|
188 |
+
- 500
|
189 |
+
- 505
|
190 |
+
- 508
|
191 |
+
- 509
|
192 |
+
- 519
|
193 |
+
test_input_yaml: ''
|
194 |
+
test_num: 100
|
195 |
+
test_set_name: test
|
196 |
+
train_set_name: train
|
197 |
+
train_sets: ''
|
198 |
+
use_energy_embed: true
|
199 |
+
use_gt_dur: false
|
200 |
+
use_gt_energy: false
|
201 |
+
use_gt_f0: false
|
202 |
+
use_pitch_embed: true
|
203 |
+
use_pos_embed: true
|
204 |
+
use_spk_embed: false
|
205 |
+
use_spk_id: false
|
206 |
+
use_uv: true
|
207 |
+
use_word_input: false
|
208 |
+
val_check_interval: 2000
|
209 |
+
valid_infer_interval: 10000
|
210 |
+
valid_monitor_key: val_loss
|
211 |
+
valid_monitor_mode: min
|
212 |
+
valid_set_name: valid
|
213 |
+
vocoder: HifiGAN
|
214 |
+
vocoder_ckpt: checkpoints/hifi_lj
|
215 |
+
warmup_updates: 4000
|
216 |
+
weight_decay: 0
|
217 |
+
win_size: 1024
|
218 |
+
word_dict_size: 10000
|
219 |
+
work_dir: checkpoints/fs2_exp
|
checkpoints/fs2_exp/model_ckpt_steps_98000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9d4f450bb3115e04b4ea93eed8c9318f08d01582bed1dd86886b32d50601dc58
|
3 |
+
size 108423039
|
checkpoints/hifi_lj/config.yaml
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
adam_b1: 0.8
|
3 |
+
adam_b2: 0.99
|
4 |
+
amp: false
|
5 |
+
audio_num_mel_bins: 80
|
6 |
+
audio_sample_rate: 22050
|
7 |
+
base_config:
|
8 |
+
- configs/tts/hifigan.yaml
|
9 |
+
- configs/tts/lj/base_mel2wav.yaml
|
10 |
+
binarization_args:
|
11 |
+
shuffle: false
|
12 |
+
trim_eos_bos: false
|
13 |
+
trim_sil: false
|
14 |
+
with_align: false
|
15 |
+
with_f0: true
|
16 |
+
with_f0cwt: false
|
17 |
+
with_linear: false
|
18 |
+
with_spk_embed: false
|
19 |
+
with_txt: true
|
20 |
+
with_wav: true
|
21 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
22 |
+
binary_data_dir: data/binary/ljspeech_wav
|
23 |
+
check_val_every_n_epoch: 10
|
24 |
+
clip_grad_norm: 1
|
25 |
+
clip_grad_value: 0
|
26 |
+
debug: false
|
27 |
+
dec_ffn_kernel_size: 9
|
28 |
+
dec_layers: 4
|
29 |
+
dict_dir: ''
|
30 |
+
disc_start_steps: 40000
|
31 |
+
discriminator_grad_norm: 1
|
32 |
+
discriminator_optimizer_params:
|
33 |
+
eps: 1.0e-06
|
34 |
+
lr: 0.0002
|
35 |
+
weight_decay: 0.0
|
36 |
+
discriminator_params:
|
37 |
+
bias: true
|
38 |
+
conv_channels: 64
|
39 |
+
in_channels: 1
|
40 |
+
kernel_size: 3
|
41 |
+
layers: 10
|
42 |
+
nonlinear_activation: LeakyReLU
|
43 |
+
nonlinear_activation_params:
|
44 |
+
negative_slope: 0.2
|
45 |
+
out_channels: 1
|
46 |
+
use_weight_norm: true
|
47 |
+
discriminator_scheduler_params:
|
48 |
+
gamma: 0.999
|
49 |
+
step_size: 600
|
50 |
+
dropout: 0.1
|
51 |
+
ds_workers: 1
|
52 |
+
enc_ffn_kernel_size: 9
|
53 |
+
enc_layers: 4
|
54 |
+
endless_ds: true
|
55 |
+
ffn_act: gelu
|
56 |
+
ffn_padding: SAME
|
57 |
+
fft_size: 1024
|
58 |
+
fm_loss: false
|
59 |
+
fmax: 7600
|
60 |
+
fmin: 80
|
61 |
+
frames_multiple: 1
|
62 |
+
gen_dir_name: ''
|
63 |
+
generator_grad_norm: 10
|
64 |
+
generator_optimizer_params:
|
65 |
+
eps: 1.0e-06
|
66 |
+
lr: 0.0002
|
67 |
+
weight_decay: 0.0
|
68 |
+
generator_params:
|
69 |
+
aux_channels: 80
|
70 |
+
aux_context_window: 0
|
71 |
+
dropout: 0.0
|
72 |
+
gate_channels: 128
|
73 |
+
in_channels: 1
|
74 |
+
kernel_size: 3
|
75 |
+
layers: 30
|
76 |
+
out_channels: 1
|
77 |
+
residual_channels: 64
|
78 |
+
skip_channels: 64
|
79 |
+
stacks: 3
|
80 |
+
upsample_net: ConvInUpsampleNetwork
|
81 |
+
upsample_params:
|
82 |
+
upsample_scales:
|
83 |
+
- 4
|
84 |
+
- 4
|
85 |
+
- 4
|
86 |
+
- 4
|
87 |
+
use_nsf: false
|
88 |
+
use_pitch_embed: false
|
89 |
+
use_weight_norm: true
|
90 |
+
generator_scheduler_params:
|
91 |
+
gamma: 0.999
|
92 |
+
step_size: 600
|
93 |
+
griffin_lim_iters: 60
|
94 |
+
hidden_size: 256
|
95 |
+
hop_size: 256
|
96 |
+
infer: false
|
97 |
+
lambda_adv: 4.0
|
98 |
+
lambda_mel: 45.0
|
99 |
+
load_ckpt: ''
|
100 |
+
loud_norm: false
|
101 |
+
lr: 2.0
|
102 |
+
max_epochs: 1000
|
103 |
+
max_eval_sentences: 1
|
104 |
+
max_eval_tokens: 60000
|
105 |
+
max_frames: 1548
|
106 |
+
max_input_tokens: 1550
|
107 |
+
max_samples: 8192
|
108 |
+
max_sentences: 24
|
109 |
+
max_tokens: 30000
|
110 |
+
max_updates: 3000000
|
111 |
+
mel_vmax: 1.5
|
112 |
+
mel_vmin: -6
|
113 |
+
min_level_db: -100
|
114 |
+
num_ckpt_keep: 3
|
115 |
+
num_heads: 2
|
116 |
+
num_mels: 80
|
117 |
+
num_sanity_val_steps: 5
|
118 |
+
num_spk: 1
|
119 |
+
optimizer_adam_beta1: 0.9
|
120 |
+
optimizer_adam_beta2: 0.98
|
121 |
+
out_wav_norm: false
|
122 |
+
pitch_extractor: parselmouth
|
123 |
+
pre_align_args:
|
124 |
+
allow_no_txt: false
|
125 |
+
denoise: false
|
126 |
+
forced_align: mfa
|
127 |
+
sox_resample: false
|
128 |
+
trim_sil: false
|
129 |
+
txt_processor: en
|
130 |
+
use_tone: true
|
131 |
+
pre_align_cls: ''
|
132 |
+
print_nan_grads: false
|
133 |
+
processed_data_dir: data/processed/ljspeech
|
134 |
+
profile_infer: false
|
135 |
+
raw_data_dir: data/raw/LJSpeech-1.1
|
136 |
+
ref_level_db: 20
|
137 |
+
rerun_gen: true
|
138 |
+
resblock: '1'
|
139 |
+
resblock_dilation_sizes:
|
140 |
+
- - 1
|
141 |
+
- 3
|
142 |
+
- 5
|
143 |
+
- - 1
|
144 |
+
- 3
|
145 |
+
- 5
|
146 |
+
- - 1
|
147 |
+
- 3
|
148 |
+
- 5
|
149 |
+
resblock_kernel_sizes:
|
150 |
+
- 3
|
151 |
+
- 7
|
152 |
+
- 11
|
153 |
+
reset_phone_dict: true
|
154 |
+
resume_from_checkpoint: 0
|
155 |
+
sampling_rate: 22050
|
156 |
+
save_best: true
|
157 |
+
save_codes: []
|
158 |
+
save_f0: false
|
159 |
+
save_gt: true
|
160 |
+
seed: 1234
|
161 |
+
sort_by_len: true
|
162 |
+
stft_loss_params:
|
163 |
+
fft_sizes:
|
164 |
+
- 1024
|
165 |
+
- 2048
|
166 |
+
- 512
|
167 |
+
hop_sizes:
|
168 |
+
- 120
|
169 |
+
- 240
|
170 |
+
- 50
|
171 |
+
win_lengths:
|
172 |
+
- 600
|
173 |
+
- 1200
|
174 |
+
- 240
|
175 |
+
window: hann_window
|
176 |
+
stop_token_weight: 5.0
|
177 |
+
task_cls: tasks.vocoder.hifigan.HifiGanTask
|
178 |
+
tb_log_interval: 100
|
179 |
+
test_input_dir: ''
|
180 |
+
test_num: 100
|
181 |
+
test_set_name: test
|
182 |
+
train_set_name: train
|
183 |
+
upsample_initial_channel: 512
|
184 |
+
upsample_kernel_sizes:
|
185 |
+
- 16
|
186 |
+
- 16
|
187 |
+
- 4
|
188 |
+
- 4
|
189 |
+
upsample_rates:
|
190 |
+
- 8
|
191 |
+
- 8
|
192 |
+
- 2
|
193 |
+
- 2
|
194 |
+
use_mel_loss: false
|
195 |
+
use_pitch_embed: false
|
196 |
+
val_check_interval: 2000
|
197 |
+
valid_monitor_key: val_loss
|
198 |
+
valid_monitor_mode: min
|
199 |
+
valid_set_name: valid
|
200 |
+
vocoder: pwg
|
201 |
+
vocoder_ckpt: ''
|
202 |
+
warmup_updates: 8000
|
203 |
+
weight_decay: 0
|
204 |
+
win_length: null
|
205 |
+
win_size: 1024
|
206 |
+
window: hann
|
207 |
+
work_dir: checkpoints/0414_hifi_lj_1
|
checkpoints/hifi_lj/model_ckpt_steps_2076000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8bbc40f0471a92394f6bf057820cf66a1f50d29db22c997341448bd496a0792d
|
3 |
+
size 55786088
|
checkpoints/ps_normal_exp/config.yaml
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
add_word_pos: true
|
3 |
+
amp: false
|
4 |
+
audio_num_mel_bins: 80
|
5 |
+
audio_sample_rate: 22050
|
6 |
+
base_config:
|
7 |
+
- ./ps_flow.yaml
|
8 |
+
binarization_args:
|
9 |
+
min_sil_duration: 0.1
|
10 |
+
shuffle: false
|
11 |
+
test_range:
|
12 |
+
- 0
|
13 |
+
- 523
|
14 |
+
train_range:
|
15 |
+
- 871
|
16 |
+
- -1
|
17 |
+
trim_eos_bos: false
|
18 |
+
valid_range:
|
19 |
+
- 523
|
20 |
+
- 871
|
21 |
+
with_align: true
|
22 |
+
with_f0: true
|
23 |
+
with_f0cwt: false
|
24 |
+
with_linear: false
|
25 |
+
with_spk_embed: false
|
26 |
+
with_wav: false
|
27 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
28 |
+
binary_data_dir: data/binary/ljspeech
|
29 |
+
check_val_every_n_epoch: 10
|
30 |
+
clip_grad_norm: 1
|
31 |
+
clip_grad_value: 0
|
32 |
+
conv_use_pos: false
|
33 |
+
debug: false
|
34 |
+
dec_dilations:
|
35 |
+
- 1
|
36 |
+
- 1
|
37 |
+
- 1
|
38 |
+
- 1
|
39 |
+
dec_ffn_kernel_size: 9
|
40 |
+
dec_inp_add_noise: false
|
41 |
+
dec_kernel_size: 5
|
42 |
+
dec_layers: 4
|
43 |
+
dec_post_net_kernel: 3
|
44 |
+
decoder_rnn_dim: 0
|
45 |
+
decoder_type: conv
|
46 |
+
detach_postflow_input: true
|
47 |
+
dropout: 0.0
|
48 |
+
ds_workers: 2
|
49 |
+
dur_level: word
|
50 |
+
dur_predictor_kernel: 5
|
51 |
+
dur_predictor_layers: 3
|
52 |
+
enc_dec_norm: ln
|
53 |
+
enc_dilations:
|
54 |
+
- 1
|
55 |
+
- 1
|
56 |
+
- 1
|
57 |
+
- 1
|
58 |
+
enc_ffn_kernel_size: 5
|
59 |
+
enc_kernel_size: 5
|
60 |
+
enc_layers: 4
|
61 |
+
enc_post_net_kernel: 3
|
62 |
+
enc_pre_ln: false
|
63 |
+
enc_prenet: true
|
64 |
+
encoder_K: 8
|
65 |
+
encoder_type: rel_fft
|
66 |
+
endless_ds: true
|
67 |
+
eval_max_batches: -1
|
68 |
+
f0_max: 800
|
69 |
+
f0_min: 80
|
70 |
+
ffn_act: gelu
|
71 |
+
ffn_hidden_size: 768
|
72 |
+
fft_size: 1024
|
73 |
+
fmax: 7600
|
74 |
+
fmin: 80
|
75 |
+
frames_multiple: 4
|
76 |
+
fvae_dec_n_layers: 4
|
77 |
+
fvae_decoder_type: wn
|
78 |
+
fvae_enc_dec_hidden: 192
|
79 |
+
fvae_enc_n_layers: 8
|
80 |
+
fvae_encoder_type: wn
|
81 |
+
fvae_kernel_size: 5
|
82 |
+
fvae_noise_scale: 1.0
|
83 |
+
fvae_strides: 4
|
84 |
+
gen_dir_name: ''
|
85 |
+
glow_kernel_size: 3
|
86 |
+
griffin_lim_iters: 30
|
87 |
+
hidden_size: 192
|
88 |
+
hop_size: 256
|
89 |
+
infer: false
|
90 |
+
infer_post_glow: true
|
91 |
+
kl_min: 0.0
|
92 |
+
kl_start_steps: 10000
|
93 |
+
lambda_commit: 0.25
|
94 |
+
lambda_energy: 0.1
|
95 |
+
lambda_f0: 1.0
|
96 |
+
lambda_kl: 1.0
|
97 |
+
lambda_ph_dur: 0.1
|
98 |
+
lambda_sent_dur: 0.0
|
99 |
+
lambda_uv: 1.0
|
100 |
+
lambda_word_dur: 1.0
|
101 |
+
latent_size: 16
|
102 |
+
layers_in_block: 2
|
103 |
+
load_ckpt: ''
|
104 |
+
loud_norm: false
|
105 |
+
lr: 0.0002
|
106 |
+
max_epochs: 1000
|
107 |
+
max_frames: 1548
|
108 |
+
max_input_tokens: 1550
|
109 |
+
max_sentences: 64
|
110 |
+
max_tokens: 40000
|
111 |
+
max_updates: 480000
|
112 |
+
max_valid_sentences: 1
|
113 |
+
max_valid_tokens: 60000
|
114 |
+
mel_losses: l1:0.5|ssim:0.5
|
115 |
+
mel_vmax: 1.5
|
116 |
+
mel_vmin: -6
|
117 |
+
min_frames: 0
|
118 |
+
noise_scale: 0.8
|
119 |
+
num_ckpt_keep: 3
|
120 |
+
num_heads: 2
|
121 |
+
num_sanity_val_steps: 5
|
122 |
+
num_spk: 1
|
123 |
+
num_valid_plots: 10
|
124 |
+
optimizer_adam_beta1: 0.9
|
125 |
+
optimizer_adam_beta2: 0.98
|
126 |
+
out_wav_norm: false
|
127 |
+
pitch_extractor: parselmouth
|
128 |
+
pitch_key: pitch
|
129 |
+
pitch_type: frame
|
130 |
+
post_decoder: false
|
131 |
+
post_decoder_detach_ling: false
|
132 |
+
post_flow_lr: 0.001
|
133 |
+
post_glow_hidden: 192
|
134 |
+
post_glow_kernel_size: 3
|
135 |
+
post_glow_n_block_layers: 3
|
136 |
+
post_glow_n_blocks: 12
|
137 |
+
post_glow_training_start: 160000
|
138 |
+
post_share_cond_layers: false
|
139 |
+
posterior_start_steps: 0
|
140 |
+
predictor_dropout: 0.2
|
141 |
+
predictor_grad: 0.1
|
142 |
+
predictor_hidden: -1
|
143 |
+
predictor_kernel: 5
|
144 |
+
predictor_layers: 2
|
145 |
+
preprocess_args:
|
146 |
+
add_eos_bos: true
|
147 |
+
mfa_group_shuffle: false
|
148 |
+
mfa_offset: 0.02
|
149 |
+
nsample_per_mfa_group: 1000
|
150 |
+
reset_phone_dict: true
|
151 |
+
reset_word_dict: true
|
152 |
+
save_sil_mask: true
|
153 |
+
txt_processor: en
|
154 |
+
use_mfa: true
|
155 |
+
vad_max_silence_length: 12
|
156 |
+
wav_processors: []
|
157 |
+
with_phsep: true
|
158 |
+
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
159 |
+
print_nan_grads: false
|
160 |
+
prior_glow_hidden: 64
|
161 |
+
prior_glow_n_blocks: 4
|
162 |
+
processed_data_dir: data/processed/ljspeech
|
163 |
+
profile_infer: false
|
164 |
+
raw_data_dir: data/raw/LJSpeech-1.1
|
165 |
+
ref_norm_layer: bn
|
166 |
+
rename_tmux: true
|
167 |
+
resume_from_checkpoint: 0
|
168 |
+
save_best: false
|
169 |
+
save_codes:
|
170 |
+
- tasks
|
171 |
+
- modules
|
172 |
+
- egs
|
173 |
+
save_f0: false
|
174 |
+
save_gt: true
|
175 |
+
scheduler: warmup
|
176 |
+
seed: 1234
|
177 |
+
share_wn_layers: 4
|
178 |
+
sigmoid_scale: false
|
179 |
+
sort_by_len: true
|
180 |
+
task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
|
181 |
+
tb_log_interval: 100
|
182 |
+
test_ids:
|
183 |
+
- 0
|
184 |
+
- 1
|
185 |
+
- 2
|
186 |
+
- 3
|
187 |
+
- 4
|
188 |
+
- 5
|
189 |
+
- 6
|
190 |
+
- 7
|
191 |
+
- 8
|
192 |
+
- 9
|
193 |
+
- 10
|
194 |
+
- 11
|
195 |
+
- 12
|
196 |
+
- 13
|
197 |
+
- 14
|
198 |
+
- 15
|
199 |
+
- 16
|
200 |
+
- 17
|
201 |
+
- 18
|
202 |
+
- 19
|
203 |
+
- 68
|
204 |
+
- 70
|
205 |
+
- 74
|
206 |
+
- 87
|
207 |
+
- 110
|
208 |
+
- 172
|
209 |
+
- 190
|
210 |
+
- 215
|
211 |
+
- 231
|
212 |
+
- 294
|
213 |
+
- 316
|
214 |
+
- 324
|
215 |
+
- 402
|
216 |
+
- 422
|
217 |
+
- 485
|
218 |
+
- 500
|
219 |
+
- 505
|
220 |
+
- 508
|
221 |
+
- 509
|
222 |
+
- 519
|
223 |
+
test_input_yaml: ''
|
224 |
+
test_num: 100
|
225 |
+
test_set_name: test
|
226 |
+
text_encoder_postnet: false
|
227 |
+
train_set_name: train
|
228 |
+
train_sets: ''
|
229 |
+
two_stage: true
|
230 |
+
use_cond_proj: false
|
231 |
+
use_fvae: true
|
232 |
+
use_gt_dur: false
|
233 |
+
use_gt_f0: false
|
234 |
+
use_latent_cond: false
|
235 |
+
use_pitch_embed: false
|
236 |
+
use_pos_embed: true
|
237 |
+
use_post_flow: true
|
238 |
+
use_prior_flow: true
|
239 |
+
use_spk_embed: false
|
240 |
+
use_spk_id: false
|
241 |
+
use_txt_cond: true
|
242 |
+
use_uv: true
|
243 |
+
use_word_encoder: false
|
244 |
+
use_word_input: false
|
245 |
+
val_check_interval: 2000
|
246 |
+
valid_infer_interval: 10000
|
247 |
+
valid_monitor_key: val_loss
|
248 |
+
valid_monitor_mode: min
|
249 |
+
valid_set_name: valid
|
250 |
+
vocoder: HifiGAN
|
251 |
+
vocoder_ckpt: checkpoints/hifi_lj
|
252 |
+
warmup_updates: 8000
|
253 |
+
weight_decay: 0
|
254 |
+
win_size: 1024
|
255 |
+
word_dict_size: 10000
|
256 |
+
word_enc_layers: 4
|
257 |
+
word_encoder_type: rel_fft
|
258 |
+
work_dir: checkpoints/ps_normal_exp
|
checkpoints/ps_normal_exp/model_ckpt_steps_278000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13a51035b84c2a385d05ce695f6dca0b5095e7bd7ea3b1d34a22aed4d9c9b5fc
|
3 |
+
size 104081102
|
checkpoints/ps_small_exp/config.yaml
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
add_word_pos: true
|
3 |
+
amp: false
|
4 |
+
audio_num_mel_bins: 80
|
5 |
+
audio_sample_rate: 22050
|
6 |
+
base_config:
|
7 |
+
- ./ps_flow_small.yaml
|
8 |
+
binarization_args:
|
9 |
+
min_sil_duration: 0.1
|
10 |
+
shuffle: false
|
11 |
+
test_range:
|
12 |
+
- 0
|
13 |
+
- 523
|
14 |
+
train_range:
|
15 |
+
- 871
|
16 |
+
- -1
|
17 |
+
trim_eos_bos: false
|
18 |
+
valid_range:
|
19 |
+
- 523
|
20 |
+
- 871
|
21 |
+
with_align: true
|
22 |
+
with_f0: true
|
23 |
+
with_f0cwt: false
|
24 |
+
with_linear: false
|
25 |
+
with_spk_embed: false
|
26 |
+
with_wav: false
|
27 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
28 |
+
binary_data_dir: data/binary/ljspeech
|
29 |
+
check_val_every_n_epoch: 10
|
30 |
+
clip_grad_norm: 1
|
31 |
+
clip_grad_value: 0
|
32 |
+
conv_use_pos: false
|
33 |
+
debug: false
|
34 |
+
dec_dilations:
|
35 |
+
- 1
|
36 |
+
- 1
|
37 |
+
- 1
|
38 |
+
- 1
|
39 |
+
dec_ffn_kernel_size: 9
|
40 |
+
dec_inp_add_noise: false
|
41 |
+
dec_kernel_size: 5
|
42 |
+
dec_layers: 4
|
43 |
+
dec_post_net_kernel: 3
|
44 |
+
decoder_rnn_dim: 0
|
45 |
+
decoder_type: conv
|
46 |
+
detach_postflow_input: true
|
47 |
+
dropout: 0.0
|
48 |
+
ds_workers: 2
|
49 |
+
dur_level: word
|
50 |
+
dur_predictor_kernel: 5
|
51 |
+
dur_predictor_layers: 3
|
52 |
+
enc_dec_norm: ln
|
53 |
+
enc_dilations:
|
54 |
+
- 1
|
55 |
+
- 1
|
56 |
+
- 1
|
57 |
+
- 1
|
58 |
+
enc_ffn_kernel_size: 3
|
59 |
+
enc_kernel_size: 5
|
60 |
+
enc_layers: 3
|
61 |
+
enc_post_net_kernel: 3
|
62 |
+
enc_pre_ln: false
|
63 |
+
enc_prenet: true
|
64 |
+
encoder_K: 8
|
65 |
+
encoder_type: rel_fft
|
66 |
+
endless_ds: true
|
67 |
+
eval_max_batches: -1
|
68 |
+
f0_max: 800
|
69 |
+
f0_min: 80
|
70 |
+
ffn_act: gelu
|
71 |
+
ffn_hidden_size: 512
|
72 |
+
fft_size: 1024
|
73 |
+
fmax: 7600
|
74 |
+
fmin: 80
|
75 |
+
frames_multiple: 4
|
76 |
+
fvae_dec_n_layers: 3
|
77 |
+
fvae_decoder_type: wn
|
78 |
+
fvae_enc_dec_hidden: 128
|
79 |
+
fvae_enc_n_layers: 8
|
80 |
+
fvae_encoder_type: wn
|
81 |
+
fvae_kernel_size: 3
|
82 |
+
fvae_noise_scale: 1.0
|
83 |
+
fvae_strides: 4
|
84 |
+
gen_dir_name: ''
|
85 |
+
glow_kernel_size: 3
|
86 |
+
griffin_lim_iters: 30
|
87 |
+
hidden_size: 128
|
88 |
+
hop_size: 256
|
89 |
+
infer: false
|
90 |
+
infer_post_glow: true
|
91 |
+
kl_min: 0.0
|
92 |
+
kl_start_steps: 10000
|
93 |
+
lambda_commit: 0.25
|
94 |
+
lambda_energy: 0.1
|
95 |
+
lambda_f0: 1.0
|
96 |
+
lambda_kl: 1.0
|
97 |
+
lambda_ph_dur: 0.1
|
98 |
+
lambda_sent_dur: 0.0
|
99 |
+
lambda_uv: 1.0
|
100 |
+
lambda_word_dur: 1.0
|
101 |
+
latent_size: 16
|
102 |
+
layers_in_block: 2
|
103 |
+
load_ckpt: ''
|
104 |
+
loud_norm: false
|
105 |
+
lr: 0.0002
|
106 |
+
max_epochs: 1000
|
107 |
+
max_frames: 1548
|
108 |
+
max_input_tokens: 1550
|
109 |
+
max_sentences: 128
|
110 |
+
max_tokens: 40000
|
111 |
+
max_updates: 480000
|
112 |
+
max_valid_sentences: 1
|
113 |
+
max_valid_tokens: 60000
|
114 |
+
mel_losses: l1:0.5|ssim:0.5
|
115 |
+
mel_vmax: 1.5
|
116 |
+
mel_vmin: -6
|
117 |
+
min_frames: 0
|
118 |
+
noise_scale: 0.6
|
119 |
+
num_ckpt_keep: 3
|
120 |
+
num_heads: 2
|
121 |
+
num_sanity_val_steps: 5
|
122 |
+
num_spk: 1
|
123 |
+
num_valid_plots: 10
|
124 |
+
optimizer_adam_beta1: 0.9
|
125 |
+
optimizer_adam_beta2: 0.98
|
126 |
+
out_wav_norm: false
|
127 |
+
pitch_extractor: parselmouth
|
128 |
+
pitch_key: pitch
|
129 |
+
pitch_type: frame
|
130 |
+
post_decoder: false
|
131 |
+
post_decoder_detach_ling: false
|
132 |
+
post_flow_lr: 0.001
|
133 |
+
post_glow_hidden: 128
|
134 |
+
post_glow_kernel_size: 3
|
135 |
+
post_glow_n_block_layers: 3
|
136 |
+
post_glow_n_blocks: 8
|
137 |
+
post_glow_training_start: 160000
|
138 |
+
post_share_cond_layers: false
|
139 |
+
posterior_start_steps: 0
|
140 |
+
predictor_dropout: 0.2
|
141 |
+
predictor_grad: 0.1
|
142 |
+
predictor_hidden: -1
|
143 |
+
predictor_kernel: 5
|
144 |
+
predictor_layers: 2
|
145 |
+
preprocess_args:
|
146 |
+
add_eos_bos: true
|
147 |
+
mfa_group_shuffle: false
|
148 |
+
mfa_offset: 0.02
|
149 |
+
nsample_per_mfa_group: 1000
|
150 |
+
reset_phone_dict: true
|
151 |
+
reset_word_dict: true
|
152 |
+
save_sil_mask: true
|
153 |
+
txt_processor: en
|
154 |
+
use_mfa: true
|
155 |
+
vad_max_silence_length: 12
|
156 |
+
wav_processors: []
|
157 |
+
with_phsep: true
|
158 |
+
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
159 |
+
print_nan_grads: false
|
160 |
+
prior_glow_hidden: 32
|
161 |
+
prior_glow_n_blocks: 3
|
162 |
+
processed_data_dir: data/processed/ljspeech
|
163 |
+
profile_infer: false
|
164 |
+
raw_data_dir: data/raw/LJSpeech-1.1
|
165 |
+
ref_norm_layer: bn
|
166 |
+
rename_tmux: true
|
167 |
+
resume_from_checkpoint: 0
|
168 |
+
save_best: false
|
169 |
+
save_codes:
|
170 |
+
- tasks
|
171 |
+
- modules
|
172 |
+
- egs
|
173 |
+
save_f0: false
|
174 |
+
save_gt: true
|
175 |
+
scheduler: warmup
|
176 |
+
seed: 1234
|
177 |
+
share_wn_layers: 4
|
178 |
+
sigmoid_scale: false
|
179 |
+
sort_by_len: true
|
180 |
+
task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
|
181 |
+
tb_log_interval: 100
|
182 |
+
test_ids:
|
183 |
+
- 0
|
184 |
+
- 1
|
185 |
+
- 2
|
186 |
+
- 3
|
187 |
+
- 4
|
188 |
+
- 5
|
189 |
+
- 6
|
190 |
+
- 7
|
191 |
+
- 8
|
192 |
+
- 9
|
193 |
+
- 10
|
194 |
+
- 11
|
195 |
+
- 12
|
196 |
+
- 13
|
197 |
+
- 14
|
198 |
+
- 15
|
199 |
+
- 16
|
200 |
+
- 17
|
201 |
+
- 18
|
202 |
+
- 19
|
203 |
+
- 68
|
204 |
+
- 70
|
205 |
+
- 74
|
206 |
+
- 87
|
207 |
+
- 110
|
208 |
+
- 172
|
209 |
+
- 190
|
210 |
+
- 215
|
211 |
+
- 231
|
212 |
+
- 294
|
213 |
+
- 316
|
214 |
+
- 324
|
215 |
+
- 402
|
216 |
+
- 422
|
217 |
+
- 485
|
218 |
+
- 500
|
219 |
+
- 505
|
220 |
+
- 508
|
221 |
+
- 509
|
222 |
+
- 519
|
223 |
+
test_input_yaml: ''
|
224 |
+
test_num: 100
|
225 |
+
test_set_name: test
|
226 |
+
text_encoder_postnet: false
|
227 |
+
train_set_name: train
|
228 |
+
train_sets: ''
|
229 |
+
two_stage: true
|
230 |
+
use_cond_proj: false
|
231 |
+
use_fvae: true
|
232 |
+
use_gt_dur: false
|
233 |
+
use_gt_f0: false
|
234 |
+
use_latent_cond: false
|
235 |
+
use_pitch_embed: false
|
236 |
+
use_pos_embed: true
|
237 |
+
use_post_flow: true
|
238 |
+
use_prior_flow: true
|
239 |
+
use_spk_embed: false
|
240 |
+
use_spk_id: false
|
241 |
+
use_txt_cond: true
|
242 |
+
use_uv: true
|
243 |
+
use_word_encoder: false
|
244 |
+
use_word_input: false
|
245 |
+
val_check_interval: 2000
|
246 |
+
valid_infer_interval: 10000
|
247 |
+
valid_monitor_key: val_loss
|
248 |
+
valid_monitor_mode: min
|
249 |
+
valid_set_name: valid
|
250 |
+
vocoder: HifiGAN
|
251 |
+
vocoder_ckpt: checkpoints/hifi_lj
|
252 |
+
warmup_updates: 8000
|
253 |
+
weight_decay: 0
|
254 |
+
win_size: 1024
|
255 |
+
word_dict_size: 10000
|
256 |
+
word_enc_layers: 3
|
257 |
+
word_encoder_type: rel_fft
|
258 |
+
work_dir: checkpoints/ps_small_exp
|
checkpoints/ps_small_exp/model_ckpt_steps_410000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6905d8969febca192f2239a99e833d9084b2e07cb6894a63e286901ab1d16553
|
3 |
+
size 32754716
|
data/binary/ljspeech/phone_set.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
|
data/binary/ljspeech/spk_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<SINGLE_SPK>": 0}
|
data/binary/ljspeech/word_set.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/binary/ljspeech_cwt/phone_set.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
|
data/binary/ljspeech_cwt/spk_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<SINGLE_SPK>": 0}
|
data/binary/ljspeech_cwt/word_set.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data_gen/tts/base_binarizer.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import traceback
|
5 |
+
from functools import partial
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
from resemblyzer import VoiceEncoder
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
import utils.commons.single_thread_env # NOQA
|
12 |
+
from utils.audio import librosa_wav2spec
|
13 |
+
from utils.audio.align import get_mel2ph, mel2token_to_dur
|
14 |
+
from utils.audio.cwt import get_lf0_cwt, get_cont_lf0
|
15 |
+
from utils.audio.pitch.utils import f0_to_coarse
|
16 |
+
from utils.audio.pitch_extractors import extract_pitch_simple
|
17 |
+
from utils.commons.hparams import hparams
|
18 |
+
from utils.commons.indexed_datasets import IndexedDatasetBuilder
|
19 |
+
from utils.commons.multiprocess_utils import multiprocess_run_tqdm
|
20 |
+
from utils.os_utils import remove_file, copy_file
|
21 |
+
|
22 |
+
np.seterr(divide='ignore', invalid='ignore')
|
23 |
+
|
24 |
+
|
25 |
+
class BinarizationError(Exception):
|
26 |
+
pass
|
27 |
+
|
28 |
+
|
29 |
+
class BaseBinarizer:
|
30 |
+
def __init__(self, processed_data_dir=None):
|
31 |
+
if processed_data_dir is None:
|
32 |
+
processed_data_dir = hparams['processed_data_dir']
|
33 |
+
self.processed_data_dir = processed_data_dir
|
34 |
+
self.binarization_args = hparams['binarization_args']
|
35 |
+
self.items = {}
|
36 |
+
self.item_names = []
|
37 |
+
|
38 |
+
def load_meta_data(self):
|
39 |
+
processed_data_dir = self.processed_data_dir
|
40 |
+
items_list = json.load(open(f"{processed_data_dir}/metadata.json"))
|
41 |
+
for r in tqdm(items_list, desc='Loading meta data.'):
|
42 |
+
item_name = r['item_name']
|
43 |
+
self.items[item_name] = r
|
44 |
+
self.item_names.append(item_name)
|
45 |
+
if self.binarization_args['shuffle']:
|
46 |
+
random.seed(1234)
|
47 |
+
random.shuffle(self.item_names)
|
48 |
+
|
49 |
+
@property
|
50 |
+
def train_item_names(self):
|
51 |
+
range_ = self._convert_range(self.binarization_args['train_range'])
|
52 |
+
return self.item_names[range_[0]:range_[1]]
|
53 |
+
|
54 |
+
@property
|
55 |
+
def valid_item_names(self):
|
56 |
+
range_ = self._convert_range(self.binarization_args['valid_range'])
|
57 |
+
return self.item_names[range_[0]:range_[1]]
|
58 |
+
|
59 |
+
@property
|
60 |
+
def test_item_names(self):
|
61 |
+
range_ = self._convert_range(self.binarization_args['test_range'])
|
62 |
+
return self.item_names[range_[0]:range_[1]]
|
63 |
+
|
64 |
+
def _convert_range(self, range_):
|
65 |
+
if range_[1] == -1:
|
66 |
+
range_[1] = len(self.item_names)
|
67 |
+
return range_
|
68 |
+
|
69 |
+
def meta_data(self, prefix):
|
70 |
+
if prefix == 'valid':
|
71 |
+
item_names = self.valid_item_names
|
72 |
+
elif prefix == 'test':
|
73 |
+
item_names = self.test_item_names
|
74 |
+
else:
|
75 |
+
item_names = self.train_item_names
|
76 |
+
for item_name in item_names:
|
77 |
+
yield self.items[item_name]
|
78 |
+
|
79 |
+
def process(self):
|
80 |
+
self.load_meta_data()
|
81 |
+
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
|
82 |
+
for fn in ['phone_set.json', 'word_set.json', 'spk_map.json']:
|
83 |
+
remove_file(f"{hparams['binary_data_dir']}/{fn}")
|
84 |
+
copy_file(f"{hparams['processed_data_dir']}/{fn}", f"{hparams['binary_data_dir']}/{fn}")
|
85 |
+
self.process_data('valid')
|
86 |
+
self.process_data('test')
|
87 |
+
self.process_data('train')
|
88 |
+
|
89 |
+
def process_data(self, prefix):
|
90 |
+
data_dir = hparams['binary_data_dir']
|
91 |
+
builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
|
92 |
+
meta_data = list(self.meta_data(prefix))
|
93 |
+
process_item = partial(self.process_item, binarization_args=self.binarization_args)
|
94 |
+
ph_lengths = []
|
95 |
+
mel_lengths = []
|
96 |
+
total_sec = 0
|
97 |
+
items = []
|
98 |
+
args = [{'item': item} for item in meta_data]
|
99 |
+
for item_id, item in multiprocess_run_tqdm(process_item, args, desc='Processing data'):
|
100 |
+
if item is not None:
|
101 |
+
items.append(item)
|
102 |
+
if self.binarization_args['with_spk_embed']:
|
103 |
+
args = [{'wav': item['wav']} for item in items]
|
104 |
+
for item_id, spk_embed in multiprocess_run_tqdm(
|
105 |
+
self.get_spk_embed, args,
|
106 |
+
init_ctx_func=lambda wid: {'voice_encoder': VoiceEncoder().cuda()}, num_workers=4,
|
107 |
+
desc='Extracting spk embed'):
|
108 |
+
items[item_id]['spk_embed'] = spk_embed
|
109 |
+
|
110 |
+
for item in items:
|
111 |
+
if not self.binarization_args['with_wav'] and 'wav' in item:
|
112 |
+
del item['wav']
|
113 |
+
builder.add_item(item)
|
114 |
+
mel_lengths.append(item['len'])
|
115 |
+
assert item['len'] > 0, (item['item_name'], item['txt'], item['mel2ph'])
|
116 |
+
if 'ph_len' in item:
|
117 |
+
ph_lengths.append(item['ph_len'])
|
118 |
+
total_sec += item['sec']
|
119 |
+
builder.finalize()
|
120 |
+
np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
|
121 |
+
if len(ph_lengths) > 0:
|
122 |
+
np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
|
123 |
+
print(f"| {prefix} total duration: {total_sec:.3f}s")
|
124 |
+
|
125 |
+
@classmethod
|
126 |
+
def process_item(cls, item, binarization_args):
|
127 |
+
item['ph_len'] = len(item['ph_token'])
|
128 |
+
item_name = item['item_name']
|
129 |
+
wav_fn = item['wav_fn']
|
130 |
+
wav, mel = cls.process_audio(wav_fn, item, binarization_args)
|
131 |
+
try:
|
132 |
+
n_bos_frames, n_eos_frames = 0, 0
|
133 |
+
if binarization_args['with_align']:
|
134 |
+
tg_fn = f"{hparams['processed_data_dir']}/mfa_outputs/{item_name}.TextGrid"
|
135 |
+
item['tg_fn'] = tg_fn
|
136 |
+
cls.process_align(tg_fn, item)
|
137 |
+
if binarization_args['trim_eos_bos']:
|
138 |
+
n_bos_frames = item['dur'][0]
|
139 |
+
n_eos_frames = item['dur'][-1]
|
140 |
+
T = len(mel)
|
141 |
+
item['mel'] = mel[n_bos_frames:T - n_eos_frames]
|
142 |
+
item['mel2ph'] = item['mel2ph'][n_bos_frames:T - n_eos_frames]
|
143 |
+
item['mel2word'] = item['mel2word'][n_bos_frames:T - n_eos_frames]
|
144 |
+
item['dur'] = item['dur'][1:-1]
|
145 |
+
item['dur_word'] = item['dur_word'][1:-1]
|
146 |
+
item['len'] = item['mel'].shape[0]
|
147 |
+
item['wav'] = wav[n_bos_frames * hparams['hop_size']:len(wav) - n_eos_frames * hparams['hop_size']]
|
148 |
+
if binarization_args['with_f0']:
|
149 |
+
cls.process_pitch(item, n_bos_frames, n_eos_frames)
|
150 |
+
except BinarizationError as e:
|
151 |
+
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
152 |
+
return None
|
153 |
+
except Exception as e:
|
154 |
+
traceback.print_exc()
|
155 |
+
print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
|
156 |
+
return None
|
157 |
+
return item
|
158 |
+
|
159 |
+
@classmethod
|
160 |
+
def process_audio(cls, wav_fn, res, binarization_args):
|
161 |
+
wav2spec_dict = librosa_wav2spec(
|
162 |
+
wav_fn,
|
163 |
+
fft_size=hparams['fft_size'],
|
164 |
+
hop_size=hparams['hop_size'],
|
165 |
+
win_length=hparams['win_size'],
|
166 |
+
num_mels=hparams['audio_num_mel_bins'],
|
167 |
+
fmin=hparams['fmin'],
|
168 |
+
fmax=hparams['fmax'],
|
169 |
+
sample_rate=hparams['audio_sample_rate'],
|
170 |
+
loud_norm=hparams['loud_norm'])
|
171 |
+
mel = wav2spec_dict['mel']
|
172 |
+
wav = wav2spec_dict['wav'].astype(np.float16)
|
173 |
+
if binarization_args['with_linear']:
|
174 |
+
res['linear'] = wav2spec_dict['linear']
|
175 |
+
res.update({'mel': mel, 'wav': wav, 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
|
176 |
+
return wav, mel
|
177 |
+
|
178 |
+
@staticmethod
|
179 |
+
def process_align(tg_fn, item):
|
180 |
+
ph = item['ph']
|
181 |
+
mel = item['mel']
|
182 |
+
ph_token = item['ph_token']
|
183 |
+
if tg_fn is not None and os.path.exists(tg_fn):
|
184 |
+
mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams['hop_size'], hparams['audio_sample_rate'],
|
185 |
+
hparams['binarization_args']['min_sil_duration'])
|
186 |
+
else:
|
187 |
+
raise BinarizationError(f"Align not found")
|
188 |
+
if np.array(mel2ph).max() - 1 >= len(ph_token):
|
189 |
+
raise BinarizationError(
|
190 |
+
f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(ph_token)}")
|
191 |
+
item['mel2ph'] = mel2ph
|
192 |
+
item['dur'] = dur
|
193 |
+
|
194 |
+
ph2word = item['ph2word']
|
195 |
+
mel2word = [ph2word[p - 1] for p in item['mel2ph']]
|
196 |
+
item['mel2word'] = mel2word # [T_mel]
|
197 |
+
dur_word = mel2token_to_dur(mel2word, len(item['word_token']))
|
198 |
+
item['dur_word'] = dur_word.tolist() # [T_word]
|
199 |
+
|
200 |
+
@staticmethod
|
201 |
+
def process_pitch(item, n_bos_frames, n_eos_frames):
|
202 |
+
wav, mel = item['wav'], item['mel']
|
203 |
+
f0 = extract_pitch_simple(item['wav'])
|
204 |
+
if sum(f0) == 0:
|
205 |
+
raise BinarizationError("Empty f0")
|
206 |
+
assert len(mel) == len(f0), (len(mel), len(f0))
|
207 |
+
pitch_coarse = f0_to_coarse(f0)
|
208 |
+
item['f0'] = f0
|
209 |
+
item['pitch'] = pitch_coarse
|
210 |
+
if hparams['binarization_args']['with_f0cwt']:
|
211 |
+
uv, cont_lf0_lpf = get_cont_lf0(f0)
|
212 |
+
logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
|
213 |
+
cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
|
214 |
+
cwt_spec, scales = get_lf0_cwt(cont_lf0_lpf_norm)
|
215 |
+
item['cwt_spec'] = cwt_spec
|
216 |
+
item['cwt_mean'] = logf0s_mean_org
|
217 |
+
item['cwt_std'] = logf0s_std_org
|
218 |
+
|
219 |
+
@staticmethod
|
220 |
+
def get_spk_embed(wav, ctx):
|
221 |
+
return ctx['voice_encoder'].embed_utterance(wav.astype(float))
|
222 |
+
|
223 |
+
@property
|
224 |
+
def num_workers(self):
|
225 |
+
return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
|
data_gen/tts/base_preprocess.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import re
|
5 |
+
import traceback
|
6 |
+
from collections import Counter
|
7 |
+
from functools import partial
|
8 |
+
|
9 |
+
import librosa
|
10 |
+
from tqdm import tqdm
|
11 |
+
from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
|
12 |
+
from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
|
13 |
+
from utils.commons.hparams import hparams
|
14 |
+
from utils.commons.multiprocess_utils import multiprocess_run_tqdm
|
15 |
+
from utils.os_utils import link_file, move_file, remove_file
|
16 |
+
from utils.text.text_encoder import is_sil_phoneme, build_token_encoder
|
17 |
+
|
18 |
+
|
19 |
+
class BasePreprocessor:
|
20 |
+
def __init__(self):
|
21 |
+
self.preprocess_args = hparams['preprocess_args']
|
22 |
+
txt_processor = self.preprocess_args['txt_processor']
|
23 |
+
self.txt_processor = get_txt_processor_cls(txt_processor)
|
24 |
+
self.raw_data_dir = hparams['raw_data_dir']
|
25 |
+
self.processed_dir = hparams['processed_data_dir']
|
26 |
+
self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
|
27 |
+
|
28 |
+
def meta_data(self):
|
29 |
+
"""
|
30 |
+
|
31 |
+
:return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
|
32 |
+
"""
|
33 |
+
raise NotImplementedError
|
34 |
+
|
35 |
+
def process(self):
|
36 |
+
processed_dir = self.processed_dir
|
37 |
+
wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
|
38 |
+
remove_file(wav_processed_tmp_dir)
|
39 |
+
os.makedirs(wav_processed_tmp_dir, exist_ok=True)
|
40 |
+
wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
|
41 |
+
remove_file(wav_processed_dir)
|
42 |
+
os.makedirs(wav_processed_dir, exist_ok=True)
|
43 |
+
|
44 |
+
meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
|
45 |
+
item_names = [d['item_name'] for d in meta_data]
|
46 |
+
assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
|
47 |
+
|
48 |
+
# preprocess data
|
49 |
+
phone_list = []
|
50 |
+
word_list = []
|
51 |
+
spk_names = set()
|
52 |
+
process_item = partial(self.preprocess_first_pass,
|
53 |
+
txt_processor=self.txt_processor,
|
54 |
+
wav_processed_dir=wav_processed_dir,
|
55 |
+
wav_processed_tmp=wav_processed_tmp_dir,
|
56 |
+
preprocess_args=self.preprocess_args)
|
57 |
+
items = []
|
58 |
+
args = [{
|
59 |
+
'item_name': item_raw['item_name'],
|
60 |
+
'txt_raw': item_raw['txt'],
|
61 |
+
'wav_fn': item_raw['wav_fn'],
|
62 |
+
'txt_loader': item_raw.get('txt_loader'),
|
63 |
+
'others': item_raw.get('others', None)
|
64 |
+
} for item_raw in meta_data]
|
65 |
+
for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
|
66 |
+
if item is not None:
|
67 |
+
item_.update(item)
|
68 |
+
item = item_
|
69 |
+
if 'txt_loader' in item:
|
70 |
+
del item['txt_loader']
|
71 |
+
item['id'] = item_id
|
72 |
+
item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
|
73 |
+
item['others'] = item.get('others', None)
|
74 |
+
phone_list += item['ph'].split(" ")
|
75 |
+
word_list += item['word'].split(" ")
|
76 |
+
spk_names.add(item['spk_name'])
|
77 |
+
items.append(item)
|
78 |
+
|
79 |
+
# add encoded tokens
|
80 |
+
ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
|
81 |
+
spk_map = self.build_spk_map(spk_names)
|
82 |
+
args = [{
|
83 |
+
'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
|
84 |
+
'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
|
85 |
+
} for item in items]
|
86 |
+
for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
|
87 |
+
items[idx].update(item_new_kv)
|
88 |
+
|
89 |
+
# build mfa data
|
90 |
+
if self.preprocess_args['use_mfa']:
|
91 |
+
mfa_dict = set()
|
92 |
+
mfa_input_dir = f'{processed_dir}/mfa_inputs'
|
93 |
+
remove_file(mfa_input_dir)
|
94 |
+
# group MFA inputs for better parallelism
|
95 |
+
mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
|
96 |
+
if self.preprocess_args['mfa_group_shuffle']:
|
97 |
+
random.seed(hparams['seed'])
|
98 |
+
random.shuffle(mfa_groups)
|
99 |
+
args = [{
|
100 |
+
'item': item, 'mfa_input_dir': mfa_input_dir,
|
101 |
+
'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
|
102 |
+
'preprocess_args': self.preprocess_args
|
103 |
+
} for item, mfa_group in zip(items, mfa_groups)]
|
104 |
+
for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
|
105 |
+
self.build_mfa_inputs, args, desc='Build MFA data'):
|
106 |
+
items[i]['wav_align_fn'] = new_wav_align_fn
|
107 |
+
for w in ph_gb_word_nosil.split(" "):
|
108 |
+
mfa_dict.add(f"{w} {w.replace('_', ' ')}")
|
109 |
+
mfa_dict = sorted(mfa_dict)
|
110 |
+
with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
|
111 |
+
f.writelines([f'{l}\n' for l in mfa_dict])
|
112 |
+
with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
|
113 |
+
f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
|
114 |
+
remove_file(wav_processed_tmp_dir)
|
115 |
+
|
116 |
+
@classmethod
|
117 |
+
def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
|
118 |
+
wav_fn, wav_processed_dir, wav_processed_tmp,
|
119 |
+
preprocess_args, txt_loader=None, others=None):
|
120 |
+
try:
|
121 |
+
if txt_loader is not None:
|
122 |
+
txt_raw = txt_loader(txt_raw)
|
123 |
+
ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
|
124 |
+
wav_fn, wav_align_fn = cls.process_wav(
|
125 |
+
item_name, wav_fn,
|
126 |
+
hparams['processed_data_dir'],
|
127 |
+
wav_processed_tmp, preprocess_args)
|
128 |
+
|
129 |
+
# wav for binarization
|
130 |
+
ext = os.path.splitext(wav_fn)[1]
|
131 |
+
os.makedirs(wav_processed_dir, exist_ok=True)
|
132 |
+
new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
|
133 |
+
move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
|
134 |
+
move_link_func(wav_fn, new_wav_fn)
|
135 |
+
return {
|
136 |
+
'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
|
137 |
+
'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
|
138 |
+
'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
|
139 |
+
'others': others
|
140 |
+
}
|
141 |
+
except:
|
142 |
+
traceback.print_exc()
|
143 |
+
print(f"| Error is caught. item_name: {item_name}.")
|
144 |
+
return None
|
145 |
+
|
146 |
+
@staticmethod
|
147 |
+
def txt_to_ph(txt_processor, txt_raw, preprocess_args):
|
148 |
+
txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
|
149 |
+
ph = [p for w in txt_struct for p in w[1]]
|
150 |
+
ph_gb_word = ["_".join(w[1]) for w in txt_struct]
|
151 |
+
words = [w[0] for w in txt_struct]
|
152 |
+
# word_id=0 is reserved for padding
|
153 |
+
ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
|
154 |
+
return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
|
155 |
+
|
156 |
+
@staticmethod
|
157 |
+
def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
|
158 |
+
processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
|
159 |
+
processors = [k() for k in processors if k is not None]
|
160 |
+
if len(processors) >= 1:
|
161 |
+
sr_file = librosa.core.get_samplerate(wav_fn)
|
162 |
+
output_fn_for_align = None
|
163 |
+
ext = os.path.splitext(wav_fn)[1]
|
164 |
+
input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
|
165 |
+
link_file(wav_fn, input_fn)
|
166 |
+
for p in processors:
|
167 |
+
outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
|
168 |
+
if len(outputs) == 3:
|
169 |
+
input_fn, sr, output_fn_for_align = outputs
|
170 |
+
else:
|
171 |
+
input_fn, sr = outputs
|
172 |
+
return input_fn, output_fn_for_align
|
173 |
+
else:
|
174 |
+
return wav_fn, wav_fn
|
175 |
+
|
176 |
+
def _phone_encoder(self, ph_set):
|
177 |
+
ph_set_fn = f"{self.processed_dir}/phone_set.json"
|
178 |
+
if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
|
179 |
+
ph_set = sorted(set(ph_set))
|
180 |
+
json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
|
181 |
+
print("| Build phone set: ", ph_set)
|
182 |
+
else:
|
183 |
+
ph_set = json.load(open(ph_set_fn, 'r'))
|
184 |
+
print("| Load phone set: ", ph_set)
|
185 |
+
return build_token_encoder(ph_set_fn)
|
186 |
+
|
187 |
+
def _word_encoder(self, word_set):
|
188 |
+
word_set_fn = f"{self.processed_dir}/word_set.json"
|
189 |
+
if self.preprocess_args['reset_word_dict']:
|
190 |
+
word_set = Counter(word_set)
|
191 |
+
total_words = sum(word_set.values())
|
192 |
+
word_set = word_set.most_common(hparams['word_dict_size'])
|
193 |
+
num_unk_words = total_words - sum([x[1] for x in word_set])
|
194 |
+
word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
|
195 |
+
word_set = sorted(set(word_set))
|
196 |
+
json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
|
197 |
+
print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
|
198 |
+
f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
|
199 |
+
else:
|
200 |
+
word_set = json.load(open(word_set_fn, 'r'))
|
201 |
+
print("| Load word set. Size: ", len(word_set), word_set[:10])
|
202 |
+
return build_token_encoder(word_set_fn)
|
203 |
+
|
204 |
+
@classmethod
|
205 |
+
def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
|
206 |
+
word_token = word_encoder.encode(word)
|
207 |
+
ph_token = ph_encoder.encode(ph)
|
208 |
+
spk_id = spk_map[spk_name]
|
209 |
+
return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
|
210 |
+
|
211 |
+
def build_spk_map(self, spk_names):
|
212 |
+
spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
|
213 |
+
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
|
214 |
+
print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
|
215 |
+
json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
|
216 |
+
return spk_map
|
217 |
+
|
218 |
+
@classmethod
|
219 |
+
def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
|
220 |
+
item_name = item['item_name']
|
221 |
+
wav_align_fn = item['wav_align_fn']
|
222 |
+
ph_gb_word = item['ph_gb_word']
|
223 |
+
ext = os.path.splitext(wav_align_fn)[1]
|
224 |
+
mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
|
225 |
+
os.makedirs(mfa_input_group_dir, exist_ok=True)
|
226 |
+
new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
|
227 |
+
move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
|
228 |
+
move_link_func(wav_align_fn, new_wav_align_fn)
|
229 |
+
ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
|
230 |
+
for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
|
231 |
+
with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
|
232 |
+
f_txt.write(ph_gb_word_nosil)
|
233 |
+
return ph_gb_word_nosil, new_wav_align_fn
|
234 |
+
|
235 |
+
def load_spk_map(self, base_dir):
|
236 |
+
spk_map_fn = f"{base_dir}/spk_map.json"
|
237 |
+
spk_map = json.load(open(spk_map_fn, 'r'))
|
238 |
+
return spk_map
|
239 |
+
|
240 |
+
def load_dict(self, base_dir):
|
241 |
+
ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
|
242 |
+
word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
|
243 |
+
return ph_encoder, word_encoder
|
244 |
+
|
245 |
+
@property
|
246 |
+
def meta_csv_filename(self):
|
247 |
+
return 'metadata'
|
248 |
+
|
249 |
+
@property
|
250 |
+
def wav_processed_dirname(self):
|
251 |
+
return 'wav_processed'
|
data_gen/tts/binarizer_zh.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from data_gen.tts.base_binarizer import BaseBinarizer
|
3 |
+
|
4 |
+
|
5 |
+
class ZhBinarizer(BaseBinarizer):
|
6 |
+
@staticmethod
|
7 |
+
def process_align(tg_fn, item):
|
8 |
+
BaseBinarizer.process_align(tg_fn, item)
|
9 |
+
# char-level pitch
|
10 |
+
if 'f0' in item:
|
11 |
+
ph_list = item['ph'].split(" ")
|
12 |
+
item['f0_ph'] = np.array([0 for _ in item['f0']], dtype=float)
|
13 |
+
char_start_idx = 0
|
14 |
+
f0s_char = []
|
15 |
+
for idx, (f0_, ph_idx) in enumerate(zip(item['f0'], item['mel2ph'])):
|
16 |
+
is_pinyin = ph_list[ph_idx - 1][0].isalpha()
|
17 |
+
if not is_pinyin or ph_idx - item['mel2ph'][idx - 1] > 1:
|
18 |
+
if len(f0s_char) > 0:
|
19 |
+
item['f0_ph'][char_start_idx:idx] = sum(f0s_char) / len(f0s_char)
|
20 |
+
f0s_char = []
|
21 |
+
char_start_idx = idx
|
22 |
+
if not is_pinyin:
|
23 |
+
char_start_idx += 1
|
24 |
+
if f0_ > 0:
|
25 |
+
f0s_char.append(f0_)
|
data_gen/tts/runs/adapt_mfa_align.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
import os
|
3 |
+
import subprocess
|
4 |
+
from utils.commons.hparams import hparams, set_hparams
|
5 |
+
|
6 |
+
|
7 |
+
def adapt_mfa_align():
|
8 |
+
CORPUS = hparams['processed_data_dir'].split("/")[-1]
|
9 |
+
print(f"| Run MFA for {CORPUS}.")
|
10 |
+
NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
|
11 |
+
subprocess.check_call(
|
12 |
+
f'CORPUS={CORPUS} NUM_JOB={NUM_JOB} bash scripts/run_mfa_adapt.sh',
|
13 |
+
shell=True)
|
14 |
+
|
15 |
+
|
16 |
+
if __name__ == '__main__':
|
17 |
+
set_hparams(print_hparams=False)
|
18 |
+
adapt_mfa_align()
|
data_gen/tts/runs/align_and_binarize.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
from utils.commons.hparams import set_hparams, hparams
|
3 |
+
from data_gen.tts.runs.binarize import binarize
|
4 |
+
from data_gen.tts.runs.preprocess import preprocess
|
5 |
+
from data_gen.tts.runs.train_mfa_align import train_mfa_align
|
6 |
+
|
7 |
+
if __name__ == '__main__':
|
8 |
+
set_hparams()
|
9 |
+
preprocess()
|
10 |
+
if hparams['preprocess_args']['use_mfa']:
|
11 |
+
train_mfa_align()
|
12 |
+
binarize()
|
data_gen/tts/runs/binarize.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
from utils.commons.hparams import hparams, set_hparams
|
3 |
+
import importlib
|
4 |
+
|
5 |
+
|
6 |
+
def binarize():
|
7 |
+
binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
|
8 |
+
pkg = ".".join(binarizer_cls.split(".")[:-1])
|
9 |
+
cls_name = binarizer_cls.split(".")[-1]
|
10 |
+
binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
|
11 |
+
print("| Binarizer: ", binarizer_cls)
|
12 |
+
binarizer_cls().process()
|
13 |
+
|
14 |
+
|
15 |
+
if __name__ == '__main__':
|
16 |
+
set_hparams()
|
17 |
+
binarize()
|
data_gen/tts/runs/preprocess.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
from utils.commons.hparams import hparams, set_hparams
|
3 |
+
import importlib
|
4 |
+
|
5 |
+
|
6 |
+
def preprocess():
|
7 |
+
assert hparams['preprocess_cls'] != ''
|
8 |
+
|
9 |
+
pkg = ".".join(hparams["preprocess_cls"].split(".")[:-1])
|
10 |
+
cls_name = hparams["preprocess_cls"].split(".")[-1]
|
11 |
+
process_cls = getattr(importlib.import_module(pkg), cls_name)
|
12 |
+
process_cls().process()
|
13 |
+
|
14 |
+
|
15 |
+
if __name__ == '__main__':
|
16 |
+
set_hparams()
|
17 |
+
preprocess()
|
data_gen/tts/runs/train_mfa_align.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.commons.single_thread_env # NOQA
|
2 |
+
import glob
|
3 |
+
import subprocess
|
4 |
+
from textgrid import TextGrid
|
5 |
+
import os
|
6 |
+
from utils.commons.hparams import hparams, set_hparams
|
7 |
+
|
8 |
+
|
9 |
+
def train_mfa_align(mfa_outputs="mfa_outputs",
|
10 |
+
mfa_inputs="mfa_inputs",
|
11 |
+
model_name=None, pretrain_model_name=None,
|
12 |
+
mfa_cmd='train'):
|
13 |
+
CORPUS = hparams['processed_data_dir'].split("/")[-1]
|
14 |
+
NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
|
15 |
+
env_vars = [f'CORPUS={CORPUS}', f'NUM_JOB={NUM_JOB}']
|
16 |
+
if mfa_outputs is not None:
|
17 |
+
env_vars.append(f'MFA_OUTPUTS={mfa_outputs}')
|
18 |
+
if mfa_inputs is not None:
|
19 |
+
env_vars.append(f'MFA_INPUTS={mfa_inputs}')
|
20 |
+
if model_name is not None:
|
21 |
+
env_vars.append(f'MODEL_NAME={model_name}')
|
22 |
+
if pretrain_model_name is not None:
|
23 |
+
env_vars.append(f'PRETRAIN_MODEL_NAME={pretrain_model_name}')
|
24 |
+
if mfa_cmd is not None:
|
25 |
+
env_vars.append(f'MFA_CMD={mfa_cmd}')
|
26 |
+
env_str = ' '.join(env_vars)
|
27 |
+
print(f"| Run MFA for {CORPUS}. Env vars: {env_str}")
|
28 |
+
subprocess.check_call(f'{env_str} bash mfa_usr/run_mfa_train_align.sh', shell=True)
|
29 |
+
mfa_offset = hparams['preprocess_args']['mfa_offset']
|
30 |
+
if mfa_offset > 0:
|
31 |
+
for tg_fn in glob.glob(f'{hparams["processed_data_dir"]}/{mfa_outputs}/*.TextGrid'):
|
32 |
+
tg = TextGrid.fromFile(tg_fn)
|
33 |
+
max_time = tg.maxTime
|
34 |
+
for tier in tg.tiers:
|
35 |
+
for interval in tier.intervals:
|
36 |
+
interval.maxTime = min(interval.maxTime + mfa_offset, max_time)
|
37 |
+
interval.minTime = min(interval.minTime + mfa_offset, max_time)
|
38 |
+
tier.intervals[0].minTime = 0
|
39 |
+
tier.maxTime = min(tier.maxTime + mfa_offset, max_time)
|
40 |
+
tg.write(tg_fn)
|
41 |
+
TextGrid.fromFile(tg_fn)
|
42 |
+
|
43 |
+
|
44 |
+
if __name__ == '__main__':
|
45 |
+
set_hparams(print_hparams=False)
|
46 |
+
train_mfa_align()
|
data_gen/tts/txt_processors/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from . import en
|
data_gen/tts/txt_processors/base_text_processor.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.text.text_encoder import is_sil_phoneme
|
2 |
+
|
3 |
+
REGISTERED_TEXT_PROCESSORS = {}
|
4 |
+
|
5 |
+
|
6 |
+
def register_txt_processors(name):
|
7 |
+
def _f(cls):
|
8 |
+
REGISTERED_TEXT_PROCESSORS[name] = cls
|
9 |
+
return cls
|
10 |
+
|
11 |
+
return _f
|
12 |
+
|
13 |
+
|
14 |
+
def get_txt_processor_cls(name):
|
15 |
+
return REGISTERED_TEXT_PROCESSORS.get(name, None)
|
16 |
+
|
17 |
+
|
18 |
+
class BaseTxtProcessor:
|
19 |
+
@staticmethod
|
20 |
+
def sp_phonemes():
|
21 |
+
return ['|']
|
22 |
+
|
23 |
+
@classmethod
|
24 |
+
def process(cls, txt, preprocess_args):
|
25 |
+
raise NotImplementedError
|
26 |
+
|
27 |
+
@classmethod
|
28 |
+
def postprocess(cls, txt_struct, preprocess_args):
|
29 |
+
# remove sil phoneme in head and tail
|
30 |
+
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
|
31 |
+
txt_struct = txt_struct[1:]
|
32 |
+
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
|
33 |
+
txt_struct = txt_struct[:-1]
|
34 |
+
if preprocess_args['with_phsep']:
|
35 |
+
txt_struct = cls.add_bdr(txt_struct)
|
36 |
+
if preprocess_args['add_eos_bos']:
|
37 |
+
txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
|
38 |
+
return txt_struct
|
39 |
+
|
40 |
+
@classmethod
|
41 |
+
def add_bdr(cls, txt_struct):
|
42 |
+
txt_struct_ = []
|
43 |
+
for i, ts in enumerate(txt_struct):
|
44 |
+
txt_struct_.append(ts)
|
45 |
+
if i != len(txt_struct) - 1 and \
|
46 |
+
not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
|
47 |
+
txt_struct_.append(['|', ['|']])
|
48 |
+
return txt_struct_
|
data_gen/tts/txt_processors/en.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import unicodedata
|
3 |
+
|
4 |
+
from g2p_en import G2p
|
5 |
+
from g2p_en.expand import normalize_numbers
|
6 |
+
from nltk import pos_tag
|
7 |
+
from nltk.tokenize import TweetTokenizer
|
8 |
+
|
9 |
+
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
|
10 |
+
from utils.text.text_encoder import PUNCS, is_sil_phoneme
|
11 |
+
|
12 |
+
|
13 |
+
class EnG2p(G2p):
|
14 |
+
word_tokenize = TweetTokenizer().tokenize
|
15 |
+
|
16 |
+
def __call__(self, text):
|
17 |
+
# preprocessing
|
18 |
+
words = EnG2p.word_tokenize(text)
|
19 |
+
tokens = pos_tag(words) # tuples of (word, tag)
|
20 |
+
|
21 |
+
# steps
|
22 |
+
prons = []
|
23 |
+
for word, pos in tokens:
|
24 |
+
if re.search("[a-z]", word) is None:
|
25 |
+
pron = [word]
|
26 |
+
|
27 |
+
elif word in self.homograph2features: # Check homograph
|
28 |
+
pron1, pron2, pos1 = self.homograph2features[word]
|
29 |
+
if pos.startswith(pos1):
|
30 |
+
pron = pron1
|
31 |
+
else:
|
32 |
+
pron = pron2
|
33 |
+
elif word in self.cmu: # lookup CMU dict
|
34 |
+
pron = self.cmu[word][0]
|
35 |
+
else: # predict for oov
|
36 |
+
pron = self.predict(word)
|
37 |
+
|
38 |
+
prons.extend(pron)
|
39 |
+
prons.extend([" "])
|
40 |
+
|
41 |
+
return prons[:-1]
|
42 |
+
|
43 |
+
|
44 |
+
@register_txt_processors('en')
|
45 |
+
class TxtProcessor(BaseTxtProcessor):
|
46 |
+
g2p = EnG2p()
|
47 |
+
|
48 |
+
@staticmethod
|
49 |
+
def preprocess_text(text):
|
50 |
+
text = normalize_numbers(text)
|
51 |
+
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
52 |
+
if unicodedata.category(char) != 'Mn') # Strip accents
|
53 |
+
text = text.lower()
|
54 |
+
text = re.sub("[\'\"()]+", "", text)
|
55 |
+
text = re.sub("[-]+", " ", text)
|
56 |
+
text = re.sub(f"[^ a-z{PUNCS}]", "", text)
|
57 |
+
text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
|
58 |
+
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
|
59 |
+
text = text.replace("i.e.", "that is")
|
60 |
+
text = text.replace("i.e.", "that is")
|
61 |
+
text = text.replace("etc.", "etc")
|
62 |
+
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
|
63 |
+
text = re.sub(rf"\s+", r" ", text)
|
64 |
+
return text
|
65 |
+
|
66 |
+
@classmethod
|
67 |
+
def process(cls, txt, preprocess_args):
|
68 |
+
txt = cls.preprocess_text(txt).strip()
|
69 |
+
phs = cls.g2p(txt)
|
70 |
+
txt_struct = [[w, []] for w in txt.split(" ")]
|
71 |
+
i_word = 0
|
72 |
+
for p in phs:
|
73 |
+
if p == ' ':
|
74 |
+
i_word += 1
|
75 |
+
else:
|
76 |
+
txt_struct[i_word][1].append(p)
|
77 |
+
txt_struct = cls.postprocess(txt_struct, preprocess_args)
|
78 |
+
return txt_struct, txt
|
data_gen/tts/wav_processors/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from . import base_processor
|
2 |
+
from . import common_processors
|
data_gen/tts/wav_processors/base_processor.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
REGISTERED_WAV_PROCESSORS = {}
|
2 |
+
|
3 |
+
|
4 |
+
def register_wav_processors(name):
|
5 |
+
def _f(cls):
|
6 |
+
REGISTERED_WAV_PROCESSORS[name] = cls
|
7 |
+
return cls
|
8 |
+
|
9 |
+
return _f
|
10 |
+
|
11 |
+
|
12 |
+
def get_wav_processor_cls(name):
|
13 |
+
return REGISTERED_WAV_PROCESSORS.get(name, None)
|
14 |
+
|
15 |
+
|
16 |
+
class BaseWavProcessor:
|
17 |
+
@property
|
18 |
+
def name(self):
|
19 |
+
raise NotImplementedError
|
20 |
+
|
21 |
+
def output_fn(self, input_fn):
|
22 |
+
return f'{input_fn[:-4]}_{self.name}.wav'
|
23 |
+
|
24 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
25 |
+
raise NotImplementedError
|
data_gen/tts/wav_processors/common_processors.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
|
6 |
+
from utils.audio import trim_long_silences
|
7 |
+
from utils.audio.io import save_wav
|
8 |
+
from utils.audio.rnnoise import rnnoise
|
9 |
+
from utils.commons.hparams import hparams
|
10 |
+
|
11 |
+
|
12 |
+
@register_wav_processors(name='sox_to_wav')
|
13 |
+
class ConvertToWavProcessor(BaseWavProcessor):
|
14 |
+
@property
|
15 |
+
def name(self):
|
16 |
+
return 'ToWav'
|
17 |
+
|
18 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
19 |
+
if input_fn[-4:] == '.wav':
|
20 |
+
return input_fn, sr
|
21 |
+
else:
|
22 |
+
output_fn = self.output_fn(input_fn)
|
23 |
+
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
|
24 |
+
return output_fn, sr
|
25 |
+
|
26 |
+
|
27 |
+
@register_wav_processors(name='sox_resample')
|
28 |
+
class ResampleProcessor(BaseWavProcessor):
|
29 |
+
@property
|
30 |
+
def name(self):
|
31 |
+
return 'Resample'
|
32 |
+
|
33 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
34 |
+
output_fn = self.output_fn(input_fn)
|
35 |
+
sr_file = librosa.core.get_samplerate(input_fn)
|
36 |
+
if sr != sr_file:
|
37 |
+
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
|
38 |
+
y, _ = librosa.core.load(input_fn, sr=sr)
|
39 |
+
y, _ = librosa.effects.trim(y)
|
40 |
+
save_wav(y, output_fn, sr)
|
41 |
+
return output_fn, sr
|
42 |
+
else:
|
43 |
+
return input_fn, sr
|
44 |
+
|
45 |
+
|
46 |
+
@register_wav_processors(name='trim_sil')
|
47 |
+
class TrimSILProcessor(BaseWavProcessor):
|
48 |
+
@property
|
49 |
+
def name(self):
|
50 |
+
return 'TrimSIL'
|
51 |
+
|
52 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
53 |
+
output_fn = self.output_fn(input_fn)
|
54 |
+
y, _ = librosa.core.load(input_fn, sr=sr)
|
55 |
+
y, _ = librosa.effects.trim(y)
|
56 |
+
save_wav(y, output_fn, sr)
|
57 |
+
return output_fn
|
58 |
+
|
59 |
+
|
60 |
+
@register_wav_processors(name='trim_all_sil')
|
61 |
+
class TrimAllSILProcessor(BaseWavProcessor):
|
62 |
+
@property
|
63 |
+
def name(self):
|
64 |
+
return 'TrimSIL'
|
65 |
+
|
66 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
67 |
+
output_fn = self.output_fn(input_fn)
|
68 |
+
y, audio_mask, _ = trim_long_silences(
|
69 |
+
input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
|
70 |
+
save_wav(y, output_fn, sr)
|
71 |
+
if preprocess_args['save_sil_mask']:
|
72 |
+
os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
|
73 |
+
np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
|
74 |
+
return output_fn, sr
|
75 |
+
|
76 |
+
|
77 |
+
@register_wav_processors(name='denoise')
|
78 |
+
class DenoiseProcessor(BaseWavProcessor):
|
79 |
+
@property
|
80 |
+
def name(self):
|
81 |
+
return 'Denoise'
|
82 |
+
|
83 |
+
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
|
84 |
+
output_fn = self.output_fn(input_fn)
|
85 |
+
rnnoise(input_fn, output_fn, out_sample_rate=sr)
|
86 |
+
return output_fn, sr
|
docs/fastspeech2.md
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Run FastSpeech 2
|
2 |
+
|
3 |
+
## Quick Start
|
4 |
+
|
5 |
+
### Install Dependencies
|
6 |
+
|
7 |
+
Install dependencies following [readme.md](../readme.md)
|
8 |
+
|
9 |
+
### Set Config Path and Experiment Name
|
10 |
+
|
11 |
+
```bash
|
12 |
+
export CONFIG_NAME=egs/datasets/audio/lj/fs2_orig.yaml
|
13 |
+
export MY_EXP_NAME=fs2_exp
|
14 |
+
```
|
15 |
+
|
16 |
+
### Preprocess and binary dataset
|
17 |
+
|
18 |
+
Prepare dataset following [prepare_data.md](./prepare_data.md)
|
19 |
+
|
20 |
+
### Prepare Vocoder
|
21 |
+
|
22 |
+
Prepare vocoder following [prepare_vocoder.md](./prepare_vocoder.md)
|
23 |
+
|
24 |
+
## Training
|
25 |
+
|
26 |
+
```bash
|
27 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
|
28 |
+
```
|
29 |
+
|
30 |
+
You can check the training and validation curves open Tensorboard via:
|
31 |
+
|
32 |
+
```bash
|
33 |
+
tensorboard --logdir checkpoints/$MY_EXP_NAME
|
34 |
+
```
|
35 |
+
|
36 |
+
## Inference (Testing)
|
37 |
+
|
38 |
+
```bash
|
39 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --infer
|
40 |
+
```
|
41 |
+
|
42 |
+
## Citation
|
43 |
+
|
44 |
+
If you find this useful for your research, please use the following.
|
45 |
+
|
46 |
+
```
|
47 |
+
@inproceedings{ren2020fastspeech,
|
48 |
+
title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech},
|
49 |
+
author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan},
|
50 |
+
booktitle={International Conference on Learning Representations},
|
51 |
+
year={2020}
|
52 |
+
}
|
53 |
+
```
|
docs/framework.md
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Framework of NATSpeech
|
2 |
+
|
3 |
+
NATSpeech is a simple framework for Non-Autoregressive Text-to-Speech.
|
4 |
+
|
5 |
+
## Directory Structure
|
6 |
+
|
7 |
+
- `egs`: configuration files, which will be loaded by `utils/commons/hparams.py`
|
8 |
+
- `data_gen`: data binarization codes
|
9 |
+
- `modules`: modules and models
|
10 |
+
- `tasks`: the training and inference logics
|
11 |
+
- `utils`: commonly used utils
|
12 |
+
- `data`: data
|
13 |
+
- `raw`: raw data
|
14 |
+
- `processed`: data after preprocess
|
15 |
+
- `binary`: binary data
|
16 |
+
- `checkpoints`: model checkpoints, tensorboard logs and generated results for all experiments.
|
17 |
+
|
18 |
+
## How to Add New Tasks and Run?
|
19 |
+
|
20 |
+
We show the basic steps of adding a new task/model and running the code (LJSpeech dataset as an example).
|
21 |
+
|
22 |
+
### Add the model
|
23 |
+
|
24 |
+
Add your model to `modules`.
|
25 |
+
|
26 |
+
### Add the task
|
27 |
+
|
28 |
+
Task classes are used to manage the training and inference procedures.
|
29 |
+
|
30 |
+
A new task (e.g., `tasks.tts.fs.FastSpeechTask`) should inherit the base task (`tasks.tts.speech_base.TTSBaseTask`)
|
31 |
+
class.
|
32 |
+
|
33 |
+
You must implement these methods:
|
34 |
+
|
35 |
+
- `build_tts_model`, which builds the model for your task. - `run_model`, indicating how to use the model in training
|
36 |
+
and inference.
|
37 |
+
|
38 |
+
You can override `test_step` and `save_valid_result` to change the validation/testing logics or add more plots to
|
39 |
+
tensorboard.
|
40 |
+
|
41 |
+
### Add a new config file
|
42 |
+
|
43 |
+
Add a new config file in `egs/datasets/audio/lj/YOUR_TASK.yaml`. For example:
|
44 |
+
|
45 |
+
```yaml
|
46 |
+
base_config: ./base_text2mel.yaml
|
47 |
+
task_cls: tasks.tts.fs.FastSpeechTask
|
48 |
+
|
49 |
+
# model configs
|
50 |
+
hidden_size: 256
|
51 |
+
dropout: 0.1
|
52 |
+
|
53 |
+
# some more configs .....
|
54 |
+
```
|
55 |
+
|
56 |
+
If you use a new dataset `YOUR_DATASET`, you should also add a `YOUR_DATASET_Processor`
|
57 |
+
in `egs/datasets/audio/YOUR_DATASET/preprocess.py`, inheriting `data_gen.tts.base_preprocess.BasePreprocessor`, which
|
58 |
+
loads some meta information of the dataset.
|
59 |
+
|
60 |
+
### Preprocess and binary dataset
|
61 |
+
|
62 |
+
```bash
|
63 |
+
python data_gen/tts/runs/align_and_binarize.py --config egs/datasets/audio/lj/base_text2mel.yaml
|
64 |
+
```
|
65 |
+
|
66 |
+
### Training
|
67 |
+
|
68 |
+
```bash
|
69 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config YOUR_CONFIG --exp_name YOUR_EXP_NAME --reset
|
70 |
+
```
|
71 |
+
|
72 |
+
You can open Tensorboard via:
|
73 |
+
|
74 |
+
```bash
|
75 |
+
tensorboard --logdir checkpoints/EXP_NAME
|
76 |
+
```
|
77 |
+
|
78 |
+
### Inference (Testing)
|
79 |
+
|
80 |
+
```bash
|
81 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config egs/datasets/audio/lj/YOUR_TASK.yaml --exp_name YOUR_EXP_NAME --reset --infer
|
82 |
+
```
|
83 |
+
|
84 |
+
## Design Philosophy
|
85 |
+
|
86 |
+
### Random-Access Binarized Dataset
|
87 |
+
|
88 |
+
To address the IO problem when reading small files, we design a `IndexedDataset` class (_utils/commons/indexed_datasets.py_)
|
89 |
+
|
90 |
+
### Global Config
|
91 |
+
|
92 |
+
We introduce a global config `hparams`, which is load from a `.yaml` config file and can be used in anywhere. However,
|
93 |
+
we do not recommend using it in some general-purpose modules.
|
94 |
+
|
95 |
+
### BaseTrainer Framework
|
96 |
+
|
97 |
+
Our [base trainer](utils/commons/trainer.py) and [base task ](utils/commons/base_task.py) classes refer
|
98 |
+
to [PytorchLightning](https://github.com/PyTorchLightning/pytorch-lightning), which builds some commonly used
|
99 |
+
training/inference code structure. Our framework supports multi-process GPU training without changing the subclass
|
100 |
+
codes.
|
101 |
+
|
102 |
+
### Checkpoint Saving
|
103 |
+
|
104 |
+
All checkpoints and tensorboard logs are saved in `checkpoints/EXP_NAME`, where `EXP_NAME` is set in the running
|
105 |
+
command: `python tasks/run.py .... --exp_name EXP_NAME`. You can use `tensorboard --logdir checkpoints/EXP_NAME` to open
|
106 |
+
the tensorboard and check the training loss curves etc.
|
docs/portaspeech.md
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Run PortaSpeech
|
2 |
+
|
3 |
+
## Quick Start
|
4 |
+
|
5 |
+
### Install Dependencies
|
6 |
+
|
7 |
+
Install dependencies following [readme.md](../readme.md)
|
8 |
+
|
9 |
+
### Set Config Path and Experiment Name
|
10 |
+
|
11 |
+
#### PortaSpeech (normal)
|
12 |
+
```bash
|
13 |
+
export CONFIG_NAME=egs/datasets/audio/lj/ps_flow_nips2021.yaml
|
14 |
+
export MY_EXP_NAME=ps_normal_exp
|
15 |
+
```
|
16 |
+
|
17 |
+
#### PortaSpeech (small)
|
18 |
+
```bash
|
19 |
+
export CONFIG_NAME=egs/datasets/audio/lj/ps_flow_small_nips2021.yaml
|
20 |
+
export MY_EXP_NAME=ps_small_exp
|
21 |
+
```
|
22 |
+
|
23 |
+
### Preprocess and binary dataset
|
24 |
+
|
25 |
+
Prepare dataset following [prepare_data.md](./prepare_data.md)
|
26 |
+
|
27 |
+
### Prepare Vocoder
|
28 |
+
|
29 |
+
Prepare vocoder following [prepare_vocoder.md](./prepare_vocoder.md)
|
30 |
+
|
31 |
+
## Training
|
32 |
+
|
33 |
+
```bash
|
34 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
|
35 |
+
```
|
36 |
+
|
37 |
+
You can check the training and validation curves open Tensorboard via:
|
38 |
+
|
39 |
+
```bash
|
40 |
+
tensorboard --logdir checkpoints/$MY_EXP_NAME
|
41 |
+
```
|
42 |
+
|
43 |
+
## Inference (Testing)
|
44 |
+
|
45 |
+
```bash
|
46 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $PS_CONFIG --exp_name $MY_EXP_NAME --infer
|
47 |
+
```
|
48 |
+
|
49 |
+
## Citation
|
50 |
+
|
51 |
+
If you find this useful for your research, please use the following.
|
52 |
+
|
53 |
+
```
|
54 |
+
@article{ren2021portaspeech,
|
55 |
+
title={PortaSpeech: Portable and High-Quality Generative Text-to-Speech},
|
56 |
+
author={Ren, Yi and Liu, Jinglin and Zhao, Zhou},
|
57 |
+
journal={Advances in Neural Information Processing Systems},
|
58 |
+
volume={34},
|
59 |
+
year={2021}
|
60 |
+
}
|
61 |
+
```
|
docs/prepare_data.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Prepare Dataset
|
2 |
+
|
3 |
+
## LJSpeech
|
4 |
+
|
5 |
+
### Download Dataset
|
6 |
+
```bash
|
7 |
+
mkdir -p data/raw/ljspeech
|
8 |
+
cd data/raw
|
9 |
+
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
|
10 |
+
bzip2 -d LJSpeech-1.1.tar.bz2
|
11 |
+
tar -xvf LJSpeech-1.1.tar
|
12 |
+
cd ../../
|
13 |
+
```
|
14 |
+
|
15 |
+
### Forced Align and Preprocess Dataset
|
16 |
+
```bash
|
17 |
+
# Preprocess step: text and unify the file structure.
|
18 |
+
python data_gen/tts/runs/preprocess.py --config $CONFIG_NAME
|
19 |
+
# Align step: MFA alignment.
|
20 |
+
python data_gen/tts/runs/train_mfa_align.py --config $CONFIG_NAME
|
21 |
+
# Binarization step: Binarize data for fast IO. You only need to rerun this line when running different task if you have `preprocess`ed and `align`ed the dataset before.
|
22 |
+
python data_gen/tts/runs/binarize.py --config $CONFIG_NAME
|
23 |
+
```
|
24 |
+
|
25 |
+
## More datasets will be supported soon...
|
docs/prepare_vocoder.md
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Prepare Vocoder
|
2 |
+
|
3 |
+
We use [HiFi-GAN](https://github.com/jik876/hifi-gan) as the default vocoder.
|
4 |
+
|
5 |
+
## LJSpeech
|
6 |
+
|
7 |
+
### Use Pretrained Model
|
8 |
+
|
9 |
+
```bash
|
10 |
+
wget https://github.com/xx/xx/releases/download/pretrain-model/hifi_lj.zip
|
11 |
+
unzip hifi_lj.zip
|
12 |
+
mv hifi_lj checkpoints/hifi_lj
|
13 |
+
```
|
14 |
+
|
15 |
+
### Train Your Vocoder
|
16 |
+
|
17 |
+
#### Set Config Path and Experiment Name
|
18 |
+
|
19 |
+
```bash
|
20 |
+
export CONFIG_NAME=egs/datasets/audio/lj/hifigan.yaml
|
21 |
+
export MY_EXP_NAME=my_hifigan_exp
|
22 |
+
```
|
23 |
+
|
24 |
+
#### Prepare Dataset
|
25 |
+
|
26 |
+
Prepare dataset following [prepare_data.md](./prepare_data.md).
|
27 |
+
|
28 |
+
If you have run the `prepare_data` step of the acoustic
|
29 |
+
model (e.g., FastSpeech 2 and PortaSpeech), you only need to binarize the dataset for the vocoder training:
|
30 |
+
|
31 |
+
```bash
|
32 |
+
python data_gen/tts/runs/binarize.py --config $CONFIG_NAME
|
33 |
+
```
|
34 |
+
|
35 |
+
#### Training
|
36 |
+
|
37 |
+
```bash
|
38 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
|
39 |
+
```
|
40 |
+
|
41 |
+
#### Inference (Testing)
|
42 |
+
|
43 |
+
```bash
|
44 |
+
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $PS_CONFIG --exp_name $MY_EXP_NAME --infer
|
45 |
+
```
|
46 |
+
|
47 |
+
#### Use the trained vocoder
|
48 |
+
Modify the `vocoder_ckpt` in config files of acoustic models (e.g., `egs/datasets/audio/lj/base_text2mel.yaml`) to $MY_EXP_NAME (e.g., `vocoder_ckpt: checkpoints/my_hifigan_exp`)
|
49 |
+
|
egs/datasets/audio/lj/base_mel2wav.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: egs/egs_bases/tts/vocoder/base.yaml
|
2 |
+
raw_data_dir: 'data/raw/LJSpeech-1.1'
|
3 |
+
processed_data_dir: 'data/processed/ljspeech'
|
4 |
+
binary_data_dir: 'data/binary/ljspeech_wav'
|
egs/datasets/audio/lj/base_text2mel.yaml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: egs/egs_bases/tts/base.yaml
|
2 |
+
raw_data_dir: 'data/raw/LJSpeech-1.1'
|
3 |
+
processed_data_dir: 'data/processed/ljspeech'
|
4 |
+
binary_data_dir: 'data/binary/ljspeech'
|
5 |
+
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
6 |
+
binarization_args:
|
7 |
+
train_range: [ 871, -1 ]
|
8 |
+
test_range: [ 0, 523 ]
|
9 |
+
valid_range: [ 523, 871 ]
|
10 |
+
test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
11 |
+
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
12 |
+
68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
|
13 |
+
316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
|
14 |
+
f0_min: 80
|
15 |
+
f0_max: 600
|
16 |
+
vocoder_ckpt: checkpoints/hifi_lj
|
egs/datasets/audio/lj/fs.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/fs.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/lj/fs2_orig.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/fs2_orig.yaml
|
3 |
+
- ./base_text2mel.yaml
|
4 |
+
binary_data_dir: 'data/binary/ljspeech_cwt'
|
egs/datasets/audio/lj/hifigan.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/vocoder/hifigan.yaml
|
3 |
+
- ./base_mel2wav.yaml
|
egs/datasets/audio/lj/preprocess.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_gen.tts.base_preprocess import BasePreprocessor
|
2 |
+
|
3 |
+
|
4 |
+
class LJPreprocess(BasePreprocessor):
|
5 |
+
def meta_data(self):
|
6 |
+
for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
|
7 |
+
item_name, _, txt = l.strip().split("|")
|
8 |
+
wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
|
9 |
+
yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt}
|
egs/datasets/audio/lj/ps_flow.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/ps_flow.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/lj/ps_flow_nips2021.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- ./ps_flow.yaml
|
3 |
+
max_sentences: 64
|
4 |
+
dur_level: word
|
5 |
+
use_word_encoder: false
|
6 |
+
enc_prenet: true
|
7 |
+
enc_pre_ln: false
|
8 |
+
fvae_encoder_type: wn
|
9 |
+
fvae_decoder_type: wn
|
10 |
+
text_encoder_postnet: false
|
11 |
+
warmup_updates: 8000
|
egs/datasets/audio/lj/ps_flow_small.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/ps_flow_small.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/lj/ps_flow_small_nips2021.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- ./ps_flow_small.yaml
|
3 |
+
max_sentences: 128
|
4 |
+
dur_level: word
|
5 |
+
use_word_encoder: false
|
6 |
+
enc_prenet: true
|
7 |
+
enc_pre_ln: false
|
8 |
+
fvae_encoder_type: wn
|
9 |
+
fvae_decoder_type: wn
|
10 |
+
text_encoder_postnet: false
|
11 |
+
warmup_updates: 8000
|
egs/egs_bases/config_base.yaml
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# task
|
2 |
+
binary_data_dir: ''
|
3 |
+
work_dir: '' # experiment directory.
|
4 |
+
infer: false # infer
|
5 |
+
amp: false
|
6 |
+
seed: 1234
|
7 |
+
debug: false
|
8 |
+
save_codes: ['tasks', 'modules', 'egs']
|
9 |
+
|
10 |
+
#############
|
11 |
+
# dataset
|
12 |
+
#############
|
13 |
+
ds_workers: 1
|
14 |
+
test_num: 100
|
15 |
+
endless_ds: true
|
16 |
+
sort_by_len: true
|
17 |
+
|
18 |
+
#########
|
19 |
+
# train and eval
|
20 |
+
#########
|
21 |
+
print_nan_grads: false
|
22 |
+
load_ckpt: ''
|
23 |
+
save_best: false
|
24 |
+
num_ckpt_keep: 3
|
25 |
+
clip_grad_norm: 0
|
26 |
+
accumulate_grad_batches: 1
|
27 |
+
tb_log_interval: 100
|
28 |
+
num_sanity_val_steps: 5 # steps of validation at the beginning
|
29 |
+
check_val_every_n_epoch: 10
|
30 |
+
val_check_interval: 2000
|
31 |
+
valid_monitor_key: 'val_loss'
|
32 |
+
valid_monitor_mode: 'min'
|
33 |
+
max_epochs: 1000
|
34 |
+
max_updates: 1000000
|
35 |
+
max_tokens: 40000
|
36 |
+
max_sentences: 100000
|
37 |
+
max_valid_tokens: -1
|
38 |
+
max_valid_sentences: -1
|
39 |
+
eval_max_batches: -1
|
40 |
+
resume_from_checkpoint: 0
|
41 |
+
rename_tmux: true
|
egs/egs_bases/tts/base.yaml
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# task
|
2 |
+
base_config:
|
3 |
+
- ../config_base.yaml
|
4 |
+
- ./dataset_params.yaml
|
5 |
+
|
6 |
+
#############
|
7 |
+
# dataset in training
|
8 |
+
#############
|
9 |
+
endless_ds: true
|
10 |
+
min_frames: 0
|
11 |
+
max_frames: 1548
|
12 |
+
frames_multiple: 1
|
13 |
+
max_input_tokens: 1550
|
14 |
+
ds_workers: 1
|
15 |
+
|
16 |
+
#########
|
17 |
+
# model
|
18 |
+
#########
|
19 |
+
use_spk_id: false
|
20 |
+
use_spk_embed: false
|
21 |
+
mel_losses: "ssim:0.5|l1:0.5"
|
22 |
+
|
23 |
+
###########
|
24 |
+
# optimization
|
25 |
+
###########
|
26 |
+
lr: 0.0005
|
27 |
+
scheduler: warmup # rsqrt|warmup|none
|
28 |
+
warmup_updates: 4000
|
29 |
+
optimizer_adam_beta1: 0.9
|
30 |
+
optimizer_adam_beta2: 0.98
|
31 |
+
weight_decay: 0
|
32 |
+
clip_grad_norm: 1
|
33 |
+
clip_grad_value: 0
|
34 |
+
|
35 |
+
|
36 |
+
###########
|
37 |
+
# train and eval
|
38 |
+
###########
|
39 |
+
use_word_input: false
|
40 |
+
max_valid_sentences: 1
|
41 |
+
max_valid_tokens: 60000
|
42 |
+
valid_infer_interval: 10000
|
43 |
+
train_set_name: 'train'
|
44 |
+
train_sets: ''
|
45 |
+
valid_set_name: 'valid'
|
46 |
+
test_set_name: 'test'
|
47 |
+
num_valid_plots: 10
|
48 |
+
test_ids: [ ]
|
49 |
+
test_input_yaml: ''
|
50 |
+
vocoder: HifiGAN
|
51 |
+
vocoder_ckpt: ''
|
52 |
+
profile_infer: false
|
53 |
+
out_wav_norm: false
|
54 |
+
save_gt: true
|
55 |
+
save_f0: false
|
56 |
+
gen_dir_name: ''
|
egs/egs_bases/tts/dataset_params.yaml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio_num_mel_bins: 80
|
2 |
+
audio_sample_rate: 22050
|
3 |
+
hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
|
4 |
+
win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
|
5 |
+
fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
|
6 |
+
fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
|
7 |
+
fmax: 7600 # To be increased/reduced depending on data.
|
8 |
+
f0_min: 80
|
9 |
+
f0_max: 800
|
10 |
+
griffin_lim_iters: 30
|
11 |
+
pitch_extractor: parselmouth
|
12 |
+
num_spk: 1
|
13 |
+
mel_vmin: -6
|
14 |
+
mel_vmax: 1.5
|
15 |
+
loud_norm: false
|
16 |
+
|
17 |
+
raw_data_dir: ''
|
18 |
+
processed_data_dir: ''
|
19 |
+
binary_data_dir: ''
|
20 |
+
preprocess_cls: ''
|
21 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
22 |
+
preprocess_args:
|
23 |
+
nsample_per_mfa_group: 1000
|
24 |
+
# text process
|
25 |
+
txt_processor: en
|
26 |
+
use_mfa: true
|
27 |
+
with_phsep: true
|
28 |
+
reset_phone_dict: true
|
29 |
+
reset_word_dict: true
|
30 |
+
add_eos_bos: true
|
31 |
+
# mfa
|
32 |
+
mfa_group_shuffle: false
|
33 |
+
mfa_offset: 0.02
|
34 |
+
# wav processors
|
35 |
+
wav_processors: [ ]
|
36 |
+
save_sil_mask: true
|
37 |
+
vad_max_silence_length: 12
|
38 |
+
binarization_args:
|
39 |
+
shuffle: false
|
40 |
+
with_wav: false
|
41 |
+
with_align: true
|
42 |
+
with_spk_embed: false
|
43 |
+
with_f0: true
|
44 |
+
with_f0cwt: false
|
45 |
+
with_linear: false
|
46 |
+
trim_eos_bos: false
|
47 |
+
min_sil_duration: 0.1
|
48 |
+
train_range: [ 200, -1 ]
|
49 |
+
test_range: [ 0, 100 ]
|
50 |
+
valid_range: [ 100, 200 ]
|
51 |
+
word_dict_size: 10000
|
52 |
+
pitch_key: pitch
|
egs/egs_bases/tts/fs.yaml
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./base.yaml
|
2 |
+
task_cls: tasks.tts.fs.FastSpeechTask
|
3 |
+
|
4 |
+
# model
|
5 |
+
hidden_size: 256
|
6 |
+
dropout: 0.0
|
7 |
+
encoder_type: rel_fft # rel_fft|fft|tacotron|tacotron2|conformer
|
8 |
+
decoder_type: conv # fft|rnn|conv|conformer|wn
|
9 |
+
|
10 |
+
# rnn enc/dec
|
11 |
+
encoder_K: 8
|
12 |
+
decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
|
13 |
+
|
14 |
+
# fft enc/dec
|
15 |
+
enc_layers: 4
|
16 |
+
enc_ffn_kernel_size: 9
|
17 |
+
enc_prenet: true
|
18 |
+
enc_pre_ln: true
|
19 |
+
dec_layers: 4
|
20 |
+
dec_ffn_kernel_size: 9
|
21 |
+
num_heads: 2
|
22 |
+
ffn_act: gelu
|
23 |
+
ffn_hidden_size: 1024
|
24 |
+
use_pos_embed: true
|
25 |
+
|
26 |
+
# conv enc/dec
|
27 |
+
enc_dec_norm: ln
|
28 |
+
conv_use_pos: false
|
29 |
+
layers_in_block: 2
|
30 |
+
enc_dilations: [ 1, 1, 1, 1 ]
|
31 |
+
enc_kernel_size: 5
|
32 |
+
enc_post_net_kernel: 3
|
33 |
+
dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
|
34 |
+
dec_kernel_size: 5
|
35 |
+
dec_post_net_kernel: 3
|
36 |
+
|
37 |
+
# duration
|
38 |
+
predictor_hidden: -1
|
39 |
+
predictor_kernel: 5
|
40 |
+
predictor_layers: 2
|
41 |
+
dur_predictor_kernel: 3
|
42 |
+
dur_predictor_layers: 2
|
43 |
+
predictor_dropout: 0.5
|
44 |
+
|
45 |
+
# pitch and energy
|
46 |
+
use_pitch_embed: false
|
47 |
+
pitch_type: frame # frame|ph|cwt
|
48 |
+
use_uv: true
|
49 |
+
|
50 |
+
# reference encoder and speaker embedding
|
51 |
+
lambda_commit: 0.25
|
52 |
+
ref_norm_layer: bn
|
53 |
+
dec_inp_add_noise: false
|
54 |
+
|
55 |
+
# mel
|
56 |
+
mel_losses: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
|
57 |
+
|
58 |
+
# loss lambda
|
59 |
+
lambda_f0: 1.0
|
60 |
+
lambda_uv: 1.0
|
61 |
+
lambda_energy: 0.1
|
62 |
+
lambda_ph_dur: 0.1
|
63 |
+
lambda_sent_dur: 1.0
|
64 |
+
lambda_word_dur: 1.0
|
65 |
+
predictor_grad: 0.1
|
66 |
+
|
67 |
+
# train and eval
|
68 |
+
warmup_updates: 4000
|
69 |
+
max_tokens: 40000
|
70 |
+
max_sentences: 128
|
71 |
+
max_valid_sentences: 1
|
72 |
+
max_updates: 160000
|
73 |
+
use_gt_dur: false
|
74 |
+
use_gt_f0: false
|
75 |
+
ds_workers: 2
|