RayeRen commited on
Commit
d1b91e7
0 Parent(s):
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +148 -0
  3. README.md +9 -0
  4. checkpoints/fs2_exp/config.yaml +219 -0
  5. checkpoints/fs2_exp/model_ckpt_steps_98000.ckpt +3 -0
  6. checkpoints/hifi_lj/config.yaml +207 -0
  7. checkpoints/hifi_lj/model_ckpt_steps_2076000.ckpt +3 -0
  8. checkpoints/ps_normal_exp/config.yaml +258 -0
  9. checkpoints/ps_normal_exp/model_ckpt_steps_278000.ckpt +3 -0
  10. checkpoints/ps_small_exp/config.yaml +258 -0
  11. checkpoints/ps_small_exp/model_ckpt_steps_410000.ckpt +3 -0
  12. data/binary/ljspeech/phone_set.json +1 -0
  13. data/binary/ljspeech/spk_map.json +1 -0
  14. data/binary/ljspeech/word_set.json +0 -0
  15. data/binary/ljspeech_cwt/phone_set.json +1 -0
  16. data/binary/ljspeech_cwt/spk_map.json +1 -0
  17. data/binary/ljspeech_cwt/word_set.json +0 -0
  18. data_gen/tts/base_binarizer.py +225 -0
  19. data_gen/tts/base_preprocess.py +251 -0
  20. data_gen/tts/binarizer_zh.py +25 -0
  21. data_gen/tts/runs/adapt_mfa_align.py +18 -0
  22. data_gen/tts/runs/align_and_binarize.py +12 -0
  23. data_gen/tts/runs/binarize.py +17 -0
  24. data_gen/tts/runs/preprocess.py +17 -0
  25. data_gen/tts/runs/train_mfa_align.py +46 -0
  26. data_gen/tts/txt_processors/__init__.py +1 -0
  27. data_gen/tts/txt_processors/base_text_processor.py +48 -0
  28. data_gen/tts/txt_processors/en.py +78 -0
  29. data_gen/tts/wav_processors/__init__.py +2 -0
  30. data_gen/tts/wav_processors/base_processor.py +25 -0
  31. data_gen/tts/wav_processors/common_processors.py +86 -0
  32. docs/fastspeech2.md +53 -0
  33. docs/framework.md +106 -0
  34. docs/portaspeech.md +61 -0
  35. docs/prepare_data.md +25 -0
  36. docs/prepare_vocoder.md +49 -0
  37. egs/datasets/audio/lj/base_mel2wav.yaml +4 -0
  38. egs/datasets/audio/lj/base_text2mel.yaml +16 -0
  39. egs/datasets/audio/lj/fs.yaml +3 -0
  40. egs/datasets/audio/lj/fs2_orig.yaml +4 -0
  41. egs/datasets/audio/lj/hifigan.yaml +3 -0
  42. egs/datasets/audio/lj/preprocess.py +9 -0
  43. egs/datasets/audio/lj/ps_flow.yaml +3 -0
  44. egs/datasets/audio/lj/ps_flow_nips2021.yaml +11 -0
  45. egs/datasets/audio/lj/ps_flow_small.yaml +3 -0
  46. egs/datasets/audio/lj/ps_flow_small_nips2021.yaml +11 -0
  47. egs/egs_bases/config_base.yaml +41 -0
  48. egs/egs_bases/tts/base.yaml +56 -0
  49. egs/egs_bases/tts/dataset_params.yaml +52 -0
  50. egs/egs_bases/tts/fs.yaml +75 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Project ignore
2
+
3
+ infer_out
4
+ flagged
5
+ rsync
6
+ .idea
7
+ .DS_Store
8
+ bak
9
+ tmp
10
+ *.tar.gz
11
+ mos
12
+ nbs
13
+ /configs_usr/*
14
+ !/configs_usr/.gitkeep
15
+ /egs_usr/*
16
+ !/egs_usr/.gitkeep
17
+ /rnnoise
18
+ #/usr/*
19
+ #!/usr/.gitkeep
20
+ scripts_usr
21
+
22
+ # Created by .ignore support plugin (hsz.mobi)
23
+ ### Python template
24
+ # Byte-compiled / optimized / DLL files
25
+ __pycache__/
26
+ *.py[cod]
27
+ *$py.class
28
+
29
+ # C extensions
30
+ *.so
31
+
32
+ # Distribution / packaging
33
+ .Python
34
+ build/
35
+ develop-eggs/
36
+ dist/
37
+ downloads/
38
+ eggs/
39
+ .eggs/
40
+ lib/
41
+ lib64/
42
+ parts/
43
+ sdist/
44
+ var/
45
+ wheels/
46
+ pip-wheel-metadata/
47
+ share/python-wheels/
48
+ *.egg-info/
49
+ .installed.cfg
50
+ *.egg
51
+ MANIFEST
52
+
53
+ # PyInstaller
54
+ # Usually these files are written by a python script from a template
55
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
56
+ *.manifest
57
+ *.spec
58
+
59
+ # Installer logs
60
+ pip-log.txt
61
+ pip-delete-this-directory.txt
62
+
63
+ # Unit test / coverage reports
64
+ htmlcov/
65
+ .tox/
66
+ .nox/
67
+ .coverage
68
+ .coverage.*
69
+ .cache
70
+ nosetests.xml
71
+ coverage.xml
72
+ *.cover
73
+ .hypothesis/
74
+ .pytest_cache/
75
+
76
+ # Translations
77
+ *.mo
78
+ *.pot
79
+
80
+ # Django stuff:
81
+ *.log
82
+ local_settings.py
83
+ db.sqlite3
84
+ db.sqlite3-journal
85
+
86
+ # Flask stuff:
87
+ instance/
88
+ .webassets-cache
89
+
90
+ # Scrapy stuff:
91
+ .scrapy
92
+
93
+ # Sphinx documentation
94
+ docs/_build/
95
+
96
+ # PyBuilder
97
+ target/
98
+
99
+ # Jupyter Notebook
100
+ .ipynb_checkpoints
101
+
102
+ # IPython
103
+ profile_default/
104
+ ipython_config.py
105
+
106
+ # pyenv
107
+ .python-version
108
+
109
+ # pipenv
110
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
111
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
112
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
113
+ # install all needed dependencies.
114
+ #Pipfile.lock
115
+
116
+ # celery beat schedule file
117
+ celerybeat-schedule
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+ 将删除 datasets/remi/test/
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FastSpeech2
3
+ emoji: 🤗
4
+ colorFrom: yellow
5
+ colorTo: orange
6
+ sdk: gradio
7
+ app_file: "inference/tts/gradio/infer.py"
8
+ pinned: false
9
+ ---
checkpoints/fs2_exp/config.yaml ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ amp: false
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 22050
5
+ base_config:
6
+ - egs/egs_bases/tts/fs2_orig.yaml
7
+ - ./base_text2mel.yaml
8
+ binarization_args:
9
+ min_sil_duration: 0.1
10
+ shuffle: false
11
+ test_range:
12
+ - 0
13
+ - 523
14
+ train_range:
15
+ - 871
16
+ - -1
17
+ trim_eos_bos: false
18
+ valid_range:
19
+ - 523
20
+ - 871
21
+ with_align: true
22
+ with_f0: true
23
+ with_f0cwt: true
24
+ with_linear: false
25
+ with_spk_embed: false
26
+ with_wav: false
27
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
28
+ binary_data_dir: data/binary/ljspeech_cwt
29
+ check_val_every_n_epoch: 10
30
+ clip_grad_norm: 1
31
+ clip_grad_value: 0
32
+ conv_use_pos: false
33
+ cwt_std_scale: 1.0
34
+ debug: false
35
+ dec_dilations:
36
+ - 1
37
+ - 1
38
+ - 1
39
+ - 1
40
+ dec_ffn_kernel_size: 9
41
+ dec_inp_add_noise: false
42
+ dec_kernel_size: 5
43
+ dec_layers: 4
44
+ dec_post_net_kernel: 3
45
+ decoder_rnn_dim: 0
46
+ decoder_type: fft
47
+ dropout: 0.0
48
+ ds_workers: 2
49
+ dur_predictor_kernel: 3
50
+ dur_predictor_layers: 2
51
+ enc_dec_norm: ln
52
+ enc_dilations:
53
+ - 1
54
+ - 1
55
+ - 1
56
+ - 1
57
+ enc_ffn_kernel_size: 9
58
+ enc_kernel_size: 5
59
+ enc_layers: 4
60
+ enc_post_net_kernel: 3
61
+ enc_pre_ln: true
62
+ enc_prenet: true
63
+ encoder_K: 8
64
+ encoder_type: fft
65
+ endless_ds: true
66
+ eval_max_batches: -1
67
+ f0_max: 800
68
+ f0_min: 80
69
+ ffn_act: gelu
70
+ ffn_hidden_size: 1024
71
+ fft_size: 1024
72
+ fmax: 7600
73
+ fmin: 80
74
+ frames_multiple: 1
75
+ gen_dir_name: ''
76
+ griffin_lim_iters: 30
77
+ hidden_size: 256
78
+ hop_size: 256
79
+ infer: false
80
+ lambda_commit: 0.25
81
+ lambda_energy: 0.1
82
+ lambda_f0: 1.0
83
+ lambda_ph_dur: 0.1
84
+ lambda_sent_dur: 1.0
85
+ lambda_uv: 1.0
86
+ lambda_word_dur: 1.0
87
+ layers_in_block: 2
88
+ load_ckpt: ''
89
+ loud_norm: false
90
+ lr: 0.0005
91
+ max_epochs: 1000
92
+ max_frames: 1548
93
+ max_input_tokens: 1550
94
+ max_sentences: 128
95
+ max_tokens: 40000
96
+ max_updates: 160000
97
+ max_valid_sentences: 1
98
+ max_valid_tokens: 60000
99
+ mel_losses: l1:0.5|ssim:0.5
100
+ mel_vmax: 1.5
101
+ mel_vmin: -6
102
+ min_frames: 0
103
+ num_ckpt_keep: 3
104
+ num_heads: 2
105
+ num_sanity_val_steps: 5
106
+ num_spk: 1
107
+ num_valid_plots: 10
108
+ optimizer_adam_beta1: 0.9
109
+ optimizer_adam_beta2: 0.98
110
+ out_wav_norm: false
111
+ pitch_extractor: parselmouth
112
+ pitch_key: pitch
113
+ pitch_type: cwt
114
+ predictor_dropout: 0.5
115
+ predictor_grad: 0.1
116
+ predictor_hidden: -1
117
+ predictor_kernel: 5
118
+ predictor_layers: 2
119
+ preprocess_args:
120
+ add_eos_bos: true
121
+ mfa_group_shuffle: false
122
+ mfa_offset: 0.02
123
+ nsample_per_mfa_group: 1000
124
+ reset_phone_dict: true
125
+ reset_word_dict: true
126
+ save_sil_mask: true
127
+ txt_processor: en
128
+ use_mfa: true
129
+ vad_max_silence_length: 12
130
+ wav_processors: []
131
+ with_phsep: true
132
+ preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
133
+ print_nan_grads: false
134
+ processed_data_dir: data/processed/ljspeech
135
+ profile_infer: false
136
+ raw_data_dir: data/raw/LJSpeech-1.1
137
+ ref_norm_layer: bn
138
+ rename_tmux: true
139
+ resume_from_checkpoint: 0
140
+ save_best: false
141
+ save_codes:
142
+ - tasks
143
+ - modules
144
+ - egs
145
+ save_f0: false
146
+ save_gt: true
147
+ scheduler: warmup
148
+ seed: 1234
149
+ sort_by_len: true
150
+ task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask
151
+ tb_log_interval: 100
152
+ test_ids:
153
+ - 0
154
+ - 1
155
+ - 2
156
+ - 3
157
+ - 4
158
+ - 5
159
+ - 6
160
+ - 7
161
+ - 8
162
+ - 9
163
+ - 10
164
+ - 11
165
+ - 12
166
+ - 13
167
+ - 14
168
+ - 15
169
+ - 16
170
+ - 17
171
+ - 18
172
+ - 19
173
+ - 68
174
+ - 70
175
+ - 74
176
+ - 87
177
+ - 110
178
+ - 172
179
+ - 190
180
+ - 215
181
+ - 231
182
+ - 294
183
+ - 316
184
+ - 324
185
+ - 402
186
+ - 422
187
+ - 485
188
+ - 500
189
+ - 505
190
+ - 508
191
+ - 509
192
+ - 519
193
+ test_input_yaml: ''
194
+ test_num: 100
195
+ test_set_name: test
196
+ train_set_name: train
197
+ train_sets: ''
198
+ use_energy_embed: true
199
+ use_gt_dur: false
200
+ use_gt_energy: false
201
+ use_gt_f0: false
202
+ use_pitch_embed: true
203
+ use_pos_embed: true
204
+ use_spk_embed: false
205
+ use_spk_id: false
206
+ use_uv: true
207
+ use_word_input: false
208
+ val_check_interval: 2000
209
+ valid_infer_interval: 10000
210
+ valid_monitor_key: val_loss
211
+ valid_monitor_mode: min
212
+ valid_set_name: valid
213
+ vocoder: HifiGAN
214
+ vocoder_ckpt: checkpoints/hifi_lj
215
+ warmup_updates: 4000
216
+ weight_decay: 0
217
+ win_size: 1024
218
+ word_dict_size: 10000
219
+ work_dir: checkpoints/fs2_exp
checkpoints/fs2_exp/model_ckpt_steps_98000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d4f450bb3115e04b4ea93eed8c9318f08d01582bed1dd86886b32d50601dc58
3
+ size 108423039
checkpoints/hifi_lj/config.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ adam_b1: 0.8
3
+ adam_b2: 0.99
4
+ amp: false
5
+ audio_num_mel_bins: 80
6
+ audio_sample_rate: 22050
7
+ base_config:
8
+ - configs/tts/hifigan.yaml
9
+ - configs/tts/lj/base_mel2wav.yaml
10
+ binarization_args:
11
+ shuffle: false
12
+ trim_eos_bos: false
13
+ trim_sil: false
14
+ with_align: false
15
+ with_f0: true
16
+ with_f0cwt: false
17
+ with_linear: false
18
+ with_spk_embed: false
19
+ with_txt: true
20
+ with_wav: true
21
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
22
+ binary_data_dir: data/binary/ljspeech_wav
23
+ check_val_every_n_epoch: 10
24
+ clip_grad_norm: 1
25
+ clip_grad_value: 0
26
+ debug: false
27
+ dec_ffn_kernel_size: 9
28
+ dec_layers: 4
29
+ dict_dir: ''
30
+ disc_start_steps: 40000
31
+ discriminator_grad_norm: 1
32
+ discriminator_optimizer_params:
33
+ eps: 1.0e-06
34
+ lr: 0.0002
35
+ weight_decay: 0.0
36
+ discriminator_params:
37
+ bias: true
38
+ conv_channels: 64
39
+ in_channels: 1
40
+ kernel_size: 3
41
+ layers: 10
42
+ nonlinear_activation: LeakyReLU
43
+ nonlinear_activation_params:
44
+ negative_slope: 0.2
45
+ out_channels: 1
46
+ use_weight_norm: true
47
+ discriminator_scheduler_params:
48
+ gamma: 0.999
49
+ step_size: 600
50
+ dropout: 0.1
51
+ ds_workers: 1
52
+ enc_ffn_kernel_size: 9
53
+ enc_layers: 4
54
+ endless_ds: true
55
+ ffn_act: gelu
56
+ ffn_padding: SAME
57
+ fft_size: 1024
58
+ fm_loss: false
59
+ fmax: 7600
60
+ fmin: 80
61
+ frames_multiple: 1
62
+ gen_dir_name: ''
63
+ generator_grad_norm: 10
64
+ generator_optimizer_params:
65
+ eps: 1.0e-06
66
+ lr: 0.0002
67
+ weight_decay: 0.0
68
+ generator_params:
69
+ aux_channels: 80
70
+ aux_context_window: 0
71
+ dropout: 0.0
72
+ gate_channels: 128
73
+ in_channels: 1
74
+ kernel_size: 3
75
+ layers: 30
76
+ out_channels: 1
77
+ residual_channels: 64
78
+ skip_channels: 64
79
+ stacks: 3
80
+ upsample_net: ConvInUpsampleNetwork
81
+ upsample_params:
82
+ upsample_scales:
83
+ - 4
84
+ - 4
85
+ - 4
86
+ - 4
87
+ use_nsf: false
88
+ use_pitch_embed: false
89
+ use_weight_norm: true
90
+ generator_scheduler_params:
91
+ gamma: 0.999
92
+ step_size: 600
93
+ griffin_lim_iters: 60
94
+ hidden_size: 256
95
+ hop_size: 256
96
+ infer: false
97
+ lambda_adv: 4.0
98
+ lambda_mel: 45.0
99
+ load_ckpt: ''
100
+ loud_norm: false
101
+ lr: 2.0
102
+ max_epochs: 1000
103
+ max_eval_sentences: 1
104
+ max_eval_tokens: 60000
105
+ max_frames: 1548
106
+ max_input_tokens: 1550
107
+ max_samples: 8192
108
+ max_sentences: 24
109
+ max_tokens: 30000
110
+ max_updates: 3000000
111
+ mel_vmax: 1.5
112
+ mel_vmin: -6
113
+ min_level_db: -100
114
+ num_ckpt_keep: 3
115
+ num_heads: 2
116
+ num_mels: 80
117
+ num_sanity_val_steps: 5
118
+ num_spk: 1
119
+ optimizer_adam_beta1: 0.9
120
+ optimizer_adam_beta2: 0.98
121
+ out_wav_norm: false
122
+ pitch_extractor: parselmouth
123
+ pre_align_args:
124
+ allow_no_txt: false
125
+ denoise: false
126
+ forced_align: mfa
127
+ sox_resample: false
128
+ trim_sil: false
129
+ txt_processor: en
130
+ use_tone: true
131
+ pre_align_cls: ''
132
+ print_nan_grads: false
133
+ processed_data_dir: data/processed/ljspeech
134
+ profile_infer: false
135
+ raw_data_dir: data/raw/LJSpeech-1.1
136
+ ref_level_db: 20
137
+ rerun_gen: true
138
+ resblock: '1'
139
+ resblock_dilation_sizes:
140
+ - - 1
141
+ - 3
142
+ - 5
143
+ - - 1
144
+ - 3
145
+ - 5
146
+ - - 1
147
+ - 3
148
+ - 5
149
+ resblock_kernel_sizes:
150
+ - 3
151
+ - 7
152
+ - 11
153
+ reset_phone_dict: true
154
+ resume_from_checkpoint: 0
155
+ sampling_rate: 22050
156
+ save_best: true
157
+ save_codes: []
158
+ save_f0: false
159
+ save_gt: true
160
+ seed: 1234
161
+ sort_by_len: true
162
+ stft_loss_params:
163
+ fft_sizes:
164
+ - 1024
165
+ - 2048
166
+ - 512
167
+ hop_sizes:
168
+ - 120
169
+ - 240
170
+ - 50
171
+ win_lengths:
172
+ - 600
173
+ - 1200
174
+ - 240
175
+ window: hann_window
176
+ stop_token_weight: 5.0
177
+ task_cls: tasks.vocoder.hifigan.HifiGanTask
178
+ tb_log_interval: 100
179
+ test_input_dir: ''
180
+ test_num: 100
181
+ test_set_name: test
182
+ train_set_name: train
183
+ upsample_initial_channel: 512
184
+ upsample_kernel_sizes:
185
+ - 16
186
+ - 16
187
+ - 4
188
+ - 4
189
+ upsample_rates:
190
+ - 8
191
+ - 8
192
+ - 2
193
+ - 2
194
+ use_mel_loss: false
195
+ use_pitch_embed: false
196
+ val_check_interval: 2000
197
+ valid_monitor_key: val_loss
198
+ valid_monitor_mode: min
199
+ valid_set_name: valid
200
+ vocoder: pwg
201
+ vocoder_ckpt: ''
202
+ warmup_updates: 8000
203
+ weight_decay: 0
204
+ win_length: null
205
+ win_size: 1024
206
+ window: hann
207
+ work_dir: checkpoints/0414_hifi_lj_1
checkpoints/hifi_lj/model_ckpt_steps_2076000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bbc40f0471a92394f6bf057820cf66a1f50d29db22c997341448bd496a0792d
3
+ size 55786088
checkpoints/ps_normal_exp/config.yaml ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ add_word_pos: true
3
+ amp: false
4
+ audio_num_mel_bins: 80
5
+ audio_sample_rate: 22050
6
+ base_config:
7
+ - ./ps_flow.yaml
8
+ binarization_args:
9
+ min_sil_duration: 0.1
10
+ shuffle: false
11
+ test_range:
12
+ - 0
13
+ - 523
14
+ train_range:
15
+ - 871
16
+ - -1
17
+ trim_eos_bos: false
18
+ valid_range:
19
+ - 523
20
+ - 871
21
+ with_align: true
22
+ with_f0: true
23
+ with_f0cwt: false
24
+ with_linear: false
25
+ with_spk_embed: false
26
+ with_wav: false
27
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
28
+ binary_data_dir: data/binary/ljspeech
29
+ check_val_every_n_epoch: 10
30
+ clip_grad_norm: 1
31
+ clip_grad_value: 0
32
+ conv_use_pos: false
33
+ debug: false
34
+ dec_dilations:
35
+ - 1
36
+ - 1
37
+ - 1
38
+ - 1
39
+ dec_ffn_kernel_size: 9
40
+ dec_inp_add_noise: false
41
+ dec_kernel_size: 5
42
+ dec_layers: 4
43
+ dec_post_net_kernel: 3
44
+ decoder_rnn_dim: 0
45
+ decoder_type: conv
46
+ detach_postflow_input: true
47
+ dropout: 0.0
48
+ ds_workers: 2
49
+ dur_level: word
50
+ dur_predictor_kernel: 5
51
+ dur_predictor_layers: 3
52
+ enc_dec_norm: ln
53
+ enc_dilations:
54
+ - 1
55
+ - 1
56
+ - 1
57
+ - 1
58
+ enc_ffn_kernel_size: 5
59
+ enc_kernel_size: 5
60
+ enc_layers: 4
61
+ enc_post_net_kernel: 3
62
+ enc_pre_ln: false
63
+ enc_prenet: true
64
+ encoder_K: 8
65
+ encoder_type: rel_fft
66
+ endless_ds: true
67
+ eval_max_batches: -1
68
+ f0_max: 800
69
+ f0_min: 80
70
+ ffn_act: gelu
71
+ ffn_hidden_size: 768
72
+ fft_size: 1024
73
+ fmax: 7600
74
+ fmin: 80
75
+ frames_multiple: 4
76
+ fvae_dec_n_layers: 4
77
+ fvae_decoder_type: wn
78
+ fvae_enc_dec_hidden: 192
79
+ fvae_enc_n_layers: 8
80
+ fvae_encoder_type: wn
81
+ fvae_kernel_size: 5
82
+ fvae_noise_scale: 1.0
83
+ fvae_strides: 4
84
+ gen_dir_name: ''
85
+ glow_kernel_size: 3
86
+ griffin_lim_iters: 30
87
+ hidden_size: 192
88
+ hop_size: 256
89
+ infer: false
90
+ infer_post_glow: true
91
+ kl_min: 0.0
92
+ kl_start_steps: 10000
93
+ lambda_commit: 0.25
94
+ lambda_energy: 0.1
95
+ lambda_f0: 1.0
96
+ lambda_kl: 1.0
97
+ lambda_ph_dur: 0.1
98
+ lambda_sent_dur: 0.0
99
+ lambda_uv: 1.0
100
+ lambda_word_dur: 1.0
101
+ latent_size: 16
102
+ layers_in_block: 2
103
+ load_ckpt: ''
104
+ loud_norm: false
105
+ lr: 0.0002
106
+ max_epochs: 1000
107
+ max_frames: 1548
108
+ max_input_tokens: 1550
109
+ max_sentences: 64
110
+ max_tokens: 40000
111
+ max_updates: 480000
112
+ max_valid_sentences: 1
113
+ max_valid_tokens: 60000
114
+ mel_losses: l1:0.5|ssim:0.5
115
+ mel_vmax: 1.5
116
+ mel_vmin: -6
117
+ min_frames: 0
118
+ noise_scale: 0.8
119
+ num_ckpt_keep: 3
120
+ num_heads: 2
121
+ num_sanity_val_steps: 5
122
+ num_spk: 1
123
+ num_valid_plots: 10
124
+ optimizer_adam_beta1: 0.9
125
+ optimizer_adam_beta2: 0.98
126
+ out_wav_norm: false
127
+ pitch_extractor: parselmouth
128
+ pitch_key: pitch
129
+ pitch_type: frame
130
+ post_decoder: false
131
+ post_decoder_detach_ling: false
132
+ post_flow_lr: 0.001
133
+ post_glow_hidden: 192
134
+ post_glow_kernel_size: 3
135
+ post_glow_n_block_layers: 3
136
+ post_glow_n_blocks: 12
137
+ post_glow_training_start: 160000
138
+ post_share_cond_layers: false
139
+ posterior_start_steps: 0
140
+ predictor_dropout: 0.2
141
+ predictor_grad: 0.1
142
+ predictor_hidden: -1
143
+ predictor_kernel: 5
144
+ predictor_layers: 2
145
+ preprocess_args:
146
+ add_eos_bos: true
147
+ mfa_group_shuffle: false
148
+ mfa_offset: 0.02
149
+ nsample_per_mfa_group: 1000
150
+ reset_phone_dict: true
151
+ reset_word_dict: true
152
+ save_sil_mask: true
153
+ txt_processor: en
154
+ use_mfa: true
155
+ vad_max_silence_length: 12
156
+ wav_processors: []
157
+ with_phsep: true
158
+ preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
159
+ print_nan_grads: false
160
+ prior_glow_hidden: 64
161
+ prior_glow_n_blocks: 4
162
+ processed_data_dir: data/processed/ljspeech
163
+ profile_infer: false
164
+ raw_data_dir: data/raw/LJSpeech-1.1
165
+ ref_norm_layer: bn
166
+ rename_tmux: true
167
+ resume_from_checkpoint: 0
168
+ save_best: false
169
+ save_codes:
170
+ - tasks
171
+ - modules
172
+ - egs
173
+ save_f0: false
174
+ save_gt: true
175
+ scheduler: warmup
176
+ seed: 1234
177
+ share_wn_layers: 4
178
+ sigmoid_scale: false
179
+ sort_by_len: true
180
+ task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
181
+ tb_log_interval: 100
182
+ test_ids:
183
+ - 0
184
+ - 1
185
+ - 2
186
+ - 3
187
+ - 4
188
+ - 5
189
+ - 6
190
+ - 7
191
+ - 8
192
+ - 9
193
+ - 10
194
+ - 11
195
+ - 12
196
+ - 13
197
+ - 14
198
+ - 15
199
+ - 16
200
+ - 17
201
+ - 18
202
+ - 19
203
+ - 68
204
+ - 70
205
+ - 74
206
+ - 87
207
+ - 110
208
+ - 172
209
+ - 190
210
+ - 215
211
+ - 231
212
+ - 294
213
+ - 316
214
+ - 324
215
+ - 402
216
+ - 422
217
+ - 485
218
+ - 500
219
+ - 505
220
+ - 508
221
+ - 509
222
+ - 519
223
+ test_input_yaml: ''
224
+ test_num: 100
225
+ test_set_name: test
226
+ text_encoder_postnet: false
227
+ train_set_name: train
228
+ train_sets: ''
229
+ two_stage: true
230
+ use_cond_proj: false
231
+ use_fvae: true
232
+ use_gt_dur: false
233
+ use_gt_f0: false
234
+ use_latent_cond: false
235
+ use_pitch_embed: false
236
+ use_pos_embed: true
237
+ use_post_flow: true
238
+ use_prior_flow: true
239
+ use_spk_embed: false
240
+ use_spk_id: false
241
+ use_txt_cond: true
242
+ use_uv: true
243
+ use_word_encoder: false
244
+ use_word_input: false
245
+ val_check_interval: 2000
246
+ valid_infer_interval: 10000
247
+ valid_monitor_key: val_loss
248
+ valid_monitor_mode: min
249
+ valid_set_name: valid
250
+ vocoder: HifiGAN
251
+ vocoder_ckpt: checkpoints/hifi_lj
252
+ warmup_updates: 8000
253
+ weight_decay: 0
254
+ win_size: 1024
255
+ word_dict_size: 10000
256
+ word_enc_layers: 4
257
+ word_encoder_type: rel_fft
258
+ work_dir: checkpoints/ps_normal_exp
checkpoints/ps_normal_exp/model_ckpt_steps_278000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13a51035b84c2a385d05ce695f6dca0b5095e7bd7ea3b1d34a22aed4d9c9b5fc
3
+ size 104081102
checkpoints/ps_small_exp/config.yaml ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ add_word_pos: true
3
+ amp: false
4
+ audio_num_mel_bins: 80
5
+ audio_sample_rate: 22050
6
+ base_config:
7
+ - ./ps_flow_small.yaml
8
+ binarization_args:
9
+ min_sil_duration: 0.1
10
+ shuffle: false
11
+ test_range:
12
+ - 0
13
+ - 523
14
+ train_range:
15
+ - 871
16
+ - -1
17
+ trim_eos_bos: false
18
+ valid_range:
19
+ - 523
20
+ - 871
21
+ with_align: true
22
+ with_f0: true
23
+ with_f0cwt: false
24
+ with_linear: false
25
+ with_spk_embed: false
26
+ with_wav: false
27
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
28
+ binary_data_dir: data/binary/ljspeech
29
+ check_val_every_n_epoch: 10
30
+ clip_grad_norm: 1
31
+ clip_grad_value: 0
32
+ conv_use_pos: false
33
+ debug: false
34
+ dec_dilations:
35
+ - 1
36
+ - 1
37
+ - 1
38
+ - 1
39
+ dec_ffn_kernel_size: 9
40
+ dec_inp_add_noise: false
41
+ dec_kernel_size: 5
42
+ dec_layers: 4
43
+ dec_post_net_kernel: 3
44
+ decoder_rnn_dim: 0
45
+ decoder_type: conv
46
+ detach_postflow_input: true
47
+ dropout: 0.0
48
+ ds_workers: 2
49
+ dur_level: word
50
+ dur_predictor_kernel: 5
51
+ dur_predictor_layers: 3
52
+ enc_dec_norm: ln
53
+ enc_dilations:
54
+ - 1
55
+ - 1
56
+ - 1
57
+ - 1
58
+ enc_ffn_kernel_size: 3
59
+ enc_kernel_size: 5
60
+ enc_layers: 3
61
+ enc_post_net_kernel: 3
62
+ enc_pre_ln: false
63
+ enc_prenet: true
64
+ encoder_K: 8
65
+ encoder_type: rel_fft
66
+ endless_ds: true
67
+ eval_max_batches: -1
68
+ f0_max: 800
69
+ f0_min: 80
70
+ ffn_act: gelu
71
+ ffn_hidden_size: 512
72
+ fft_size: 1024
73
+ fmax: 7600
74
+ fmin: 80
75
+ frames_multiple: 4
76
+ fvae_dec_n_layers: 3
77
+ fvae_decoder_type: wn
78
+ fvae_enc_dec_hidden: 128
79
+ fvae_enc_n_layers: 8
80
+ fvae_encoder_type: wn
81
+ fvae_kernel_size: 3
82
+ fvae_noise_scale: 1.0
83
+ fvae_strides: 4
84
+ gen_dir_name: ''
85
+ glow_kernel_size: 3
86
+ griffin_lim_iters: 30
87
+ hidden_size: 128
88
+ hop_size: 256
89
+ infer: false
90
+ infer_post_glow: true
91
+ kl_min: 0.0
92
+ kl_start_steps: 10000
93
+ lambda_commit: 0.25
94
+ lambda_energy: 0.1
95
+ lambda_f0: 1.0
96
+ lambda_kl: 1.0
97
+ lambda_ph_dur: 0.1
98
+ lambda_sent_dur: 0.0
99
+ lambda_uv: 1.0
100
+ lambda_word_dur: 1.0
101
+ latent_size: 16
102
+ layers_in_block: 2
103
+ load_ckpt: ''
104
+ loud_norm: false
105
+ lr: 0.0002
106
+ max_epochs: 1000
107
+ max_frames: 1548
108
+ max_input_tokens: 1550
109
+ max_sentences: 128
110
+ max_tokens: 40000
111
+ max_updates: 480000
112
+ max_valid_sentences: 1
113
+ max_valid_tokens: 60000
114
+ mel_losses: l1:0.5|ssim:0.5
115
+ mel_vmax: 1.5
116
+ mel_vmin: -6
117
+ min_frames: 0
118
+ noise_scale: 0.6
119
+ num_ckpt_keep: 3
120
+ num_heads: 2
121
+ num_sanity_val_steps: 5
122
+ num_spk: 1
123
+ num_valid_plots: 10
124
+ optimizer_adam_beta1: 0.9
125
+ optimizer_adam_beta2: 0.98
126
+ out_wav_norm: false
127
+ pitch_extractor: parselmouth
128
+ pitch_key: pitch
129
+ pitch_type: frame
130
+ post_decoder: false
131
+ post_decoder_detach_ling: false
132
+ post_flow_lr: 0.001
133
+ post_glow_hidden: 128
134
+ post_glow_kernel_size: 3
135
+ post_glow_n_block_layers: 3
136
+ post_glow_n_blocks: 8
137
+ post_glow_training_start: 160000
138
+ post_share_cond_layers: false
139
+ posterior_start_steps: 0
140
+ predictor_dropout: 0.2
141
+ predictor_grad: 0.1
142
+ predictor_hidden: -1
143
+ predictor_kernel: 5
144
+ predictor_layers: 2
145
+ preprocess_args:
146
+ add_eos_bos: true
147
+ mfa_group_shuffle: false
148
+ mfa_offset: 0.02
149
+ nsample_per_mfa_group: 1000
150
+ reset_phone_dict: true
151
+ reset_word_dict: true
152
+ save_sil_mask: true
153
+ txt_processor: en
154
+ use_mfa: true
155
+ vad_max_silence_length: 12
156
+ wav_processors: []
157
+ with_phsep: true
158
+ preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
159
+ print_nan_grads: false
160
+ prior_glow_hidden: 32
161
+ prior_glow_n_blocks: 3
162
+ processed_data_dir: data/processed/ljspeech
163
+ profile_infer: false
164
+ raw_data_dir: data/raw/LJSpeech-1.1
165
+ ref_norm_layer: bn
166
+ rename_tmux: true
167
+ resume_from_checkpoint: 0
168
+ save_best: false
169
+ save_codes:
170
+ - tasks
171
+ - modules
172
+ - egs
173
+ save_f0: false
174
+ save_gt: true
175
+ scheduler: warmup
176
+ seed: 1234
177
+ share_wn_layers: 4
178
+ sigmoid_scale: false
179
+ sort_by_len: true
180
+ task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
181
+ tb_log_interval: 100
182
+ test_ids:
183
+ - 0
184
+ - 1
185
+ - 2
186
+ - 3
187
+ - 4
188
+ - 5
189
+ - 6
190
+ - 7
191
+ - 8
192
+ - 9
193
+ - 10
194
+ - 11
195
+ - 12
196
+ - 13
197
+ - 14
198
+ - 15
199
+ - 16
200
+ - 17
201
+ - 18
202
+ - 19
203
+ - 68
204
+ - 70
205
+ - 74
206
+ - 87
207
+ - 110
208
+ - 172
209
+ - 190
210
+ - 215
211
+ - 231
212
+ - 294
213
+ - 316
214
+ - 324
215
+ - 402
216
+ - 422
217
+ - 485
218
+ - 500
219
+ - 505
220
+ - 508
221
+ - 509
222
+ - 519
223
+ test_input_yaml: ''
224
+ test_num: 100
225
+ test_set_name: test
226
+ text_encoder_postnet: false
227
+ train_set_name: train
228
+ train_sets: ''
229
+ two_stage: true
230
+ use_cond_proj: false
231
+ use_fvae: true
232
+ use_gt_dur: false
233
+ use_gt_f0: false
234
+ use_latent_cond: false
235
+ use_pitch_embed: false
236
+ use_pos_embed: true
237
+ use_post_flow: true
238
+ use_prior_flow: true
239
+ use_spk_embed: false
240
+ use_spk_id: false
241
+ use_txt_cond: true
242
+ use_uv: true
243
+ use_word_encoder: false
244
+ use_word_input: false
245
+ val_check_interval: 2000
246
+ valid_infer_interval: 10000
247
+ valid_monitor_key: val_loss
248
+ valid_monitor_mode: min
249
+ valid_set_name: valid
250
+ vocoder: HifiGAN
251
+ vocoder_ckpt: checkpoints/hifi_lj
252
+ warmup_updates: 8000
253
+ weight_decay: 0
254
+ win_size: 1024
255
+ word_dict_size: 10000
256
+ word_enc_layers: 3
257
+ word_encoder_type: rel_fft
258
+ work_dir: checkpoints/ps_small_exp
checkpoints/ps_small_exp/model_ckpt_steps_410000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6905d8969febca192f2239a99e833d9084b2e07cb6894a63e286901ab1d16553
3
+ size 32754716
data/binary/ljspeech/phone_set.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
data/binary/ljspeech/spk_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<SINGLE_SPK>": 0}
data/binary/ljspeech/word_set.json ADDED
The diff for this file is too large to render. See raw diff
 
data/binary/ljspeech_cwt/phone_set.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
data/binary/ljspeech_cwt/spk_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<SINGLE_SPK>": 0}
data/binary/ljspeech_cwt/word_set.json ADDED
The diff for this file is too large to render. See raw diff
 
data_gen/tts/base_binarizer.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+ import traceback
5
+ from functools import partial
6
+
7
+ import numpy as np
8
+ from resemblyzer import VoiceEncoder
9
+ from tqdm import tqdm
10
+
11
+ import utils.commons.single_thread_env # NOQA
12
+ from utils.audio import librosa_wav2spec
13
+ from utils.audio.align import get_mel2ph, mel2token_to_dur
14
+ from utils.audio.cwt import get_lf0_cwt, get_cont_lf0
15
+ from utils.audio.pitch.utils import f0_to_coarse
16
+ from utils.audio.pitch_extractors import extract_pitch_simple
17
+ from utils.commons.hparams import hparams
18
+ from utils.commons.indexed_datasets import IndexedDatasetBuilder
19
+ from utils.commons.multiprocess_utils import multiprocess_run_tqdm
20
+ from utils.os_utils import remove_file, copy_file
21
+
22
+ np.seterr(divide='ignore', invalid='ignore')
23
+
24
+
25
+ class BinarizationError(Exception):
26
+ pass
27
+
28
+
29
+ class BaseBinarizer:
30
+ def __init__(self, processed_data_dir=None):
31
+ if processed_data_dir is None:
32
+ processed_data_dir = hparams['processed_data_dir']
33
+ self.processed_data_dir = processed_data_dir
34
+ self.binarization_args = hparams['binarization_args']
35
+ self.items = {}
36
+ self.item_names = []
37
+
38
+ def load_meta_data(self):
39
+ processed_data_dir = self.processed_data_dir
40
+ items_list = json.load(open(f"{processed_data_dir}/metadata.json"))
41
+ for r in tqdm(items_list, desc='Loading meta data.'):
42
+ item_name = r['item_name']
43
+ self.items[item_name] = r
44
+ self.item_names.append(item_name)
45
+ if self.binarization_args['shuffle']:
46
+ random.seed(1234)
47
+ random.shuffle(self.item_names)
48
+
49
+ @property
50
+ def train_item_names(self):
51
+ range_ = self._convert_range(self.binarization_args['train_range'])
52
+ return self.item_names[range_[0]:range_[1]]
53
+
54
+ @property
55
+ def valid_item_names(self):
56
+ range_ = self._convert_range(self.binarization_args['valid_range'])
57
+ return self.item_names[range_[0]:range_[1]]
58
+
59
+ @property
60
+ def test_item_names(self):
61
+ range_ = self._convert_range(self.binarization_args['test_range'])
62
+ return self.item_names[range_[0]:range_[1]]
63
+
64
+ def _convert_range(self, range_):
65
+ if range_[1] == -1:
66
+ range_[1] = len(self.item_names)
67
+ return range_
68
+
69
+ def meta_data(self, prefix):
70
+ if prefix == 'valid':
71
+ item_names = self.valid_item_names
72
+ elif prefix == 'test':
73
+ item_names = self.test_item_names
74
+ else:
75
+ item_names = self.train_item_names
76
+ for item_name in item_names:
77
+ yield self.items[item_name]
78
+
79
+ def process(self):
80
+ self.load_meta_data()
81
+ os.makedirs(hparams['binary_data_dir'], exist_ok=True)
82
+ for fn in ['phone_set.json', 'word_set.json', 'spk_map.json']:
83
+ remove_file(f"{hparams['binary_data_dir']}/{fn}")
84
+ copy_file(f"{hparams['processed_data_dir']}/{fn}", f"{hparams['binary_data_dir']}/{fn}")
85
+ self.process_data('valid')
86
+ self.process_data('test')
87
+ self.process_data('train')
88
+
89
+ def process_data(self, prefix):
90
+ data_dir = hparams['binary_data_dir']
91
+ builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
92
+ meta_data = list(self.meta_data(prefix))
93
+ process_item = partial(self.process_item, binarization_args=self.binarization_args)
94
+ ph_lengths = []
95
+ mel_lengths = []
96
+ total_sec = 0
97
+ items = []
98
+ args = [{'item': item} for item in meta_data]
99
+ for item_id, item in multiprocess_run_tqdm(process_item, args, desc='Processing data'):
100
+ if item is not None:
101
+ items.append(item)
102
+ if self.binarization_args['with_spk_embed']:
103
+ args = [{'wav': item['wav']} for item in items]
104
+ for item_id, spk_embed in multiprocess_run_tqdm(
105
+ self.get_spk_embed, args,
106
+ init_ctx_func=lambda wid: {'voice_encoder': VoiceEncoder().cuda()}, num_workers=4,
107
+ desc='Extracting spk embed'):
108
+ items[item_id]['spk_embed'] = spk_embed
109
+
110
+ for item in items:
111
+ if not self.binarization_args['with_wav'] and 'wav' in item:
112
+ del item['wav']
113
+ builder.add_item(item)
114
+ mel_lengths.append(item['len'])
115
+ assert item['len'] > 0, (item['item_name'], item['txt'], item['mel2ph'])
116
+ if 'ph_len' in item:
117
+ ph_lengths.append(item['ph_len'])
118
+ total_sec += item['sec']
119
+ builder.finalize()
120
+ np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
121
+ if len(ph_lengths) > 0:
122
+ np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
123
+ print(f"| {prefix} total duration: {total_sec:.3f}s")
124
+
125
+ @classmethod
126
+ def process_item(cls, item, binarization_args):
127
+ item['ph_len'] = len(item['ph_token'])
128
+ item_name = item['item_name']
129
+ wav_fn = item['wav_fn']
130
+ wav, mel = cls.process_audio(wav_fn, item, binarization_args)
131
+ try:
132
+ n_bos_frames, n_eos_frames = 0, 0
133
+ if binarization_args['with_align']:
134
+ tg_fn = f"{hparams['processed_data_dir']}/mfa_outputs/{item_name}.TextGrid"
135
+ item['tg_fn'] = tg_fn
136
+ cls.process_align(tg_fn, item)
137
+ if binarization_args['trim_eos_bos']:
138
+ n_bos_frames = item['dur'][0]
139
+ n_eos_frames = item['dur'][-1]
140
+ T = len(mel)
141
+ item['mel'] = mel[n_bos_frames:T - n_eos_frames]
142
+ item['mel2ph'] = item['mel2ph'][n_bos_frames:T - n_eos_frames]
143
+ item['mel2word'] = item['mel2word'][n_bos_frames:T - n_eos_frames]
144
+ item['dur'] = item['dur'][1:-1]
145
+ item['dur_word'] = item['dur_word'][1:-1]
146
+ item['len'] = item['mel'].shape[0]
147
+ item['wav'] = wav[n_bos_frames * hparams['hop_size']:len(wav) - n_eos_frames * hparams['hop_size']]
148
+ if binarization_args['with_f0']:
149
+ cls.process_pitch(item, n_bos_frames, n_eos_frames)
150
+ except BinarizationError as e:
151
+ print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
152
+ return None
153
+ except Exception as e:
154
+ traceback.print_exc()
155
+ print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
156
+ return None
157
+ return item
158
+
159
+ @classmethod
160
+ def process_audio(cls, wav_fn, res, binarization_args):
161
+ wav2spec_dict = librosa_wav2spec(
162
+ wav_fn,
163
+ fft_size=hparams['fft_size'],
164
+ hop_size=hparams['hop_size'],
165
+ win_length=hparams['win_size'],
166
+ num_mels=hparams['audio_num_mel_bins'],
167
+ fmin=hparams['fmin'],
168
+ fmax=hparams['fmax'],
169
+ sample_rate=hparams['audio_sample_rate'],
170
+ loud_norm=hparams['loud_norm'])
171
+ mel = wav2spec_dict['mel']
172
+ wav = wav2spec_dict['wav'].astype(np.float16)
173
+ if binarization_args['with_linear']:
174
+ res['linear'] = wav2spec_dict['linear']
175
+ res.update({'mel': mel, 'wav': wav, 'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
176
+ return wav, mel
177
+
178
+ @staticmethod
179
+ def process_align(tg_fn, item):
180
+ ph = item['ph']
181
+ mel = item['mel']
182
+ ph_token = item['ph_token']
183
+ if tg_fn is not None and os.path.exists(tg_fn):
184
+ mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams['hop_size'], hparams['audio_sample_rate'],
185
+ hparams['binarization_args']['min_sil_duration'])
186
+ else:
187
+ raise BinarizationError(f"Align not found")
188
+ if np.array(mel2ph).max() - 1 >= len(ph_token):
189
+ raise BinarizationError(
190
+ f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(ph_token)}")
191
+ item['mel2ph'] = mel2ph
192
+ item['dur'] = dur
193
+
194
+ ph2word = item['ph2word']
195
+ mel2word = [ph2word[p - 1] for p in item['mel2ph']]
196
+ item['mel2word'] = mel2word # [T_mel]
197
+ dur_word = mel2token_to_dur(mel2word, len(item['word_token']))
198
+ item['dur_word'] = dur_word.tolist() # [T_word]
199
+
200
+ @staticmethod
201
+ def process_pitch(item, n_bos_frames, n_eos_frames):
202
+ wav, mel = item['wav'], item['mel']
203
+ f0 = extract_pitch_simple(item['wav'])
204
+ if sum(f0) == 0:
205
+ raise BinarizationError("Empty f0")
206
+ assert len(mel) == len(f0), (len(mel), len(f0))
207
+ pitch_coarse = f0_to_coarse(f0)
208
+ item['f0'] = f0
209
+ item['pitch'] = pitch_coarse
210
+ if hparams['binarization_args']['with_f0cwt']:
211
+ uv, cont_lf0_lpf = get_cont_lf0(f0)
212
+ logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
213
+ cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
214
+ cwt_spec, scales = get_lf0_cwt(cont_lf0_lpf_norm)
215
+ item['cwt_spec'] = cwt_spec
216
+ item['cwt_mean'] = logf0s_mean_org
217
+ item['cwt_std'] = logf0s_std_org
218
+
219
+ @staticmethod
220
+ def get_spk_embed(wav, ctx):
221
+ return ctx['voice_encoder'].embed_utterance(wav.astype(float))
222
+
223
+ @property
224
+ def num_workers(self):
225
+ return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
data_gen/tts/base_preprocess.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+ import re
5
+ import traceback
6
+ from collections import Counter
7
+ from functools import partial
8
+
9
+ import librosa
10
+ from tqdm import tqdm
11
+ from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
12
+ from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
13
+ from utils.commons.hparams import hparams
14
+ from utils.commons.multiprocess_utils import multiprocess_run_tqdm
15
+ from utils.os_utils import link_file, move_file, remove_file
16
+ from utils.text.text_encoder import is_sil_phoneme, build_token_encoder
17
+
18
+
19
+ class BasePreprocessor:
20
+ def __init__(self):
21
+ self.preprocess_args = hparams['preprocess_args']
22
+ txt_processor = self.preprocess_args['txt_processor']
23
+ self.txt_processor = get_txt_processor_cls(txt_processor)
24
+ self.raw_data_dir = hparams['raw_data_dir']
25
+ self.processed_dir = hparams['processed_data_dir']
26
+ self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
27
+
28
+ def meta_data(self):
29
+ """
30
+
31
+ :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
32
+ """
33
+ raise NotImplementedError
34
+
35
+ def process(self):
36
+ processed_dir = self.processed_dir
37
+ wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
38
+ remove_file(wav_processed_tmp_dir)
39
+ os.makedirs(wav_processed_tmp_dir, exist_ok=True)
40
+ wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
41
+ remove_file(wav_processed_dir)
42
+ os.makedirs(wav_processed_dir, exist_ok=True)
43
+
44
+ meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
45
+ item_names = [d['item_name'] for d in meta_data]
46
+ assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
47
+
48
+ # preprocess data
49
+ phone_list = []
50
+ word_list = []
51
+ spk_names = set()
52
+ process_item = partial(self.preprocess_first_pass,
53
+ txt_processor=self.txt_processor,
54
+ wav_processed_dir=wav_processed_dir,
55
+ wav_processed_tmp=wav_processed_tmp_dir,
56
+ preprocess_args=self.preprocess_args)
57
+ items = []
58
+ args = [{
59
+ 'item_name': item_raw['item_name'],
60
+ 'txt_raw': item_raw['txt'],
61
+ 'wav_fn': item_raw['wav_fn'],
62
+ 'txt_loader': item_raw.get('txt_loader'),
63
+ 'others': item_raw.get('others', None)
64
+ } for item_raw in meta_data]
65
+ for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
66
+ if item is not None:
67
+ item_.update(item)
68
+ item = item_
69
+ if 'txt_loader' in item:
70
+ del item['txt_loader']
71
+ item['id'] = item_id
72
+ item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
73
+ item['others'] = item.get('others', None)
74
+ phone_list += item['ph'].split(" ")
75
+ word_list += item['word'].split(" ")
76
+ spk_names.add(item['spk_name'])
77
+ items.append(item)
78
+
79
+ # add encoded tokens
80
+ ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
81
+ spk_map = self.build_spk_map(spk_names)
82
+ args = [{
83
+ 'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
84
+ 'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
85
+ } for item in items]
86
+ for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
87
+ items[idx].update(item_new_kv)
88
+
89
+ # build mfa data
90
+ if self.preprocess_args['use_mfa']:
91
+ mfa_dict = set()
92
+ mfa_input_dir = f'{processed_dir}/mfa_inputs'
93
+ remove_file(mfa_input_dir)
94
+ # group MFA inputs for better parallelism
95
+ mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
96
+ if self.preprocess_args['mfa_group_shuffle']:
97
+ random.seed(hparams['seed'])
98
+ random.shuffle(mfa_groups)
99
+ args = [{
100
+ 'item': item, 'mfa_input_dir': mfa_input_dir,
101
+ 'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
102
+ 'preprocess_args': self.preprocess_args
103
+ } for item, mfa_group in zip(items, mfa_groups)]
104
+ for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
105
+ self.build_mfa_inputs, args, desc='Build MFA data'):
106
+ items[i]['wav_align_fn'] = new_wav_align_fn
107
+ for w in ph_gb_word_nosil.split(" "):
108
+ mfa_dict.add(f"{w} {w.replace('_', ' ')}")
109
+ mfa_dict = sorted(mfa_dict)
110
+ with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
111
+ f.writelines([f'{l}\n' for l in mfa_dict])
112
+ with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
113
+ f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
114
+ remove_file(wav_processed_tmp_dir)
115
+
116
+ @classmethod
117
+ def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
118
+ wav_fn, wav_processed_dir, wav_processed_tmp,
119
+ preprocess_args, txt_loader=None, others=None):
120
+ try:
121
+ if txt_loader is not None:
122
+ txt_raw = txt_loader(txt_raw)
123
+ ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
124
+ wav_fn, wav_align_fn = cls.process_wav(
125
+ item_name, wav_fn,
126
+ hparams['processed_data_dir'],
127
+ wav_processed_tmp, preprocess_args)
128
+
129
+ # wav for binarization
130
+ ext = os.path.splitext(wav_fn)[1]
131
+ os.makedirs(wav_processed_dir, exist_ok=True)
132
+ new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
133
+ move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
134
+ move_link_func(wav_fn, new_wav_fn)
135
+ return {
136
+ 'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
137
+ 'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
138
+ 'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
139
+ 'others': others
140
+ }
141
+ except:
142
+ traceback.print_exc()
143
+ print(f"| Error is caught. item_name: {item_name}.")
144
+ return None
145
+
146
+ @staticmethod
147
+ def txt_to_ph(txt_processor, txt_raw, preprocess_args):
148
+ txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
149
+ ph = [p for w in txt_struct for p in w[1]]
150
+ ph_gb_word = ["_".join(w[1]) for w in txt_struct]
151
+ words = [w[0] for w in txt_struct]
152
+ # word_id=0 is reserved for padding
153
+ ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
154
+ return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
155
+
156
+ @staticmethod
157
+ def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
158
+ processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
159
+ processors = [k() for k in processors if k is not None]
160
+ if len(processors) >= 1:
161
+ sr_file = librosa.core.get_samplerate(wav_fn)
162
+ output_fn_for_align = None
163
+ ext = os.path.splitext(wav_fn)[1]
164
+ input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
165
+ link_file(wav_fn, input_fn)
166
+ for p in processors:
167
+ outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
168
+ if len(outputs) == 3:
169
+ input_fn, sr, output_fn_for_align = outputs
170
+ else:
171
+ input_fn, sr = outputs
172
+ return input_fn, output_fn_for_align
173
+ else:
174
+ return wav_fn, wav_fn
175
+
176
+ def _phone_encoder(self, ph_set):
177
+ ph_set_fn = f"{self.processed_dir}/phone_set.json"
178
+ if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
179
+ ph_set = sorted(set(ph_set))
180
+ json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
181
+ print("| Build phone set: ", ph_set)
182
+ else:
183
+ ph_set = json.load(open(ph_set_fn, 'r'))
184
+ print("| Load phone set: ", ph_set)
185
+ return build_token_encoder(ph_set_fn)
186
+
187
+ def _word_encoder(self, word_set):
188
+ word_set_fn = f"{self.processed_dir}/word_set.json"
189
+ if self.preprocess_args['reset_word_dict']:
190
+ word_set = Counter(word_set)
191
+ total_words = sum(word_set.values())
192
+ word_set = word_set.most_common(hparams['word_dict_size'])
193
+ num_unk_words = total_words - sum([x[1] for x in word_set])
194
+ word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
195
+ word_set = sorted(set(word_set))
196
+ json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
197
+ print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
198
+ f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
199
+ else:
200
+ word_set = json.load(open(word_set_fn, 'r'))
201
+ print("| Load word set. Size: ", len(word_set), word_set[:10])
202
+ return build_token_encoder(word_set_fn)
203
+
204
+ @classmethod
205
+ def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
206
+ word_token = word_encoder.encode(word)
207
+ ph_token = ph_encoder.encode(ph)
208
+ spk_id = spk_map[spk_name]
209
+ return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
210
+
211
+ def build_spk_map(self, spk_names):
212
+ spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
213
+ assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
214
+ print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
215
+ json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
216
+ return spk_map
217
+
218
+ @classmethod
219
+ def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
220
+ item_name = item['item_name']
221
+ wav_align_fn = item['wav_align_fn']
222
+ ph_gb_word = item['ph_gb_word']
223
+ ext = os.path.splitext(wav_align_fn)[1]
224
+ mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
225
+ os.makedirs(mfa_input_group_dir, exist_ok=True)
226
+ new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
227
+ move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
228
+ move_link_func(wav_align_fn, new_wav_align_fn)
229
+ ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
230
+ for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
231
+ with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
232
+ f_txt.write(ph_gb_word_nosil)
233
+ return ph_gb_word_nosil, new_wav_align_fn
234
+
235
+ def load_spk_map(self, base_dir):
236
+ spk_map_fn = f"{base_dir}/spk_map.json"
237
+ spk_map = json.load(open(spk_map_fn, 'r'))
238
+ return spk_map
239
+
240
+ def load_dict(self, base_dir):
241
+ ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
242
+ word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
243
+ return ph_encoder, word_encoder
244
+
245
+ @property
246
+ def meta_csv_filename(self):
247
+ return 'metadata'
248
+
249
+ @property
250
+ def wav_processed_dirname(self):
251
+ return 'wav_processed'
data_gen/tts/binarizer_zh.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from data_gen.tts.base_binarizer import BaseBinarizer
3
+
4
+
5
+ class ZhBinarizer(BaseBinarizer):
6
+ @staticmethod
7
+ def process_align(tg_fn, item):
8
+ BaseBinarizer.process_align(tg_fn, item)
9
+ # char-level pitch
10
+ if 'f0' in item:
11
+ ph_list = item['ph'].split(" ")
12
+ item['f0_ph'] = np.array([0 for _ in item['f0']], dtype=float)
13
+ char_start_idx = 0
14
+ f0s_char = []
15
+ for idx, (f0_, ph_idx) in enumerate(zip(item['f0'], item['mel2ph'])):
16
+ is_pinyin = ph_list[ph_idx - 1][0].isalpha()
17
+ if not is_pinyin or ph_idx - item['mel2ph'][idx - 1] > 1:
18
+ if len(f0s_char) > 0:
19
+ item['f0_ph'][char_start_idx:idx] = sum(f0s_char) / len(f0s_char)
20
+ f0s_char = []
21
+ char_start_idx = idx
22
+ if not is_pinyin:
23
+ char_start_idx += 1
24
+ if f0_ > 0:
25
+ f0s_char.append(f0_)
data_gen/tts/runs/adapt_mfa_align.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ import os
3
+ import subprocess
4
+ from utils.commons.hparams import hparams, set_hparams
5
+
6
+
7
+ def adapt_mfa_align():
8
+ CORPUS = hparams['processed_data_dir'].split("/")[-1]
9
+ print(f"| Run MFA for {CORPUS}.")
10
+ NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
11
+ subprocess.check_call(
12
+ f'CORPUS={CORPUS} NUM_JOB={NUM_JOB} bash scripts/run_mfa_adapt.sh',
13
+ shell=True)
14
+
15
+
16
+ if __name__ == '__main__':
17
+ set_hparams(print_hparams=False)
18
+ adapt_mfa_align()
data_gen/tts/runs/align_and_binarize.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ from utils.commons.hparams import set_hparams, hparams
3
+ from data_gen.tts.runs.binarize import binarize
4
+ from data_gen.tts.runs.preprocess import preprocess
5
+ from data_gen.tts.runs.train_mfa_align import train_mfa_align
6
+
7
+ if __name__ == '__main__':
8
+ set_hparams()
9
+ preprocess()
10
+ if hparams['preprocess_args']['use_mfa']:
11
+ train_mfa_align()
12
+ binarize()
data_gen/tts/runs/binarize.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ from utils.commons.hparams import hparams, set_hparams
3
+ import importlib
4
+
5
+
6
+ def binarize():
7
+ binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
8
+ pkg = ".".join(binarizer_cls.split(".")[:-1])
9
+ cls_name = binarizer_cls.split(".")[-1]
10
+ binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
11
+ print("| Binarizer: ", binarizer_cls)
12
+ binarizer_cls().process()
13
+
14
+
15
+ if __name__ == '__main__':
16
+ set_hparams()
17
+ binarize()
data_gen/tts/runs/preprocess.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ from utils.commons.hparams import hparams, set_hparams
3
+ import importlib
4
+
5
+
6
+ def preprocess():
7
+ assert hparams['preprocess_cls'] != ''
8
+
9
+ pkg = ".".join(hparams["preprocess_cls"].split(".")[:-1])
10
+ cls_name = hparams["preprocess_cls"].split(".")[-1]
11
+ process_cls = getattr(importlib.import_module(pkg), cls_name)
12
+ process_cls().process()
13
+
14
+
15
+ if __name__ == '__main__':
16
+ set_hparams()
17
+ preprocess()
data_gen/tts/runs/train_mfa_align.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.commons.single_thread_env # NOQA
2
+ import glob
3
+ import subprocess
4
+ from textgrid import TextGrid
5
+ import os
6
+ from utils.commons.hparams import hparams, set_hparams
7
+
8
+
9
+ def train_mfa_align(mfa_outputs="mfa_outputs",
10
+ mfa_inputs="mfa_inputs",
11
+ model_name=None, pretrain_model_name=None,
12
+ mfa_cmd='train'):
13
+ CORPUS = hparams['processed_data_dir'].split("/")[-1]
14
+ NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
15
+ env_vars = [f'CORPUS={CORPUS}', f'NUM_JOB={NUM_JOB}']
16
+ if mfa_outputs is not None:
17
+ env_vars.append(f'MFA_OUTPUTS={mfa_outputs}')
18
+ if mfa_inputs is not None:
19
+ env_vars.append(f'MFA_INPUTS={mfa_inputs}')
20
+ if model_name is not None:
21
+ env_vars.append(f'MODEL_NAME={model_name}')
22
+ if pretrain_model_name is not None:
23
+ env_vars.append(f'PRETRAIN_MODEL_NAME={pretrain_model_name}')
24
+ if mfa_cmd is not None:
25
+ env_vars.append(f'MFA_CMD={mfa_cmd}')
26
+ env_str = ' '.join(env_vars)
27
+ print(f"| Run MFA for {CORPUS}. Env vars: {env_str}")
28
+ subprocess.check_call(f'{env_str} bash mfa_usr/run_mfa_train_align.sh', shell=True)
29
+ mfa_offset = hparams['preprocess_args']['mfa_offset']
30
+ if mfa_offset > 0:
31
+ for tg_fn in glob.glob(f'{hparams["processed_data_dir"]}/{mfa_outputs}/*.TextGrid'):
32
+ tg = TextGrid.fromFile(tg_fn)
33
+ max_time = tg.maxTime
34
+ for tier in tg.tiers:
35
+ for interval in tier.intervals:
36
+ interval.maxTime = min(interval.maxTime + mfa_offset, max_time)
37
+ interval.minTime = min(interval.minTime + mfa_offset, max_time)
38
+ tier.intervals[0].minTime = 0
39
+ tier.maxTime = min(tier.maxTime + mfa_offset, max_time)
40
+ tg.write(tg_fn)
41
+ TextGrid.fromFile(tg_fn)
42
+
43
+
44
+ if __name__ == '__main__':
45
+ set_hparams(print_hparams=False)
46
+ train_mfa_align()
data_gen/tts/txt_processors/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from . import en
data_gen/tts/txt_processors/base_text_processor.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.text.text_encoder import is_sil_phoneme
2
+
3
+ REGISTERED_TEXT_PROCESSORS = {}
4
+
5
+
6
+ def register_txt_processors(name):
7
+ def _f(cls):
8
+ REGISTERED_TEXT_PROCESSORS[name] = cls
9
+ return cls
10
+
11
+ return _f
12
+
13
+
14
+ def get_txt_processor_cls(name):
15
+ return REGISTERED_TEXT_PROCESSORS.get(name, None)
16
+
17
+
18
+ class BaseTxtProcessor:
19
+ @staticmethod
20
+ def sp_phonemes():
21
+ return ['|']
22
+
23
+ @classmethod
24
+ def process(cls, txt, preprocess_args):
25
+ raise NotImplementedError
26
+
27
+ @classmethod
28
+ def postprocess(cls, txt_struct, preprocess_args):
29
+ # remove sil phoneme in head and tail
30
+ while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
31
+ txt_struct = txt_struct[1:]
32
+ while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
33
+ txt_struct = txt_struct[:-1]
34
+ if preprocess_args['with_phsep']:
35
+ txt_struct = cls.add_bdr(txt_struct)
36
+ if preprocess_args['add_eos_bos']:
37
+ txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
38
+ return txt_struct
39
+
40
+ @classmethod
41
+ def add_bdr(cls, txt_struct):
42
+ txt_struct_ = []
43
+ for i, ts in enumerate(txt_struct):
44
+ txt_struct_.append(ts)
45
+ if i != len(txt_struct) - 1 and \
46
+ not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
47
+ txt_struct_.append(['|', ['|']])
48
+ return txt_struct_
data_gen/tts/txt_processors/en.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import unicodedata
3
+
4
+ from g2p_en import G2p
5
+ from g2p_en.expand import normalize_numbers
6
+ from nltk import pos_tag
7
+ from nltk.tokenize import TweetTokenizer
8
+
9
+ from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
10
+ from utils.text.text_encoder import PUNCS, is_sil_phoneme
11
+
12
+
13
+ class EnG2p(G2p):
14
+ word_tokenize = TweetTokenizer().tokenize
15
+
16
+ def __call__(self, text):
17
+ # preprocessing
18
+ words = EnG2p.word_tokenize(text)
19
+ tokens = pos_tag(words) # tuples of (word, tag)
20
+
21
+ # steps
22
+ prons = []
23
+ for word, pos in tokens:
24
+ if re.search("[a-z]", word) is None:
25
+ pron = [word]
26
+
27
+ elif word in self.homograph2features: # Check homograph
28
+ pron1, pron2, pos1 = self.homograph2features[word]
29
+ if pos.startswith(pos1):
30
+ pron = pron1
31
+ else:
32
+ pron = pron2
33
+ elif word in self.cmu: # lookup CMU dict
34
+ pron = self.cmu[word][0]
35
+ else: # predict for oov
36
+ pron = self.predict(word)
37
+
38
+ prons.extend(pron)
39
+ prons.extend([" "])
40
+
41
+ return prons[:-1]
42
+
43
+
44
+ @register_txt_processors('en')
45
+ class TxtProcessor(BaseTxtProcessor):
46
+ g2p = EnG2p()
47
+
48
+ @staticmethod
49
+ def preprocess_text(text):
50
+ text = normalize_numbers(text)
51
+ text = ''.join(char for char in unicodedata.normalize('NFD', text)
52
+ if unicodedata.category(char) != 'Mn') # Strip accents
53
+ text = text.lower()
54
+ text = re.sub("[\'\"()]+", "", text)
55
+ text = re.sub("[-]+", " ", text)
56
+ text = re.sub(f"[^ a-z{PUNCS}]", "", text)
57
+ text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
58
+ text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
59
+ text = text.replace("i.e.", "that is")
60
+ text = text.replace("i.e.", "that is")
61
+ text = text.replace("etc.", "etc")
62
+ text = re.sub(f"([{PUNCS}])", r" \1 ", text)
63
+ text = re.sub(rf"\s+", r" ", text)
64
+ return text
65
+
66
+ @classmethod
67
+ def process(cls, txt, preprocess_args):
68
+ txt = cls.preprocess_text(txt).strip()
69
+ phs = cls.g2p(txt)
70
+ txt_struct = [[w, []] for w in txt.split(" ")]
71
+ i_word = 0
72
+ for p in phs:
73
+ if p == ' ':
74
+ i_word += 1
75
+ else:
76
+ txt_struct[i_word][1].append(p)
77
+ txt_struct = cls.postprocess(txt_struct, preprocess_args)
78
+ return txt_struct, txt
data_gen/tts/wav_processors/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from . import base_processor
2
+ from . import common_processors
data_gen/tts/wav_processors/base_processor.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REGISTERED_WAV_PROCESSORS = {}
2
+
3
+
4
+ def register_wav_processors(name):
5
+ def _f(cls):
6
+ REGISTERED_WAV_PROCESSORS[name] = cls
7
+ return cls
8
+
9
+ return _f
10
+
11
+
12
+ def get_wav_processor_cls(name):
13
+ return REGISTERED_WAV_PROCESSORS.get(name, None)
14
+
15
+
16
+ class BaseWavProcessor:
17
+ @property
18
+ def name(self):
19
+ raise NotImplementedError
20
+
21
+ def output_fn(self, input_fn):
22
+ return f'{input_fn[:-4]}_{self.name}.wav'
23
+
24
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
25
+ raise NotImplementedError
data_gen/tts/wav_processors/common_processors.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import librosa
4
+ import numpy as np
5
+ from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
6
+ from utils.audio import trim_long_silences
7
+ from utils.audio.io import save_wav
8
+ from utils.audio.rnnoise import rnnoise
9
+ from utils.commons.hparams import hparams
10
+
11
+
12
+ @register_wav_processors(name='sox_to_wav')
13
+ class ConvertToWavProcessor(BaseWavProcessor):
14
+ @property
15
+ def name(self):
16
+ return 'ToWav'
17
+
18
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
19
+ if input_fn[-4:] == '.wav':
20
+ return input_fn, sr
21
+ else:
22
+ output_fn = self.output_fn(input_fn)
23
+ subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
24
+ return output_fn, sr
25
+
26
+
27
+ @register_wav_processors(name='sox_resample')
28
+ class ResampleProcessor(BaseWavProcessor):
29
+ @property
30
+ def name(self):
31
+ return 'Resample'
32
+
33
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
34
+ output_fn = self.output_fn(input_fn)
35
+ sr_file = librosa.core.get_samplerate(input_fn)
36
+ if sr != sr_file:
37
+ subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
38
+ y, _ = librosa.core.load(input_fn, sr=sr)
39
+ y, _ = librosa.effects.trim(y)
40
+ save_wav(y, output_fn, sr)
41
+ return output_fn, sr
42
+ else:
43
+ return input_fn, sr
44
+
45
+
46
+ @register_wav_processors(name='trim_sil')
47
+ class TrimSILProcessor(BaseWavProcessor):
48
+ @property
49
+ def name(self):
50
+ return 'TrimSIL'
51
+
52
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
53
+ output_fn = self.output_fn(input_fn)
54
+ y, _ = librosa.core.load(input_fn, sr=sr)
55
+ y, _ = librosa.effects.trim(y)
56
+ save_wav(y, output_fn, sr)
57
+ return output_fn
58
+
59
+
60
+ @register_wav_processors(name='trim_all_sil')
61
+ class TrimAllSILProcessor(BaseWavProcessor):
62
+ @property
63
+ def name(self):
64
+ return 'TrimSIL'
65
+
66
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
67
+ output_fn = self.output_fn(input_fn)
68
+ y, audio_mask, _ = trim_long_silences(
69
+ input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
70
+ save_wav(y, output_fn, sr)
71
+ if preprocess_args['save_sil_mask']:
72
+ os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
73
+ np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
74
+ return output_fn, sr
75
+
76
+
77
+ @register_wav_processors(name='denoise')
78
+ class DenoiseProcessor(BaseWavProcessor):
79
+ @property
80
+ def name(self):
81
+ return 'Denoise'
82
+
83
+ def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
84
+ output_fn = self.output_fn(input_fn)
85
+ rnnoise(input_fn, output_fn, out_sample_rate=sr)
86
+ return output_fn, sr
docs/fastspeech2.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run FastSpeech 2
2
+
3
+ ## Quick Start
4
+
5
+ ### Install Dependencies
6
+
7
+ Install dependencies following [readme.md](../readme.md)
8
+
9
+ ### Set Config Path and Experiment Name
10
+
11
+ ```bash
12
+ export CONFIG_NAME=egs/datasets/audio/lj/fs2_orig.yaml
13
+ export MY_EXP_NAME=fs2_exp
14
+ ```
15
+
16
+ ### Preprocess and binary dataset
17
+
18
+ Prepare dataset following [prepare_data.md](./prepare_data.md)
19
+
20
+ ### Prepare Vocoder
21
+
22
+ Prepare vocoder following [prepare_vocoder.md](./prepare_vocoder.md)
23
+
24
+ ## Training
25
+
26
+ ```bash
27
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
28
+ ```
29
+
30
+ You can check the training and validation curves open Tensorboard via:
31
+
32
+ ```bash
33
+ tensorboard --logdir checkpoints/$MY_EXP_NAME
34
+ ```
35
+
36
+ ## Inference (Testing)
37
+
38
+ ```bash
39
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --infer
40
+ ```
41
+
42
+ ## Citation
43
+
44
+ If you find this useful for your research, please use the following.
45
+
46
+ ```
47
+ @inproceedings{ren2020fastspeech,
48
+ title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech},
49
+ author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan},
50
+ booktitle={International Conference on Learning Representations},
51
+ year={2020}
52
+ }
53
+ ```
docs/framework.md ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Framework of NATSpeech
2
+
3
+ NATSpeech is a simple framework for Non-Autoregressive Text-to-Speech.
4
+
5
+ ## Directory Structure
6
+
7
+ - `egs`: configuration files, which will be loaded by `utils/commons/hparams.py`
8
+ - `data_gen`: data binarization codes
9
+ - `modules`: modules and models
10
+ - `tasks`: the training and inference logics
11
+ - `utils`: commonly used utils
12
+ - `data`: data
13
+ - `raw`: raw data
14
+ - `processed`: data after preprocess
15
+ - `binary`: binary data
16
+ - `checkpoints`: model checkpoints, tensorboard logs and generated results for all experiments.
17
+
18
+ ## How to Add New Tasks and Run?
19
+
20
+ We show the basic steps of adding a new task/model and running the code (LJSpeech dataset as an example).
21
+
22
+ ### Add the model
23
+
24
+ Add your model to `modules`.
25
+
26
+ ### Add the task
27
+
28
+ Task classes are used to manage the training and inference procedures.
29
+
30
+ A new task (e.g., `tasks.tts.fs.FastSpeechTask`) should inherit the base task (`tasks.tts.speech_base.TTSBaseTask`)
31
+ class.
32
+
33
+ You must implement these methods:
34
+
35
+ - `build_tts_model`, which builds the model for your task. - `run_model`, indicating how to use the model in training
36
+ and inference.
37
+
38
+ You can override `test_step` and `save_valid_result` to change the validation/testing logics or add more plots to
39
+ tensorboard.
40
+
41
+ ### Add a new config file
42
+
43
+ Add a new config file in `egs/datasets/audio/lj/YOUR_TASK.yaml`. For example:
44
+
45
+ ```yaml
46
+ base_config: ./base_text2mel.yaml
47
+ task_cls: tasks.tts.fs.FastSpeechTask
48
+
49
+ # model configs
50
+ hidden_size: 256
51
+ dropout: 0.1
52
+
53
+ # some more configs .....
54
+ ```
55
+
56
+ If you use a new dataset `YOUR_DATASET`, you should also add a `YOUR_DATASET_Processor`
57
+ in `egs/datasets/audio/YOUR_DATASET/preprocess.py`, inheriting `data_gen.tts.base_preprocess.BasePreprocessor`, which
58
+ loads some meta information of the dataset.
59
+
60
+ ### Preprocess and binary dataset
61
+
62
+ ```bash
63
+ python data_gen/tts/runs/align_and_binarize.py --config egs/datasets/audio/lj/base_text2mel.yaml
64
+ ```
65
+
66
+ ### Training
67
+
68
+ ```bash
69
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config YOUR_CONFIG --exp_name YOUR_EXP_NAME --reset
70
+ ```
71
+
72
+ You can open Tensorboard via:
73
+
74
+ ```bash
75
+ tensorboard --logdir checkpoints/EXP_NAME
76
+ ```
77
+
78
+ ### Inference (Testing)
79
+
80
+ ```bash
81
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config egs/datasets/audio/lj/YOUR_TASK.yaml --exp_name YOUR_EXP_NAME --reset --infer
82
+ ```
83
+
84
+ ## Design Philosophy
85
+
86
+ ### Random-Access Binarized Dataset
87
+
88
+ To address the IO problem when reading small files, we design a `IndexedDataset` class (_utils/commons/indexed_datasets.py_)
89
+
90
+ ### Global Config
91
+
92
+ We introduce a global config `hparams`, which is load from a `.yaml` config file and can be used in anywhere. However,
93
+ we do not recommend using it in some general-purpose modules.
94
+
95
+ ### BaseTrainer Framework
96
+
97
+ Our [base trainer](utils/commons/trainer.py) and [base task ](utils/commons/base_task.py) classes refer
98
+ to [PytorchLightning](https://github.com/PyTorchLightning/pytorch-lightning), which builds some commonly used
99
+ training/inference code structure. Our framework supports multi-process GPU training without changing the subclass
100
+ codes.
101
+
102
+ ### Checkpoint Saving
103
+
104
+ All checkpoints and tensorboard logs are saved in `checkpoints/EXP_NAME`, where `EXP_NAME` is set in the running
105
+ command: `python tasks/run.py .... --exp_name EXP_NAME`. You can use `tensorboard --logdir checkpoints/EXP_NAME` to open
106
+ the tensorboard and check the training loss curves etc.
docs/portaspeech.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run PortaSpeech
2
+
3
+ ## Quick Start
4
+
5
+ ### Install Dependencies
6
+
7
+ Install dependencies following [readme.md](../readme.md)
8
+
9
+ ### Set Config Path and Experiment Name
10
+
11
+ #### PortaSpeech (normal)
12
+ ```bash
13
+ export CONFIG_NAME=egs/datasets/audio/lj/ps_flow_nips2021.yaml
14
+ export MY_EXP_NAME=ps_normal_exp
15
+ ```
16
+
17
+ #### PortaSpeech (small)
18
+ ```bash
19
+ export CONFIG_NAME=egs/datasets/audio/lj/ps_flow_small_nips2021.yaml
20
+ export MY_EXP_NAME=ps_small_exp
21
+ ```
22
+
23
+ ### Preprocess and binary dataset
24
+
25
+ Prepare dataset following [prepare_data.md](./prepare_data.md)
26
+
27
+ ### Prepare Vocoder
28
+
29
+ Prepare vocoder following [prepare_vocoder.md](./prepare_vocoder.md)
30
+
31
+ ## Training
32
+
33
+ ```bash
34
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
35
+ ```
36
+
37
+ You can check the training and validation curves open Tensorboard via:
38
+
39
+ ```bash
40
+ tensorboard --logdir checkpoints/$MY_EXP_NAME
41
+ ```
42
+
43
+ ## Inference (Testing)
44
+
45
+ ```bash
46
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $PS_CONFIG --exp_name $MY_EXP_NAME --infer
47
+ ```
48
+
49
+ ## Citation
50
+
51
+ If you find this useful for your research, please use the following.
52
+
53
+ ```
54
+ @article{ren2021portaspeech,
55
+ title={PortaSpeech: Portable and High-Quality Generative Text-to-Speech},
56
+ author={Ren, Yi and Liu, Jinglin and Zhao, Zhou},
57
+ journal={Advances in Neural Information Processing Systems},
58
+ volume={34},
59
+ year={2021}
60
+ }
61
+ ```
docs/prepare_data.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prepare Dataset
2
+
3
+ ## LJSpeech
4
+
5
+ ### Download Dataset
6
+ ```bash
7
+ mkdir -p data/raw/ljspeech
8
+ cd data/raw
9
+ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
10
+ bzip2 -d LJSpeech-1.1.tar.bz2
11
+ tar -xvf LJSpeech-1.1.tar
12
+ cd ../../
13
+ ```
14
+
15
+ ### Forced Align and Preprocess Dataset
16
+ ```bash
17
+ # Preprocess step: text and unify the file structure.
18
+ python data_gen/tts/runs/preprocess.py --config $CONFIG_NAME
19
+ # Align step: MFA alignment.
20
+ python data_gen/tts/runs/train_mfa_align.py --config $CONFIG_NAME
21
+ # Binarization step: Binarize data for fast IO. You only need to rerun this line when running different task if you have `preprocess`ed and `align`ed the dataset before.
22
+ python data_gen/tts/runs/binarize.py --config $CONFIG_NAME
23
+ ```
24
+
25
+ ## More datasets will be supported soon...
docs/prepare_vocoder.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prepare Vocoder
2
+
3
+ We use [HiFi-GAN](https://github.com/jik876/hifi-gan) as the default vocoder.
4
+
5
+ ## LJSpeech
6
+
7
+ ### Use Pretrained Model
8
+
9
+ ```bash
10
+ wget https://github.com/xx/xx/releases/download/pretrain-model/hifi_lj.zip
11
+ unzip hifi_lj.zip
12
+ mv hifi_lj checkpoints/hifi_lj
13
+ ```
14
+
15
+ ### Train Your Vocoder
16
+
17
+ #### Set Config Path and Experiment Name
18
+
19
+ ```bash
20
+ export CONFIG_NAME=egs/datasets/audio/lj/hifigan.yaml
21
+ export MY_EXP_NAME=my_hifigan_exp
22
+ ```
23
+
24
+ #### Prepare Dataset
25
+
26
+ Prepare dataset following [prepare_data.md](./prepare_data.md).
27
+
28
+ If you have run the `prepare_data` step of the acoustic
29
+ model (e.g., FastSpeech 2 and PortaSpeech), you only need to binarize the dataset for the vocoder training:
30
+
31
+ ```bash
32
+ python data_gen/tts/runs/binarize.py --config $CONFIG_NAME
33
+ ```
34
+
35
+ #### Training
36
+
37
+ ```bash
38
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $CONFIG_NAME --exp_name $MY_EXP_NAME --reset
39
+ ```
40
+
41
+ #### Inference (Testing)
42
+
43
+ ```bash
44
+ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config $PS_CONFIG --exp_name $MY_EXP_NAME --infer
45
+ ```
46
+
47
+ #### Use the trained vocoder
48
+ Modify the `vocoder_ckpt` in config files of acoustic models (e.g., `egs/datasets/audio/lj/base_text2mel.yaml`) to $MY_EXP_NAME (e.g., `vocoder_ckpt: checkpoints/my_hifigan_exp`)
49
+
egs/datasets/audio/lj/base_mel2wav.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ base_config: egs/egs_bases/tts/vocoder/base.yaml
2
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
3
+ processed_data_dir: 'data/processed/ljspeech'
4
+ binary_data_dir: 'data/binary/ljspeech_wav'
egs/datasets/audio/lj/base_text2mel.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: egs/egs_bases/tts/base.yaml
2
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
3
+ processed_data_dir: 'data/processed/ljspeech'
4
+ binary_data_dir: 'data/binary/ljspeech'
5
+ preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
6
+ binarization_args:
7
+ train_range: [ 871, -1 ]
8
+ test_range: [ 0, 523 ]
9
+ valid_range: [ 523, 871 ]
10
+ test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
11
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
12
+ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
13
+ 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
14
+ f0_min: 80
15
+ f0_max: 600
16
+ vocoder_ckpt: checkpoints/hifi_lj
egs/datasets/audio/lj/fs.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/fs.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/lj/fs2_orig.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/fs2_orig.yaml
3
+ - ./base_text2mel.yaml
4
+ binary_data_dir: 'data/binary/ljspeech_cwt'
egs/datasets/audio/lj/hifigan.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/vocoder/hifigan.yaml
3
+ - ./base_mel2wav.yaml
egs/datasets/audio/lj/preprocess.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from data_gen.tts.base_preprocess import BasePreprocessor
2
+
3
+
4
+ class LJPreprocess(BasePreprocessor):
5
+ def meta_data(self):
6
+ for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
7
+ item_name, _, txt = l.strip().split("|")
8
+ wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
9
+ yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt}
egs/datasets/audio/lj/ps_flow.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/ps_flow.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/lj/ps_flow_nips2021.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - ./ps_flow.yaml
3
+ max_sentences: 64
4
+ dur_level: word
5
+ use_word_encoder: false
6
+ enc_prenet: true
7
+ enc_pre_ln: false
8
+ fvae_encoder_type: wn
9
+ fvae_decoder_type: wn
10
+ text_encoder_postnet: false
11
+ warmup_updates: 8000
egs/datasets/audio/lj/ps_flow_small.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/ps_flow_small.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/lj/ps_flow_small_nips2021.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - ./ps_flow_small.yaml
3
+ max_sentences: 128
4
+ dur_level: word
5
+ use_word_encoder: false
6
+ enc_prenet: true
7
+ enc_pre_ln: false
8
+ fvae_encoder_type: wn
9
+ fvae_decoder_type: wn
10
+ text_encoder_postnet: false
11
+ warmup_updates: 8000
egs/egs_bases/config_base.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # task
2
+ binary_data_dir: ''
3
+ work_dir: '' # experiment directory.
4
+ infer: false # infer
5
+ amp: false
6
+ seed: 1234
7
+ debug: false
8
+ save_codes: ['tasks', 'modules', 'egs']
9
+
10
+ #############
11
+ # dataset
12
+ #############
13
+ ds_workers: 1
14
+ test_num: 100
15
+ endless_ds: true
16
+ sort_by_len: true
17
+
18
+ #########
19
+ # train and eval
20
+ #########
21
+ print_nan_grads: false
22
+ load_ckpt: ''
23
+ save_best: false
24
+ num_ckpt_keep: 3
25
+ clip_grad_norm: 0
26
+ accumulate_grad_batches: 1
27
+ tb_log_interval: 100
28
+ num_sanity_val_steps: 5 # steps of validation at the beginning
29
+ check_val_every_n_epoch: 10
30
+ val_check_interval: 2000
31
+ valid_monitor_key: 'val_loss'
32
+ valid_monitor_mode: 'min'
33
+ max_epochs: 1000
34
+ max_updates: 1000000
35
+ max_tokens: 40000
36
+ max_sentences: 100000
37
+ max_valid_tokens: -1
38
+ max_valid_sentences: -1
39
+ eval_max_batches: -1
40
+ resume_from_checkpoint: 0
41
+ rename_tmux: true
egs/egs_bases/tts/base.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # task
2
+ base_config:
3
+ - ../config_base.yaml
4
+ - ./dataset_params.yaml
5
+
6
+ #############
7
+ # dataset in training
8
+ #############
9
+ endless_ds: true
10
+ min_frames: 0
11
+ max_frames: 1548
12
+ frames_multiple: 1
13
+ max_input_tokens: 1550
14
+ ds_workers: 1
15
+
16
+ #########
17
+ # model
18
+ #########
19
+ use_spk_id: false
20
+ use_spk_embed: false
21
+ mel_losses: "ssim:0.5|l1:0.5"
22
+
23
+ ###########
24
+ # optimization
25
+ ###########
26
+ lr: 0.0005
27
+ scheduler: warmup # rsqrt|warmup|none
28
+ warmup_updates: 4000
29
+ optimizer_adam_beta1: 0.9
30
+ optimizer_adam_beta2: 0.98
31
+ weight_decay: 0
32
+ clip_grad_norm: 1
33
+ clip_grad_value: 0
34
+
35
+
36
+ ###########
37
+ # train and eval
38
+ ###########
39
+ use_word_input: false
40
+ max_valid_sentences: 1
41
+ max_valid_tokens: 60000
42
+ valid_infer_interval: 10000
43
+ train_set_name: 'train'
44
+ train_sets: ''
45
+ valid_set_name: 'valid'
46
+ test_set_name: 'test'
47
+ num_valid_plots: 10
48
+ test_ids: [ ]
49
+ test_input_yaml: ''
50
+ vocoder: HifiGAN
51
+ vocoder_ckpt: ''
52
+ profile_infer: false
53
+ out_wav_norm: false
54
+ save_gt: true
55
+ save_f0: false
56
+ gen_dir_name: ''
egs/egs_bases/tts/dataset_params.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio_num_mel_bins: 80
2
+ audio_sample_rate: 22050
3
+ hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
4
+ win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
5
+ fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
6
+ fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
7
+ fmax: 7600 # To be increased/reduced depending on data.
8
+ f0_min: 80
9
+ f0_max: 800
10
+ griffin_lim_iters: 30
11
+ pitch_extractor: parselmouth
12
+ num_spk: 1
13
+ mel_vmin: -6
14
+ mel_vmax: 1.5
15
+ loud_norm: false
16
+
17
+ raw_data_dir: ''
18
+ processed_data_dir: ''
19
+ binary_data_dir: ''
20
+ preprocess_cls: ''
21
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
22
+ preprocess_args:
23
+ nsample_per_mfa_group: 1000
24
+ # text process
25
+ txt_processor: en
26
+ use_mfa: true
27
+ with_phsep: true
28
+ reset_phone_dict: true
29
+ reset_word_dict: true
30
+ add_eos_bos: true
31
+ # mfa
32
+ mfa_group_shuffle: false
33
+ mfa_offset: 0.02
34
+ # wav processors
35
+ wav_processors: [ ]
36
+ save_sil_mask: true
37
+ vad_max_silence_length: 12
38
+ binarization_args:
39
+ shuffle: false
40
+ with_wav: false
41
+ with_align: true
42
+ with_spk_embed: false
43
+ with_f0: true
44
+ with_f0cwt: false
45
+ with_linear: false
46
+ trim_eos_bos: false
47
+ min_sil_duration: 0.1
48
+ train_range: [ 200, -1 ]
49
+ test_range: [ 0, 100 ]
50
+ valid_range: [ 100, 200 ]
51
+ word_dict_size: 10000
52
+ pitch_key: pitch
egs/egs_bases/tts/fs.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./base.yaml
2
+ task_cls: tasks.tts.fs.FastSpeechTask
3
+
4
+ # model
5
+ hidden_size: 256
6
+ dropout: 0.0
7
+ encoder_type: rel_fft # rel_fft|fft|tacotron|tacotron2|conformer
8
+ decoder_type: conv # fft|rnn|conv|conformer|wn
9
+
10
+ # rnn enc/dec
11
+ encoder_K: 8
12
+ decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
13
+
14
+ # fft enc/dec
15
+ enc_layers: 4
16
+ enc_ffn_kernel_size: 9
17
+ enc_prenet: true
18
+ enc_pre_ln: true
19
+ dec_layers: 4
20
+ dec_ffn_kernel_size: 9
21
+ num_heads: 2
22
+ ffn_act: gelu
23
+ ffn_hidden_size: 1024
24
+ use_pos_embed: true
25
+
26
+ # conv enc/dec
27
+ enc_dec_norm: ln
28
+ conv_use_pos: false
29
+ layers_in_block: 2
30
+ enc_dilations: [ 1, 1, 1, 1 ]
31
+ enc_kernel_size: 5
32
+ enc_post_net_kernel: 3
33
+ dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
34
+ dec_kernel_size: 5
35
+ dec_post_net_kernel: 3
36
+
37
+ # duration
38
+ predictor_hidden: -1
39
+ predictor_kernel: 5
40
+ predictor_layers: 2
41
+ dur_predictor_kernel: 3
42
+ dur_predictor_layers: 2
43
+ predictor_dropout: 0.5
44
+
45
+ # pitch and energy
46
+ use_pitch_embed: false
47
+ pitch_type: frame # frame|ph|cwt
48
+ use_uv: true
49
+
50
+ # reference encoder and speaker embedding
51
+ lambda_commit: 0.25
52
+ ref_norm_layer: bn
53
+ dec_inp_add_noise: false
54
+
55
+ # mel
56
+ mel_losses: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
57
+
58
+ # loss lambda
59
+ lambda_f0: 1.0
60
+ lambda_uv: 1.0
61
+ lambda_energy: 0.1
62
+ lambda_ph_dur: 0.1
63
+ lambda_sent_dur: 1.0
64
+ lambda_word_dur: 1.0
65
+ predictor_grad: 0.1
66
+
67
+ # train and eval
68
+ warmup_updates: 4000
69
+ max_tokens: 40000
70
+ max_sentences: 128
71
+ max_valid_sentences: 1
72
+ max_updates: 160000
73
+ use_gt_dur: false
74
+ use_gt_f0: false
75
+ ds_workers: 2