Rongjiehuang commited on
Commit
1f001bb
0 Parent(s):

First model version

Browse files
.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
26
+ *.tflite filter=lfs diff=lfs merge=lfs -text
27
+ *.tgz filter=lfs diff=lfs merge=lfs -text
28
+ *.wasm filter=lfs diff=lfs merge=lfs -text
29
+ *.xz filter=lfs diff=lfs merge=lfs -text
30
+ *.zip filter=lfs diff=lfs merge=lfs -text
31
+ *.zst filter=lfs diff=lfs merge=lfs -text
32
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
33
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Project ignore
2
+
3
+ /ParallelWaveGAN
4
+ /wavegan_pretrained*
5
+ /pretrained_models
6
+ rsync
7
+ .idea
8
+ .DS_Store
9
+ bak
10
+ tmp
11
+ *.tar.gz
12
+ # mfa and kaldi
13
+ kaldi_align/exp
14
+ mfa
15
+ montreal-forced-aligner
16
+ mos
17
+ nbs
18
+ /configs_usr/*
19
+ !/configs_usr/.gitkeep
20
+ /fast_transformers
21
+ /rnnoise
22
+ /usr/*
23
+ !/usr/.gitkeep
24
+
25
+ # Created by .ignore support plugin (hsz.mobi)
26
+ ### Python template
27
+ # Byte-compiled / optimized / DLL files
28
+ __pycache__/
29
+ *.py[cod]
30
+ *$py.class
31
+
32
+ # C extensions
33
+ *.so
34
+
35
+ # Distribution / packaging
36
+ .Python
37
+ build/
38
+ develop-eggs/
39
+ dist/
40
+ downloads/
41
+ eggs/
42
+ .eggs/
43
+ lib/
44
+ lib64/
45
+ parts/
46
+ sdist/
47
+ var/
48
+ wheels/
49
+ pip-wheel-metadata/
50
+ share/python-wheels/
51
+ *.egg-info/
52
+ .installed.cfg
53
+ *.egg
54
+ MANIFEST
55
+
56
+ # PyInstaller
57
+ # Usually these files are written by a python script from a template
58
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
59
+ *.manifest
60
+ *.spec
61
+
62
+ # Installer logs
63
+ pip-log.txt
64
+ pip-delete-this-directory.txt
65
+
66
+ # Unit test / coverage reports
67
+ htmlcov/
68
+ .tox/
69
+ .nox/
70
+ .coverage
71
+ .coverage.*
72
+ .cache
73
+ nosetests.xml
74
+ coverage.xml
75
+ *.cover
76
+ .hypothesis/
77
+ .pytest_cache/
78
+
79
+ # Translations
80
+ *.mo
81
+ *.pot
82
+
83
+ # Django stuff:
84
+ *.log
85
+ local_settings.py
86
+ db.sqlite3
87
+ db.sqlite3-journal
88
+
89
+ # Flask stuff:
90
+ instance/
91
+ .webassets-cache
92
+
93
+ # Scrapy stuff:
94
+ .scrapy
95
+
96
+ # Sphinx documentation
97
+ docs/_build/
98
+
99
+ # PyBuilder
100
+ target/
101
+
102
+ # Jupyter Notebook
103
+ .ipynb_checkpoints
104
+
105
+ # IPython
106
+ profile_default/
107
+ ipython_config.py
108
+
109
+ # pyenv
110
+ .python-version
111
+
112
+ # pipenv
113
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
114
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
115
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
116
+ # install all needed dependencies.
117
+ #Pipfile.lock
118
+
119
+ # celery beat schedule file
120
+ celerybeat-schedule
121
+
122
+ # SageMath parsed files
123
+ *.sage.py
124
+
125
+ # Environments
126
+ .env
127
+ .venv
128
+ env/
129
+ venv/
130
+ ENV/
131
+ env.bak/
132
+ venv.bak/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # Pyre type checker
150
+ .pyre/
151
+ 将删除 datasets/remi/test/
README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ tags:
4
+ - text-to-speech
5
+ - neural-vocoder
6
+ inference: false
7
+ extra_gated_prompt: |-
8
+ One more step before getting this model.
9
+ This model is open access and available to all, with a license further specifying rights and usage.
10
+
11
+ Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
12
+
13
+
14
+ By clicking on "Access repository" below, you accept that your *contact information* (email address and username) can be shared with the model authors as well.
15
+
16
+ extra_gated_fields:
17
+ I have read the License and agree with its terms: checkbox
18
+ ---
19
+
20
+ # ProDiff and FastDiff Model Card
21
+
22
+ ## Key Features
23
+ - **Extremely-Fast** diffusion text-to-speech synthesis pipeline for potential **industrial deployment**.
24
+ - **Tutorial and code base** for speech diffusion models.
25
+ - More **supported diffusion mechanism** (e.g., guided diffusion) will be available.
26
+
27
+
28
+ ## Model Details
29
+ - **Developed by:** Robin Rombach, Patrick Esser
30
+ - **Model type:** Diffusion-based text-to-speech generation model
31
+ - **Language(s):** English
32
+ - **License:**
33
+ - **Model Description:** A conditional diffusion probabilistic model capable of generating high fidelity speech efficiently.
34
+ - **Resources for more information:** [FastDiff GitHub Repository](https://github.com/Rongjiehuang/FastDiff), [FastDiff Paper](https://arxiv.org/abs/2204.09934). [ProDiff GitHub Repository](https://github.com/Rongjiehuang/ProDiff), [ProDiff Paper](https://arxiv.org/abs/2207.06389).
35
+ - **Cite as:**
36
+
37
+ @inproceedings{huang2022prodiff,
38
+ title={ProDiff: Progressive Fast Diffusion Model For High-Quality Text-to-Speech},
39
+ author={Huang, Rongjie and Zhao, Zhou and Liu, Huadai and Liu, Jinglin and Cui, Chenye and Ren, Yi},
40
+ booktitle={Proceedings of the 30th ACM International Conference on Multimedia},
41
+ year={2022}
42
+
43
+ @inproceedings{huang2022fastdiff,
44
+ title={FastDiff: A Fast Conditional Diffusion Model for High-Quality Speech Synthesis},
45
+ author={Huang, Rongjie and Lam, Max WY and Wang, Jun and Su, Dan and Yu, Dong and Ren, Yi and Zhao, Zhou},
46
+ booktitle = {Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, {IJCAI-22}},
47
+ year={2022}
48
+ -
49
+
50
+
51
+ *This model card was written based on the [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
checkpoints/FastDiff/config.yaml ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ N: ''
2
+ T: 1000
3
+ accumulate_grad_batches: 1
4
+ amp: false
5
+ audio_channels: 1
6
+ audio_num_mel_bins: 80
7
+ audio_sample_rate: 22050
8
+ aux_context_window: 0
9
+ beta_0: 1.0e-06
10
+ beta_T: 0.01
11
+ binarization_args:
12
+ reset_phone_dict: true
13
+ reset_word_dict: true
14
+ shuffle: false
15
+ trim_eos_bos: false
16
+ with_align: false
17
+ with_f0: false
18
+ with_f0cwt: false
19
+ with_linear: false
20
+ with_spk_embed: false
21
+ with_spk_id: true
22
+ with_txt: false
23
+ with_wav: true
24
+ with_word: false
25
+ binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
26
+ binary_data_dir: data/binary/LJSpeech
27
+ check_val_every_n_epoch: 10
28
+ clip_grad_norm: 1
29
+ clip_grad_value: 0
30
+ cond_channels: 80
31
+ debug: false
32
+ dec_ffn_kernel_size: 9
33
+ dec_layers: 4
34
+ dict_dir: ''
35
+ diffusion_step_embed_dim_in: 128
36
+ diffusion_step_embed_dim_mid: 512
37
+ diffusion_step_embed_dim_out: 512
38
+ disc_start_steps: 40000
39
+ discriminator_grad_norm: 1
40
+ dropout: 0.0
41
+ ds_workers: 1
42
+ enc_ffn_kernel_size: 9
43
+ enc_layers: 4
44
+ endless_ds: true
45
+ eval_max_batches: -1
46
+ ffn_act: gelu
47
+ ffn_padding: SAME
48
+ fft_size: 1024
49
+ fmax: 7600
50
+ fmin: 80
51
+ frames_multiple: 1
52
+ gen_dir_name: ''
53
+ generator_grad_norm: 10
54
+ griffin_lim_iters: 60
55
+ hidden_size: 256
56
+ hop_size: 256
57
+ infer: false
58
+ inner_channels: 32
59
+ kpnet_conv_size: 3
60
+ kpnet_hidden_channels: 64
61
+ load_ckpt: ''
62
+ loud_norm: false
63
+ lr: 2e-4
64
+ lvc_kernel_size: 3
65
+ lvc_layers_each_block: 4
66
+ max_epochs: 1000
67
+ max_frames: 1548
68
+ max_input_tokens: 1550
69
+ max_samples: 25600
70
+ max_sentences: 20
71
+ max_tokens: 30000
72
+ max_updates: 1000000
73
+ max_valid_sentences: 1
74
+ max_valid_tokens: 60000
75
+ mel_loss: l1
76
+ mel_vmax: 1.5
77
+ mel_vmin: -6
78
+ mfa_version: 2
79
+ min_frames: 0
80
+ min_level_db: -100
81
+ noise_schedule: ''
82
+ num_ckpt_keep: 3
83
+ num_heads: 2
84
+ num_mels: 80
85
+ num_sanity_val_steps: -1
86
+ num_spk: 400
87
+ num_test_samples: 0
88
+ num_valid_plots: 10
89
+ optimizer_adam_beta1: 0.9
90
+ optimizer_adam_beta2: 0.98
91
+ out_wav_norm: false
92
+ pitch_extractor: parselmouth
93
+ pre_align_args:
94
+ allow_no_txt: false
95
+ denoise: false
96
+ nsample_per_mfa_group: 1000
97
+ sox_resample: false
98
+ sox_to_wav: false
99
+ trim_sil: false
100
+ txt_processor: en
101
+ use_tone: true
102
+ pre_align_cls: egs.datasets.audio.pre_align.PreAlign
103
+ print_nan_grads: false
104
+ processed_data_dir: data/processed/LJSpeech
105
+ profile_infer: false
106
+ raw_data_dir: data/raw/LJSpeech-1.1
107
+ ref_level_db: 20
108
+ rename_tmux: true
109
+ resume_from_checkpoint: 0
110
+ save_best: true
111
+ save_codes: []
112
+ save_f0: false
113
+ save_gt: true
114
+ scheduler: rsqrt
115
+ seed: 1234
116
+ sort_by_len: true
117
+ task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
118
+ tb_log_interval: 100
119
+ test_ids: []
120
+ test_input_dir: ''
121
+ test_mel_dir: ''
122
+ test_num: 100
123
+ test_set_name: test
124
+ train_set_name: train
125
+ train_sets: ''
126
+ upsample_ratios:
127
+ - 8
128
+ - 8
129
+ - 4
130
+ use_pitch_embed: false
131
+ use_spk_embed: false
132
+ use_spk_id: false
133
+ use_split_spk_id: false
134
+ use_wav: true
135
+ use_weight_norm: true
136
+ use_word_input: false
137
+ val_check_interval: 2000
138
+ valid_infer_interval: 10000
139
+ valid_monitor_key: val_loss
140
+ valid_monitor_mode: min
141
+ valid_set_name: valid
142
+ vocoder_denoise_c: 0.0
143
+ warmup_updates: 8000
144
+ weight_decay: 0
145
+ win_length: null
146
+ win_size: 1024
147
+ window: hann
148
+ word_size: 30000
149
+ work_dir: checkpoints/FastDiff
checkpoints/FastDiff/model_ckpt_steps_500000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee7b6022e525c71a6025b41eeeafff9d6186b52cba76b580d6986bc8674902f3
3
+ size 183951271
checkpoints/ProDiff/config.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ amp: false
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 22050
5
+ base_config:
6
+ - ./base.yaml
7
+ binarization_args:
8
+ reset_phone_dict: true
9
+ reset_word_dict: true
10
+ shuffle: false
11
+ trim_eos_bos: false
12
+ trim_sil: false
13
+ with_align: true
14
+ with_f0: true
15
+ with_f0cwt: false
16
+ with_linear: false
17
+ with_spk_embed: false
18
+ with_spk_id: true
19
+ with_txt: true
20
+ with_wav: false
21
+ with_word: true
22
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
23
+ binary_data_dir: data/binary/LJSpeech
24
+ check_val_every_n_epoch: 10
25
+ clip_grad_norm: 1
26
+ clip_grad_value: 0
27
+ conv_use_pos: false
28
+ cwt_add_f0_loss: false
29
+ cwt_hidden_size: 128
30
+ cwt_layers: 2
31
+ cwt_loss: l1
32
+ cwt_std_scale: 0.8
33
+ debug: false
34
+ dec_dilations:
35
+ - 1
36
+ - 1
37
+ - 1
38
+ - 1
39
+ dec_ffn_kernel_size: 9
40
+ dec_inp_add_noise: false
41
+ dec_kernel_size: 5
42
+ dec_layers: 4
43
+ dec_num_heads: 2
44
+ decoder_rnn_dim: 0
45
+ decoder_type: fft
46
+ dict_dir: ''
47
+ diff_decoder_type: wavenet
48
+ diff_loss_type: l1
49
+ dilation_cycle_length: 1
50
+ dropout: 0.1
51
+ ds_workers: 2
52
+ dur_enc_hidden_stride_kernel:
53
+ - 0,2,3
54
+ - 0,2,3
55
+ - 0,1,3
56
+ dur_loss: mse
57
+ dur_predictor_kernel: 3
58
+ dur_predictor_layers: 2
59
+ enc_dec_norm: ln
60
+ enc_dilations:
61
+ - 1
62
+ - 1
63
+ - 1
64
+ - 1
65
+ enc_ffn_kernel_size: 9
66
+ enc_kernel_size: 5
67
+ enc_layers: 4
68
+ encoder_K: 8
69
+ encoder_type: fft
70
+ endless_ds: true
71
+ ffn_act: gelu
72
+ ffn_hidden_size: 1024
73
+ ffn_padding: SAME
74
+ fft_size: 1024
75
+ fmax: 7600
76
+ fmin: 80
77
+ frames_multiple: 1
78
+ gen_dir_name: ''
79
+ gen_tgt_spk_id: -1
80
+ griffin_lim_iters: 60
81
+ hidden_size: 256
82
+ hop_size: 256
83
+ infer: false
84
+ keep_bins: 80
85
+ lambda_commit: 0.25
86
+ lambda_energy: 0.1
87
+ lambda_f0: 1.0
88
+ lambda_ph_dur: 0.1
89
+ lambda_sent_dur: 1.0
90
+ lambda_uv: 1.0
91
+ lambda_word_dur: 1.0
92
+ layers_in_block: 2
93
+ load_ckpt: ''
94
+ loud_norm: false
95
+ lr: 1.0
96
+ max_beta: 0.06
97
+ max_epochs: 1000
98
+ max_frames: 1548
99
+ max_input_tokens: 1550
100
+ max_sentences: 48
101
+ max_tokens: 32000
102
+ max_updates: 200000
103
+ max_valid_sentences: 1
104
+ max_valid_tokens: 60000
105
+ mel_loss: ssim:0.5|l1:0.5
106
+ mel_vmax: 1.5
107
+ mel_vmin: -6
108
+ min_frames: 0
109
+ min_level_db: -100
110
+ num_ckpt_keep: 3
111
+ num_heads: 2
112
+ num_sanity_val_steps: -1
113
+ num_spk: 1
114
+ num_test_samples: 0
115
+ num_valid_plots: 10
116
+ optimizer_adam_beta1: 0.9
117
+ optimizer_adam_beta2: 0.98
118
+ out_wav_norm: false
119
+ pitch_ar: false
120
+ pitch_embed_type: 0
121
+ pitch_enc_hidden_stride_kernel:
122
+ - 0,2,5
123
+ - 0,2,5
124
+ - 0,2,5
125
+ pitch_extractor: parselmouth
126
+ pitch_loss: l1
127
+ pitch_norm: standard
128
+ pitch_ssim_win: 11
129
+ pitch_type: frame
130
+ pre_align_args:
131
+ allow_no_txt: false
132
+ denoise: false
133
+ sox_resample: false
134
+ sox_to_wav: false
135
+ trim_sil: false
136
+ txt_processor: en
137
+ use_tone: true
138
+ pre_align_cls: ''
139
+ predictor_dropout: 0.5
140
+ predictor_grad: 0.1
141
+ predictor_hidden: -1
142
+ predictor_kernel: 5
143
+ predictor_layers: 2
144
+ pretrain_fs_ckpt: ''
145
+ print_nan_grads: false
146
+ processed_data_dir: data/processed/LJSpeech
147
+ profile_infer: false
148
+ raw_data_dir: data/raw/LJSpeech
149
+ ref_hidden_stride_kernel:
150
+ - 0,3,5
151
+ - 0,3,5
152
+ - 0,2,5
153
+ - 0,2,5
154
+ - 0,2,5
155
+ ref_level_db: 20
156
+ ref_norm_layer: bn
157
+ rename_tmux: true
158
+ residual_channels: 256
159
+ residual_layers: 20
160
+ resume_from_checkpoint: 0
161
+ save_best: true
162
+ save_codes: []
163
+ save_f0: false
164
+ save_gt: true
165
+ schedule_type: vpsde
166
+ scheduler: rsqrt
167
+ seed: 1234
168
+ sil_add_noise: false
169
+ sort_by_len: true
170
+ spec_max: []
171
+ spec_min: []
172
+ task_cls: modules.ProDiff.task.ProDiff_task.ProDiff_Task
173
+ tb_log_interval: 100
174
+ teacher_ckpt: checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt
175
+ test_ids: []
176
+ test_input_dir: ''
177
+ test_num: 100
178
+ test_set_name: test
179
+ timesteps: 4
180
+ train_set_name: train
181
+ train_sets: ''
182
+ use_cond_disc: true
183
+ use_energy_embed: true
184
+ use_gt_dur: true
185
+ use_gt_f0: true
186
+ use_pitch_embed: true
187
+ use_pos_embed: true
188
+ use_ref_enc: false
189
+ use_spk_embed: false
190
+ use_spk_id: false
191
+ use_split_spk_id: false
192
+ use_uv: true
193
+ use_var_enc: false
194
+ val_check_interval: 2000
195
+ valid_infer_interval: 10000
196
+ valid_monitor_key: val_loss
197
+ valid_monitor_mode: min
198
+ valid_set_name: valid
199
+ var_enc_vq_codes: 64
200
+ vocoder_denoise_c: 0.0
201
+ warmup_updates: 2000
202
+ weight_decay: 0
203
+ win_size: 1024
204
+ word_size: 30000
205
+ work_dir: checkpoints/ProDiff
checkpoints/ProDiff/model_ckpt_steps_200000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cc8aad355c297b010e2c362341f736b3477744af76e02f6c9965409a7e9113a
3
+ size 349055740
checkpoints/ProDiff_Teacher/config.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ amp: false
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 22050
5
+ base_config:
6
+ - ./base.yaml
7
+ binarization_args:
8
+ reset_phone_dict: true
9
+ reset_word_dict: true
10
+ shuffle: false
11
+ trim_eos_bos: false
12
+ trim_sil: false
13
+ with_align: true
14
+ with_f0: true
15
+ with_f0cwt: false
16
+ with_linear: false
17
+ with_spk_embed: false
18
+ with_spk_id: true
19
+ with_txt: true
20
+ with_wav: false
21
+ with_word: true
22
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
23
+ binary_data_dir: data/binary/LJSpeech
24
+ check_val_every_n_epoch: 10
25
+ clip_grad_norm: 1
26
+ clip_grad_value: 0
27
+ conv_use_pos: false
28
+ cwt_add_f0_loss: false
29
+ cwt_hidden_size: 128
30
+ cwt_layers: 2
31
+ cwt_loss: l1
32
+ cwt_std_scale: 0.8
33
+ debug: false
34
+ dec_dilations:
35
+ - 1
36
+ - 1
37
+ - 1
38
+ - 1
39
+ dec_ffn_kernel_size: 9
40
+ dec_inp_add_noise: false
41
+ dec_kernel_size: 5
42
+ dec_layers: 4
43
+ dec_num_heads: 2
44
+ decoder_rnn_dim: 0
45
+ decoder_type: fft
46
+ dict_dir: ''
47
+ diff_decoder_type: wavenet
48
+ diff_loss_type: l1
49
+ dilation_cycle_length: 1
50
+ dropout: 0.1
51
+ ds_workers: 2
52
+ dur_enc_hidden_stride_kernel:
53
+ - 0,2,3
54
+ - 0,2,3
55
+ - 0,1,3
56
+ dur_loss: mse
57
+ dur_predictor_kernel: 3
58
+ dur_predictor_layers: 2
59
+ enc_dec_norm: ln
60
+ enc_dilations:
61
+ - 1
62
+ - 1
63
+ - 1
64
+ - 1
65
+ enc_ffn_kernel_size: 9
66
+ enc_kernel_size: 5
67
+ enc_layers: 4
68
+ encoder_K: 8
69
+ encoder_type: fft
70
+ endless_ds: true
71
+ ffn_act: gelu
72
+ ffn_hidden_size: 1024
73
+ ffn_padding: SAME
74
+ fft_size: 1024
75
+ fmax: 7600
76
+ fmin: 80
77
+ frames_multiple: 1
78
+ gen_dir_name: ''
79
+ gen_tgt_spk_id: -1
80
+ griffin_lim_iters: 60
81
+ hidden_size: 256
82
+ hop_size: 256
83
+ infer: false
84
+ keep_bins: 80
85
+ lambda_commit: 0.25
86
+ lambda_energy: 0.1
87
+ lambda_f0: 1.0
88
+ lambda_ph_dur: 0.1
89
+ lambda_sent_dur: 1.0
90
+ lambda_uv: 1.0
91
+ lambda_word_dur: 1.0
92
+ layers_in_block: 2
93
+ load_ckpt: ''
94
+ loud_norm: false
95
+ lr: 1.0
96
+ max_beta: 0.06
97
+ max_epochs: 1000
98
+ max_frames: 1548
99
+ max_input_tokens: 1550
100
+ max_sentences: 48
101
+ max_tokens: 32000
102
+ max_updates: 200000
103
+ max_valid_sentences: 1
104
+ max_valid_tokens: 60000
105
+ mel_loss: ssim:0.5|l1:0.5
106
+ mel_vmax: 1.5
107
+ mel_vmin: -6
108
+ min_frames: 0
109
+ min_level_db: -100
110
+ num_ckpt_keep: 3
111
+ num_heads: 2
112
+ num_sanity_val_steps: -1
113
+ num_spk: 1
114
+ num_test_samples: 20
115
+ num_valid_plots: 10
116
+ optimizer_adam_beta1: 0.9
117
+ optimizer_adam_beta2: 0.98
118
+ out_wav_norm: false
119
+ pitch_ar: false
120
+ pitch_embed_type: 0
121
+ pitch_enc_hidden_stride_kernel:
122
+ - 0,2,5
123
+ - 0,2,5
124
+ - 0,2,5
125
+ pitch_extractor: parselmouth
126
+ pitch_loss: l1
127
+ pitch_norm: standard
128
+ pitch_ssim_win: 11
129
+ pitch_type: frame
130
+ pre_align_args:
131
+ allow_no_txt: false
132
+ denoise: false
133
+ sox_resample: false
134
+ sox_to_wav: false
135
+ trim_sil: false
136
+ txt_processor: en
137
+ use_tone: true
138
+ pre_align_cls: egs.datasets.audio.lj.pre_align.LJPreAlign
139
+ predictor_dropout: 0.5
140
+ predictor_grad: 0.1
141
+ predictor_hidden: -1
142
+ predictor_kernel: 5
143
+ predictor_layers: 2
144
+ pretrain_fs_ckpt: ''
145
+ print_nan_grads: false
146
+ processed_data_dir: data/processed/LJSpeech
147
+ profile_infer: false
148
+ raw_data_dir: data/raw/LJSpeech
149
+ ref_hidden_stride_kernel:
150
+ - 0,3,5
151
+ - 0,3,5
152
+ - 0,2,5
153
+ - 0,2,5
154
+ - 0,2,5
155
+ ref_level_db: 20
156
+ ref_norm_layer: bn
157
+ rename_tmux: true
158
+ residual_channels: 256
159
+ residual_layers: 20
160
+ resume_from_checkpoint: 0
161
+ save_best: true
162
+ save_codes: []
163
+ save_f0: false
164
+ save_gt: true
165
+ schedule_type: vpsde
166
+ scheduler: rsqrt
167
+ seed: 1234
168
+ sil_add_noise: false
169
+ sort_by_len: true
170
+ spec_max: []
171
+ spec_min: []
172
+ task_cls: modules.ProDiff.task.ProDiff_teacher_task.ProDiff_teacher_Task
173
+ tb_log_interval: 100
174
+ test_ids: []
175
+ test_input_dir: ''
176
+ test_num: 100
177
+ test_set_name: test
178
+ timescale: 1
179
+ timesteps: 4
180
+ train_set_name: train
181
+ train_sets: ''
182
+ use_cond_disc: true
183
+ use_energy_embed: true
184
+ use_gt_dur: true
185
+ use_gt_f0: true
186
+ use_pitch_embed: true
187
+ use_pos_embed: true
188
+ use_ref_enc: false
189
+ use_spk_embed: false
190
+ use_spk_id: false
191
+ use_split_spk_id: false
192
+ use_uv: true
193
+ use_var_enc: false
194
+ val_check_interval: 2000
195
+ valid_infer_interval: 10000
196
+ valid_monitor_key: val_loss
197
+ valid_monitor_mode: min
198
+ valid_set_name: valid
199
+ var_enc_vq_codes: 64
200
+ vocoder_denoise_c: 0.0
201
+ warmup_updates: 2000
202
+ weight_decay: 0
203
+ win_size: 1024
204
+ word_size: 30000
205
+ work_dir: checkpoints/ProDiff_Teacher1
checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d3d02a215431c69dd54c1413b9a02cdc32795e2039ad9be857b12e85c470eea
3
+ size 342252871
data/binary/LJSpeech/phone_set.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
data/binary/LJSpeech/spk_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"SPK1": 0}
data/binary/LJSpeech/train_f0s_mean_std.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8790d5a84d77143690ae71a1f1e7fc81359e69ead263dc440366f2164c739efd
3
+ size 144