Rongjiehuang
commited on
Commit
·
1f001bb
0
Parent(s):
First model version
Browse files- .gitattributes +33 -0
- .gitignore +151 -0
- README.md +51 -0
- checkpoints/FastDiff/config.yaml +149 -0
- checkpoints/FastDiff/model_ckpt_steps_500000.ckpt +3 -0
- checkpoints/ProDiff/config.yaml +205 -0
- checkpoints/ProDiff/model_ckpt_steps_200000.ckpt +3 -0
- checkpoints/ProDiff_Teacher/config.yaml +205 -0
- checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt +3 -0
- data/binary/LJSpeech/phone_set.json +1 -0
- data/binary/LJSpeech/spk_map.json +1 -0
- data/binary/LJSpeech/train_f0s_mean_std.npy +3 -0
.gitattributes
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
24 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Project ignore
|
2 |
+
|
3 |
+
/ParallelWaveGAN
|
4 |
+
/wavegan_pretrained*
|
5 |
+
/pretrained_models
|
6 |
+
rsync
|
7 |
+
.idea
|
8 |
+
.DS_Store
|
9 |
+
bak
|
10 |
+
tmp
|
11 |
+
*.tar.gz
|
12 |
+
# mfa and kaldi
|
13 |
+
kaldi_align/exp
|
14 |
+
mfa
|
15 |
+
montreal-forced-aligner
|
16 |
+
mos
|
17 |
+
nbs
|
18 |
+
/configs_usr/*
|
19 |
+
!/configs_usr/.gitkeep
|
20 |
+
/fast_transformers
|
21 |
+
/rnnoise
|
22 |
+
/usr/*
|
23 |
+
!/usr/.gitkeep
|
24 |
+
|
25 |
+
# Created by .ignore support plugin (hsz.mobi)
|
26 |
+
### Python template
|
27 |
+
# Byte-compiled / optimized / DLL files
|
28 |
+
__pycache__/
|
29 |
+
*.py[cod]
|
30 |
+
*$py.class
|
31 |
+
|
32 |
+
# C extensions
|
33 |
+
*.so
|
34 |
+
|
35 |
+
# Distribution / packaging
|
36 |
+
.Python
|
37 |
+
build/
|
38 |
+
develop-eggs/
|
39 |
+
dist/
|
40 |
+
downloads/
|
41 |
+
eggs/
|
42 |
+
.eggs/
|
43 |
+
lib/
|
44 |
+
lib64/
|
45 |
+
parts/
|
46 |
+
sdist/
|
47 |
+
var/
|
48 |
+
wheels/
|
49 |
+
pip-wheel-metadata/
|
50 |
+
share/python-wheels/
|
51 |
+
*.egg-info/
|
52 |
+
.installed.cfg
|
53 |
+
*.egg
|
54 |
+
MANIFEST
|
55 |
+
|
56 |
+
# PyInstaller
|
57 |
+
# Usually these files are written by a python script from a template
|
58 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
59 |
+
*.manifest
|
60 |
+
*.spec
|
61 |
+
|
62 |
+
# Installer logs
|
63 |
+
pip-log.txt
|
64 |
+
pip-delete-this-directory.txt
|
65 |
+
|
66 |
+
# Unit test / coverage reports
|
67 |
+
htmlcov/
|
68 |
+
.tox/
|
69 |
+
.nox/
|
70 |
+
.coverage
|
71 |
+
.coverage.*
|
72 |
+
.cache
|
73 |
+
nosetests.xml
|
74 |
+
coverage.xml
|
75 |
+
*.cover
|
76 |
+
.hypothesis/
|
77 |
+
.pytest_cache/
|
78 |
+
|
79 |
+
# Translations
|
80 |
+
*.mo
|
81 |
+
*.pot
|
82 |
+
|
83 |
+
# Django stuff:
|
84 |
+
*.log
|
85 |
+
local_settings.py
|
86 |
+
db.sqlite3
|
87 |
+
db.sqlite3-journal
|
88 |
+
|
89 |
+
# Flask stuff:
|
90 |
+
instance/
|
91 |
+
.webassets-cache
|
92 |
+
|
93 |
+
# Scrapy stuff:
|
94 |
+
.scrapy
|
95 |
+
|
96 |
+
# Sphinx documentation
|
97 |
+
docs/_build/
|
98 |
+
|
99 |
+
# PyBuilder
|
100 |
+
target/
|
101 |
+
|
102 |
+
# Jupyter Notebook
|
103 |
+
.ipynb_checkpoints
|
104 |
+
|
105 |
+
# IPython
|
106 |
+
profile_default/
|
107 |
+
ipython_config.py
|
108 |
+
|
109 |
+
# pyenv
|
110 |
+
.python-version
|
111 |
+
|
112 |
+
# pipenv
|
113 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
114 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
115 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
116 |
+
# install all needed dependencies.
|
117 |
+
#Pipfile.lock
|
118 |
+
|
119 |
+
# celery beat schedule file
|
120 |
+
celerybeat-schedule
|
121 |
+
|
122 |
+
# SageMath parsed files
|
123 |
+
*.sage.py
|
124 |
+
|
125 |
+
# Environments
|
126 |
+
.env
|
127 |
+
.venv
|
128 |
+
env/
|
129 |
+
venv/
|
130 |
+
ENV/
|
131 |
+
env.bak/
|
132 |
+
venv.bak/
|
133 |
+
|
134 |
+
# Spyder project settings
|
135 |
+
.spyderproject
|
136 |
+
.spyproject
|
137 |
+
|
138 |
+
# Rope project settings
|
139 |
+
.ropeproject
|
140 |
+
|
141 |
+
# mkdocs documentation
|
142 |
+
/site
|
143 |
+
|
144 |
+
# mypy
|
145 |
+
.mypy_cache/
|
146 |
+
.dmypy.json
|
147 |
+
dmypy.json
|
148 |
+
|
149 |
+
# Pyre type checker
|
150 |
+
.pyre/
|
151 |
+
将删除 datasets/remi/test/
|
README.md
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: other
|
3 |
+
tags:
|
4 |
+
- text-to-speech
|
5 |
+
- neural-vocoder
|
6 |
+
inference: false
|
7 |
+
extra_gated_prompt: |-
|
8 |
+
One more step before getting this model.
|
9 |
+
This model is open access and available to all, with a license further specifying rights and usage.
|
10 |
+
|
11 |
+
Any organization or individual is prohibited from using any technology mentioned in this paper to generate someone's speech without his/her consent, including but not limited to government leaders, political figures, and celebrities. If you do not comply with this item, you could be in violation of copyright laws.
|
12 |
+
|
13 |
+
|
14 |
+
By clicking on "Access repository" below, you accept that your *contact information* (email address and username) can be shared with the model authors as well.
|
15 |
+
|
16 |
+
extra_gated_fields:
|
17 |
+
I have read the License and agree with its terms: checkbox
|
18 |
+
---
|
19 |
+
|
20 |
+
# ProDiff and FastDiff Model Card
|
21 |
+
|
22 |
+
## Key Features
|
23 |
+
- **Extremely-Fast** diffusion text-to-speech synthesis pipeline for potential **industrial deployment**.
|
24 |
+
- **Tutorial and code base** for speech diffusion models.
|
25 |
+
- More **supported diffusion mechanism** (e.g., guided diffusion) will be available.
|
26 |
+
|
27 |
+
|
28 |
+
## Model Details
|
29 |
+
- **Developed by:** Robin Rombach, Patrick Esser
|
30 |
+
- **Model type:** Diffusion-based text-to-speech generation model
|
31 |
+
- **Language(s):** English
|
32 |
+
- **License:**
|
33 |
+
- **Model Description:** A conditional diffusion probabilistic model capable of generating high fidelity speech efficiently.
|
34 |
+
- **Resources for more information:** [FastDiff GitHub Repository](https://github.com/Rongjiehuang/FastDiff), [FastDiff Paper](https://arxiv.org/abs/2204.09934). [ProDiff GitHub Repository](https://github.com/Rongjiehuang/ProDiff), [ProDiff Paper](https://arxiv.org/abs/2207.06389).
|
35 |
+
- **Cite as:**
|
36 |
+
|
37 |
+
@inproceedings{huang2022prodiff,
|
38 |
+
title={ProDiff: Progressive Fast Diffusion Model For High-Quality Text-to-Speech},
|
39 |
+
author={Huang, Rongjie and Zhao, Zhou and Liu, Huadai and Liu, Jinglin and Cui, Chenye and Ren, Yi},
|
40 |
+
booktitle={Proceedings of the 30th ACM International Conference on Multimedia},
|
41 |
+
year={2022}
|
42 |
+
|
43 |
+
@inproceedings{huang2022fastdiff,
|
44 |
+
title={FastDiff: A Fast Conditional Diffusion Model for High-Quality Speech Synthesis},
|
45 |
+
author={Huang, Rongjie and Lam, Max WY and Wang, Jun and Su, Dan and Yu, Dong and Ren, Yi and Zhao, Zhou},
|
46 |
+
booktitle = {Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, {IJCAI-22}},
|
47 |
+
year={2022}
|
48 |
+
-
|
49 |
+
|
50 |
+
|
51 |
+
*This model card was written based on the [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
|
checkpoints/FastDiff/config.yaml
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
N: ''
|
2 |
+
T: 1000
|
3 |
+
accumulate_grad_batches: 1
|
4 |
+
amp: false
|
5 |
+
audio_channels: 1
|
6 |
+
audio_num_mel_bins: 80
|
7 |
+
audio_sample_rate: 22050
|
8 |
+
aux_context_window: 0
|
9 |
+
beta_0: 1.0e-06
|
10 |
+
beta_T: 0.01
|
11 |
+
binarization_args:
|
12 |
+
reset_phone_dict: true
|
13 |
+
reset_word_dict: true
|
14 |
+
shuffle: false
|
15 |
+
trim_eos_bos: false
|
16 |
+
with_align: false
|
17 |
+
with_f0: false
|
18 |
+
with_f0cwt: false
|
19 |
+
with_linear: false
|
20 |
+
with_spk_embed: false
|
21 |
+
with_spk_id: true
|
22 |
+
with_txt: false
|
23 |
+
with_wav: true
|
24 |
+
with_word: false
|
25 |
+
binarizer_cls: data_gen.tts.vocoder_binarizer.VocoderBinarizer
|
26 |
+
binary_data_dir: data/binary/LJSpeech
|
27 |
+
check_val_every_n_epoch: 10
|
28 |
+
clip_grad_norm: 1
|
29 |
+
clip_grad_value: 0
|
30 |
+
cond_channels: 80
|
31 |
+
debug: false
|
32 |
+
dec_ffn_kernel_size: 9
|
33 |
+
dec_layers: 4
|
34 |
+
dict_dir: ''
|
35 |
+
diffusion_step_embed_dim_in: 128
|
36 |
+
diffusion_step_embed_dim_mid: 512
|
37 |
+
diffusion_step_embed_dim_out: 512
|
38 |
+
disc_start_steps: 40000
|
39 |
+
discriminator_grad_norm: 1
|
40 |
+
dropout: 0.0
|
41 |
+
ds_workers: 1
|
42 |
+
enc_ffn_kernel_size: 9
|
43 |
+
enc_layers: 4
|
44 |
+
endless_ds: true
|
45 |
+
eval_max_batches: -1
|
46 |
+
ffn_act: gelu
|
47 |
+
ffn_padding: SAME
|
48 |
+
fft_size: 1024
|
49 |
+
fmax: 7600
|
50 |
+
fmin: 80
|
51 |
+
frames_multiple: 1
|
52 |
+
gen_dir_name: ''
|
53 |
+
generator_grad_norm: 10
|
54 |
+
griffin_lim_iters: 60
|
55 |
+
hidden_size: 256
|
56 |
+
hop_size: 256
|
57 |
+
infer: false
|
58 |
+
inner_channels: 32
|
59 |
+
kpnet_conv_size: 3
|
60 |
+
kpnet_hidden_channels: 64
|
61 |
+
load_ckpt: ''
|
62 |
+
loud_norm: false
|
63 |
+
lr: 2e-4
|
64 |
+
lvc_kernel_size: 3
|
65 |
+
lvc_layers_each_block: 4
|
66 |
+
max_epochs: 1000
|
67 |
+
max_frames: 1548
|
68 |
+
max_input_tokens: 1550
|
69 |
+
max_samples: 25600
|
70 |
+
max_sentences: 20
|
71 |
+
max_tokens: 30000
|
72 |
+
max_updates: 1000000
|
73 |
+
max_valid_sentences: 1
|
74 |
+
max_valid_tokens: 60000
|
75 |
+
mel_loss: l1
|
76 |
+
mel_vmax: 1.5
|
77 |
+
mel_vmin: -6
|
78 |
+
mfa_version: 2
|
79 |
+
min_frames: 0
|
80 |
+
min_level_db: -100
|
81 |
+
noise_schedule: ''
|
82 |
+
num_ckpt_keep: 3
|
83 |
+
num_heads: 2
|
84 |
+
num_mels: 80
|
85 |
+
num_sanity_val_steps: -1
|
86 |
+
num_spk: 400
|
87 |
+
num_test_samples: 0
|
88 |
+
num_valid_plots: 10
|
89 |
+
optimizer_adam_beta1: 0.9
|
90 |
+
optimizer_adam_beta2: 0.98
|
91 |
+
out_wav_norm: false
|
92 |
+
pitch_extractor: parselmouth
|
93 |
+
pre_align_args:
|
94 |
+
allow_no_txt: false
|
95 |
+
denoise: false
|
96 |
+
nsample_per_mfa_group: 1000
|
97 |
+
sox_resample: false
|
98 |
+
sox_to_wav: false
|
99 |
+
trim_sil: false
|
100 |
+
txt_processor: en
|
101 |
+
use_tone: true
|
102 |
+
pre_align_cls: egs.datasets.audio.pre_align.PreAlign
|
103 |
+
print_nan_grads: false
|
104 |
+
processed_data_dir: data/processed/LJSpeech
|
105 |
+
profile_infer: false
|
106 |
+
raw_data_dir: data/raw/LJSpeech-1.1
|
107 |
+
ref_level_db: 20
|
108 |
+
rename_tmux: true
|
109 |
+
resume_from_checkpoint: 0
|
110 |
+
save_best: true
|
111 |
+
save_codes: []
|
112 |
+
save_f0: false
|
113 |
+
save_gt: true
|
114 |
+
scheduler: rsqrt
|
115 |
+
seed: 1234
|
116 |
+
sort_by_len: true
|
117 |
+
task_cls: modules.FastDiff.task.FastDiff.FastDiffTask
|
118 |
+
tb_log_interval: 100
|
119 |
+
test_ids: []
|
120 |
+
test_input_dir: ''
|
121 |
+
test_mel_dir: ''
|
122 |
+
test_num: 100
|
123 |
+
test_set_name: test
|
124 |
+
train_set_name: train
|
125 |
+
train_sets: ''
|
126 |
+
upsample_ratios:
|
127 |
+
- 8
|
128 |
+
- 8
|
129 |
+
- 4
|
130 |
+
use_pitch_embed: false
|
131 |
+
use_spk_embed: false
|
132 |
+
use_spk_id: false
|
133 |
+
use_split_spk_id: false
|
134 |
+
use_wav: true
|
135 |
+
use_weight_norm: true
|
136 |
+
use_word_input: false
|
137 |
+
val_check_interval: 2000
|
138 |
+
valid_infer_interval: 10000
|
139 |
+
valid_monitor_key: val_loss
|
140 |
+
valid_monitor_mode: min
|
141 |
+
valid_set_name: valid
|
142 |
+
vocoder_denoise_c: 0.0
|
143 |
+
warmup_updates: 8000
|
144 |
+
weight_decay: 0
|
145 |
+
win_length: null
|
146 |
+
win_size: 1024
|
147 |
+
window: hann
|
148 |
+
word_size: 30000
|
149 |
+
work_dir: checkpoints/FastDiff
|
checkpoints/FastDiff/model_ckpt_steps_500000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee7b6022e525c71a6025b41eeeafff9d6186b52cba76b580d6986bc8674902f3
|
3 |
+
size 183951271
|
checkpoints/ProDiff/config.yaml
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
amp: false
|
3 |
+
audio_num_mel_bins: 80
|
4 |
+
audio_sample_rate: 22050
|
5 |
+
base_config:
|
6 |
+
- ./base.yaml
|
7 |
+
binarization_args:
|
8 |
+
reset_phone_dict: true
|
9 |
+
reset_word_dict: true
|
10 |
+
shuffle: false
|
11 |
+
trim_eos_bos: false
|
12 |
+
trim_sil: false
|
13 |
+
with_align: true
|
14 |
+
with_f0: true
|
15 |
+
with_f0cwt: false
|
16 |
+
with_linear: false
|
17 |
+
with_spk_embed: false
|
18 |
+
with_spk_id: true
|
19 |
+
with_txt: true
|
20 |
+
with_wav: false
|
21 |
+
with_word: true
|
22 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
23 |
+
binary_data_dir: data/binary/LJSpeech
|
24 |
+
check_val_every_n_epoch: 10
|
25 |
+
clip_grad_norm: 1
|
26 |
+
clip_grad_value: 0
|
27 |
+
conv_use_pos: false
|
28 |
+
cwt_add_f0_loss: false
|
29 |
+
cwt_hidden_size: 128
|
30 |
+
cwt_layers: 2
|
31 |
+
cwt_loss: l1
|
32 |
+
cwt_std_scale: 0.8
|
33 |
+
debug: false
|
34 |
+
dec_dilations:
|
35 |
+
- 1
|
36 |
+
- 1
|
37 |
+
- 1
|
38 |
+
- 1
|
39 |
+
dec_ffn_kernel_size: 9
|
40 |
+
dec_inp_add_noise: false
|
41 |
+
dec_kernel_size: 5
|
42 |
+
dec_layers: 4
|
43 |
+
dec_num_heads: 2
|
44 |
+
decoder_rnn_dim: 0
|
45 |
+
decoder_type: fft
|
46 |
+
dict_dir: ''
|
47 |
+
diff_decoder_type: wavenet
|
48 |
+
diff_loss_type: l1
|
49 |
+
dilation_cycle_length: 1
|
50 |
+
dropout: 0.1
|
51 |
+
ds_workers: 2
|
52 |
+
dur_enc_hidden_stride_kernel:
|
53 |
+
- 0,2,3
|
54 |
+
- 0,2,3
|
55 |
+
- 0,1,3
|
56 |
+
dur_loss: mse
|
57 |
+
dur_predictor_kernel: 3
|
58 |
+
dur_predictor_layers: 2
|
59 |
+
enc_dec_norm: ln
|
60 |
+
enc_dilations:
|
61 |
+
- 1
|
62 |
+
- 1
|
63 |
+
- 1
|
64 |
+
- 1
|
65 |
+
enc_ffn_kernel_size: 9
|
66 |
+
enc_kernel_size: 5
|
67 |
+
enc_layers: 4
|
68 |
+
encoder_K: 8
|
69 |
+
encoder_type: fft
|
70 |
+
endless_ds: true
|
71 |
+
ffn_act: gelu
|
72 |
+
ffn_hidden_size: 1024
|
73 |
+
ffn_padding: SAME
|
74 |
+
fft_size: 1024
|
75 |
+
fmax: 7600
|
76 |
+
fmin: 80
|
77 |
+
frames_multiple: 1
|
78 |
+
gen_dir_name: ''
|
79 |
+
gen_tgt_spk_id: -1
|
80 |
+
griffin_lim_iters: 60
|
81 |
+
hidden_size: 256
|
82 |
+
hop_size: 256
|
83 |
+
infer: false
|
84 |
+
keep_bins: 80
|
85 |
+
lambda_commit: 0.25
|
86 |
+
lambda_energy: 0.1
|
87 |
+
lambda_f0: 1.0
|
88 |
+
lambda_ph_dur: 0.1
|
89 |
+
lambda_sent_dur: 1.0
|
90 |
+
lambda_uv: 1.0
|
91 |
+
lambda_word_dur: 1.0
|
92 |
+
layers_in_block: 2
|
93 |
+
load_ckpt: ''
|
94 |
+
loud_norm: false
|
95 |
+
lr: 1.0
|
96 |
+
max_beta: 0.06
|
97 |
+
max_epochs: 1000
|
98 |
+
max_frames: 1548
|
99 |
+
max_input_tokens: 1550
|
100 |
+
max_sentences: 48
|
101 |
+
max_tokens: 32000
|
102 |
+
max_updates: 200000
|
103 |
+
max_valid_sentences: 1
|
104 |
+
max_valid_tokens: 60000
|
105 |
+
mel_loss: ssim:0.5|l1:0.5
|
106 |
+
mel_vmax: 1.5
|
107 |
+
mel_vmin: -6
|
108 |
+
min_frames: 0
|
109 |
+
min_level_db: -100
|
110 |
+
num_ckpt_keep: 3
|
111 |
+
num_heads: 2
|
112 |
+
num_sanity_val_steps: -1
|
113 |
+
num_spk: 1
|
114 |
+
num_test_samples: 0
|
115 |
+
num_valid_plots: 10
|
116 |
+
optimizer_adam_beta1: 0.9
|
117 |
+
optimizer_adam_beta2: 0.98
|
118 |
+
out_wav_norm: false
|
119 |
+
pitch_ar: false
|
120 |
+
pitch_embed_type: 0
|
121 |
+
pitch_enc_hidden_stride_kernel:
|
122 |
+
- 0,2,5
|
123 |
+
- 0,2,5
|
124 |
+
- 0,2,5
|
125 |
+
pitch_extractor: parselmouth
|
126 |
+
pitch_loss: l1
|
127 |
+
pitch_norm: standard
|
128 |
+
pitch_ssim_win: 11
|
129 |
+
pitch_type: frame
|
130 |
+
pre_align_args:
|
131 |
+
allow_no_txt: false
|
132 |
+
denoise: false
|
133 |
+
sox_resample: false
|
134 |
+
sox_to_wav: false
|
135 |
+
trim_sil: false
|
136 |
+
txt_processor: en
|
137 |
+
use_tone: true
|
138 |
+
pre_align_cls: ''
|
139 |
+
predictor_dropout: 0.5
|
140 |
+
predictor_grad: 0.1
|
141 |
+
predictor_hidden: -1
|
142 |
+
predictor_kernel: 5
|
143 |
+
predictor_layers: 2
|
144 |
+
pretrain_fs_ckpt: ''
|
145 |
+
print_nan_grads: false
|
146 |
+
processed_data_dir: data/processed/LJSpeech
|
147 |
+
profile_infer: false
|
148 |
+
raw_data_dir: data/raw/LJSpeech
|
149 |
+
ref_hidden_stride_kernel:
|
150 |
+
- 0,3,5
|
151 |
+
- 0,3,5
|
152 |
+
- 0,2,5
|
153 |
+
- 0,2,5
|
154 |
+
- 0,2,5
|
155 |
+
ref_level_db: 20
|
156 |
+
ref_norm_layer: bn
|
157 |
+
rename_tmux: true
|
158 |
+
residual_channels: 256
|
159 |
+
residual_layers: 20
|
160 |
+
resume_from_checkpoint: 0
|
161 |
+
save_best: true
|
162 |
+
save_codes: []
|
163 |
+
save_f0: false
|
164 |
+
save_gt: true
|
165 |
+
schedule_type: vpsde
|
166 |
+
scheduler: rsqrt
|
167 |
+
seed: 1234
|
168 |
+
sil_add_noise: false
|
169 |
+
sort_by_len: true
|
170 |
+
spec_max: []
|
171 |
+
spec_min: []
|
172 |
+
task_cls: modules.ProDiff.task.ProDiff_task.ProDiff_Task
|
173 |
+
tb_log_interval: 100
|
174 |
+
teacher_ckpt: checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt
|
175 |
+
test_ids: []
|
176 |
+
test_input_dir: ''
|
177 |
+
test_num: 100
|
178 |
+
test_set_name: test
|
179 |
+
timesteps: 4
|
180 |
+
train_set_name: train
|
181 |
+
train_sets: ''
|
182 |
+
use_cond_disc: true
|
183 |
+
use_energy_embed: true
|
184 |
+
use_gt_dur: true
|
185 |
+
use_gt_f0: true
|
186 |
+
use_pitch_embed: true
|
187 |
+
use_pos_embed: true
|
188 |
+
use_ref_enc: false
|
189 |
+
use_spk_embed: false
|
190 |
+
use_spk_id: false
|
191 |
+
use_split_spk_id: false
|
192 |
+
use_uv: true
|
193 |
+
use_var_enc: false
|
194 |
+
val_check_interval: 2000
|
195 |
+
valid_infer_interval: 10000
|
196 |
+
valid_monitor_key: val_loss
|
197 |
+
valid_monitor_mode: min
|
198 |
+
valid_set_name: valid
|
199 |
+
var_enc_vq_codes: 64
|
200 |
+
vocoder_denoise_c: 0.0
|
201 |
+
warmup_updates: 2000
|
202 |
+
weight_decay: 0
|
203 |
+
win_size: 1024
|
204 |
+
word_size: 30000
|
205 |
+
work_dir: checkpoints/ProDiff
|
checkpoints/ProDiff/model_ckpt_steps_200000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8cc8aad355c297b010e2c362341f736b3477744af76e02f6c9965409a7e9113a
|
3 |
+
size 349055740
|
checkpoints/ProDiff_Teacher/config.yaml
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accumulate_grad_batches: 1
|
2 |
+
amp: false
|
3 |
+
audio_num_mel_bins: 80
|
4 |
+
audio_sample_rate: 22050
|
5 |
+
base_config:
|
6 |
+
- ./base.yaml
|
7 |
+
binarization_args:
|
8 |
+
reset_phone_dict: true
|
9 |
+
reset_word_dict: true
|
10 |
+
shuffle: false
|
11 |
+
trim_eos_bos: false
|
12 |
+
trim_sil: false
|
13 |
+
with_align: true
|
14 |
+
with_f0: true
|
15 |
+
with_f0cwt: false
|
16 |
+
with_linear: false
|
17 |
+
with_spk_embed: false
|
18 |
+
with_spk_id: true
|
19 |
+
with_txt: true
|
20 |
+
with_wav: false
|
21 |
+
with_word: true
|
22 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
23 |
+
binary_data_dir: data/binary/LJSpeech
|
24 |
+
check_val_every_n_epoch: 10
|
25 |
+
clip_grad_norm: 1
|
26 |
+
clip_grad_value: 0
|
27 |
+
conv_use_pos: false
|
28 |
+
cwt_add_f0_loss: false
|
29 |
+
cwt_hidden_size: 128
|
30 |
+
cwt_layers: 2
|
31 |
+
cwt_loss: l1
|
32 |
+
cwt_std_scale: 0.8
|
33 |
+
debug: false
|
34 |
+
dec_dilations:
|
35 |
+
- 1
|
36 |
+
- 1
|
37 |
+
- 1
|
38 |
+
- 1
|
39 |
+
dec_ffn_kernel_size: 9
|
40 |
+
dec_inp_add_noise: false
|
41 |
+
dec_kernel_size: 5
|
42 |
+
dec_layers: 4
|
43 |
+
dec_num_heads: 2
|
44 |
+
decoder_rnn_dim: 0
|
45 |
+
decoder_type: fft
|
46 |
+
dict_dir: ''
|
47 |
+
diff_decoder_type: wavenet
|
48 |
+
diff_loss_type: l1
|
49 |
+
dilation_cycle_length: 1
|
50 |
+
dropout: 0.1
|
51 |
+
ds_workers: 2
|
52 |
+
dur_enc_hidden_stride_kernel:
|
53 |
+
- 0,2,3
|
54 |
+
- 0,2,3
|
55 |
+
- 0,1,3
|
56 |
+
dur_loss: mse
|
57 |
+
dur_predictor_kernel: 3
|
58 |
+
dur_predictor_layers: 2
|
59 |
+
enc_dec_norm: ln
|
60 |
+
enc_dilations:
|
61 |
+
- 1
|
62 |
+
- 1
|
63 |
+
- 1
|
64 |
+
- 1
|
65 |
+
enc_ffn_kernel_size: 9
|
66 |
+
enc_kernel_size: 5
|
67 |
+
enc_layers: 4
|
68 |
+
encoder_K: 8
|
69 |
+
encoder_type: fft
|
70 |
+
endless_ds: true
|
71 |
+
ffn_act: gelu
|
72 |
+
ffn_hidden_size: 1024
|
73 |
+
ffn_padding: SAME
|
74 |
+
fft_size: 1024
|
75 |
+
fmax: 7600
|
76 |
+
fmin: 80
|
77 |
+
frames_multiple: 1
|
78 |
+
gen_dir_name: ''
|
79 |
+
gen_tgt_spk_id: -1
|
80 |
+
griffin_lim_iters: 60
|
81 |
+
hidden_size: 256
|
82 |
+
hop_size: 256
|
83 |
+
infer: false
|
84 |
+
keep_bins: 80
|
85 |
+
lambda_commit: 0.25
|
86 |
+
lambda_energy: 0.1
|
87 |
+
lambda_f0: 1.0
|
88 |
+
lambda_ph_dur: 0.1
|
89 |
+
lambda_sent_dur: 1.0
|
90 |
+
lambda_uv: 1.0
|
91 |
+
lambda_word_dur: 1.0
|
92 |
+
layers_in_block: 2
|
93 |
+
load_ckpt: ''
|
94 |
+
loud_norm: false
|
95 |
+
lr: 1.0
|
96 |
+
max_beta: 0.06
|
97 |
+
max_epochs: 1000
|
98 |
+
max_frames: 1548
|
99 |
+
max_input_tokens: 1550
|
100 |
+
max_sentences: 48
|
101 |
+
max_tokens: 32000
|
102 |
+
max_updates: 200000
|
103 |
+
max_valid_sentences: 1
|
104 |
+
max_valid_tokens: 60000
|
105 |
+
mel_loss: ssim:0.5|l1:0.5
|
106 |
+
mel_vmax: 1.5
|
107 |
+
mel_vmin: -6
|
108 |
+
min_frames: 0
|
109 |
+
min_level_db: -100
|
110 |
+
num_ckpt_keep: 3
|
111 |
+
num_heads: 2
|
112 |
+
num_sanity_val_steps: -1
|
113 |
+
num_spk: 1
|
114 |
+
num_test_samples: 20
|
115 |
+
num_valid_plots: 10
|
116 |
+
optimizer_adam_beta1: 0.9
|
117 |
+
optimizer_adam_beta2: 0.98
|
118 |
+
out_wav_norm: false
|
119 |
+
pitch_ar: false
|
120 |
+
pitch_embed_type: 0
|
121 |
+
pitch_enc_hidden_stride_kernel:
|
122 |
+
- 0,2,5
|
123 |
+
- 0,2,5
|
124 |
+
- 0,2,5
|
125 |
+
pitch_extractor: parselmouth
|
126 |
+
pitch_loss: l1
|
127 |
+
pitch_norm: standard
|
128 |
+
pitch_ssim_win: 11
|
129 |
+
pitch_type: frame
|
130 |
+
pre_align_args:
|
131 |
+
allow_no_txt: false
|
132 |
+
denoise: false
|
133 |
+
sox_resample: false
|
134 |
+
sox_to_wav: false
|
135 |
+
trim_sil: false
|
136 |
+
txt_processor: en
|
137 |
+
use_tone: true
|
138 |
+
pre_align_cls: egs.datasets.audio.lj.pre_align.LJPreAlign
|
139 |
+
predictor_dropout: 0.5
|
140 |
+
predictor_grad: 0.1
|
141 |
+
predictor_hidden: -1
|
142 |
+
predictor_kernel: 5
|
143 |
+
predictor_layers: 2
|
144 |
+
pretrain_fs_ckpt: ''
|
145 |
+
print_nan_grads: false
|
146 |
+
processed_data_dir: data/processed/LJSpeech
|
147 |
+
profile_infer: false
|
148 |
+
raw_data_dir: data/raw/LJSpeech
|
149 |
+
ref_hidden_stride_kernel:
|
150 |
+
- 0,3,5
|
151 |
+
- 0,3,5
|
152 |
+
- 0,2,5
|
153 |
+
- 0,2,5
|
154 |
+
- 0,2,5
|
155 |
+
ref_level_db: 20
|
156 |
+
ref_norm_layer: bn
|
157 |
+
rename_tmux: true
|
158 |
+
residual_channels: 256
|
159 |
+
residual_layers: 20
|
160 |
+
resume_from_checkpoint: 0
|
161 |
+
save_best: true
|
162 |
+
save_codes: []
|
163 |
+
save_f0: false
|
164 |
+
save_gt: true
|
165 |
+
schedule_type: vpsde
|
166 |
+
scheduler: rsqrt
|
167 |
+
seed: 1234
|
168 |
+
sil_add_noise: false
|
169 |
+
sort_by_len: true
|
170 |
+
spec_max: []
|
171 |
+
spec_min: []
|
172 |
+
task_cls: modules.ProDiff.task.ProDiff_teacher_task.ProDiff_teacher_Task
|
173 |
+
tb_log_interval: 100
|
174 |
+
test_ids: []
|
175 |
+
test_input_dir: ''
|
176 |
+
test_num: 100
|
177 |
+
test_set_name: test
|
178 |
+
timescale: 1
|
179 |
+
timesteps: 4
|
180 |
+
train_set_name: train
|
181 |
+
train_sets: ''
|
182 |
+
use_cond_disc: true
|
183 |
+
use_energy_embed: true
|
184 |
+
use_gt_dur: true
|
185 |
+
use_gt_f0: true
|
186 |
+
use_pitch_embed: true
|
187 |
+
use_pos_embed: true
|
188 |
+
use_ref_enc: false
|
189 |
+
use_spk_embed: false
|
190 |
+
use_spk_id: false
|
191 |
+
use_split_spk_id: false
|
192 |
+
use_uv: true
|
193 |
+
use_var_enc: false
|
194 |
+
val_check_interval: 2000
|
195 |
+
valid_infer_interval: 10000
|
196 |
+
valid_monitor_key: val_loss
|
197 |
+
valid_monitor_mode: min
|
198 |
+
valid_set_name: valid
|
199 |
+
var_enc_vq_codes: 64
|
200 |
+
vocoder_denoise_c: 0.0
|
201 |
+
warmup_updates: 2000
|
202 |
+
weight_decay: 0
|
203 |
+
win_size: 1024
|
204 |
+
word_size: 30000
|
205 |
+
work_dir: checkpoints/ProDiff_Teacher1
|
checkpoints/ProDiff_Teacher/model_ckpt_steps_188000.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d3d02a215431c69dd54c1413b9a02cdc32795e2039ad9be857b12e85c470eea
|
3 |
+
size 342252871
|
data/binary/LJSpeech/phone_set.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
|
data/binary/LJSpeech/spk_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"SPK1": 0}
|
data/binary/LJSpeech/train_f0s_mean_std.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8790d5a84d77143690ae71a1f1e7fc81359e69ead263dc440366f2164c739efd
|
3 |
+
size 144
|