VITS checkpoint trained on Hi-Fi TTS

#1
README.md CHANGED
@@ -1,3 +1,46 @@
1
  ---
2
  license: mit
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ language:
4
+ - en
5
  ---
6
+
7
+ # Amphion Multi-Speaker TTS Pre-trained Model
8
+ ## Quick Start
9
+ We provide the pre-trained checkpoint of [VITS](https://github.com/open-mmlab/Amphion/tree/main/egs/tts/VITS), trained on [Hi-fi TTS](https://www.openslr.org/109/), which consists of a total of 291.6 hours audio contributed by 10 speakers, on an average of 17 hours per speaker.
10
+ To utilize the pre-trained model, run the following commands:
11
+
12
+ ### Step1: Download the checkpoint
13
+ ```bash
14
+ git lfs install
15
+ git clone https://huggingface.co/amphion/vits_hifitts
16
+ ```
17
+
18
+ ### Step2: Clone the Amphion's Source Code of GitHub
19
+ ```bash
20
+ git clone https://github.com/open-mmlab/Amphion.git
21
+ ```
22
+
23
+ ### Step3: Specify the checkpoint's path
24
+ Use the soft link to specify the downloaded checkpoint in the first step:
25
+
26
+ ```bash
27
+ cd Amphion
28
+ mkdir -p ckpts/tts
29
+ ln -s ../../../vits_hifitts ckpts/tts/
30
+ ```
31
+
32
+ ### Step4: Inference
33
+
34
+ You can follow the inference part of this [recipe](https://github.com/open-mmlab/Amphion/tree/main/egs/tts/VITS#4-inference) to generate speech from text. For example, if you want to synthesize a clip of speech with the text of "This is a clip of generated speech with the given text from a TTS model.", just, run:
35
+
36
+ ```bash
37
+ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
38
+ --config ckpts/tts/vits_hifitts/args.json \
39
+ --infer_expt_dir ckpts/tts/vits_hifitts/ \
40
+ --infer_output_dir ckpts/tts/vits_hifitts/result \
41
+ --infer_mode "single" \
42
+ --infer_text "This is a clip of generated speech with the given text from a TTS model." \
43
+ --infer_speaker_name "hifitts_92"
44
+ ```
45
+
46
+ **Note**: The supported `infer_speaker_name` values can be seen [here](https://huggingface.co/amphion/vits_hifitts/tree/main/spk2id.json).
args.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/vits.json",
3
+ "dataset": [
4
+ "hifitts",
5
+ ],
6
+ "model": {
7
+ "filter_channels": 768,
8
+ "gin_channels": 256,
9
+ "hidden_channels": 192,
10
+ "inter_channels": 192,
11
+ "kernel_size": 3,
12
+ "n_heads": 2,
13
+ "n_layers": 6,
14
+ "n_layers_q": 3,
15
+ "n_speakers": 10,
16
+ "p_dropout": 0.1,
17
+ "resblock": "1",
18
+ "resblock_dilation_sizes": [
19
+ [
20
+ 1,
21
+ 3,
22
+ 5,
23
+ ],
24
+ [
25
+ 1,
26
+ 3,
27
+ 5,
28
+ ],
29
+ [
30
+ 1,
31
+ 3,
32
+ 5,
33
+ ],
34
+ ],
35
+ "resblock_kernel_sizes": [
36
+ 3,
37
+ 7,
38
+ 11,
39
+ ],
40
+ "text_token_num": 512,
41
+ "upsample_initial_channel": 512,
42
+ "upsample_kernel_sizes": [
43
+ 16,
44
+ 16,
45
+ 4,
46
+ 4,
47
+ ],
48
+ "upsample_rates": [
49
+ 8,
50
+ 8,
51
+ 2,
52
+ 2,
53
+ ],
54
+ "use_sdp": true,
55
+ "use_spectral_norm": false,
56
+ },
57
+ "model_type": "VITS",
58
+ "preprocess": {
59
+ "add_blank": true,
60
+ "align_mel_duration": false,
61
+ "audio_dir": "audios",
62
+ "bits": 8,
63
+ "contentvec_dir": "contentvec",
64
+ "data_augment": false,
65
+ "dur_dir": "durs",
66
+ "duration_dir": "duration",
67
+ "emo2id": "emo2id.json",
68
+ "energy_dir": "energys",
69
+ "energy_extract_mode": "from_mel",
70
+ "energy_norm": false,
71
+ "energy_remove_outlier": false,
72
+ "extract_acoustic_token": false,
73
+ "extract_amplitude_phase": false,
74
+ "extract_audio": true,
75
+ "extract_contentvec_feature": false,
76
+ "extract_duration": false,
77
+ "extract_energy": false,
78
+ "extract_label": false,
79
+ "extract_linear_spec": true,
80
+ "extract_mcep": false,
81
+ "extract_mel": true,
82
+ "extract_mert_feature": false,
83
+ "extract_phone": true,
84
+ "extract_pitch": false,
85
+ "extract_uv": false,
86
+ "extract_wenet_feature": false,
87
+ "extract_whisper_feature": false,
88
+ "file_lst": "file.lst",
89
+ "fmax": null,
90
+ "fmin": 0,
91
+ "hop_size": 256,
92
+ "imaginary_dir": "imaginarys",
93
+ "lab_dir": "labs",
94
+ "label_dir": "labels",
95
+ "language": "en-us",
96
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
97
+ "linear_dir": "linears",
98
+ "log_amplitude_dir": "log_amplitudes",
99
+ "mcep_dir": "mcep",
100
+ "mel_dir": "mels",
101
+ "mel_extract_mode": "",
102
+ "mel_min_max_norm": false,
103
+ "min_level_db": -115,
104
+ "n_fft": 1024,
105
+ "n_mel": 80,
106
+ "num_silent_frames": 8,
107
+ "phase_dir": "phases",
108
+ "phone_dir": "phones",
109
+ "phone_energy_dir": "phone_energys",
110
+ "phone_extractor": "espeak",
111
+ "phone_pitch_dir": "phone_pitches",
112
+ "phone_seq_file": "phone_seq_file",
113
+ "pitch_dir": "pitches",
114
+ "pitch_extractor": "parselmouth",
115
+ "pitch_norm": false,
116
+ "pitch_remove_outlier": false,
117
+ "raw_data": "raw_data",
118
+ "real_dir": "reals",
119
+ "ref_level_db": 20,
120
+ "sample_rate": 24000,
121
+ "segment_size": 8192,
122
+ "spk2id": "spk2id.json",
123
+ "symbols_dict": "symbols.dict",
124
+ "text_cleaners": [
125
+ "english_cleaners",
126
+ ],
127
+ "train_file": "train.json",
128
+ "trim_fft_size": 512,
129
+ "trim_hop_size": 128,
130
+ "trim_silence": false,
131
+ "trim_top_db": 30,
132
+ "trimmed_wav_dir": "trimmed_wavs",
133
+ "use_amplitude_phase": false,
134
+ "use_audio": true,
135
+ "use_dur": false,
136
+ "use_emoid": false,
137
+ "use_frame_duration": false,
138
+ "use_frame_energy": false,
139
+ "use_frame_pitch": false,
140
+ "use_lab": false,
141
+ "use_label": false,
142
+ "use_linear": true,
143
+ "use_log_scale_energy": false,
144
+ "use_log_scale_pitch": false,
145
+ "use_mel": true,
146
+ "use_min_max_norm_mel": false,
147
+ "use_one_hot": false,
148
+ "use_phn_seq": false,
149
+ "use_phone": true,
150
+ "use_phone_duration": false,
151
+ "use_phone_energy": false,
152
+ "use_phone_pitch": false,
153
+ "use_spkid": true,
154
+ "use_text": false,
155
+ "use_uv": false,
156
+ "use_wav": false,
157
+ "use_wenet": false,
158
+ "utt2emo": "utt2emo",
159
+ "utt2spk": "utt2spk",
160
+ "uv_dir": "uvs",
161
+ "valid_file": "valid.json",
162
+ "wav_dir": "wavs",
163
+ "wenet_dir": "wenet",
164
+ "win_size": 1024,
165
+ },
166
+ "supported_model_type": [
167
+ "Fastspeech2",
168
+ "VITS",
169
+ "VALLE",
170
+ ],
171
+ "task_type": "tts",
172
+ "train": {
173
+ "AdamW": {
174
+ "betas": [
175
+ 0.8,
176
+ 0.99,
177
+ ],
178
+ "eps": 1e-09,
179
+ },
180
+ "adamw": {
181
+ "lr": 0.0004,
182
+ },
183
+ "batch_size": 16,
184
+ "betas": [
185
+ 0.8,
186
+ 0.99,
187
+ ],
188
+ "c_kl": 1.0,
189
+ "c_mel": 45,
190
+ "dataloader": {
191
+ "num_worker": 32,
192
+ "pin_memory": true,
193
+ },
194
+ "ddp": true,
195
+ "eps": 1e-09,
196
+ "fp16_run": true,
197
+ "gradient_accumulation_step": 1,
198
+ "init_lr_ratio": 1,
199
+ "keep_checkpoint_max": 5,
200
+ "keep_last": [
201
+ 3,
202
+ -1,
203
+ ],
204
+ "learning_rate": 0.0002,
205
+ "lr_decay": 0.999875,
206
+ "max_epoch": -1,
207
+ "max_steps": 1000000,
208
+ "multi_speaker_training": true,
209
+ "optimizer": "AdamW",
210
+ "random_seed": 10086,
211
+ "reducelronplateau": {
212
+ "factor": 0.8,
213
+ "min_lr": 0.0001,
214
+ "patience": 10,
215
+ },
216
+ "run_eval": [
217
+ false,
218
+ true,
219
+ ],
220
+ "sampler": {
221
+ "drop_last": true,
222
+ "holistic_shuffle": true,
223
+ },
224
+ "save_checkpoint_stride": [
225
+ 5,
226
+ 20,
227
+ ],
228
+ "save_checkpoints_steps": 10000,
229
+ "save_summary_steps": 500,
230
+ "scheduler": "ReduceLROnPlateau",
231
+ "total_training_steps": 50000,
232
+ "tracker": [
233
+ "tensorboard",
234
+ ],
235
+ "valid_interval": 10000,
236
+ "warmup_epochs": 0,
237
+ },
238
+ "use_custom_dataset": false,
239
+ }
checkpoint/epoch-0030_step-0312356_loss-38.448391/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd7ca5a98e57292908a7749488dfa1bee82e1f9cf560ec999906bdb72f03cce4
3
+ size 159044848
checkpoint/epoch-0030_step-0312356_loss-38.448391/model_1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35e0287f33fe21c2234fbab466fac8659bfa5759bf5914b873746a42308f916
3
+ size 187000096
checkpoint/epoch-0030_step-0312356_loss-38.448391/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d23f14adfa8137ca14c9a7493556cc00848a169e11dca0b4b8bb182b711760c
3
+ size 318631531
checkpoint/epoch-0030_step-0312356_loss-38.448391/optimizer_1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c687b8da88c7e166df5376dd5826837501ef848366b43270257e521478de5331
3
+ size 374071331
checkpoint/epoch-0030_step-0312356_loss-38.448391/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9365e6ef62ff302c74e6a0c36a5d057ff5879a317b20e2da80246dfd03e356f4
3
+ size 15691
checkpoint/epoch-0030_step-0312356_loss-38.448391/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3790f159ec2bc847509acad910fae392b7bb974fa7dc7a2e52a5108e24b2484b
3
+ size 563
checkpoint/epoch-0030_step-0312356_loss-38.448391/scheduler_1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:231b81d7e845b9b1247f789877d0bb85bd04e81c03468eba58f27e9c2664ad62
3
+ size 567
spk2id.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "hifitts_11614": 0,
3
+ "hifitts_11697": 1,
4
+ "hifitts_12787": 2,
5
+ "hifitts_6097": 3,
6
+ "hifitts_6670": 4,
7
+ "hifitts_6671": 5,
8
+ "hifitts_8051": 6,
9
+ "hifitts_9017": 7,
10
+ "hifitts_9136": 8,
11
+ "hifitts_92": 9
12
+ }
symbols.dict ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <eps> 0
2
+ ! 1
3
+ " 2
4
+ ( 3
5
+ ) 4
6
+ , 5
7
+ . 6
8
+ : 7
9
+ ; 8
10
+ ? 9
11
+ _ 10
12
+ a 11
13
+ aɪ 12
14
+ aɪə 13
15
+ aɪɚ 14
16
+ aɪʊ 15
17
+ aɪʊɹ 16
18
+ aʊ 17
19
+ b 18
20
+ d 19
21
+ dʒ 20
22
+ enus 21
23
+ es 22
24
+ eɪ 23
25
+ f 24
26
+ fr 25
27
+ h 26
28
+ i 27
29
+ iə 28
30
+ iː 29
31
+ j 30
32
+ k 31
33
+ l 32
34
+ m 33
35
+ n 34
36
+ nʲ 35
37
+ o 36
38
+ oʊ 37
39
+ oː 38
40
+ oːɹ 39
41
+ p 40
42
+ r 41
43
+ s 42
44
+ t 43
45
+ tʃ 44
46
+ uː 45
47
+ v 46
48
+ w 47
49
+ z 48
50
+ æ 49
51
+ ð 50
52
+ ø 51
53
+ ŋ 52
54
+ ɐ 53
55
+ ɑ 54
56
+ ɑː 55
57
+ ɑːɹ 56
58
+ ɔ 57
59
+ ɔɪ 58
60
+ ɔː 59
61
+ ɔːɹ 60
62
+ ə 61
63
+ əl 62
64
+ ɚ 63
65
+ ɛ 64
66
+ ɛɹ 65
67
+ ɜː 66
68
+ ɡ 67
69
+ ɪ 68
70
+ ɪɹ 69
71
+ ɫ 70
72
+ ɹ 71
73
+ ɾ 72
74
+ ʃ 73
75
+ ʊ 74
76
+ ʊɹ 75
77
+ ʌ 76
78
+ ʒ 77
79
+ ʔ 78
80
+ ̃ 79
81
+ ̩ 80
82
+ θ 81
83
+ ᵻ 82