ssiidd commited on
Commit
7537bc9
1 Parent(s): 4c4182c

Add tts model

Browse files
Files changed (25) hide show
  1. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml +385 -0
  2. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png +0 -0
  3. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png +0 -0
  4. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png +0 -0
  5. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png +0 -0
  6. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png +0 -0
  7. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png +0 -0
  8. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png +0 -0
  9. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_adv_loss.png +0 -0
  10. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png +0 -0
  11. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_dur_loss.png +0 -0
  12. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_feat_match_loss.png +0 -0
  13. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png +0 -0
  14. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_kl_loss.png +0 -0
  15. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png +0 -0
  16. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_mel_loss.png +0 -0
  17. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png +0 -0
  18. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png +0 -0
  19. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png +0 -0
  20. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png +0 -0
  21. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png +0 -0
  22. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png +0 -0
  23. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/train_time.png +0 -0
  24. tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth +3 -0
  25. tts_model/meta.yaml +8 -0
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 36133
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 2000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ grad_clip: -1
42
+ grad_clip_type: 2.0
43
+ grad_noise: false
44
+ accum_grad: 1
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ use_amp: false
49
+ log_interval: 50
50
+ use_tensorboard: true
51
+ use_wandb: false
52
+ wandb_project: null
53
+ wandb_id: null
54
+ wandb_entity: null
55
+ wandb_name: null
56
+ wandb_model_log_interval: -1
57
+ detect_anomaly: false
58
+ pretrain_path: null
59
+ init_param: []
60
+ ignore_init_mismatch: false
61
+ freeze_param: []
62
+ num_iters_per_epoch: 500
63
+ batch_size: 20
64
+ valid_batch_size: null
65
+ batch_bins: 5000000
66
+ valid_batch_bins: null
67
+ train_shape_file:
68
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/text_shape.phn
69
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/speech_shape
70
+ valid_shape_file:
71
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
72
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/speech_shape
73
+ batch_type: numel
74
+ valid_batch_type: null
75
+ fold_length:
76
+ - 150
77
+ - 204800
78
+ sort_in_batch: descending
79
+ sort_batch: descending
80
+ multiple_iterator: false
81
+ chunk_length: 500
82
+ chunk_shift_ratio: 0.5
83
+ num_cache_chunks: 1024
84
+ train_data_path_and_name_and_type:
85
+ - - dump/raw/tr_no_dev/text
86
+ - text
87
+ - text
88
+ - - dump/raw/tr_no_dev/wav.scp
89
+ - speech
90
+ - sound
91
+ valid_data_path_and_name_and_type:
92
+ - - dump/raw/dev/text
93
+ - text
94
+ - text
95
+ - - dump/raw/dev/wav.scp
96
+ - speech
97
+ - sound
98
+ allow_variable_data_keys: false
99
+ max_cache_size: 0.0
100
+ max_cache_fd: 32
101
+ valid_max_cache_size: null
102
+ optim: adamw
103
+ optim_conf:
104
+ lr: 0.0002
105
+ betas:
106
+ - 0.8
107
+ - 0.99
108
+ eps: 1.0e-09
109
+ weight_decay: 0.0
110
+ scheduler: exponentiallr
111
+ scheduler_conf:
112
+ gamma: 0.999875
113
+ optim2: adamw
114
+ optim2_conf:
115
+ lr: 0.0002
116
+ betas:
117
+ - 0.8
118
+ - 0.99
119
+ eps: 1.0e-09
120
+ weight_decay: 0.0
121
+ scheduler2: exponentiallr
122
+ scheduler2_conf:
123
+ gamma: 0.999875
124
+ generator_first: false
125
+ token_list:
126
+ - <blank>
127
+ - <unk>
128
+ - AH0
129
+ - N
130
+ - T
131
+ - D
132
+ - S
133
+ - R
134
+ - L
135
+ - DH
136
+ - K
137
+ - Z
138
+ - IH1
139
+ - IH0
140
+ - M
141
+ - EH1
142
+ - W
143
+ - P
144
+ - AE1
145
+ - AH1
146
+ - V
147
+ - ER0
148
+ - F
149
+ - ','
150
+ - AA1
151
+ - B
152
+ - HH
153
+ - IY1
154
+ - UW1
155
+ - IY0
156
+ - AO1
157
+ - EY1
158
+ - AY1
159
+ - .
160
+ - OW1
161
+ - SH
162
+ - NG
163
+ - G
164
+ - ER1
165
+ - CH
166
+ - JH
167
+ - Y
168
+ - AW1
169
+ - TH
170
+ - UH1
171
+ - EH2
172
+ - OW0
173
+ - EY2
174
+ - AO0
175
+ - IH2
176
+ - AE2
177
+ - AY2
178
+ - AA2
179
+ - UW0
180
+ - EH0
181
+ - OY1
182
+ - EY0
183
+ - AO2
184
+ - ZH
185
+ - OW2
186
+ - AE0
187
+ - UW2
188
+ - AH2
189
+ - AY0
190
+ - IY2
191
+ - AW2
192
+ - AA0
193
+ - ''''
194
+ - ER2
195
+ - UH2
196
+ - '?'
197
+ - OY2
198
+ - '!'
199
+ - AW0
200
+ - UH0
201
+ - OY0
202
+ - ..
203
+ - <sos/eos>
204
+ odim: null
205
+ model_conf: {}
206
+ use_preprocessor: true
207
+ token_type: phn
208
+ bpemodel: null
209
+ non_linguistic_symbols: null
210
+ cleaner: tacotron
211
+ g2p: g2p_en_no_space
212
+ feats_extract: linear_spectrogram
213
+ feats_extract_conf:
214
+ n_fft: 1024
215
+ hop_length: 256
216
+ win_length: null
217
+ normalize: null
218
+ normalize_conf: {}
219
+ tts: vits
220
+ tts_conf:
221
+ generator_type: vits_generator
222
+ generator_params:
223
+ hidden_channels: 192
224
+ spks: -1
225
+ global_channels: -1
226
+ segment_size: 32
227
+ text_encoder_attention_heads: 2
228
+ text_encoder_ffn_expand: 4
229
+ text_encoder_blocks: 6
230
+ text_encoder_positionwise_layer_type: conv1d
231
+ text_encoder_positionwise_conv_kernel_size: 3
232
+ text_encoder_positional_encoding_layer_type: rel_pos
233
+ text_encoder_self_attention_layer_type: rel_selfattn
234
+ text_encoder_activation_type: swish
235
+ text_encoder_normalize_before: true
236
+ text_encoder_dropout_rate: 0.1
237
+ text_encoder_positional_dropout_rate: 0.0
238
+ text_encoder_attention_dropout_rate: 0.1
239
+ use_macaron_style_in_text_encoder: true
240
+ use_conformer_conv_in_text_encoder: false
241
+ text_encoder_conformer_kernel_size: -1
242
+ decoder_kernel_size: 7
243
+ decoder_channels: 512
244
+ decoder_upsample_scales:
245
+ - 8
246
+ - 8
247
+ - 2
248
+ - 2
249
+ decoder_upsample_kernel_sizes:
250
+ - 16
251
+ - 16
252
+ - 4
253
+ - 4
254
+ decoder_resblock_kernel_sizes:
255
+ - 3
256
+ - 7
257
+ - 11
258
+ decoder_resblock_dilations:
259
+ - - 1
260
+ - 3
261
+ - 5
262
+ - - 1
263
+ - 3
264
+ - 5
265
+ - - 1
266
+ - 3
267
+ - 5
268
+ use_weight_norm_in_decoder: true
269
+ posterior_encoder_kernel_size: 5
270
+ posterior_encoder_layers: 16
271
+ posterior_encoder_stacks: 1
272
+ posterior_encoder_base_dilation: 1
273
+ posterior_encoder_dropout_rate: 0.0
274
+ use_weight_norm_in_posterior_encoder: true
275
+ flow_flows: 4
276
+ flow_kernel_size: 5
277
+ flow_base_dilation: 1
278
+ flow_layers: 4
279
+ flow_dropout_rate: 0.0
280
+ use_weight_norm_in_flow: true
281
+ use_only_mean_in_flow: true
282
+ stochastic_duration_predictor_kernel_size: 3
283
+ stochastic_duration_predictor_dropout_rate: 0.5
284
+ stochastic_duration_predictor_flows: 4
285
+ stochastic_duration_predictor_dds_conv_layers: 3
286
+ vocabs: 78
287
+ aux_channels: 513
288
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
289
+ discriminator_params:
290
+ scales: 1
291
+ scale_downsample_pooling: AvgPool1d
292
+ scale_downsample_pooling_params:
293
+ kernel_size: 4
294
+ stride: 2
295
+ padding: 2
296
+ scale_discriminator_params:
297
+ in_channels: 1
298
+ out_channels: 1
299
+ kernel_sizes:
300
+ - 15
301
+ - 41
302
+ - 5
303
+ - 3
304
+ channels: 128
305
+ max_downsample_channels: 1024
306
+ max_groups: 16
307
+ bias: true
308
+ downsample_scales:
309
+ - 2
310
+ - 2
311
+ - 4
312
+ - 4
313
+ - 1
314
+ nonlinear_activation: LeakyReLU
315
+ nonlinear_activation_params:
316
+ negative_slope: 0.1
317
+ use_weight_norm: true
318
+ use_spectral_norm: false
319
+ follow_official_norm: false
320
+ periods:
321
+ - 2
322
+ - 3
323
+ - 5
324
+ - 7
325
+ - 11
326
+ period_discriminator_params:
327
+ in_channels: 1
328
+ out_channels: 1
329
+ kernel_sizes:
330
+ - 5
331
+ - 3
332
+ channels: 32
333
+ downsample_scales:
334
+ - 3
335
+ - 3
336
+ - 3
337
+ - 3
338
+ - 1
339
+ max_downsample_channels: 1024
340
+ bias: true
341
+ nonlinear_activation: LeakyReLU
342
+ nonlinear_activation_params:
343
+ negative_slope: 0.1
344
+ use_weight_norm: true
345
+ use_spectral_norm: false
346
+ generator_adv_loss_params:
347
+ average_by_discriminators: false
348
+ loss_type: mse
349
+ discriminator_adv_loss_params:
350
+ average_by_discriminators: false
351
+ loss_type: mse
352
+ feat_match_loss_params:
353
+ average_by_discriminators: false
354
+ average_by_layers: false
355
+ include_final_outputs: true
356
+ mel_loss_params:
357
+ fs: 22050
358
+ n_fft: 1024
359
+ hop_length: 256
360
+ win_length: null
361
+ window: hann
362
+ n_mels: 80
363
+ fmin: 0
364
+ fmax: null
365
+ log_base: null
366
+ lambda_adv: 1.0
367
+ lambda_mel: 45.0
368
+ lambda_feat_match: 2.0
369
+ lambda_dur: 1.0
370
+ lambda_kl: 1.0
371
+ sampling_rate: 22050
372
+ cache_generator_outputs: true
373
+ pitch_extract: null
374
+ pitch_extract_conf: {}
375
+ pitch_normalize: null
376
+ pitch_normalize_conf: {}
377
+ energy_extract: null
378
+ energy_extract_conf: {}
379
+ energy_normalize: null
380
+ energy_normalize_conf: {}
381
+ required:
382
+ - output_dir
383
+ - token_list
384
+ version: 0.10.3a1
385
+ distributed: true
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_adv_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_dur_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_feat_match_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_kl_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_mel_loss.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/images/train_time.png ADDED
tts_model/exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84752ef48430c0cad8fc182d6255670bdaff38903e366c136b7aa8c63b64a97a
3
+ size 372559183
tts_model/meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.3a1
2
+ files:
3
+ model_file: exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1630751435.588649
6
+ torch: 1.7.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml