ftshijt commited on
Commit
1d79d00
1 Parent(s): 23b6d49

Update model

Browse files
Files changed (33) hide show
  1. README.md +554 -0
  2. dump/raw/org/tr_no_dev/spk2sid +31 -0
  3. exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz +3 -0
  4. exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz +3 -0
  5. exp/svs_train_visinger2_raw_phn_None_zh/500epoch.pth +3 -0
  6. exp/svs_train_visinger2_raw_phn_None_zh/config.yaml +474 -0
  7. exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_backward_time.png +0 -0
  8. exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_fake_loss.png +0 -0
  9. exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_forward_time.png +0 -0
  10. exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_loss.png +0 -0
  11. exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_optim_step_time.png +0 -0
  12. exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_real_loss.png +0 -0
  13. exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_train_time.png +0 -0
  14. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_adv_loss.png +0 -0
  15. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_backward_time.png +0 -0
  16. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_feat_match_loss.png +0 -0
  17. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_forward_time.png +0 -0
  18. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_kl_loss.png +0 -0
  19. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_loss.png +0 -0
  20. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_mel_am_loss.png +0 -0
  21. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_mel_ddsp_loss.png +0 -0
  22. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_mel_loss.png +0 -0
  23. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_optim_step_time.png +0 -0
  24. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_phn_dur_loss.png +0 -0
  25. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_pitch_loss.png +0 -0
  26. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_score_dur_loss.png +0 -0
  27. exp/svs_train_visinger2_raw_phn_None_zh/images/generator_train_time.png +0 -0
  28. exp/svs_train_visinger2_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png +0 -0
  29. exp/svs_train_visinger2_raw_phn_None_zh/images/iter_time.png +0 -0
  30. exp/svs_train_visinger2_raw_phn_None_zh/images/optim0_lr0.png +0 -0
  31. exp/svs_train_visinger2_raw_phn_None_zh/images/optim1_lr0.png +0 -0
  32. exp/svs_train_visinger2_raw_phn_None_zh/images/train_time.png +0 -0
  33. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,557 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - acesinger
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/acesinger_opencpop_visinger2_44khz`
15
+
16
+ This model was trained by ftshijt using acesinger recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 5c4d7cf7feba8461de2e1080bf82182f0efaef38
26
+ pip install -e .
27
+ cd egs2/acesinger/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/acesinger_opencpop_visinger2_44khz
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_visinger2.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/svs_train_visinger2_raw_phn_None_zh
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 0
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 500
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - train
77
+ - total_count
78
+ - max
79
+ keep_nbest_models: 10
80
+ nbest_averaging_interval: 0
81
+ grad_clip: -1
82
+ grad_clip_type: 2.0
83
+ grad_noise: false
84
+ accum_grad: 1
85
+ no_forward_run: false
86
+ resume: true
87
+ train_dtype: float32
88
+ use_amp: false
89
+ log_interval: 50
90
+ use_matplotlib: true
91
+ use_tensorboard: true
92
+ create_graph_in_tensorboard: false
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ use_lora: false
101
+ save_lora_only: true
102
+ lora_conf: {}
103
+ pretrain_path: null
104
+ init_param: []
105
+ ignore_init_mismatch: false
106
+ freeze_param: []
107
+ num_iters_per_epoch: 1000
108
+ batch_size: 8
109
+ valid_batch_size: null
110
+ batch_bins: 1000000
111
+ valid_batch_bins: null
112
+ train_shape_file:
113
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
114
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
115
+ valid_shape_file:
116
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
117
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
118
+ batch_type: sorted
119
+ valid_batch_type: null
120
+ fold_length:
121
+ - 150
122
+ - 409600
123
+ sort_in_batch: descending
124
+ shuffle_within_batch: false
125
+ sort_batch: descending
126
+ multiple_iterator: false
127
+ chunk_length: 500
128
+ chunk_shift_ratio: 0.5
129
+ num_cache_chunks: 1024
130
+ chunk_excluded_key_prefixes: []
131
+ train_data_path_and_name_and_type:
132
+ - - dump/raw/tr_no_dev/text
133
+ - text
134
+ - text
135
+ - - dump/raw/tr_no_dev/wav.scp
136
+ - singing
137
+ - sound
138
+ - - dump/raw/tr_no_dev/label
139
+ - label
140
+ - duration
141
+ - - dump/raw/tr_no_dev/score.scp
142
+ - score
143
+ - score
144
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
145
+ - pitch
146
+ - npy
147
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
148
+ - feats
149
+ - npy
150
+ - - dump/raw/tr_no_dev/utt2sid
151
+ - sids
152
+ - text_int
153
+ valid_data_path_and_name_and_type:
154
+ - - dump/raw/dev/text
155
+ - text
156
+ - text
157
+ - - dump/raw/dev/wav.scp
158
+ - singing
159
+ - sound
160
+ - - dump/raw/dev/label
161
+ - label
162
+ - duration
163
+ - - dump/raw/dev/score.scp
164
+ - score
165
+ - score
166
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
167
+ - pitch
168
+ - npy
169
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
170
+ - feats
171
+ - npy
172
+ - - dump/raw/dev/utt2sid
173
+ - sids
174
+ - text_int
175
+ allow_variable_data_keys: false
176
+ max_cache_size: 0.0
177
+ max_cache_fd: 32
178
+ allow_multi_rates: false
179
+ valid_max_cache_size: null
180
+ exclude_weight_decay: false
181
+ exclude_weight_decay_conf: {}
182
+ optim: adamw
183
+ optim_conf:
184
+ lr: 0.0002
185
+ betas:
186
+ - 0.8
187
+ - 0.99
188
+ eps: 1.0e-09
189
+ weight_decay: 0.0
190
+ scheduler: exponentiallr
191
+ scheduler_conf:
192
+ gamma: 0.998
193
+ optim2: adamw
194
+ optim2_conf:
195
+ lr: 0.0002
196
+ betas:
197
+ - 0.8
198
+ - 0.99
199
+ eps: 1.0e-09
200
+ weight_decay: 0.0
201
+ scheduler2: exponentiallr
202
+ scheduler2_conf:
203
+ gamma: 0.998
204
+ generator_first: true
205
+ token_list:
206
+ - <blank>
207
+ - <unk>
208
+ - SP
209
+ - i
210
+ - AP
211
+ - e
212
+ - d
213
+ - y
214
+ - w
215
+ - sh
216
+ - ai
217
+ - n
218
+ - x
219
+ - j
220
+ - u
221
+ - ian
222
+ - l
223
+ - h
224
+ - b
225
+ - o
226
+ - zh
227
+ - ou
228
+ - an
229
+ - m
230
+ - q
231
+ - z
232
+ - en
233
+ - g
234
+ - ing
235
+ - ei
236
+ - ao
237
+ - uo
238
+ - ang
239
+ - eng
240
+ - t
241
+ - ong
242
+ - a
243
+ - ui
244
+ - f
245
+ - k
246
+ - r
247
+ - ch
248
+ - v
249
+ - iang
250
+ - in
251
+ - iao
252
+ - ie
253
+ - iu
254
+ - c
255
+ - s
256
+ - van
257
+ - p
258
+ - ve
259
+ - uan
260
+ - uang
261
+ - ia
262
+ - ua
263
+ - uai
264
+ - un
265
+ - er
266
+ - vn
267
+ - iong
268
+ - <sos/eos>
269
+ odim: null
270
+ model_conf: {}
271
+ use_preprocessor: true
272
+ token_type: phn
273
+ bpemodel: null
274
+ non_linguistic_symbols: null
275
+ cleaner: null
276
+ g2p: null
277
+ fs: 44100
278
+ score_feats_extract: syllable_score_feats
279
+ score_feats_extract_conf:
280
+ fs: 44100
281
+ n_fft: 2048
282
+ win_length: 2048
283
+ hop_length: 512
284
+ feats_extract: fbank
285
+ feats_extract_conf:
286
+ n_fft: 2048
287
+ hop_length: 512
288
+ win_length: 2048
289
+ fs: 44100
290
+ fmin: 80
291
+ fmax: 7600
292
+ n_mels: 80
293
+ normalize: global_mvn
294
+ normalize_conf:
295
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
296
+ svs: vits
297
+ svs_conf:
298
+ generator_type: visinger2
299
+ vocoder_generator_type: visinger2
300
+ generator_params:
301
+ hidden_channels: 192
302
+ spks: 31
303
+ global_channels: 256
304
+ segment_size: 20
305
+ text_encoder_attention_heads: 2
306
+ text_encoder_ffn_expand: 4
307
+ text_encoder_blocks: 6
308
+ text_encoder_positionwise_layer_type: conv1d
309
+ text_encoder_positionwise_conv_kernel_size: 3
310
+ text_encoder_positional_encoding_layer_type: rel_pos
311
+ text_encoder_self_attention_layer_type: rel_selfattn
312
+ text_encoder_activation_type: swish
313
+ text_encoder_normalize_before: true
314
+ text_encoder_dropout_rate: 0.1
315
+ text_encoder_positional_dropout_rate: 0.0
316
+ text_encoder_attention_dropout_rate: 0.1
317
+ use_macaron_style_in_text_encoder: true
318
+ use_conformer_conv_in_text_encoder: false
319
+ text_encoder_conformer_kernel_size: -1
320
+ decoder_kernel_size: 7
321
+ decoder_channels: 256
322
+ decoder_upsample_scales:
323
+ - 8
324
+ - 8
325
+ - 4
326
+ - 2
327
+ decoder_upsample_kernel_sizes:
328
+ - 16
329
+ - 16
330
+ - 8
331
+ - 4
332
+ n_harmonic: 64
333
+ decoder_resblock_kernel_sizes:
334
+ - 3
335
+ - 7
336
+ - 11
337
+ decoder_resblock_dilations:
338
+ - - 1
339
+ - 3
340
+ - 5
341
+ - - 1
342
+ - 3
343
+ - 5
344
+ - - 1
345
+ - 3
346
+ - 5
347
+ use_weight_norm_in_decoder: true
348
+ posterior_encoder_kernel_size: 3
349
+ posterior_encoder_layers: 8
350
+ posterior_encoder_stacks: 1
351
+ posterior_encoder_base_dilation: 1
352
+ posterior_encoder_dropout_rate: 0.0
353
+ use_weight_norm_in_posterior_encoder: true
354
+ flow_flows: -1
355
+ flow_kernel_size: 5
356
+ flow_base_dilation: 1
357
+ flow_layers: 4
358
+ flow_dropout_rate: 0.0
359
+ use_weight_norm_in_flow: true
360
+ use_only_mean_in_flow: true
361
+ use_phoneme_predictor: false
362
+ vocabs: 63
363
+ aux_channels: 80
364
+ generator_type: visinger2
365
+ vocoder_generator_type: visinger2
366
+ fs: 44100
367
+ hop_length: 512
368
+ win_length: 2048
369
+ n_fft: 2048
370
+ discriminator_type: visinger2
371
+ discriminator_params:
372
+ scales: 1
373
+ scale_downsample_pooling: AvgPool1d
374
+ scale_downsample_pooling_params:
375
+ kernel_size: 4
376
+ stride: 2
377
+ padding: 2
378
+ scale_discriminator_params:
379
+ in_channels: 1
380
+ out_channels: 1
381
+ kernel_sizes:
382
+ - 15
383
+ - 41
384
+ - 5
385
+ - 3
386
+ channels: 128
387
+ max_downsample_channels: 1024
388
+ max_groups: 256
389
+ bias: true
390
+ downsample_scales:
391
+ - 4
392
+ - 4
393
+ - 4
394
+ - 4
395
+ nonlinear_activation: LeakyReLU
396
+ nonlinear_activation_params:
397
+ negative_slope: 0.1
398
+ use_weight_norm: true
399
+ use_spectral_norm: false
400
+ follow_official_norm: false
401
+ periods:
402
+ - 2
403
+ - 3
404
+ - 5
405
+ - 7
406
+ - 11
407
+ period_discriminator_params:
408
+ in_channels: 1
409
+ out_channels: 1
410
+ kernel_sizes:
411
+ - 5
412
+ - 3
413
+ channels: 32
414
+ downsample_scales:
415
+ - 3
416
+ - 3
417
+ - 3
418
+ - 3
419
+ - 1
420
+ max_downsample_channels: 1024
421
+ bias: true
422
+ nonlinear_activation: LeakyReLU
423
+ nonlinear_activation_params:
424
+ negative_slope: 0.1
425
+ use_weight_norm: true
426
+ use_spectral_norm: false
427
+ multi_freq_disc_params:
428
+ hidden_channels:
429
+ - 256
430
+ - 256
431
+ - 256
432
+ - 256
433
+ - 256
434
+ domain: double
435
+ mel_scale: true
436
+ divisors:
437
+ - 32
438
+ - 16
439
+ - 8
440
+ - 4
441
+ - 2
442
+ - 1
443
+ - 1
444
+ strides:
445
+ - 1
446
+ - 2
447
+ - 1
448
+ - 2
449
+ - 1
450
+ - 2
451
+ - 1
452
+ sample_rate: 44100
453
+ hop_lengths:
454
+ - 110
455
+ - 220
456
+ - 330
457
+ - 441
458
+ - 551
459
+ - 661
460
+ generator_adv_loss_params:
461
+ average_by_discriminators: false
462
+ loss_type: mse
463
+ discriminator_adv_loss_params:
464
+ average_by_discriminators: false
465
+ loss_type: mse
466
+ feat_match_loss_params:
467
+ average_by_discriminators: false
468
+ average_by_layers: false
469
+ include_final_outputs: true
470
+ mel_loss_params:
471
+ fs: 44100
472
+ n_fft: 2048
473
+ hop_length: 512
474
+ win_length: 2048
475
+ window: hann
476
+ n_mels: 80
477
+ fmin: 0
478
+ fmax: 22050
479
+ log_base: null
480
+ lambda_adv: 1.0
481
+ lambda_mel: 45.0
482
+ lambda_feat_match: 2.0
483
+ lambda_dur: 0.1
484
+ lambda_pitch: 10.0
485
+ lambda_phoneme: 1.0
486
+ lambda_kl: 1.0
487
+ sampling_rate: 44100
488
+ cache_generator_outputs: true
489
+ pitch_extract: dio
490
+ pitch_extract_conf:
491
+ use_token_averaged_f0: false
492
+ use_log_f0: false
493
+ fs: 44100
494
+ n_fft: 2048
495
+ hop_length: 512
496
+ f0max: 800
497
+ f0min: 80
498
+ pitch_normalize: null
499
+ pitch_normalize_conf:
500
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
501
+ ying_extract: null
502
+ ying_extract_conf: {}
503
+ energy_extract: null
504
+ energy_extract_conf: {}
505
+ energy_normalize: null
506
+ energy_normalize_conf: {}
507
+ required:
508
+ - output_dir
509
+ - token_list
510
+ version: '202310'
511
+ distributed: false
512
+ ```
513
+
514
+ </details>
515
+
516
+
517
+
518
+ ### Citing ESPnet
519
+
520
+ ```BibTex
521
+ @inproceedings{watanabe2018espnet,
522
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
523
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
524
+ year={2018},
525
+ booktitle={Proceedings of Interspeech},
526
+ pages={2207--2211},
527
+ doi={10.21437/Interspeech.2018-1456},
528
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
529
+ }
530
+
531
+
532
+
533
+
534
+
535
+
536
+ @inproceedings{shi22d_interspeech,
537
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
538
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
539
+ year=2022,
540
+ booktitle={Proc. Interspeech 2022},
541
+ pages={4277--4281},
542
+ doi={10.21437/Interspeech.2022-10039}
543
+ }
544
+ ```
545
+
546
+ or arXiv:
547
+
548
+ ```bibtex
549
+ @misc{watanabe2018espnet,
550
+ title={ESPnet: End-to-End Speech Processing Toolkit},
551
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
552
+ year={2018},
553
+ eprint={1804.00015},
554
+ archivePrefix={arXiv},
555
+ primaryClass={cs.CL}
556
+ }
557
+ ```
dump/raw/org/tr_no_dev/spk2sid ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <unk> 0
2
+ 1 1
3
+ 10 2
4
+ 11 3
5
+ 12 4
6
+ 13 5
7
+ 14 6
8
+ 15 7
9
+ 16 8
10
+ 17 9
11
+ 18 10
12
+ 19 11
13
+ 2 12
14
+ 20 13
15
+ 21 14
16
+ 22 15
17
+ 23 16
18
+ 24 17
19
+ 25 18
20
+ 26 19
21
+ 27 20
22
+ 28 21
23
+ 29 22
24
+ 3 23
25
+ 30 24
26
+ 4 25
27
+ 5 26
28
+ 6 27
29
+ 7 28
30
+ 8 29
31
+ 9 30
exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91ed268c6ea2d7a005f9fd542e21509a3625f5f10b3b4624b7dd2f28f15ee830
3
+ size 1402
exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c47d2ada04809ecaf6335963a737fac108371d7bef101f5f6f9d2a0addf45bfb
3
+ size 770
exp/svs_train_visinger2_raw_phn_None_zh/500epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:081823d2488951735c39b94f9a1a56fc38b0ccc5bae7db9f7e2e89b04fb762d5
3
+ size 448199387
exp/svs_train_visinger2_raw_phn_None_zh/config.yaml ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_visinger2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_train_visinger2_raw_phn_None_zh
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 0
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 500
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: 1000
71
+ batch_size: 8
72
+ valid_batch_size: null
73
+ batch_bins: 1000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
77
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
78
+ valid_shape_file:
79
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
80
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
81
+ batch_type: sorted
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 150
85
+ - 409600
86
+ sort_in_batch: descending
87
+ shuffle_within_batch: false
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ train_data_path_and_name_and_type:
95
+ - - dump/raw/tr_no_dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/tr_no_dev/wav.scp
99
+ - singing
100
+ - sound
101
+ - - dump/raw/tr_no_dev/label
102
+ - label
103
+ - duration
104
+ - - dump/raw/tr_no_dev/score.scp
105
+ - score
106
+ - score
107
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
108
+ - pitch
109
+ - npy
110
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
111
+ - feats
112
+ - npy
113
+ - - dump/raw/tr_no_dev/utt2sid
114
+ - sids
115
+ - text_int
116
+ valid_data_path_and_name_and_type:
117
+ - - dump/raw/dev/text
118
+ - text
119
+ - text
120
+ - - dump/raw/dev/wav.scp
121
+ - singing
122
+ - sound
123
+ - - dump/raw/dev/label
124
+ - label
125
+ - duration
126
+ - - dump/raw/dev/score.scp
127
+ - score
128
+ - score
129
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
130
+ - pitch
131
+ - npy
132
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
133
+ - feats
134
+ - npy
135
+ - - dump/raw/dev/utt2sid
136
+ - sids
137
+ - text_int
138
+ allow_variable_data_keys: false
139
+ max_cache_size: 0.0
140
+ max_cache_fd: 32
141
+ allow_multi_rates: false
142
+ valid_max_cache_size: null
143
+ exclude_weight_decay: false
144
+ exclude_weight_decay_conf: {}
145
+ optim: adamw
146
+ optim_conf:
147
+ lr: 0.0002
148
+ betas:
149
+ - 0.8
150
+ - 0.99
151
+ eps: 1.0e-09
152
+ weight_decay: 0.0
153
+ scheduler: exponentiallr
154
+ scheduler_conf:
155
+ gamma: 0.998
156
+ optim2: adamw
157
+ optim2_conf:
158
+ lr: 0.0002
159
+ betas:
160
+ - 0.8
161
+ - 0.99
162
+ eps: 1.0e-09
163
+ weight_decay: 0.0
164
+ scheduler2: exponentiallr
165
+ scheduler2_conf:
166
+ gamma: 0.998
167
+ generator_first: true
168
+ token_list:
169
+ - <blank>
170
+ - <unk>
171
+ - SP
172
+ - i
173
+ - AP
174
+ - e
175
+ - d
176
+ - y
177
+ - w
178
+ - sh
179
+ - ai
180
+ - n
181
+ - x
182
+ - j
183
+ - u
184
+ - ian
185
+ - l
186
+ - h
187
+ - b
188
+ - o
189
+ - zh
190
+ - ou
191
+ - an
192
+ - m
193
+ - q
194
+ - z
195
+ - en
196
+ - g
197
+ - ing
198
+ - ei
199
+ - ao
200
+ - uo
201
+ - ang
202
+ - eng
203
+ - t
204
+ - ong
205
+ - a
206
+ - ui
207
+ - f
208
+ - k
209
+ - r
210
+ - ch
211
+ - v
212
+ - iang
213
+ - in
214
+ - iao
215
+ - ie
216
+ - iu
217
+ - c
218
+ - s
219
+ - van
220
+ - p
221
+ - ve
222
+ - uan
223
+ - uang
224
+ - ia
225
+ - ua
226
+ - uai
227
+ - un
228
+ - er
229
+ - vn
230
+ - iong
231
+ - <sos/eos>
232
+ odim: null
233
+ model_conf: {}
234
+ use_preprocessor: true
235
+ token_type: phn
236
+ bpemodel: null
237
+ non_linguistic_symbols: null
238
+ cleaner: null
239
+ g2p: null
240
+ fs: 44100
241
+ score_feats_extract: syllable_score_feats
242
+ score_feats_extract_conf:
243
+ fs: 44100
244
+ n_fft: 2048
245
+ win_length: 2048
246
+ hop_length: 512
247
+ feats_extract: fbank
248
+ feats_extract_conf:
249
+ n_fft: 2048
250
+ hop_length: 512
251
+ win_length: 2048
252
+ fs: 44100
253
+ fmin: 80
254
+ fmax: 7600
255
+ n_mels: 80
256
+ normalize: global_mvn
257
+ normalize_conf:
258
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
259
+ svs: vits
260
+ svs_conf:
261
+ generator_type: visinger2
262
+ vocoder_generator_type: visinger2
263
+ generator_params:
264
+ hidden_channels: 192
265
+ spks: 31
266
+ global_channels: 256
267
+ segment_size: 20
268
+ text_encoder_attention_heads: 2
269
+ text_encoder_ffn_expand: 4
270
+ text_encoder_blocks: 6
271
+ text_encoder_positionwise_layer_type: conv1d
272
+ text_encoder_positionwise_conv_kernel_size: 3
273
+ text_encoder_positional_encoding_layer_type: rel_pos
274
+ text_encoder_self_attention_layer_type: rel_selfattn
275
+ text_encoder_activation_type: swish
276
+ text_encoder_normalize_before: true
277
+ text_encoder_dropout_rate: 0.1
278
+ text_encoder_positional_dropout_rate: 0.0
279
+ text_encoder_attention_dropout_rate: 0.1
280
+ use_macaron_style_in_text_encoder: true
281
+ use_conformer_conv_in_text_encoder: false
282
+ text_encoder_conformer_kernel_size: -1
283
+ decoder_kernel_size: 7
284
+ decoder_channels: 256
285
+ decoder_upsample_scales:
286
+ - 8
287
+ - 8
288
+ - 4
289
+ - 2
290
+ decoder_upsample_kernel_sizes:
291
+ - 16
292
+ - 16
293
+ - 8
294
+ - 4
295
+ n_harmonic: 64
296
+ decoder_resblock_kernel_sizes:
297
+ - 3
298
+ - 7
299
+ - 11
300
+ decoder_resblock_dilations:
301
+ - - 1
302
+ - 3
303
+ - 5
304
+ - - 1
305
+ - 3
306
+ - 5
307
+ - - 1
308
+ - 3
309
+ - 5
310
+ use_weight_norm_in_decoder: true
311
+ posterior_encoder_kernel_size: 3
312
+ posterior_encoder_layers: 8
313
+ posterior_encoder_stacks: 1
314
+ posterior_encoder_base_dilation: 1
315
+ posterior_encoder_dropout_rate: 0.0
316
+ use_weight_norm_in_posterior_encoder: true
317
+ flow_flows: -1
318
+ flow_kernel_size: 5
319
+ flow_base_dilation: 1
320
+ flow_layers: 4
321
+ flow_dropout_rate: 0.0
322
+ use_weight_norm_in_flow: true
323
+ use_only_mean_in_flow: true
324
+ use_phoneme_predictor: false
325
+ vocabs: 63
326
+ aux_channels: 80
327
+ generator_type: visinger2
328
+ vocoder_generator_type: visinger2
329
+ fs: 44100
330
+ hop_length: 512
331
+ win_length: 2048
332
+ n_fft: 2048
333
+ discriminator_type: visinger2
334
+ discriminator_params:
335
+ scales: 1
336
+ scale_downsample_pooling: AvgPool1d
337
+ scale_downsample_pooling_params:
338
+ kernel_size: 4
339
+ stride: 2
340
+ padding: 2
341
+ scale_discriminator_params:
342
+ in_channels: 1
343
+ out_channels: 1
344
+ kernel_sizes:
345
+ - 15
346
+ - 41
347
+ - 5
348
+ - 3
349
+ channels: 128
350
+ max_downsample_channels: 1024
351
+ max_groups: 256
352
+ bias: true
353
+ downsample_scales:
354
+ - 4
355
+ - 4
356
+ - 4
357
+ - 4
358
+ nonlinear_activation: LeakyReLU
359
+ nonlinear_activation_params:
360
+ negative_slope: 0.1
361
+ use_weight_norm: true
362
+ use_spectral_norm: false
363
+ follow_official_norm: false
364
+ periods:
365
+ - 2
366
+ - 3
367
+ - 5
368
+ - 7
369
+ - 11
370
+ period_discriminator_params:
371
+ in_channels: 1
372
+ out_channels: 1
373
+ kernel_sizes:
374
+ - 5
375
+ - 3
376
+ channels: 32
377
+ downsample_scales:
378
+ - 3
379
+ - 3
380
+ - 3
381
+ - 3
382
+ - 1
383
+ max_downsample_channels: 1024
384
+ bias: true
385
+ nonlinear_activation: LeakyReLU
386
+ nonlinear_activation_params:
387
+ negative_slope: 0.1
388
+ use_weight_norm: true
389
+ use_spectral_norm: false
390
+ multi_freq_disc_params:
391
+ hidden_channels:
392
+ - 256
393
+ - 256
394
+ - 256
395
+ - 256
396
+ - 256
397
+ domain: double
398
+ mel_scale: true
399
+ divisors:
400
+ - 32
401
+ - 16
402
+ - 8
403
+ - 4
404
+ - 2
405
+ - 1
406
+ - 1
407
+ strides:
408
+ - 1
409
+ - 2
410
+ - 1
411
+ - 2
412
+ - 1
413
+ - 2
414
+ - 1
415
+ sample_rate: 44100
416
+ hop_lengths:
417
+ - 110
418
+ - 220
419
+ - 330
420
+ - 441
421
+ - 551
422
+ - 661
423
+ generator_adv_loss_params:
424
+ average_by_discriminators: false
425
+ loss_type: mse
426
+ discriminator_adv_loss_params:
427
+ average_by_discriminators: false
428
+ loss_type: mse
429
+ feat_match_loss_params:
430
+ average_by_discriminators: false
431
+ average_by_layers: false
432
+ include_final_outputs: true
433
+ mel_loss_params:
434
+ fs: 44100
435
+ n_fft: 2048
436
+ hop_length: 512
437
+ win_length: 2048
438
+ window: hann
439
+ n_mels: 80
440
+ fmin: 0
441
+ fmax: 22050
442
+ log_base: null
443
+ lambda_adv: 1.0
444
+ lambda_mel: 45.0
445
+ lambda_feat_match: 2.0
446
+ lambda_dur: 0.1
447
+ lambda_pitch: 10.0
448
+ lambda_phoneme: 1.0
449
+ lambda_kl: 1.0
450
+ sampling_rate: 44100
451
+ cache_generator_outputs: true
452
+ pitch_extract: dio
453
+ pitch_extract_conf:
454
+ use_token_averaged_f0: false
455
+ use_log_f0: false
456
+ fs: 44100
457
+ n_fft: 2048
458
+ hop_length: 512
459
+ f0max: 800
460
+ f0min: 80
461
+ pitch_normalize: null
462
+ pitch_normalize_conf:
463
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
464
+ ying_extract: null
465
+ ying_extract_conf: {}
466
+ energy_extract: null
467
+ energy_extract_conf: {}
468
+ energy_normalize: null
469
+ energy_normalize_conf: {}
470
+ required:
471
+ - output_dir
472
+ - token_list
473
+ version: '202310'
474
+ distributed: false
exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_backward_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_fake_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_forward_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_optim_step_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_real_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/discriminator_train_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_adv_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_backward_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_feat_match_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_forward_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_kl_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_mel_am_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_mel_ddsp_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_mel_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_optim_step_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_phn_dur_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_pitch_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_score_dur_loss.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/generator_train_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/iter_time.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/optim0_lr0.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/optim1_lr0.png ADDED
exp/svs_train_visinger2_raw_phn_None_zh/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ model_file: exp/svs_train_visinger2_raw_phn_None_zh/500epoch.pth
4
+ python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1701182154.739273
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: exp/svs_train_visinger2_raw_phn_None_zh/config.yaml