voidful commited on
Commit
63e3df4
1 Parent(s): 0b22158

Upload 21 files

Browse files
Files changed (21) hide show
  1. autoencoder/symAD_libritts_24000_hop300/checkpoint-1000000steps.pkl +3 -0
  2. autoencoder/symAD_libritts_24000_hop300/checkpoint-500000steps.pkl +3 -0
  3. autoencoder/symAD_libritts_24000_hop300/config.yml +189 -0
  4. autoencoder/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl +3 -0
  5. autoencoder/symAD_vctk_48000_hop300/checkpoint-700000steps.pkl +3 -0
  6. autoencoder/symAD_vctk_48000_hop300/config.yml +194 -0
  7. autoencoder/symADuniv_vctk_48000_hop300/checkpoint-1000000steps.pkl +3 -0
  8. autoencoder/symADuniv_vctk_48000_hop300/checkpoint-500000steps.pkl +3 -0
  9. autoencoder/symADuniv_vctk_48000_hop300/config.yml +206 -0
  10. denoise/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl +3 -0
  11. denoise/symAD_vctk_48000_hop300/config.yml +192 -0
  12. vocoder/AudioDec_v0_symAD_vctk_48000_hop300_clean/checkpoint-500000steps.pkl +3 -0
  13. vocoder/AudioDec_v0_symAD_vctk_48000_hop300_clean/config.yml +198 -0
  14. vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean/checkpoint-500000steps.pkl +3 -0
  15. vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean/config.yml +190 -0
  16. vocoder/AudioDec_v1_symAD_vctk_48000_hop300_clean/checkpoint-500000steps.pkl +3 -0
  17. vocoder/AudioDec_v1_symAD_vctk_48000_hop300_clean/config.yml +190 -0
  18. vocoder/AudioDec_v2_symAD_vctk_48000_hop300_clean/checkpoint-500000steps.pkl +3 -0
  19. vocoder/AudioDec_v2_symAD_vctk_48000_hop300_clean/config.yml +190 -0
  20. vocoder/AudioDec_v3_symADuniv_vctk_48000_hop300_clean/checkpoint-500000steps.pkl +3 -0
  21. vocoder/AudioDec_v3_symADuniv_vctk_48000_hop300_clean/config.yml +207 -0
autoencoder/symAD_libritts_24000_hop300/checkpoint-1000000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bd3718a90e2cc885b5eb755cc35809560ddfccead323c8f2c1f3a293bdc99e5
3
+ size 36366497
autoencoder/symAD_libritts_24000_hop300/checkpoint-500000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:563872c0c474205411cf637d3c512ba9c6d81f99526b29222a3bd81ae3be2a32
3
+ size 36366367
autoencoder/symAD_libritts_24000_hop300/config.yml ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_length: 9600
2
+ batch_size: 16
3
+ config: config/autoencoder/symAD_libritts_24000_hop300.yaml
4
+ data:
5
+ path: /mnt/home/yichiaowu/datasets/LibriTTS/LibriTTS/24000
6
+ subset:
7
+ test: test-clean-1utt
8
+ train: train-clean-450
9
+ valid: dev-clean-1utt
10
+ disable_cudnn: 'False'
11
+ discriminator_adv_loss_params:
12
+ average_by_discriminators: false
13
+ discriminator_grad_norm: -1
14
+ discriminator_optimizer_params:
15
+ betas:
16
+ - 0.5
17
+ - 0.9
18
+ lr: 0.0002
19
+ weight_decay: 0.0
20
+ discriminator_optimizer_type: Adam
21
+ discriminator_params:
22
+ follow_official_norm: true
23
+ period_discriminator_params:
24
+ bias: true
25
+ channels: 32
26
+ downsample_scales:
27
+ - 3
28
+ - 3
29
+ - 3
30
+ - 3
31
+ - 1
32
+ in_channels: 1
33
+ kernel_sizes:
34
+ - 5
35
+ - 3
36
+ max_downsample_channels: 1024
37
+ nonlinear_activation: LeakyReLU
38
+ nonlinear_activation_params:
39
+ negative_slope: 0.1
40
+ out_channels: 1
41
+ use_spectral_norm: false
42
+ use_weight_norm: true
43
+ periods:
44
+ - 2
45
+ - 3
46
+ - 5
47
+ - 7
48
+ - 11
49
+ scale_discriminator_params:
50
+ bias: true
51
+ channels: 128
52
+ downsample_scales:
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 4
57
+ - 1
58
+ in_channels: 1
59
+ kernel_sizes:
60
+ - 15
61
+ - 41
62
+ - 5
63
+ - 3
64
+ max_downsample_channels: 1024
65
+ max_groups: 16
66
+ nonlinear_activation: LeakyReLU
67
+ nonlinear_activation_params:
68
+ negative_slope: 0.1
69
+ out_channels: 1
70
+ scale_downsample_pooling: AvgPool1d
71
+ scale_downsample_pooling_params:
72
+ kernel_size: 4
73
+ padding: 2
74
+ stride: 2
75
+ scales: 3
76
+ discriminator_scheduler_params:
77
+ gamma: 0.5
78
+ milestones:
79
+ - 200000
80
+ - 400000
81
+ - 600000
82
+ - 800000
83
+ discriminator_scheduler_type: MultiStepLR
84
+ eval_interval_steps: 1000
85
+ exp_root: exp
86
+ feat_match_loss_params:
87
+ average_by_discriminators: false
88
+ average_by_layers: false
89
+ include_final_outputs: false
90
+ generator_adv_loss_params:
91
+ average_by_discriminators: false
92
+ generator_grad_norm: -1
93
+ generator_optimizer_params:
94
+ betas:
95
+ - 0.5
96
+ - 0.9
97
+ lr: 0.0001
98
+ weight_decay: 0.0
99
+ generator_optimizer_type: Adam
100
+ generator_params:
101
+ bias: true
102
+ code_dim: 64
103
+ codebook_num: 8
104
+ codebook_size: 1024
105
+ codec: audiodec
106
+ dec_ratios:
107
+ - 16
108
+ - 8
109
+ - 4
110
+ - 2
111
+ dec_strides:
112
+ - 5
113
+ - 5
114
+ - 4
115
+ - 3
116
+ decode_channels: 32
117
+ enc_ratios:
118
+ - 2
119
+ - 4
120
+ - 8
121
+ - 16
122
+ enc_strides:
123
+ - 3
124
+ - 4
125
+ - 5
126
+ - 5
127
+ encode_channels: 32
128
+ input_channels: 1
129
+ mode: causal
130
+ output_channels: 1
131
+ projector: conv1d
132
+ quantier: residual_vq
133
+ generator_scheduler_params:
134
+ gamma: 1.0
135
+ step_size: 200000
136
+ generator_scheduler_type: StepLR
137
+ lambda_adv: 1.0
138
+ lambda_feat_match: 2.0
139
+ lambda_mel_loss: 45.0
140
+ lambda_shape_loss: 45.0
141
+ lambda_stft_loss: 45.0
142
+ lambda_vq_loss: 1.0
143
+ log_interval_steps: 100
144
+ mel_loss_params:
145
+ fft_size: 2048
146
+ fmax: 12000
147
+ fmin: 0
148
+ fs: 24000
149
+ hop_size: 300
150
+ log_base: null
151
+ num_mels: 80
152
+ win_length: null
153
+ window: hann
154
+ model_type: symAudioDec
155
+ num_workers: 2
156
+ outdir: exp/autoencoder/symAD_libritts_24000_hop300
157
+ paradigm: efficient
158
+ pin_memory: true
159
+ resume: ''
160
+ sampling_rate: 24000
161
+ save_interval_steps: 10000
162
+ seed: 1337
163
+ shape_loss_params:
164
+ winlen:
165
+ - 300
166
+ start_steps:
167
+ discriminator: 500000
168
+ generator: 0
169
+ stft_loss_params:
170
+ fft_sizes:
171
+ - 1024
172
+ - 2048
173
+ - 512
174
+ hop_sizes:
175
+ - 120
176
+ - 240
177
+ - 50
178
+ win_lengths:
179
+ - 600
180
+ - 1200
181
+ - 240
182
+ window: hann_window
183
+ tag: autoencoder/symAD_libritts_24000_hop300
184
+ train_max_steps: 1000000
185
+ train_mode: autoencoder
186
+ use_feat_match_loss: true
187
+ use_mel_loss: true
188
+ use_shape_loss: false
189
+ use_stft_loss: false
autoencoder/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d591fa4e564f90d1d777aa6329a088c39fa3fca18a3cb0ea1c7515faa4e4db04
3
+ size 36366367
autoencoder/symAD_vctk_48000_hop300/checkpoint-700000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbbef9aefa87c68cb3416fdf805bd2e54afd6d7a9d6a65adf5dcaeb220b0c9df
3
+ size 36366367
autoencoder/symAD_vctk_48000_hop300/config.yml ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ adv_batch_length: 9600
2
+ adv_train_max_steps: 700000
3
+ batch_length: 9600
4
+ batch_size: 16
5
+ config: config/autoencoder/symAD_vctk_48000_hop300.yaml
6
+ data:
7
+ path: /mnt/home/yichiaowu/datasets/vctk_noisy/48000
8
+ subset:
9
+ test: clean_testset_wav
10
+ train: clean_trainset_84spk_wav
11
+ valid: clean_validset_84spk_wav
12
+ disable_cudnn: 'False'
13
+ discriminator_adv_loss_params:
14
+ average_by_discriminators: false
15
+ discriminator_grad_norm: -1
16
+ discriminator_optimizer_params:
17
+ betas:
18
+ - 0.5
19
+ - 0.9
20
+ lr: 0.0002
21
+ weight_decay: 0.0
22
+ discriminator_optimizer_type: Adam
23
+ discriminator_params:
24
+ follow_official_norm: true
25
+ period_discriminator_params:
26
+ bias: true
27
+ channels: 32
28
+ downsample_scales:
29
+ - 3
30
+ - 3
31
+ - 3
32
+ - 3
33
+ - 1
34
+ in_channels: 1
35
+ kernel_sizes:
36
+ - 5
37
+ - 3
38
+ max_downsample_channels: 1024
39
+ nonlinear_activation: LeakyReLU
40
+ nonlinear_activation_params:
41
+ negative_slope: 0.1
42
+ out_channels: 1
43
+ use_spectral_norm: false
44
+ use_weight_norm: true
45
+ periods:
46
+ - 2
47
+ - 3
48
+ - 5
49
+ - 7
50
+ - 11
51
+ scale_discriminator_params:
52
+ bias: true
53
+ channels: 128
54
+ downsample_scales:
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 4
59
+ - 1
60
+ in_channels: 1
61
+ kernel_sizes:
62
+ - 15
63
+ - 41
64
+ - 5
65
+ - 3
66
+ max_downsample_channels: 1024
67
+ max_groups: 16
68
+ nonlinear_activation: LeakyReLU
69
+ nonlinear_activation_params:
70
+ negative_slope: 0.1
71
+ out_channels: 1
72
+ scale_downsample_pooling: AvgPool1d
73
+ scale_downsample_pooling_params:
74
+ kernel_size: 4
75
+ padding: 2
76
+ stride: 2
77
+ scales: 3
78
+ discriminator_scheduler_params:
79
+ gamma: 0.5
80
+ milestones:
81
+ - 200000
82
+ - 400000
83
+ - 600000
84
+ - 800000
85
+ discriminator_scheduler_type: MultiStepLR
86
+ eval_interval_steps: 1000
87
+ exp_root: exp
88
+ feat_match_loss_params:
89
+ average_by_discriminators: false
90
+ average_by_layers: false
91
+ include_final_outputs: false
92
+ generator_adv_loss_params:
93
+ average_by_discriminators: false
94
+ generator_grad_norm: -1
95
+ generator_optimizer_params:
96
+ betas:
97
+ - 0.5
98
+ - 0.9
99
+ lr: 0.0001
100
+ weight_decay: 0.0
101
+ generator_optimizer_type: Adam
102
+ generator_params:
103
+ bias: true
104
+ code_dim: 64
105
+ codebook_num: 8
106
+ codebook_size: 1024
107
+ codec: audiodec
108
+ dec_ratios:
109
+ - 16
110
+ - 8
111
+ - 4
112
+ - 2
113
+ dec_strides:
114
+ - 5
115
+ - 5
116
+ - 4
117
+ - 3
118
+ decode_channels: 32
119
+ enc_ratios:
120
+ - 2
121
+ - 4
122
+ - 8
123
+ - 16
124
+ enc_strides:
125
+ - 3
126
+ - 4
127
+ - 5
128
+ - 5
129
+ encode_channels: 32
130
+ input_channels: 1
131
+ mode: causal
132
+ output_channels: 1
133
+ projector: conv1d
134
+ quantier: residual_vq
135
+ generator_scheduler_params:
136
+ gamma: 1.0
137
+ step_size: 200000
138
+ generator_scheduler_type: StepLR
139
+ lambda_adv: 1.0
140
+ lambda_feat_match: 2.0
141
+ lambda_mel_loss: 45.0
142
+ lambda_shape_loss: 45.0
143
+ lambda_stft_loss: 45.0
144
+ lambda_vq_loss: 1.0
145
+ log_interval_steps: 100
146
+ mel_loss_params:
147
+ fft_sizes:
148
+ - 2048
149
+ fmax: 24000
150
+ fmin: 0
151
+ fs: 48000
152
+ hop_sizes:
153
+ - 300
154
+ log_base: null
155
+ num_mels: 80
156
+ win_lengths:
157
+ - 2048
158
+ window: hann_window
159
+ model_type: symAudioDec
160
+ num_workers: 2
161
+ outdir: exp/autoencoder/symAD_vctk_48000_hop300
162
+ paradigm: efficient
163
+ pin_memory: true
164
+ resume: exp/autoencoder/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl
165
+ sampling_rate: 48000
166
+ save_interval_steps: 100000
167
+ seed: 1337
168
+ shape_loss_params:
169
+ winlen:
170
+ - 300
171
+ start_steps:
172
+ discriminator: 200000
173
+ generator: 0
174
+ stft_loss_params:
175
+ fft_sizes:
176
+ - 1024
177
+ - 2048
178
+ - 512
179
+ hop_sizes:
180
+ - 120
181
+ - 240
182
+ - 50
183
+ win_lengths:
184
+ - 600
185
+ - 1200
186
+ - 240
187
+ window: hann_window
188
+ tag: autoencoder/symAD_vctk_48000_hop300
189
+ train_max_steps: 200000
190
+ train_mode: autoencoder
191
+ use_feat_match_loss: true
192
+ use_mel_loss: true
193
+ use_shape_loss: false
194
+ use_stft_loss: false
autoencoder/symADuniv_vctk_48000_hop300/checkpoint-1000000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccc9ef13cf2ff5a31f3b0d6e19252d5bd68e3ee83aeee3263c2e9fddf7ca8da6
3
+ size 36366497
autoencoder/symADuniv_vctk_48000_hop300/checkpoint-500000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:937d96b0d9e4d9c6413ebc3039ef2049fdac6008fa98168d14f31e49d07b5e63
3
+ size 36366367
autoencoder/symADuniv_vctk_48000_hop300/config.yml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_length: 9600
2
+ batch_size: 16
3
+ config: config/autoencoder/symADuniv_vctk_48000_hop300.yaml
4
+ data:
5
+ path: /mnt/home/yichiaowu/datasets/vctk_noisy/48000
6
+ subset:
7
+ test: clean_testset_wav
8
+ train: clean_trainset_84spk_wav
9
+ valid: clean_validset_84spk_wav
10
+ disable_cudnn: 'False'
11
+ discriminator_adv_loss_params:
12
+ average_by_discriminators: false
13
+ discriminator_grad_norm: -1
14
+ discriminator_optimizer_params:
15
+ betas:
16
+ - 0.5
17
+ - 0.9
18
+ lr: 0.0002
19
+ weight_decay: 0.0
20
+ discriminator_optimizer_type: Adam
21
+ discriminator_params:
22
+ fft_sizes:
23
+ - 1024
24
+ - 2048
25
+ - 512
26
+ hop_sizes:
27
+ - 120
28
+ - 240
29
+ - 50
30
+ period_discriminator_params:
31
+ bias: true
32
+ channels: 32
33
+ downsample_scales:
34
+ - 3
35
+ - 3
36
+ - 3
37
+ - 3
38
+ - 1
39
+ in_channels: 1
40
+ kernel_sizes:
41
+ - 5
42
+ - 3
43
+ max_downsample_channels: 1024
44
+ nonlinear_activation: LeakyReLU
45
+ nonlinear_activation_params:
46
+ negative_slope: 0.1
47
+ out_channels: 1
48
+ use_spectral_norm: false
49
+ use_weight_norm: true
50
+ periods:
51
+ - 2
52
+ - 3
53
+ - 5
54
+ - 7
55
+ - 11
56
+ spectral_discriminator_params:
57
+ bias: true
58
+ channels: 32
59
+ kernel_sizes:
60
+ - - 3
61
+ - 9
62
+ - - 3
63
+ - 9
64
+ - - 3
65
+ - 9
66
+ - - 3
67
+ - 9
68
+ - - 3
69
+ - 3
70
+ - - 3
71
+ - 3
72
+ nonlinear_activation: LeakyReLU
73
+ nonlinear_activation_params:
74
+ negative_slope: 0.2
75
+ strides:
76
+ - - 1
77
+ - 1
78
+ - - 1
79
+ - 2
80
+ - - 1
81
+ - 2
82
+ - - 1
83
+ - 2
84
+ - - 1
85
+ - 1
86
+ - - 1
87
+ - 1
88
+ win_lengths:
89
+ - 600
90
+ - 1200
91
+ - 240
92
+ window: hann_window
93
+ discriminator_scheduler_params:
94
+ gamma: 0.5
95
+ milestones:
96
+ - 200000
97
+ - 400000
98
+ - 600000
99
+ - 800000
100
+ discriminator_scheduler_type: MultiStepLR
101
+ eval_interval_steps: 1000
102
+ exp_root: exp
103
+ feat_match_loss_params:
104
+ average_by_discriminators: false
105
+ average_by_layers: false
106
+ include_final_outputs: false
107
+ generator_adv_loss_params:
108
+ average_by_discriminators: false
109
+ generator_grad_norm: -1
110
+ generator_optimizer_params:
111
+ betas:
112
+ - 0.5
113
+ - 0.9
114
+ lr: 0.0001
115
+ weight_decay: 0.0
116
+ generator_optimizer_type: Adam
117
+ generator_params:
118
+ bias: true
119
+ code_dim: 64
120
+ codebook_num: 8
121
+ codebook_size: 1024
122
+ codec: audiodec
123
+ dec_ratios:
124
+ - 16
125
+ - 8
126
+ - 4
127
+ - 2
128
+ dec_strides:
129
+ - 5
130
+ - 5
131
+ - 4
132
+ - 3
133
+ decode_channels: 32
134
+ enc_ratios:
135
+ - 2
136
+ - 4
137
+ - 8
138
+ - 16
139
+ enc_strides:
140
+ - 3
141
+ - 4
142
+ - 5
143
+ - 5
144
+ encode_channels: 32
145
+ input_channels: 1
146
+ mode: causal
147
+ output_channels: 1
148
+ projector: conv1d
149
+ quantier: residual_vq
150
+ generator_scheduler_params:
151
+ gamma: 1.0
152
+ step_size: 200000
153
+ generator_scheduler_type: StepLR
154
+ lambda_adv: 1.0
155
+ lambda_feat_match: 2.0
156
+ lambda_mel_loss: 45.0
157
+ lambda_shape_loss: 45.0
158
+ lambda_stft_loss: 45.0
159
+ lambda_vq_loss: 1.0
160
+ log_interval_steps: 100
161
+ mel_loss_params:
162
+ fft_size: 2048
163
+ fmax: 24000
164
+ fmin: 0
165
+ fs: 48000
166
+ hop_size: 300
167
+ log_base: null
168
+ num_mels: 80
169
+ win_length: null
170
+ window: hann
171
+ model_type: symAudioDecUniv
172
+ num_workers: 2
173
+ outdir: exp/autoencoder/symADuniv_vctk_48000_hop300
174
+ paradigm: efficient
175
+ pin_memory: true
176
+ resume: ''
177
+ sampling_rate: 48000
178
+ save_interval_steps: 10000
179
+ seed: 1337
180
+ shape_loss_params:
181
+ winlen:
182
+ - 300
183
+ start_steps:
184
+ discriminator: 500000
185
+ generator: 0
186
+ stft_loss_params:
187
+ fft_sizes:
188
+ - 1024
189
+ - 2048
190
+ - 512
191
+ hop_sizes:
192
+ - 120
193
+ - 240
194
+ - 50
195
+ win_lengths:
196
+ - 600
197
+ - 1200
198
+ - 240
199
+ window: hann_window
200
+ tag: autoencoder/symADuniv_vctk_48000_hop300
201
+ train_max_steps: 1000000
202
+ train_mode: autoencoder
203
+ use_feat_match_loss: true
204
+ use_mel_loss: true
205
+ use_shape_loss: false
206
+ use_stft_loss: false
denoise/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04831490f33212cefdf58b53d50c612440bbfa1474a860ad9535a5dff99e0930
3
+ size 36366367
denoise/symAD_vctk_48000_hop300/config.yml ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ batch_length: 96000
2
+ batch_size: 16
3
+ config: config/denoise/symAD_vctk_48000_hop300.yaml
4
+ data:
5
+ path: /mnt/home/yichiaowu/datasets/vctk_noisy/48000
6
+ subset:
7
+ clean_test: clean_testset_wav
8
+ clean_train: clean_trainset_84spk_wav
9
+ clean_valid: clean_validset_84spk_wav
10
+ noisy_test: noisy_testset_wav
11
+ noisy_train: noisy_trainset_84spk_wav
12
+ noisy_valid: noisy_validset_84spk_wav
13
+ disable_cudnn: 'False'
14
+ discriminator_adv_loss_params:
15
+ average_by_discriminators: false
16
+ discriminator_grad_norm: -1
17
+ discriminator_optimizer_params:
18
+ betas:
19
+ - 0.5
20
+ - 0.9
21
+ lr: 0.0002
22
+ weight_decay: 0.0
23
+ discriminator_optimizer_type: Adam
24
+ discriminator_params:
25
+ follow_official_norm: true
26
+ period_discriminator_params:
27
+ bias: true
28
+ channels: 32
29
+ downsample_scales:
30
+ - 3
31
+ - 3
32
+ - 3
33
+ - 3
34
+ - 1
35
+ in_channels: 1
36
+ kernel_sizes:
37
+ - 5
38
+ - 3
39
+ max_downsample_channels: 1024
40
+ nonlinear_activation: LeakyReLU
41
+ nonlinear_activation_params:
42
+ negative_slope: 0.1
43
+ out_channels: 1
44
+ use_spectral_norm: false
45
+ use_weight_norm: true
46
+ periods:
47
+ - 2
48
+ - 3
49
+ - 5
50
+ - 7
51
+ - 11
52
+ scale_discriminator_params:
53
+ bias: true
54
+ channels: 128
55
+ downsample_scales:
56
+ - 4
57
+ - 4
58
+ - 4
59
+ - 4
60
+ - 1
61
+ in_channels: 1
62
+ kernel_sizes:
63
+ - 15
64
+ - 41
65
+ - 5
66
+ - 3
67
+ max_downsample_channels: 1024
68
+ max_groups: 16
69
+ nonlinear_activation: LeakyReLU
70
+ nonlinear_activation_params:
71
+ negative_slope: 0.1
72
+ out_channels: 1
73
+ scale_downsample_pooling: AvgPool1d
74
+ scale_downsample_pooling_params:
75
+ kernel_size: 4
76
+ padding: 2
77
+ stride: 2
78
+ scales: 3
79
+ discriminator_scheduler_params:
80
+ gamma: 0.5
81
+ milestones:
82
+ - 200000
83
+ - 400000
84
+ - 600000
85
+ - 800000
86
+ discriminator_scheduler_type: MultiStepLR
87
+ eval_interval_steps: 1000
88
+ exp_root: exp
89
+ feat_match_loss_params:
90
+ average_by_discriminators: false
91
+ average_by_layers: false
92
+ include_final_outputs: false
93
+ generator_adv_loss_params:
94
+ average_by_discriminators: false
95
+ generator_grad_norm: -1
96
+ generator_optimizer_params:
97
+ betas:
98
+ - 0.5
99
+ - 0.9
100
+ lr: 0.0001
101
+ weight_decay: 0.0
102
+ generator_optimizer_type: Adam
103
+ generator_params:
104
+ bias: true
105
+ code_dim: 64
106
+ codebook_num: 8
107
+ codebook_size: 1024
108
+ codec: audiodec
109
+ dec_ratios:
110
+ - 16
111
+ - 8
112
+ - 4
113
+ - 2
114
+ dec_strides:
115
+ - 5
116
+ - 5
117
+ - 4
118
+ - 3
119
+ decode_channels: 32
120
+ enc_ratios:
121
+ - 2
122
+ - 4
123
+ - 8
124
+ - 16
125
+ enc_strides:
126
+ - 3
127
+ - 4
128
+ - 5
129
+ - 5
130
+ encode_channels: 32
131
+ input_channels: 1
132
+ mode: causal
133
+ output_channels: 1
134
+ projector: conv1d
135
+ quantier: residual_vq
136
+ generator_scheduler_params:
137
+ gamma: 1.0
138
+ step_size: 200000
139
+ generator_scheduler_type: StepLR
140
+ initial: exp/autoencoder/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl
141
+ lambda_adv: 1.0
142
+ lambda_feat_match: 2.0
143
+ lambda_mel_loss: 45.0
144
+ lambda_shape_loss: 45.0
145
+ lambda_stft_loss: 45.0
146
+ lambda_vq_loss: 1.0
147
+ log_interval_steps: 100
148
+ mel_loss_params:
149
+ fft_size: 2048
150
+ fmax: 24000
151
+ fmin: 0
152
+ fs: 48000
153
+ hop_size: 300
154
+ log_base: null
155
+ num_mels: 80
156
+ win_length: null
157
+ window: hann
158
+ model_type: symAudioDec
159
+ num_workers: 2
160
+ outdir: exp/denoise/symAD_vctk_48000_hop300
161
+ pin_memory: true
162
+ resume: ''
163
+ sampling_rate: 48000
164
+ save_interval_steps: 10000
165
+ seed: 1337
166
+ shape_loss_params:
167
+ winlen:
168
+ - 300
169
+ start_steps:
170
+ discriminator: 200000
171
+ generator: 0
172
+ stft_loss_params:
173
+ fft_sizes:
174
+ - 1024
175
+ - 2048
176
+ - 512
177
+ hop_sizes:
178
+ - 120
179
+ - 240
180
+ - 50
181
+ win_lengths:
182
+ - 600
183
+ - 1200
184
+ - 240
185
+ window: hann_window
186
+ tag: denoise/symAD_vctk_48000_hop300
187
+ train_max_steps: 200000
188
+ train_mode: denoise
189
+ use_feat_match_loss: true
190
+ use_mel_loss: true
191
+ use_shape_loss: false
192
+ use_stft_loss: false
vocoder/AudioDec_v0_symAD_vctk_48000_hop300_clean/checkpoint-500000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09cf10d7c23beba1eaead494e487890d5f74766f3e56c58783de5baddc1a1a2d
3
+ size 52266385
vocoder/AudioDec_v0_symAD_vctk_48000_hop300_clean/config.yml ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ analyzer: exp/autoencoder/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl
2
+ batch_length: 9600
3
+ batch_size: 16
4
+ config: config/vocoder/AudioDec_v0_symAD_vctk_48000_hop300_clean.yaml
5
+ data:
6
+ path: /mnt/home/yichiaowu/datasets/vctk_noisy/48000
7
+ subset:
8
+ test: clean_testset_wav
9
+ train: clean_trainset_84spk_wav
10
+ valid: clean_validset_84spk_wav
11
+ disable_cudnn: 'False'
12
+ discriminator_adv_loss_params:
13
+ average_by_discriminators: false
14
+ discriminator_grad_norm: -1
15
+ discriminator_optimizer_params:
16
+ betas:
17
+ - 0.5
18
+ - 0.9
19
+ lr: 0.0002
20
+ weight_decay: 0.0
21
+ discriminator_optimizer_type: Adam
22
+ discriminator_params:
23
+ follow_official_norm: true
24
+ period_discriminator_params:
25
+ bias: true
26
+ channels: 32
27
+ downsample_scales:
28
+ - 3
29
+ - 3
30
+ - 3
31
+ - 3
32
+ - 1
33
+ in_channels: 1
34
+ kernel_sizes:
35
+ - 5
36
+ - 3
37
+ max_downsample_channels: 1024
38
+ nonlinear_activation: LeakyReLU
39
+ nonlinear_activation_params:
40
+ negative_slope: 0.1
41
+ out_channels: 1
42
+ use_spectral_norm: false
43
+ use_weight_norm: true
44
+ periods:
45
+ - 2
46
+ - 3
47
+ - 5
48
+ - 7
49
+ - 11
50
+ scale_discriminator_params:
51
+ bias: true
52
+ channels: 128
53
+ downsample_scales:
54
+ - 4
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 1
59
+ in_channels: 1
60
+ kernel_sizes:
61
+ - 15
62
+ - 41
63
+ - 5
64
+ - 3
65
+ max_downsample_channels: 1024
66
+ max_groups: 16
67
+ nonlinear_activation: LeakyReLU
68
+ nonlinear_activation_params:
69
+ negative_slope: 0.1
70
+ out_channels: 1
71
+ scale_downsample_pooling: AvgPool1d
72
+ scale_downsample_pooling_params:
73
+ kernel_size: 4
74
+ padding: 2
75
+ stride: 2
76
+ scales: 3
77
+ discriminator_scheduler_params:
78
+ gamma: 0.5
79
+ milestones:
80
+ - 200000
81
+ - 400000
82
+ - 600000
83
+ - 800000
84
+ discriminator_scheduler_type: MultiStepLR
85
+ discriminator_train_start_steps: 0
86
+ eval_interval_steps: 1000
87
+ exp_root: exp
88
+ feat_match_loss_params:
89
+ average_by_discriminators: false
90
+ average_by_layers: false
91
+ include_final_outputs: false
92
+ generator_adv_loss_params:
93
+ average_by_discriminators: false
94
+ generator_grad_norm: -1
95
+ generator_optimizer_params:
96
+ betas:
97
+ - 0.5
98
+ - 0.9
99
+ lr: 0.0002
100
+ weight_decay: 0.0
101
+ generator_optimizer_type: Adam
102
+ generator_params:
103
+ bias: true
104
+ channels: 512
105
+ groups: 1
106
+ in_channels: 64
107
+ kernel_size: 7
108
+ nonlinear_activation: LeakyReLU
109
+ nonlinear_activation_params:
110
+ negative_slope: 0.1
111
+ out_channels: 1
112
+ resblock_dilations:
113
+ - - 1
114
+ - 3
115
+ - 5
116
+ - - 1
117
+ - 3
118
+ - 5
119
+ - - 1
120
+ - 3
121
+ - 5
122
+ resblock_kernel_sizes:
123
+ - 3
124
+ - 7
125
+ - 11
126
+ stats: stats/symAD_vctk_48000_hop300_clean.npy
127
+ upsample_kernel_sizes:
128
+ - 10
129
+ - 10
130
+ - 8
131
+ - 6
132
+ upsample_scales:
133
+ - 5
134
+ - 5
135
+ - 4
136
+ - 3
137
+ use_additional_convs: true
138
+ use_weight_norm: true
139
+ generator_scheduler_params:
140
+ gamma: 0.5
141
+ milestones:
142
+ - 200000
143
+ - 400000
144
+ - 600000
145
+ - 800000
146
+ generator_scheduler_type: MultiStepLR
147
+ generator_train_start_steps: 1
148
+ lambda_adv: 1.0
149
+ lambda_feat_match: 2.0
150
+ lambda_mel_loss: 45.0
151
+ lambda_shape_loss: 45.0
152
+ lambda_stft_loss: 45.0
153
+ log_interval_steps: 100
154
+ mel_loss_params:
155
+ fft_sizes:
156
+ - 2048
157
+ fmax: 24000
158
+ fmin: 0
159
+ fs: 48000
160
+ hop_sizes:
161
+ - 300
162
+ log_base: null
163
+ num_mels: 80
164
+ win_lengths:
165
+ - 2048
166
+ window: hann_window
167
+ model_type: HiFiGAN
168
+ num_workers: 2
169
+ outdir: exp/vocoder/AudioDec_v0_symAD_vctk_48000_hop300_clean
170
+ pin_memory: true
171
+ resume: ''
172
+ sampling_rate: 48000
173
+ save_interval_steps: 100000
174
+ seed: 1337
175
+ shape_loss_params:
176
+ winlen:
177
+ - 300
178
+ stft_loss_params:
179
+ fft_sizes:
180
+ - 1024
181
+ - 2048
182
+ - 512
183
+ hop_sizes:
184
+ - 120
185
+ - 240
186
+ - 50
187
+ win_lengths:
188
+ - 600
189
+ - 1200
190
+ - 240
191
+ window: hann_window
192
+ tag: vocoder/AudioDec_v0_symAD_vctk_48000_hop300_clean
193
+ train_max_steps: 500000
194
+ train_mode: vocoder
195
+ use_feat_match_loss: true
196
+ use_mel_loss: true
197
+ use_shape_loss: false
198
+ use_stft_loss: false
vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean/checkpoint-500000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fa357fbad89213ab58c967598844f5f0ec77d6f0cfec31b285b252455eb5256
3
+ size 78587449
vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean/config.yml ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ analyzer: exp/autoencoder/symAD_libritts_24000_hop300/checkpoint-500000steps.pkl
2
+ batch_length: 9600
3
+ batch_size: 16
4
+ config: config/vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean.yaml
5
+ data:
6
+ path: /mnt/home/yichiaowu/datasets/LibriTTS/LibriTTS/24000
7
+ subset:
8
+ test: test-clean-1utt
9
+ train: train-clean-450
10
+ valid: dev-clean-1utt
11
+ disable_cudnn: 'False'
12
+ discriminator_adv_loss_params:
13
+ average_by_discriminators: false
14
+ discriminator_grad_norm: -1
15
+ discriminator_optimizer_params:
16
+ betas:
17
+ - 0.5
18
+ - 0.9
19
+ lr: 0.0002
20
+ weight_decay: 0.0
21
+ discriminator_optimizer_type: Adam
22
+ discriminator_params:
23
+ follow_official_norm: true
24
+ period_discriminator_params:
25
+ bias: true
26
+ channels: 32
27
+ downsample_scales:
28
+ - 3
29
+ - 3
30
+ - 3
31
+ - 3
32
+ - 1
33
+ in_channels: 1
34
+ kernel_sizes:
35
+ - 5
36
+ - 3
37
+ max_downsample_channels: 1024
38
+ nonlinear_activation: LeakyReLU
39
+ nonlinear_activation_params:
40
+ negative_slope: 0.1
41
+ out_channels: 1
42
+ use_spectral_norm: false
43
+ use_weight_norm: true
44
+ periods:
45
+ - 2
46
+ - 3
47
+ - 5
48
+ - 7
49
+ - 11
50
+ scale_discriminator_params:
51
+ bias: true
52
+ channels: 128
53
+ downsample_scales:
54
+ - 4
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 1
59
+ in_channels: 1
60
+ kernel_sizes:
61
+ - 15
62
+ - 41
63
+ - 5
64
+ - 3
65
+ max_downsample_channels: 1024
66
+ max_groups: 16
67
+ nonlinear_activation: LeakyReLU
68
+ nonlinear_activation_params:
69
+ negative_slope: 0.1
70
+ out_channels: 1
71
+ scale_downsample_pooling: AvgPool1d
72
+ scale_downsample_pooling_params:
73
+ kernel_size: 4
74
+ padding: 2
75
+ stride: 2
76
+ scales: 3
77
+ discriminator_scheduler_params:
78
+ gamma: 0.5
79
+ milestones:
80
+ - 200000
81
+ - 400000
82
+ - 600000
83
+ - 800000
84
+ discriminator_scheduler_type: MultiStepLR
85
+ discriminator_train_start_steps: 0
86
+ eval_interval_steps: 1000
87
+ exp_root: exp
88
+ feat_match_loss_params:
89
+ average_by_discriminators: false
90
+ average_by_layers: false
91
+ include_final_outputs: false
92
+ generator_adv_loss_params:
93
+ average_by_discriminators: false
94
+ generator_grad_norm: -1
95
+ generator_optimizer_params:
96
+ betas:
97
+ - 0.5
98
+ - 0.9
99
+ lr: 0.0002
100
+ weight_decay: 0.0
101
+ generator_optimizer_type: Adam
102
+ generator_params:
103
+ bias: true
104
+ channels: 512
105
+ groups: 3
106
+ in_channels: 64
107
+ kernel_size: 7
108
+ nonlinear_activation: LeakyReLU
109
+ nonlinear_activation_params:
110
+ negative_slope: 0.1
111
+ out_channels: 1
112
+ resblock_dilations:
113
+ - - 1
114
+ - 3
115
+ - 5
116
+ resblock_kernel_sizes:
117
+ - 11
118
+ stats: stats/symAD_libritts_24000_hop300_clean.npy
119
+ upsample_kernel_sizes:
120
+ - 10
121
+ - 10
122
+ - 8
123
+ - 6
124
+ upsample_scales:
125
+ - 5
126
+ - 5
127
+ - 4
128
+ - 3
129
+ use_additional_convs: true
130
+ use_weight_norm: true
131
+ generator_scheduler_params:
132
+ gamma: 0.5
133
+ milestones:
134
+ - 200000
135
+ - 400000
136
+ - 600000
137
+ - 800000
138
+ generator_scheduler_type: MultiStepLR
139
+ generator_train_start_steps: 1
140
+ lambda_adv: 1.0
141
+ lambda_feat_match: 2.0
142
+ lambda_mel_loss: 45.0
143
+ lambda_shape_loss: 45.0
144
+ lambda_stft_loss: 45.0
145
+ log_interval_steps: 100
146
+ mel_loss_params:
147
+ fft_sizes:
148
+ - 2048
149
+ fmax: 12000
150
+ fmin: 0
151
+ fs: 24000
152
+ hop_sizes:
153
+ - 300
154
+ log_base: null
155
+ num_mels: 80
156
+ win_lengths:
157
+ - 2048
158
+ window: hann_window
159
+ model_type: HiFiGAN
160
+ num_workers: 2
161
+ outdir: exp/vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean
162
+ pin_memory: true
163
+ resume: ''
164
+ sampling_rate: 24000
165
+ save_interval_steps: 100000
166
+ seed: 1337
167
+ shape_loss_params:
168
+ winlen:
169
+ - 300
170
+ stft_loss_params:
171
+ fft_sizes:
172
+ - 1024
173
+ - 2048
174
+ - 512
175
+ hop_sizes:
176
+ - 120
177
+ - 240
178
+ - 50
179
+ win_lengths:
180
+ - 600
181
+ - 1200
182
+ - 240
183
+ window: hann_window
184
+ tag: vocoder/AudioDec_v1_symAD_libritts_24000_hop300_clean
185
+ train_max_steps: 500000
186
+ train_mode: vocoder
187
+ use_feat_match_loss: true
188
+ use_mel_loss: true
189
+ use_shape_loss: false
190
+ use_stft_loss: false
vocoder/AudioDec_v1_symAD_vctk_48000_hop300_clean/checkpoint-500000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa8fc5367217723a24786bc5eb350e5f8e039255669797873938ca3ba0d22b2d
3
+ size 78587449
vocoder/AudioDec_v1_symAD_vctk_48000_hop300_clean/config.yml ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ analyzer: exp/autoencoder/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl
2
+ batch_length: 9600
3
+ batch_size: 16
4
+ config: config/vocoder/AudioDec_v1_symAD_vctk_48000_hop300_clean.yaml
5
+ data:
6
+ path: /mnt/home/yichiaowu/datasets/vctk_noisy/48000
7
+ subset:
8
+ test: clean_testset_wav
9
+ train: clean_trainset_84spk_wav
10
+ valid: clean_validset_84spk_wav
11
+ disable_cudnn: 'False'
12
+ discriminator_adv_loss_params:
13
+ average_by_discriminators: false
14
+ discriminator_grad_norm: -1
15
+ discriminator_optimizer_params:
16
+ betas:
17
+ - 0.5
18
+ - 0.9
19
+ lr: 0.0002
20
+ weight_decay: 0.0
21
+ discriminator_optimizer_type: Adam
22
+ discriminator_params:
23
+ follow_official_norm: true
24
+ period_discriminator_params:
25
+ bias: true
26
+ channels: 32
27
+ downsample_scales:
28
+ - 3
29
+ - 3
30
+ - 3
31
+ - 3
32
+ - 1
33
+ in_channels: 1
34
+ kernel_sizes:
35
+ - 5
36
+ - 3
37
+ max_downsample_channels: 1024
38
+ nonlinear_activation: LeakyReLU
39
+ nonlinear_activation_params:
40
+ negative_slope: 0.1
41
+ out_channels: 1
42
+ use_spectral_norm: false
43
+ use_weight_norm: true
44
+ periods:
45
+ - 2
46
+ - 3
47
+ - 5
48
+ - 7
49
+ - 11
50
+ scale_discriminator_params:
51
+ bias: true
52
+ channels: 128
53
+ downsample_scales:
54
+ - 4
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 1
59
+ in_channels: 1
60
+ kernel_sizes:
61
+ - 15
62
+ - 41
63
+ - 5
64
+ - 3
65
+ max_downsample_channels: 1024
66
+ max_groups: 16
67
+ nonlinear_activation: LeakyReLU
68
+ nonlinear_activation_params:
69
+ negative_slope: 0.1
70
+ out_channels: 1
71
+ scale_downsample_pooling: AvgPool1d
72
+ scale_downsample_pooling_params:
73
+ kernel_size: 4
74
+ padding: 2
75
+ stride: 2
76
+ scales: 3
77
+ discriminator_scheduler_params:
78
+ gamma: 0.5
79
+ milestones:
80
+ - 200000
81
+ - 400000
82
+ - 600000
83
+ - 800000
84
+ discriminator_scheduler_type: MultiStepLR
85
+ discriminator_train_start_steps: 0
86
+ eval_interval_steps: 1000
87
+ exp_root: exp
88
+ feat_match_loss_params:
89
+ average_by_discriminators: false
90
+ average_by_layers: false
91
+ include_final_outputs: false
92
+ generator_adv_loss_params:
93
+ average_by_discriminators: false
94
+ generator_grad_norm: -1
95
+ generator_optimizer_params:
96
+ betas:
97
+ - 0.5
98
+ - 0.9
99
+ lr: 0.0002
100
+ weight_decay: 0.0
101
+ generator_optimizer_type: Adam
102
+ generator_params:
103
+ bias: true
104
+ channels: 512
105
+ groups: 3
106
+ in_channels: 64
107
+ kernel_size: 7
108
+ nonlinear_activation: LeakyReLU
109
+ nonlinear_activation_params:
110
+ negative_slope: 0.1
111
+ out_channels: 1
112
+ resblock_dilations:
113
+ - - 1
114
+ - 3
115
+ - 5
116
+ resblock_kernel_sizes:
117
+ - 11
118
+ stats: stats/symAD_vctk_48000_hop300_clean.npy
119
+ upsample_kernel_sizes:
120
+ - 10
121
+ - 10
122
+ - 8
123
+ - 6
124
+ upsample_scales:
125
+ - 5
126
+ - 5
127
+ - 4
128
+ - 3
129
+ use_additional_convs: true
130
+ use_weight_norm: true
131
+ generator_scheduler_params:
132
+ gamma: 0.5
133
+ milestones:
134
+ - 200000
135
+ - 400000
136
+ - 600000
137
+ - 800000
138
+ generator_scheduler_type: MultiStepLR
139
+ generator_train_start_steps: 1
140
+ lambda_adv: 1.0
141
+ lambda_feat_match: 2.0
142
+ lambda_mel_loss: 45.0
143
+ lambda_shape_loss: 45.0
144
+ lambda_stft_loss: 45.0
145
+ log_interval_steps: 100
146
+ mel_loss_params:
147
+ fft_sizes:
148
+ - 2048
149
+ fmax: 24000
150
+ fmin: 0
151
+ fs: 48000
152
+ hop_sizes:
153
+ - 300
154
+ log_base: null
155
+ num_mels: 80
156
+ win_lengths:
157
+ - 2048
158
+ window: hann_window
159
+ model_type: HiFiGAN
160
+ num_workers: 2
161
+ outdir: exp/vocoder/AudioDec_v1_symAD_vctk_48000_hop300_clean
162
+ pin_memory: true
163
+ resume: exp/vocoder/AudioDec_v1_symAD_vctk_48000_hop300_clean/checkpoint-100000steps.pkl
164
+ sampling_rate: 48000
165
+ save_interval_steps: 100000
166
+ seed: 1337
167
+ shape_loss_params:
168
+ winlen:
169
+ - 300
170
+ stft_loss_params:
171
+ fft_sizes:
172
+ - 1024
173
+ - 2048
174
+ - 512
175
+ hop_sizes:
176
+ - 120
177
+ - 240
178
+ - 50
179
+ win_lengths:
180
+ - 600
181
+ - 1200
182
+ - 240
183
+ window: hann_window
184
+ tag: vocoder/AudioDec_v1_symAD_vctk_48000_hop300_clean
185
+ train_max_steps: 500000
186
+ train_mode: vocoder
187
+ use_feat_match_loss: true
188
+ use_mel_loss: true
189
+ use_shape_loss: false
190
+ use_stft_loss: false
vocoder/AudioDec_v2_symAD_vctk_48000_hop300_clean/checkpoint-500000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57128a319bc38cbc77fada84dd10bca399f0b884d03f5e1635c17cef5fdfd1bd
3
+ size 27899385
vocoder/AudioDec_v2_symAD_vctk_48000_hop300_clean/config.yml ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ analyzer: exp/autoencoder/symAD_vctk_48000_hop300/checkpoint-200000steps.pkl
2
+ batch_length: 9600
3
+ batch_size: 16
4
+ config: config/vocoder/AudioDec_v2_symAD_vctk_48000_hop300_clean.yaml
5
+ data:
6
+ path: /mnt/home/yichiaowu/datasets/vctk_noisy/48000
7
+ subset:
8
+ test: clean_testset_wav
9
+ train: clean_trainset_84spk_wav
10
+ valid: clean_validset_84spk_wav
11
+ disable_cudnn: 'False'
12
+ discriminator_adv_loss_params:
13
+ average_by_discriminators: false
14
+ discriminator_grad_norm: -1
15
+ discriminator_optimizer_params:
16
+ betas:
17
+ - 0.5
18
+ - 0.9
19
+ lr: 0.0002
20
+ weight_decay: 0.0
21
+ discriminator_optimizer_type: Adam
22
+ discriminator_params:
23
+ follow_official_norm: true
24
+ period_discriminator_params:
25
+ bias: true
26
+ channels: 32
27
+ downsample_scales:
28
+ - 3
29
+ - 3
30
+ - 3
31
+ - 3
32
+ - 1
33
+ in_channels: 1
34
+ kernel_sizes:
35
+ - 5
36
+ - 3
37
+ max_downsample_channels: 1024
38
+ nonlinear_activation: LeakyReLU
39
+ nonlinear_activation_params:
40
+ negative_slope: 0.1
41
+ out_channels: 1
42
+ use_spectral_norm: false
43
+ use_weight_norm: true
44
+ periods:
45
+ - 2
46
+ - 3
47
+ - 5
48
+ - 7
49
+ - 11
50
+ scale_discriminator_params:
51
+ bias: true
52
+ channels: 128
53
+ downsample_scales:
54
+ - 4
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 1
59
+ in_channels: 1
60
+ kernel_sizes:
61
+ - 15
62
+ - 41
63
+ - 5
64
+ - 3
65
+ max_downsample_channels: 1024
66
+ max_groups: 16
67
+ nonlinear_activation: LeakyReLU
68
+ nonlinear_activation_params:
69
+ negative_slope: 0.1
70
+ out_channels: 1
71
+ scale_downsample_pooling: AvgPool1d
72
+ scale_downsample_pooling_params:
73
+ kernel_size: 4
74
+ padding: 2
75
+ stride: 2
76
+ scales: 3
77
+ discriminator_scheduler_params:
78
+ gamma: 0.5
79
+ milestones:
80
+ - 200000
81
+ - 400000
82
+ - 600000
83
+ - 800000
84
+ discriminator_scheduler_type: MultiStepLR
85
+ discriminator_train_start_steps: 0
86
+ eval_interval_steps: 1000
87
+ exp_root: exp
88
+ feat_match_loss_params:
89
+ average_by_discriminators: false
90
+ average_by_layers: false
91
+ include_final_outputs: false
92
+ generator_adv_loss_params:
93
+ average_by_discriminators: false
94
+ generator_grad_norm: -1
95
+ generator_optimizer_params:
96
+ betas:
97
+ - 0.5
98
+ - 0.9
99
+ lr: 0.0002
100
+ weight_decay: 0.0
101
+ generator_optimizer_type: Adam
102
+ generator_params:
103
+ bias: true
104
+ channels: 512
105
+ groups: 3
106
+ in_channels: 64
107
+ kernel_size: 7
108
+ nonlinear_activation: LeakyReLU
109
+ nonlinear_activation_params:
110
+ negative_slope: 0.1
111
+ out_channels: 1
112
+ resblock_dilations:
113
+ - - 1
114
+ - 3
115
+ - 5
116
+ resblock_kernel_sizes:
117
+ - 3
118
+ stats: stats/symAD_vctk_48000_hop300_clean.npy
119
+ upsample_kernel_sizes:
120
+ - 10
121
+ - 10
122
+ - 8
123
+ - 6
124
+ upsample_scales:
125
+ - 5
126
+ - 5
127
+ - 4
128
+ - 3
129
+ use_additional_convs: true
130
+ use_weight_norm: true
131
+ generator_scheduler_params:
132
+ gamma: 0.5
133
+ milestones:
134
+ - 200000
135
+ - 400000
136
+ - 600000
137
+ - 800000
138
+ generator_scheduler_type: MultiStepLR
139
+ generator_train_start_steps: 1
140
+ lambda_adv: 1.0
141
+ lambda_feat_match: 2.0
142
+ lambda_mel_loss: 45.0
143
+ lambda_shape_loss: 45.0
144
+ lambda_stft_loss: 45.0
145
+ log_interval_steps: 100
146
+ mel_loss_params:
147
+ fft_sizes:
148
+ - 2048
149
+ fmax: 24000
150
+ fmin: 0
151
+ fs: 48000
152
+ hop_sizes:
153
+ - 300
154
+ log_base: null
155
+ num_mels: 80
156
+ win_lengths:
157
+ - 2048
158
+ window: hann_window
159
+ model_type: HiFiGAN
160
+ num_workers: 2
161
+ outdir: exp/vocoder/AudioDec_v2_symAD_vctk_48000_hop300_clean
162
+ pin_memory: true
163
+ resume: ''
164
+ sampling_rate: 48000
165
+ save_interval_steps: 100000
166
+ seed: 1337
167
+ shape_loss_params:
168
+ winlen:
169
+ - 300
170
+ stft_loss_params:
171
+ fft_sizes:
172
+ - 1024
173
+ - 2048
174
+ - 512
175
+ hop_sizes:
176
+ - 120
177
+ - 240
178
+ - 50
179
+ win_lengths:
180
+ - 600
181
+ - 1200
182
+ - 240
183
+ window: hann_window
184
+ tag: vocoder/AudioDec_v2_symAD_vctk_48000_hop300_clean
185
+ train_max_steps: 500000
186
+ train_mode: vocoder
187
+ use_feat_match_loss: true
188
+ use_mel_loss: true
189
+ use_shape_loss: false
190
+ use_stft_loss: false
vocoder/AudioDec_v3_symADuniv_vctk_48000_hop300_clean/checkpoint-500000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ae8e353621a10bbd130d81e4b85131eea25228c83ea9d21629af981f6a572b8
3
+ size 78587449
vocoder/AudioDec_v3_symADuniv_vctk_48000_hop300_clean/config.yml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ analyzer: exp/autoencoder/symADuniv_vctk_48000_hop300/checkpoint-500000steps.pkl
2
+ batch_length: 9600
3
+ batch_size: 16
4
+ config: config/vocoder/AudioDec_v3_symADuniv_vctk_48000_hop300_clean.yaml
5
+ data:
6
+ path: /mnt/home/yichiaowu/datasets/vctk_noisy/48000
7
+ subset:
8
+ test: clean_testset_wav
9
+ train: clean_trainset_84spk_wav
10
+ valid: clean_validset_84spk_wav
11
+ disable_cudnn: 'False'
12
+ discriminator_adv_loss_params:
13
+ average_by_discriminators: false
14
+ discriminator_grad_norm: -1
15
+ discriminator_optimizer_params:
16
+ betas:
17
+ - 0.5
18
+ - 0.9
19
+ lr: 0.0002
20
+ weight_decay: 0.0
21
+ discriminator_optimizer_type: Adam
22
+ discriminator_params:
23
+ fft_sizes:
24
+ - 1024
25
+ - 2048
26
+ - 512
27
+ hop_sizes:
28
+ - 120
29
+ - 240
30
+ - 50
31
+ period_discriminator_params:
32
+ bias: true
33
+ channels: 32
34
+ downsample_scales:
35
+ - 3
36
+ - 3
37
+ - 3
38
+ - 3
39
+ - 1
40
+ in_channels: 1
41
+ kernel_sizes:
42
+ - 5
43
+ - 3
44
+ max_downsample_channels: 1024
45
+ nonlinear_activation: LeakyReLU
46
+ nonlinear_activation_params:
47
+ negative_slope: 0.1
48
+ out_channels: 1
49
+ use_spectral_norm: false
50
+ use_weight_norm: true
51
+ periods:
52
+ - 2
53
+ - 3
54
+ - 5
55
+ - 7
56
+ - 11
57
+ spectral_discriminator_params:
58
+ bias: true
59
+ channels: 32
60
+ kernel_sizes:
61
+ - - 3
62
+ - 9
63
+ - - 3
64
+ - 9
65
+ - - 3
66
+ - 9
67
+ - - 3
68
+ - 9
69
+ - - 3
70
+ - 3
71
+ - - 3
72
+ - 3
73
+ nonlinear_activation: LeakyReLU
74
+ nonlinear_activation_params:
75
+ negative_slope: 0.2
76
+ strides:
77
+ - - 1
78
+ - 1
79
+ - - 1
80
+ - 2
81
+ - - 1
82
+ - 2
83
+ - - 1
84
+ - 2
85
+ - - 1
86
+ - 1
87
+ - - 1
88
+ - 1
89
+ win_lengths:
90
+ - 600
91
+ - 1200
92
+ - 240
93
+ window: hann_window
94
+ discriminator_scheduler_params:
95
+ gamma: 0.5
96
+ milestones:
97
+ - 200000
98
+ - 400000
99
+ - 600000
100
+ - 800000
101
+ discriminator_scheduler_type: MultiStepLR
102
+ discriminator_train_start_steps: 0
103
+ eval_interval_steps: 1000
104
+ exp_root: exp
105
+ feat_match_loss_params:
106
+ average_by_discriminators: false
107
+ average_by_layers: false
108
+ include_final_outputs: false
109
+ generator_adv_loss_params:
110
+ average_by_discriminators: false
111
+ generator_grad_norm: -1
112
+ generator_optimizer_params:
113
+ betas:
114
+ - 0.5
115
+ - 0.9
116
+ lr: 0.0002
117
+ weight_decay: 0.0
118
+ generator_optimizer_type: Adam
119
+ generator_params:
120
+ bias: true
121
+ channels: 512
122
+ groups: 3
123
+ in_channels: 64
124
+ kernel_size: 7
125
+ nonlinear_activation: LeakyReLU
126
+ nonlinear_activation_params:
127
+ negative_slope: 0.1
128
+ out_channels: 1
129
+ resblock_dilations:
130
+ - - 1
131
+ - 3
132
+ - 5
133
+ resblock_kernel_sizes:
134
+ - 11
135
+ stats: stats/symADuniv_vctk_48000_hop300_clean.npy
136
+ upsample_kernel_sizes:
137
+ - 10
138
+ - 10
139
+ - 8
140
+ - 6
141
+ upsample_scales:
142
+ - 5
143
+ - 5
144
+ - 4
145
+ - 3
146
+ use_additional_convs: true
147
+ use_weight_norm: true
148
+ generator_scheduler_params:
149
+ gamma: 0.5
150
+ milestones:
151
+ - 200000
152
+ - 400000
153
+ - 600000
154
+ - 800000
155
+ generator_scheduler_type: MultiStepLR
156
+ generator_train_start_steps: 1
157
+ lambda_adv: 1.0
158
+ lambda_feat_match: 2.0
159
+ lambda_mel_loss: 45.0
160
+ lambda_shape_loss: 45.0
161
+ lambda_stft_loss: 45.0
162
+ log_interval_steps: 100
163
+ mel_loss_params:
164
+ fft_sizes:
165
+ - 2048
166
+ fmax: 24000
167
+ fmin: 0
168
+ fs: 48000
169
+ hop_sizes:
170
+ - 300
171
+ log_base: null
172
+ num_mels: 80
173
+ win_lengths:
174
+ - 2048
175
+ window: hann_window
176
+ model_type: UnivNet
177
+ num_workers: 2
178
+ outdir: exp/vocoder/AudioDec_v3_symADuniv_vctk_48000_hop300_clean
179
+ pin_memory: true
180
+ resume: ''
181
+ sampling_rate: 48000
182
+ save_interval_steps: 100000
183
+ seed: 1337
184
+ shape_loss_params:
185
+ winlen:
186
+ - 300
187
+ stft_loss_params:
188
+ fft_sizes:
189
+ - 1024
190
+ - 2048
191
+ - 512
192
+ hop_sizes:
193
+ - 120
194
+ - 240
195
+ - 50
196
+ win_lengths:
197
+ - 600
198
+ - 1200
199
+ - 240
200
+ window: hann_window
201
+ tag: vocoder/AudioDec_v3_symADuniv_vctk_48000_hop300_clean
202
+ train_max_steps: 500000
203
+ train_mode: vocoder
204
+ use_feat_match_loss: true
205
+ use_mel_loss: true
206
+ use_shape_loss: false
207
+ use_stft_loss: false