ftshijt commited on
Commit
ceb5148
1 Parent(s): b75c8bc

Update model

Browse files
Files changed (29) hide show
  1. README.md +347 -3
  2. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/200epoch.pth +3 -0
  3. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/config.yaml +272 -0
  4. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/adv_loss.png +0 -0
  5. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/codec_commit_loss.png +0 -0
  6. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/codec_loss.png +0 -0
  7. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/codec_quantization_loss.png +0 -0
  8. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_backward_time.png +0 -0
  9. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_forward_time.png +0 -0
  10. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_loss.png +0 -0
  11. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_optim_step_time.png +0 -0
  12. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_train_time.png +0 -0
  13. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/fake_loss.png +0 -0
  14. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/feat_match_loss.png +0 -0
  15. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/generator_backward_time.png +0 -0
  16. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/generator_forward_time.png +0 -0
  17. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/generator_optim_step_time.png +0 -0
  18. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/generator_train_time.png +0 -0
  19. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/gpu_max_cached_mem_GB.png +0 -0
  20. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/iter_time.png +0 -0
  21. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/loss.png +0 -0
  22. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/mel_loss.png +0 -0
  23. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/mel_loss_real.png +0 -0
  24. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/optim0_lr0.png +0 -0
  25. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/optim1_lr0.png +0 -0
  26. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/real_loss.png +0 -0
  27. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/reconstruct_loss.png +0 -0
  28. exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/train_time.png +0 -0
  29. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,347 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - codec
6
+ language: multilingual
7
+ datasets:
8
+ - amuse
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 Codec model
13
+
14
+ ### `espnet/owsmdata_soundstream_16k_200epoch`
15
+
16
+ This model was trained by ftshijt using amuse recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 734f1235b3dd3c444822b6337fbb2e417e75e321
26
+ pip install -e .
27
+ cd egs2/amuse/codec_speechlm
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/owsmdata_soundstream_16k_200epoch
29
+ ```
30
+
31
+
32
+
33
+ ## Codec config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train_soundstream4_large_v1.1.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: chunk
44
+ valid_iterator_type: null
45
+ output_dir: exp/codec_train_soundstream4_large_v1.1_raw_fs16000
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 1
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: 4
53
+ dist_rank: 0
54
+ local_rank: 0
55
+ dist_master_addr: localhost
56
+ dist_master_port: 49939
57
+ dist_launcher: null
58
+ multiprocessing_distributed: true
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ use_tf32: true
65
+ collect_stats: false
66
+ write_collected_feats: false
67
+ max_epoch: 360
68
+ patience: null
69
+ val_scheduler_criterion:
70
+ - valid
71
+ - loss
72
+ early_stopping_criterion:
73
+ - valid
74
+ - loss
75
+ - min
76
+ best_model_criterion:
77
+ - - valid
78
+ - mel_loss
79
+ - min
80
+ - - train
81
+ - mel_loss
82
+ - min
83
+ - - train
84
+ - total_count
85
+ - max
86
+ keep_nbest_models: 5
87
+ nbest_averaging_interval: 0
88
+ grad_clip: -1
89
+ grad_clip_type: 2.0
90
+ grad_noise: false
91
+ accum_grad: 1
92
+ no_forward_run: false
93
+ resume: true
94
+ train_dtype: float32
95
+ use_amp: false
96
+ log_interval: 50
97
+ use_matplotlib: true
98
+ use_tensorboard: true
99
+ create_graph_in_tensorboard: false
100
+ use_wandb: false
101
+ wandb_project: null
102
+ wandb_id: null
103
+ wandb_entity: null
104
+ wandb_name: null
105
+ wandb_model_log_interval: -1
106
+ detect_anomaly: false
107
+ use_adapter: false
108
+ adapter: lora
109
+ save_strategy: all
110
+ adapter_conf: {}
111
+ pretrain_path: null
112
+ init_param: []
113
+ ignore_init_mismatch: false
114
+ freeze_param: []
115
+ num_iters_per_epoch: 5000
116
+ batch_size: 128
117
+ valid_batch_size: null
118
+ batch_bins: 1000000
119
+ valid_batch_bins: null
120
+ train_shape_file:
121
+ - exp/codec_stats_raw/train/audio_shape
122
+ valid_shape_file:
123
+ - exp/codec_stats_raw/valid/audio_shape
124
+ batch_type: unsorted
125
+ valid_batch_type: null
126
+ fold_length:
127
+ - 256000
128
+ sort_in_batch: descending
129
+ shuffle_within_batch: false
130
+ sort_batch: descending
131
+ multiple_iterator: false
132
+ truncate_audio: false
133
+ chunk_length: 32000
134
+ chunk_shift_ratio: 0.5
135
+ num_cache_chunks: 128
136
+ chunk_excluded_key_prefixes: []
137
+ chunk_default_fs: null
138
+ train_data_path_and_name_and_type:
139
+ - - dump/raw/train/wav.scp
140
+ - audio
141
+ - kaldi_ark
142
+ valid_data_path_and_name_and_type:
143
+ - - dump/raw/dev-small/wav.scp
144
+ - audio
145
+ - kaldi_ark
146
+ multi_task_dataset: false
147
+ allow_variable_data_keys: false
148
+ max_cache_size: 0.0
149
+ max_cache_fd: 32
150
+ allow_multi_rates: false
151
+ valid_max_cache_size: null
152
+ exclude_weight_decay: false
153
+ exclude_weight_decay_conf: {}
154
+ optim: adamw
155
+ optim_conf:
156
+ lr: 0.0002
157
+ betas:
158
+ - 0.5
159
+ - 0.9
160
+ eps: 1.0e-09
161
+ weight_decay: 0.0
162
+ scheduler: exponentiallr
163
+ scheduler_conf:
164
+ gamma: 0.999875
165
+ optim2: adamw
166
+ optim2_conf:
167
+ lr: 0.0002
168
+ betas:
169
+ - 0.5
170
+ - 0.9
171
+ eps: 1.0e-09
172
+ weight_decay: 0.0
173
+ scheduler2: exponentiallr
174
+ scheduler2_conf:
175
+ gamma: 0.999875
176
+ generator_first: true
177
+ skip_discriminator_prob: 0.0
178
+ model_conf: {}
179
+ use_preprocessor: true
180
+ codec: soundstream
181
+ codec_conf:
182
+ sampling_rate: 16000
183
+ generator_params:
184
+ hidden_dim: 512
185
+ encdec_channels: 1
186
+ encdec_n_filters: 32
187
+ encdec_n_residual_layers: 3
188
+ encdec_ratios:
189
+ - 8
190
+ - 5
191
+ - 4
192
+ - 2
193
+ encdec_activation: ELU
194
+ encdec_activation_params:
195
+ alpha: 1.0
196
+ encdec_norm: weight_norm
197
+ encdec_kernel_size: 7
198
+ encdec_residual_kernel_size: 7
199
+ encdec_last_kernel_size: 7
200
+ encdec_dilation_base: 2
201
+ encdec_causal: false
202
+ encdec_pad_mode: reflect
203
+ encdec_true_skip: false
204
+ encdec_compress: 2
205
+ encdec_lstm: 2
206
+ decoder_trim_right_ratio: 1.0
207
+ decoder_final_activation: null
208
+ decoder_final_activation_params: null
209
+ quantizer_n_q: 32
210
+ quantizer_bins: 1024
211
+ quantizer_decay: 0.99
212
+ quantizer_kmeans_init: true
213
+ quantizer_kmeans_iters: 50
214
+ quantizer_threshold_ema_dead_code: 2
215
+ quantizer_target_bandwidth:
216
+ - 2
217
+ - 4
218
+ - 8
219
+ - 16
220
+ - 32
221
+ sample_rate: 16000
222
+ discriminator_params:
223
+ scales: 3
224
+ scale_downsample_pooling: AvgPool1d
225
+ scale_downsample_pooling_params:
226
+ kernel_size: 4
227
+ stride: 2
228
+ padding: 2
229
+ scale_discriminator_params:
230
+ in_channels: 1
231
+ out_channels: 1
232
+ kernel_sizes:
233
+ - 15
234
+ - 41
235
+ - 5
236
+ - 3
237
+ channels: 128
238
+ max_downsample_channels: 1024
239
+ max_groups: 16
240
+ bias: true
241
+ downsample_scales:
242
+ - 2
243
+ - 2
244
+ - 4
245
+ - 4
246
+ - 1
247
+ nonlinear_activation: LeakyReLU
248
+ nonlinear_activation_params:
249
+ negative_slope: 0.1
250
+ scale_follow_official_norm: false
251
+ complexstft_discriminator_params:
252
+ in_channels: 1
253
+ channels: 32
254
+ strides:
255
+ - - 1
256
+ - 2
257
+ - - 2
258
+ - 2
259
+ - - 1
260
+ - 2
261
+ - - 2
262
+ - 2
263
+ - - 1
264
+ - 2
265
+ - - 2
266
+ - 2
267
+ chan_mults:
268
+ - 1
269
+ - 2
270
+ - 4
271
+ - 4
272
+ - 8
273
+ - 8
274
+ n_fft: 1024
275
+ hop_length: 256
276
+ win_length: 1024
277
+ stft_normalized: false
278
+ generator_adv_loss_params:
279
+ average_by_discriminators: false
280
+ loss_type: mse
281
+ discriminator_adv_loss_params:
282
+ average_by_discriminators: false
283
+ loss_type: mse
284
+ use_feat_match_loss: true
285
+ feat_match_loss_params:
286
+ average_by_discriminators: false
287
+ average_by_layers: false
288
+ include_final_outputs: true
289
+ use_mel_loss: true
290
+ mel_loss_params:
291
+ range_start: 6
292
+ range_end: 11
293
+ window: hann
294
+ n_mels: 80
295
+ fmin: 0
296
+ fmax: null
297
+ log_base: null
298
+ fs: 16000
299
+ lambda_quantization: 1.0
300
+ lambda_commit: 1.0
301
+ lambda_reconstruct: 1.0
302
+ lambda_adv: 1.0
303
+ lambda_mel: 45.0
304
+ lambda_feat_match: 2.0
305
+ cache_generator_outputs: true
306
+ required:
307
+ - output_dir
308
+ version: '202402'
309
+ distributed: true
310
+ ```
311
+
312
+ </details>
313
+
314
+
315
+
316
+ ### Citing ESPnet
317
+
318
+ ```BibTex
319
+ @inproceedings{watanabe2018espnet,
320
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
321
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
322
+ year={2018},
323
+ booktitle={Proceedings of Interspeech},
324
+ pages={2207--2211},
325
+ doi={10.21437/Interspeech.2018-1456},
326
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
327
+ }
328
+
329
+
330
+
331
+
332
+
333
+
334
+ ```
335
+
336
+ or arXiv:
337
+
338
+ ```bibtex
339
+ @misc{watanabe2018espnet,
340
+ title={ESPnet: End-to-End Speech Processing Toolkit},
341
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
342
+ year={2018},
343
+ eprint={1804.00015},
344
+ archivePrefix={arXiv},
345
+ primaryClass={cs.CL}
346
+ }
347
+ ```
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/200epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ea3d81d0672b959d5cd502eb4022f1dab1825f3f8826758fc6245bcd091c2f
3
+ size 354547787
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/config.yaml ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_soundstream4_large_v1.1.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: chunk
7
+ valid_iterator_type: null
8
+ output_dir: exp/codec_train_soundstream4_large_v1.1_raw_fs16000
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 1
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 4
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 49939
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ use_tf32: true
28
+ collect_stats: false
29
+ write_collected_feats: false
30
+ max_epoch: 360
31
+ patience: null
32
+ val_scheduler_criterion:
33
+ - valid
34
+ - loss
35
+ early_stopping_criterion:
36
+ - valid
37
+ - loss
38
+ - min
39
+ best_model_criterion:
40
+ - - valid
41
+ - mel_loss
42
+ - min
43
+ - - train
44
+ - mel_loss
45
+ - min
46
+ - - train
47
+ - total_count
48
+ - max
49
+ keep_nbest_models: 5
50
+ nbest_averaging_interval: 0
51
+ grad_clip: -1
52
+ grad_clip_type: 2.0
53
+ grad_noise: false
54
+ accum_grad: 1
55
+ no_forward_run: false
56
+ resume: true
57
+ train_dtype: float32
58
+ use_amp: false
59
+ log_interval: 50
60
+ use_matplotlib: true
61
+ use_tensorboard: true
62
+ create_graph_in_tensorboard: false
63
+ use_wandb: false
64
+ wandb_project: null
65
+ wandb_id: null
66
+ wandb_entity: null
67
+ wandb_name: null
68
+ wandb_model_log_interval: -1
69
+ detect_anomaly: false
70
+ use_adapter: false
71
+ adapter: lora
72
+ save_strategy: all
73
+ adapter_conf: {}
74
+ pretrain_path: null
75
+ init_param: []
76
+ ignore_init_mismatch: false
77
+ freeze_param: []
78
+ num_iters_per_epoch: 5000
79
+ batch_size: 128
80
+ valid_batch_size: null
81
+ batch_bins: 1000000
82
+ valid_batch_bins: null
83
+ train_shape_file:
84
+ - exp/codec_stats_raw/train/audio_shape
85
+ valid_shape_file:
86
+ - exp/codec_stats_raw/valid/audio_shape
87
+ batch_type: unsorted
88
+ valid_batch_type: null
89
+ fold_length:
90
+ - 256000
91
+ sort_in_batch: descending
92
+ shuffle_within_batch: false
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ truncate_audio: false
96
+ chunk_length: 32000
97
+ chunk_shift_ratio: 0.5
98
+ num_cache_chunks: 128
99
+ chunk_excluded_key_prefixes: []
100
+ chunk_default_fs: null
101
+ train_data_path_and_name_and_type:
102
+ - - dump/raw/train/wav.scp
103
+ - audio
104
+ - kaldi_ark
105
+ valid_data_path_and_name_and_type:
106
+ - - dump/raw/dev-small/wav.scp
107
+ - audio
108
+ - kaldi_ark
109
+ multi_task_dataset: false
110
+ allow_variable_data_keys: false
111
+ max_cache_size: 0.0
112
+ max_cache_fd: 32
113
+ allow_multi_rates: false
114
+ valid_max_cache_size: null
115
+ exclude_weight_decay: false
116
+ exclude_weight_decay_conf: {}
117
+ optim: adamw
118
+ optim_conf:
119
+ lr: 0.0002
120
+ betas:
121
+ - 0.5
122
+ - 0.9
123
+ eps: 1.0e-09
124
+ weight_decay: 0.0
125
+ scheduler: exponentiallr
126
+ scheduler_conf:
127
+ gamma: 0.999875
128
+ optim2: adamw
129
+ optim2_conf:
130
+ lr: 0.0002
131
+ betas:
132
+ - 0.5
133
+ - 0.9
134
+ eps: 1.0e-09
135
+ weight_decay: 0.0
136
+ scheduler2: exponentiallr
137
+ scheduler2_conf:
138
+ gamma: 0.999875
139
+ generator_first: true
140
+ skip_discriminator_prob: 0.0
141
+ model_conf: {}
142
+ use_preprocessor: true
143
+ codec: soundstream
144
+ codec_conf:
145
+ sampling_rate: 16000
146
+ generator_params:
147
+ hidden_dim: 512
148
+ encdec_channels: 1
149
+ encdec_n_filters: 32
150
+ encdec_n_residual_layers: 3
151
+ encdec_ratios:
152
+ - 8
153
+ - 5
154
+ - 4
155
+ - 2
156
+ encdec_activation: ELU
157
+ encdec_activation_params:
158
+ alpha: 1.0
159
+ encdec_norm: weight_norm
160
+ encdec_kernel_size: 7
161
+ encdec_residual_kernel_size: 7
162
+ encdec_last_kernel_size: 7
163
+ encdec_dilation_base: 2
164
+ encdec_causal: false
165
+ encdec_pad_mode: reflect
166
+ encdec_true_skip: false
167
+ encdec_compress: 2
168
+ encdec_lstm: 2
169
+ decoder_trim_right_ratio: 1.0
170
+ decoder_final_activation: null
171
+ decoder_final_activation_params: null
172
+ quantizer_n_q: 32
173
+ quantizer_bins: 1024
174
+ quantizer_decay: 0.99
175
+ quantizer_kmeans_init: true
176
+ quantizer_kmeans_iters: 50
177
+ quantizer_threshold_ema_dead_code: 2
178
+ quantizer_target_bandwidth:
179
+ - 2
180
+ - 4
181
+ - 8
182
+ - 16
183
+ - 32
184
+ sample_rate: 16000
185
+ discriminator_params:
186
+ scales: 3
187
+ scale_downsample_pooling: AvgPool1d
188
+ scale_downsample_pooling_params:
189
+ kernel_size: 4
190
+ stride: 2
191
+ padding: 2
192
+ scale_discriminator_params:
193
+ in_channels: 1
194
+ out_channels: 1
195
+ kernel_sizes:
196
+ - 15
197
+ - 41
198
+ - 5
199
+ - 3
200
+ channels: 128
201
+ max_downsample_channels: 1024
202
+ max_groups: 16
203
+ bias: true
204
+ downsample_scales:
205
+ - 2
206
+ - 2
207
+ - 4
208
+ - 4
209
+ - 1
210
+ nonlinear_activation: LeakyReLU
211
+ nonlinear_activation_params:
212
+ negative_slope: 0.1
213
+ scale_follow_official_norm: false
214
+ complexstft_discriminator_params:
215
+ in_channels: 1
216
+ channels: 32
217
+ strides:
218
+ - - 1
219
+ - 2
220
+ - - 2
221
+ - 2
222
+ - - 1
223
+ - 2
224
+ - - 2
225
+ - 2
226
+ - - 1
227
+ - 2
228
+ - - 2
229
+ - 2
230
+ chan_mults:
231
+ - 1
232
+ - 2
233
+ - 4
234
+ - 4
235
+ - 8
236
+ - 8
237
+ n_fft: 1024
238
+ hop_length: 256
239
+ win_length: 1024
240
+ stft_normalized: false
241
+ generator_adv_loss_params:
242
+ average_by_discriminators: false
243
+ loss_type: mse
244
+ discriminator_adv_loss_params:
245
+ average_by_discriminators: false
246
+ loss_type: mse
247
+ use_feat_match_loss: true
248
+ feat_match_loss_params:
249
+ average_by_discriminators: false
250
+ average_by_layers: false
251
+ include_final_outputs: true
252
+ use_mel_loss: true
253
+ mel_loss_params:
254
+ range_start: 6
255
+ range_end: 11
256
+ window: hann
257
+ n_mels: 80
258
+ fmin: 0
259
+ fmax: null
260
+ log_base: null
261
+ fs: 16000
262
+ lambda_quantization: 1.0
263
+ lambda_commit: 1.0
264
+ lambda_reconstruct: 1.0
265
+ lambda_adv: 1.0
266
+ lambda_mel: 45.0
267
+ lambda_feat_match: 2.0
268
+ cache_generator_outputs: true
269
+ required:
270
+ - output_dir
271
+ version: '202402'
272
+ distributed: true
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/adv_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/codec_commit_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/codec_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/codec_quantization_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_backward_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_forward_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_optim_step_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/discriminator_train_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/fake_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/feat_match_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/generator_backward_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/generator_forward_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/generator_optim_step_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/generator_train_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/gpu_max_cached_mem_GB.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/iter_time.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/mel_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/mel_loss_real.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/optim0_lr0.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/optim1_lr0.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/real_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/reconstruct_loss.png ADDED
exp/codec_train_soundstream4_large_v1.1_raw_fs16000/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202402'
2
+ files:
3
+ model_file: exp/codec_train_soundstream4_large_v1.1_raw_fs16000/200epoch.pth
4
+ python: "3.9.19 (main, Mar 21 2024, 17:11:28) \n[GCC 11.2.0]"
5
+ timestamp: 1723443274.882881
6
+ torch: 2.1.0
7
+ yaml_files:
8
+ train_config: exp/codec_train_soundstream4_large_v1.1_raw_fs16000/config.yaml