ftshijt commited on
Commit
383be74
1 Parent(s): 1224e4a

Update model

Browse files
Files changed (29) hide show
  1. README.md +351 -0
  2. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/75epoch.pth +3 -0
  3. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/config.yaml +276 -0
  4. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/adv_loss.png +0 -0
  5. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/codec_commit_loss.png +0 -0
  6. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/codec_loss.png +0 -0
  7. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/codec_quantization_loss.png +0 -0
  8. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_backward_time.png +0 -0
  9. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_forward_time.png +0 -0
  10. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_loss.png +0 -0
  11. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_optim_step_time.png +0 -0
  12. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_train_time.png +0 -0
  13. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/fake_loss.png +0 -0
  14. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/feat_match_loss.png +0 -0
  15. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/generator_backward_time.png +0 -0
  16. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/generator_forward_time.png +0 -0
  17. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/generator_optim_step_time.png +0 -0
  18. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/generator_train_time.png +0 -0
  19. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/gpu_max_cached_mem_GB.png +0 -0
  20. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/iter_time.png +0 -0
  21. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/loss.png +0 -0
  22. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/mel_loss.png +0 -0
  23. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/mel_loss_real.png +0 -0
  24. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/optim0_lr0.png +0 -0
  25. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/optim1_lr0.png +0 -0
  26. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/real_loss.png +0 -0
  27. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/reconstruct_loss.png +0 -0
  28. exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/train_time.png +0 -0
  29. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - codec
6
+ language: multilingual
7
+ datasets:
8
+ - amuse
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 Codec model
13
+
14
+ ### `ftshijt/espnet_codec_soundstream_large_v1.8`
15
+
16
+ This model was trained by ftshijt using amuse recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 9baec3a7b10b784cb721849e19caed19e8ac45bc
26
+ pip install -e .
27
+ cd egs2/amuse/codec1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model ftshijt/espnet_codec_soundstream_large_v1.8
29
+ ```
30
+
31
+
32
+
33
+ ## Codec config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train_soundstream4_large_v1.8.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: category_chunk
44
+ valid_iterator_type: null
45
+ output_dir: exp/codec_train_soundstream4_large_v1.8_raw_fs16000
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 1
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: 2
53
+ dist_rank: 0
54
+ local_rank: 0
55
+ dist_master_addr: localhost
56
+ dist_master_port: 59983
57
+ dist_launcher: null
58
+ multiprocessing_distributed: true
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ use_deepspeed: false
62
+ deepspeed_config: null
63
+ cudnn_enabled: true
64
+ cudnn_benchmark: false
65
+ cudnn_deterministic: false
66
+ use_tf32: true
67
+ collect_stats: false
68
+ write_collected_feats: false
69
+ max_epoch: 360
70
+ patience: null
71
+ val_scheduler_criterion:
72
+ - valid
73
+ - loss
74
+ early_stopping_criterion:
75
+ - valid
76
+ - loss
77
+ - min
78
+ best_model_criterion:
79
+ - - valid
80
+ - mel_loss
81
+ - min
82
+ - - train
83
+ - mel_loss
84
+ - min
85
+ - - train
86
+ - total_count
87
+ - max
88
+ keep_nbest_models: 5
89
+ nbest_averaging_interval: 0
90
+ grad_clip: -1
91
+ grad_clip_type: 2.0
92
+ grad_noise: false
93
+ accum_grad: 1
94
+ no_forward_run: false
95
+ resume: true
96
+ train_dtype: float32
97
+ use_amp: false
98
+ log_interval: 50
99
+ use_matplotlib: true
100
+ use_tensorboard: true
101
+ create_graph_in_tensorboard: false
102
+ use_wandb: false
103
+ wandb_project: null
104
+ wandb_id: null
105
+ wandb_entity: null
106
+ wandb_name: null
107
+ wandb_model_log_interval: -1
108
+ detect_anomaly: false
109
+ use_adapter: false
110
+ adapter: lora
111
+ save_strategy: all
112
+ adapter_conf: {}
113
+ pretrain_path: null
114
+ init_param: []
115
+ ignore_init_mismatch: false
116
+ freeze_param: []
117
+ num_iters_per_epoch: 5000
118
+ batch_size: 64
119
+ valid_batch_size: null
120
+ batch_bins: 1000000
121
+ valid_batch_bins: null
122
+ category_sample_size: 3
123
+ train_shape_file:
124
+ - exp/codec_stats_raw/train/audio_shape
125
+ valid_shape_file:
126
+ - exp/codec_stats_raw/valid/audio_shape
127
+ batch_type: unsorted
128
+ valid_batch_type: null
129
+ fold_length:
130
+ - 256000
131
+ sort_in_batch: descending
132
+ shuffle_within_batch: false
133
+ sort_batch: descending
134
+ multiple_iterator: false
135
+ chunk_length: 32000
136
+ chunk_shift_ratio: 0.5
137
+ num_cache_chunks: 640
138
+ chunk_excluded_key_prefixes: []
139
+ chunk_default_fs: null
140
+ chunk_max_abs_length: null
141
+ chunk_discard_short_samples: true
142
+ train_data_path_and_name_and_type:
143
+ - - dump/raw/owsm_all/wav.scp
144
+ - audio
145
+ - kaldi_ark
146
+ valid_data_path_and_name_and_type:
147
+ - - dump/raw/dev-small/wav.scp
148
+ - audio
149
+ - kaldi_ark
150
+ multi_task_dataset: false
151
+ allow_variable_data_keys: false
152
+ max_cache_size: 0.0
153
+ max_cache_fd: 32
154
+ allow_multi_rates: false
155
+ valid_max_cache_size: null
156
+ exclude_weight_decay: false
157
+ exclude_weight_decay_conf: {}
158
+ optim: adamw
159
+ optim_conf:
160
+ lr: 0.0002
161
+ betas:
162
+ - 0.5
163
+ - 0.9
164
+ eps: 1.0e-09
165
+ weight_decay: 0.0
166
+ scheduler: exponentiallr
167
+ scheduler_conf:
168
+ gamma: 0.999875
169
+ optim2: adamw
170
+ optim2_conf:
171
+ lr: 0.0002
172
+ betas:
173
+ - 0.5
174
+ - 0.9
175
+ eps: 1.0e-09
176
+ weight_decay: 0.0
177
+ scheduler2: exponentiallr
178
+ scheduler2_conf:
179
+ gamma: 0.999875
180
+ generator_first: true
181
+ skip_discriminator_prob: 0.0
182
+ model_conf: {}
183
+ use_preprocessor: true
184
+ codec: soundstream
185
+ codec_conf:
186
+ sampling_rate: 16000
187
+ generator_params:
188
+ hidden_dim: 512
189
+ encdec_channels: 1
190
+ encdec_n_filters: 32
191
+ encdec_n_residual_layers: 3
192
+ encdec_ratios:
193
+ - 8
194
+ - 5
195
+ - 4
196
+ - 2
197
+ encdec_activation: ELU
198
+ encdec_activation_params:
199
+ alpha: 1.0
200
+ encdec_norm: weight_norm
201
+ encdec_kernel_size: 7
202
+ encdec_residual_kernel_size: 7
203
+ encdec_last_kernel_size: 7
204
+ encdec_dilation_base: 2
205
+ encdec_causal: false
206
+ encdec_pad_mode: reflect
207
+ encdec_true_skip: false
208
+ encdec_compress: 2
209
+ encdec_lstm: 2
210
+ decoder_trim_right_ratio: 1.0
211
+ decoder_final_activation: null
212
+ decoder_final_activation_params: null
213
+ quantizer_n_q: 8
214
+ quantizer_bins: 1024
215
+ quantizer_decay: 0.99
216
+ quantizer_kmeans_init: true
217
+ quantizer_kmeans_iters: 50
218
+ quantizer_threshold_ema_dead_code: 2
219
+ quantizer_target_bandwidth:
220
+ - 0.5
221
+ - 1.0
222
+ - 1.5
223
+ - 2
224
+ - 4
225
+ sample_rate: 16000
226
+ discriminator_params:
227
+ scales: 3
228
+ scale_downsample_pooling: AvgPool1d
229
+ scale_downsample_pooling_params:
230
+ kernel_size: 4
231
+ stride: 2
232
+ padding: 2
233
+ scale_discriminator_params:
234
+ in_channels: 1
235
+ out_channels: 1
236
+ kernel_sizes:
237
+ - 15
238
+ - 41
239
+ - 5
240
+ - 3
241
+ channels: 128
242
+ max_downsample_channels: 1024
243
+ max_groups: 16
244
+ bias: true
245
+ downsample_scales:
246
+ - 2
247
+ - 2
248
+ - 4
249
+ - 4
250
+ - 1
251
+ nonlinear_activation: LeakyReLU
252
+ nonlinear_activation_params:
253
+ negative_slope: 0.1
254
+ scale_follow_official_norm: false
255
+ complexstft_discriminator_params:
256
+ in_channels: 1
257
+ channels: 32
258
+ strides:
259
+ - - 1
260
+ - 2
261
+ - - 2
262
+ - 2
263
+ - - 1
264
+ - 2
265
+ - - 2
266
+ - 2
267
+ - - 1
268
+ - 2
269
+ - - 2
270
+ - 2
271
+ chan_mults:
272
+ - 1
273
+ - 2
274
+ - 4
275
+ - 4
276
+ - 8
277
+ - 8
278
+ n_fft: 1024
279
+ hop_length: 256
280
+ win_length: 1024
281
+ stft_normalized: false
282
+ generator_adv_loss_params:
283
+ average_by_discriminators: false
284
+ loss_type: mse
285
+ discriminator_adv_loss_params:
286
+ average_by_discriminators: false
287
+ loss_type: mse
288
+ use_feat_match_loss: true
289
+ feat_match_loss_params:
290
+ average_by_discriminators: false
291
+ average_by_layers: false
292
+ include_final_outputs: true
293
+ use_mel_loss: true
294
+ mel_loss_params:
295
+ range_start: 6
296
+ range_end: 11
297
+ window: hann
298
+ n_mels: 80
299
+ fmin: 0
300
+ fmax: null
301
+ log_base: null
302
+ fs: 16000
303
+ lambda_quantization: 1.0
304
+ lambda_commit: 1.0
305
+ lambda_reconstruct: 1.0
306
+ lambda_adv: 1.0
307
+ lambda_mel: 45.0
308
+ lambda_feat_match: 2.0
309
+ cache_generator_outputs: true
310
+ required:
311
+ - output_dir
312
+ version: '202402'
313
+ distributed: true
314
+ ```
315
+
316
+ </details>
317
+
318
+
319
+
320
+ ### Citing ESPnet
321
+
322
+ ```BibTex
323
+ @inproceedings{watanabe2018espnet,
324
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
325
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
326
+ year={2018},
327
+ booktitle={Proceedings of Interspeech},
328
+ pages={2207--2211},
329
+ doi={10.21437/Interspeech.2018-1456},
330
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
331
+ }
332
+
333
+
334
+
335
+
336
+
337
+
338
+ ```
339
+
340
+ or arXiv:
341
+
342
+ ```bibtex
343
+ @misc{watanabe2018espnet,
344
+ title={ESPnet: End-to-End Speech Processing Toolkit},
345
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
346
+ year={2018},
347
+ eprint={1804.00015},
348
+ archivePrefix={arXiv},
349
+ primaryClass={cs.CL}
350
+ }
351
+ ```
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/75epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b955418d3cd74169d26384198bb4591000d4eaf32b25324be550ce0959e763d
3
+ size 253744924
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/config.yaml ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_soundstream4_large_v1.8.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: category_chunk
7
+ valid_iterator_type: null
8
+ output_dir: exp/codec_train_soundstream4_large_v1.8_raw_fs16000
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 1
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 2
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 59983
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ cudnn_enabled: true
27
+ cudnn_benchmark: false
28
+ cudnn_deterministic: false
29
+ use_tf32: true
30
+ collect_stats: false
31
+ write_collected_feats: false
32
+ max_epoch: 360
33
+ patience: null
34
+ val_scheduler_criterion:
35
+ - valid
36
+ - loss
37
+ early_stopping_criterion:
38
+ - valid
39
+ - loss
40
+ - min
41
+ best_model_criterion:
42
+ - - valid
43
+ - mel_loss
44
+ - min
45
+ - - train
46
+ - mel_loss
47
+ - min
48
+ - - train
49
+ - total_count
50
+ - max
51
+ keep_nbest_models: 5
52
+ nbest_averaging_interval: 0
53
+ grad_clip: -1
54
+ grad_clip_type: 2.0
55
+ grad_noise: false
56
+ accum_grad: 1
57
+ no_forward_run: false
58
+ resume: true
59
+ train_dtype: float32
60
+ use_amp: false
61
+ log_interval: 50
62
+ use_matplotlib: true
63
+ use_tensorboard: true
64
+ create_graph_in_tensorboard: false
65
+ use_wandb: false
66
+ wandb_project: null
67
+ wandb_id: null
68
+ wandb_entity: null
69
+ wandb_name: null
70
+ wandb_model_log_interval: -1
71
+ detect_anomaly: false
72
+ use_adapter: false
73
+ adapter: lora
74
+ save_strategy: all
75
+ adapter_conf: {}
76
+ pretrain_path: null
77
+ init_param: []
78
+ ignore_init_mismatch: false
79
+ freeze_param: []
80
+ num_iters_per_epoch: 5000
81
+ batch_size: 64
82
+ valid_batch_size: null
83
+ batch_bins: 1000000
84
+ valid_batch_bins: null
85
+ category_sample_size: 3
86
+ train_shape_file:
87
+ - exp/codec_stats_raw/train/audio_shape
88
+ valid_shape_file:
89
+ - exp/codec_stats_raw/valid/audio_shape
90
+ batch_type: unsorted
91
+ valid_batch_type: null
92
+ fold_length:
93
+ - 256000
94
+ sort_in_batch: descending
95
+ shuffle_within_batch: false
96
+ sort_batch: descending
97
+ multiple_iterator: false
98
+ chunk_length: 32000
99
+ chunk_shift_ratio: 0.5
100
+ num_cache_chunks: 640
101
+ chunk_excluded_key_prefixes: []
102
+ chunk_default_fs: null
103
+ chunk_max_abs_length: null
104
+ chunk_discard_short_samples: true
105
+ train_data_path_and_name_and_type:
106
+ - - dump/raw/owsm_all/wav.scp
107
+ - audio
108
+ - kaldi_ark
109
+ valid_data_path_and_name_and_type:
110
+ - - dump/raw/dev-small/wav.scp
111
+ - audio
112
+ - kaldi_ark
113
+ multi_task_dataset: false
114
+ allow_variable_data_keys: false
115
+ max_cache_size: 0.0
116
+ max_cache_fd: 32
117
+ allow_multi_rates: false
118
+ valid_max_cache_size: null
119
+ exclude_weight_decay: false
120
+ exclude_weight_decay_conf: {}
121
+ optim: adamw
122
+ optim_conf:
123
+ lr: 0.0002
124
+ betas:
125
+ - 0.5
126
+ - 0.9
127
+ eps: 1.0e-09
128
+ weight_decay: 0.0
129
+ scheduler: exponentiallr
130
+ scheduler_conf:
131
+ gamma: 0.999875
132
+ optim2: adamw
133
+ optim2_conf:
134
+ lr: 0.0002
135
+ betas:
136
+ - 0.5
137
+ - 0.9
138
+ eps: 1.0e-09
139
+ weight_decay: 0.0
140
+ scheduler2: exponentiallr
141
+ scheduler2_conf:
142
+ gamma: 0.999875
143
+ generator_first: true
144
+ skip_discriminator_prob: 0.0
145
+ model_conf: {}
146
+ use_preprocessor: true
147
+ codec: soundstream
148
+ codec_conf:
149
+ sampling_rate: 16000
150
+ generator_params:
151
+ hidden_dim: 512
152
+ encdec_channels: 1
153
+ encdec_n_filters: 32
154
+ encdec_n_residual_layers: 3
155
+ encdec_ratios:
156
+ - 8
157
+ - 5
158
+ - 4
159
+ - 2
160
+ encdec_activation: ELU
161
+ encdec_activation_params:
162
+ alpha: 1.0
163
+ encdec_norm: weight_norm
164
+ encdec_kernel_size: 7
165
+ encdec_residual_kernel_size: 7
166
+ encdec_last_kernel_size: 7
167
+ encdec_dilation_base: 2
168
+ encdec_causal: false
169
+ encdec_pad_mode: reflect
170
+ encdec_true_skip: false
171
+ encdec_compress: 2
172
+ encdec_lstm: 2
173
+ decoder_trim_right_ratio: 1.0
174
+ decoder_final_activation: null
175
+ decoder_final_activation_params: null
176
+ quantizer_n_q: 8
177
+ quantizer_bins: 1024
178
+ quantizer_decay: 0.99
179
+ quantizer_kmeans_init: true
180
+ quantizer_kmeans_iters: 50
181
+ quantizer_threshold_ema_dead_code: 2
182
+ quantizer_target_bandwidth:
183
+ - 0.5
184
+ - 1.0
185
+ - 1.5
186
+ - 2
187
+ - 4
188
+ sample_rate: 16000
189
+ discriminator_params:
190
+ scales: 3
191
+ scale_downsample_pooling: AvgPool1d
192
+ scale_downsample_pooling_params:
193
+ kernel_size: 4
194
+ stride: 2
195
+ padding: 2
196
+ scale_discriminator_params:
197
+ in_channels: 1
198
+ out_channels: 1
199
+ kernel_sizes:
200
+ - 15
201
+ - 41
202
+ - 5
203
+ - 3
204
+ channels: 128
205
+ max_downsample_channels: 1024
206
+ max_groups: 16
207
+ bias: true
208
+ downsample_scales:
209
+ - 2
210
+ - 2
211
+ - 4
212
+ - 4
213
+ - 1
214
+ nonlinear_activation: LeakyReLU
215
+ nonlinear_activation_params:
216
+ negative_slope: 0.1
217
+ scale_follow_official_norm: false
218
+ complexstft_discriminator_params:
219
+ in_channels: 1
220
+ channels: 32
221
+ strides:
222
+ - - 1
223
+ - 2
224
+ - - 2
225
+ - 2
226
+ - - 1
227
+ - 2
228
+ - - 2
229
+ - 2
230
+ - - 1
231
+ - 2
232
+ - - 2
233
+ - 2
234
+ chan_mults:
235
+ - 1
236
+ - 2
237
+ - 4
238
+ - 4
239
+ - 8
240
+ - 8
241
+ n_fft: 1024
242
+ hop_length: 256
243
+ win_length: 1024
244
+ stft_normalized: false
245
+ generator_adv_loss_params:
246
+ average_by_discriminators: false
247
+ loss_type: mse
248
+ discriminator_adv_loss_params:
249
+ average_by_discriminators: false
250
+ loss_type: mse
251
+ use_feat_match_loss: true
252
+ feat_match_loss_params:
253
+ average_by_discriminators: false
254
+ average_by_layers: false
255
+ include_final_outputs: true
256
+ use_mel_loss: true
257
+ mel_loss_params:
258
+ range_start: 6
259
+ range_end: 11
260
+ window: hann
261
+ n_mels: 80
262
+ fmin: 0
263
+ fmax: null
264
+ log_base: null
265
+ fs: 16000
266
+ lambda_quantization: 1.0
267
+ lambda_commit: 1.0
268
+ lambda_reconstruct: 1.0
269
+ lambda_adv: 1.0
270
+ lambda_mel: 45.0
271
+ lambda_feat_match: 2.0
272
+ cache_generator_outputs: true
273
+ required:
274
+ - output_dir
275
+ version: '202402'
276
+ distributed: true
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/adv_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/codec_commit_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/codec_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/codec_quantization_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_backward_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_forward_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_optim_step_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/discriminator_train_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/fake_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/feat_match_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/generator_backward_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/generator_forward_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/generator_optim_step_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/generator_train_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/gpu_max_cached_mem_GB.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/iter_time.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/mel_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/mel_loss_real.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/optim0_lr0.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/optim1_lr0.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/real_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/reconstruct_loss.png ADDED
exp/codec_train_soundstream4_large_v1.8_raw_fs16000/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202402'
2
+ files:
3
+ model_file: exp/codec_train_soundstream4_large_v1.8_raw_fs16000/75epoch.pth
4
+ python: 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:26:55) [GCC 12.3.0]
5
+ timestamp: 1729619370.963525
6
+ torch: 2.5.0.dev20240825+cu124
7
+ yaml_files:
8
+ train_config: exp/codec_train_soundstream4_large_v1.8_raw_fs16000/config.yaml