ftshijt commited on
Commit
4387365
1 Parent(s): 5d6d383

Update model

Browse files
Files changed (29) hide show
  1. README.md +319 -0
  2. exp/codec_train_encodec_large_v1.4_raw_fs16000/360epoch.pth +3 -0
  3. exp/codec_train_encodec_large_v1.4_raw_fs16000/config.yaml +244 -0
  4. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/adv_loss.png +0 -0
  5. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/codec_commit_loss.png +0 -0
  6. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/codec_loss.png +0 -0
  7. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/codec_quantization_loss.png +0 -0
  8. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_backward_time.png +0 -0
  9. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_forward_time.png +0 -0
  10. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_loss.png +0 -0
  11. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_optim_step_time.png +0 -0
  12. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_train_time.png +0 -0
  13. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/fake_loss.png +0 -0
  14. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/feat_match_loss.png +0 -0
  15. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/generator_backward_time.png +0 -0
  16. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/generator_forward_time.png +0 -0
  17. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/generator_optim_step_time.png +0 -0
  18. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/generator_train_time.png +0 -0
  19. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/gpu_max_cached_mem_GB.png +0 -0
  20. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/iter_time.png +0 -0
  21. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/loss.png +0 -0
  22. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/mel_loss.png +0 -0
  23. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/mel_loss_real.png +0 -0
  24. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/optim0_lr0.png +0 -0
  25. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/optim1_lr0.png +0 -0
  26. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/real_loss.png +0 -0
  27. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/reconstruct_loss.png +0 -0
  28. exp/codec_train_encodec_large_v1.4_raw_fs16000/images/train_time.png +0 -0
  29. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - codec
6
+ language: multilingual
7
+ datasets:
8
+ - amuse
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 Codec model
13
+
14
+ ### `ftshijt/espnet_codec_encodec_large_v1.4`
15
+
16
+ This model was trained by ftshijt using amuse recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 734f1235b3dd3c444822b6337fbb2e417e75e321
26
+ pip install -e .
27
+ cd egs2/amuse/codec_speechlm
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model ftshijt/espnet_codec_encodec_large_v1.4
29
+ ```
30
+
31
+
32
+
33
+ ## Codec config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train_encodec_large_v1.4.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: chunk
44
+ valid_iterator_type: null
45
+ output_dir: exp/codec_train_encodec_large_v1.4_raw_fs16000
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 1
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: 4
53
+ dist_rank: 0
54
+ local_rank: 0
55
+ dist_master_addr: localhost
56
+ dist_master_port: 52547
57
+ dist_launcher: null
58
+ multiprocessing_distributed: true
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ use_tf32: false
65
+ collect_stats: false
66
+ write_collected_feats: false
67
+ max_epoch: 360
68
+ patience: null
69
+ val_scheduler_criterion:
70
+ - valid
71
+ - loss
72
+ early_stopping_criterion:
73
+ - valid
74
+ - loss
75
+ - min
76
+ best_model_criterion:
77
+ - - valid
78
+ - mel_loss
79
+ - min
80
+ - - train
81
+ - mel_loss
82
+ - min
83
+ - - train
84
+ - total_count
85
+ - max
86
+ keep_nbest_models: 5
87
+ nbest_averaging_interval: 0
88
+ grad_clip: -1
89
+ grad_clip_type: 2.0
90
+ grad_noise: false
91
+ accum_grad: 1
92
+ no_forward_run: false
93
+ resume: true
94
+ train_dtype: float32
95
+ use_amp: false
96
+ log_interval: 50
97
+ use_matplotlib: true
98
+ use_tensorboard: true
99
+ create_graph_in_tensorboard: false
100
+ use_wandb: false
101
+ wandb_project: null
102
+ wandb_id: null
103
+ wandb_entity: null
104
+ wandb_name: null
105
+ wandb_model_log_interval: -1
106
+ detect_anomaly: false
107
+ use_adapter: false
108
+ adapter: lora
109
+ save_strategy: all
110
+ adapter_conf: {}
111
+ pretrain_path: null
112
+ init_param: []
113
+ ignore_init_mismatch: false
114
+ freeze_param: []
115
+ num_iters_per_epoch: 5000
116
+ batch_size: 128
117
+ valid_batch_size: null
118
+ batch_bins: 1000000
119
+ valid_batch_bins: null
120
+ train_shape_file:
121
+ - exp/codec_stats_raw/train/audio_shape
122
+ valid_shape_file:
123
+ - exp/codec_stats_raw/valid/audio_shape
124
+ batch_type: unsorted
125
+ valid_batch_type: null
126
+ fold_length:
127
+ - 256000
128
+ sort_in_batch: descending
129
+ shuffle_within_batch: false
130
+ sort_batch: descending
131
+ multiple_iterator: false
132
+ truncate_audio: false
133
+ chunk_length: 32000
134
+ chunk_shift_ratio: 0.5
135
+ num_cache_chunks: 128
136
+ chunk_excluded_key_prefixes: []
137
+ chunk_default_fs: null
138
+ train_data_path_and_name_and_type:
139
+ - - dump/raw/train/wav.scp
140
+ - audio
141
+ - kaldi_ark
142
+ valid_data_path_and_name_and_type:
143
+ - - dump/raw/dev-small/wav.scp
144
+ - audio
145
+ - kaldi_ark
146
+ multi_task_dataset: false
147
+ allow_variable_data_keys: false
148
+ max_cache_size: 0.0
149
+ max_cache_fd: 32
150
+ allow_multi_rates: false
151
+ valid_max_cache_size: null
152
+ exclude_weight_decay: false
153
+ exclude_weight_decay_conf: {}
154
+ optim: adamw
155
+ optim_conf:
156
+ lr: 0.0002
157
+ betas:
158
+ - 0.5
159
+ - 0.9
160
+ eps: 1.0e-09
161
+ weight_decay: 0.0
162
+ scheduler: exponentiallr
163
+ scheduler_conf:
164
+ gamma: 0.999875
165
+ optim2: adamw
166
+ optim2_conf:
167
+ lr: 0.0002
168
+ betas:
169
+ - 0.5
170
+ - 0.9
171
+ eps: 1.0e-09
172
+ weight_decay: 0.0
173
+ scheduler2: exponentiallr
174
+ scheduler2_conf:
175
+ gamma: 0.999875
176
+ generator_first: true
177
+ skip_discriminator_prob: 0.3
178
+ model_conf: {}
179
+ use_preprocessor: true
180
+ codec: encodec
181
+ codec_conf:
182
+ sampling_rate: 16000
183
+ generator_params:
184
+ hidden_dim: 512
185
+ encdec_channels: 1
186
+ encdec_n_filters: 32
187
+ encdec_n_residual_layers: 3
188
+ encdec_ratios:
189
+ - 8
190
+ - 5
191
+ - 4
192
+ - 2
193
+ encdec_activation: ELU
194
+ encdec_activation_params:
195
+ alpha: 1.0
196
+ encdec_norm: weight_norm
197
+ encdec_kernel_size: 7
198
+ encdec_residual_kernel_size: 7
199
+ encdec_last_kernel_size: 7
200
+ encdec_dilation_base: 2
201
+ encdec_causal: false
202
+ encdec_pad_mode: reflect
203
+ encdec_true_skip: false
204
+ encdec_compress: 2
205
+ encdec_lstm: 2
206
+ decoder_trim_right_ratio: 1.0
207
+ decoder_final_activation: null
208
+ decoder_final_activation_params: null
209
+ quantizer_n_q: 8
210
+ quantizer_bins: 1024
211
+ quantizer_decay: 0.99
212
+ quantizer_kmeans_init: true
213
+ quantizer_kmeans_iters: 50
214
+ quantizer_threshold_ema_dead_code: 2
215
+ quantizer_target_bandwidth:
216
+ - 0.5
217
+ - 1
218
+ - 1.5
219
+ - 2.0
220
+ - 4
221
+ sample_rate: 16000
222
+ discriminator_params:
223
+ msstft_discriminator_params:
224
+ filters: 32
225
+ in_channels: 1
226
+ out_channels: 1
227
+ norm: weight_norm
228
+ n_ffts:
229
+ - 1024
230
+ - 2048
231
+ - 512
232
+ - 256
233
+ - 128
234
+ hop_lengths:
235
+ - 256
236
+ - 512
237
+ - 128
238
+ - 64
239
+ - 32
240
+ win_lengths:
241
+ - 1024
242
+ - 2048
243
+ - 512
244
+ - 256
245
+ - 128
246
+ activation: LeakyReLU
247
+ activation_params:
248
+ negative_slope: 0.3
249
+ generator_adv_loss_params:
250
+ average_by_discriminators: false
251
+ loss_type: mse
252
+ discriminator_adv_loss_params:
253
+ average_by_discriminators: false
254
+ loss_type: mse
255
+ use_feat_match_loss: true
256
+ feat_match_loss_params:
257
+ average_by_discriminators: false
258
+ average_by_layers: false
259
+ include_final_outputs: true
260
+ use_mel_loss: true
261
+ mel_loss_params:
262
+ range_start: 6
263
+ range_end: 11
264
+ window: hann
265
+ n_mels: 80
266
+ fmin: 0
267
+ fmax: null
268
+ log_base: null
269
+ fs: 16000
270
+ lambda_quantization: 1.0
271
+ lambda_commit: 1.0
272
+ lambda_reconstruct: 1.0
273
+ lambda_adv: 10.0
274
+ lambda_mel: 45.0
275
+ lambda_feat_match: 2.0
276
+ cache_generator_outputs: true
277
+ use_loss_balancer: false
278
+ required:
279
+ - output_dir
280
+ version: '202402'
281
+ distributed: true
282
+ ```
283
+
284
+ </details>
285
+
286
+
287
+
288
+ ### Citing ESPnet
289
+
290
+ ```BibTex
291
+ @inproceedings{watanabe2018espnet,
292
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
293
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
294
+ year={2018},
295
+ booktitle={Proceedings of Interspeech},
296
+ pages={2207--2211},
297
+ doi={10.21437/Interspeech.2018-1456},
298
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
299
+ }
300
+
301
+
302
+
303
+
304
+
305
+
306
+ ```
307
+
308
+ or arXiv:
309
+
310
+ ```bibtex
311
+ @misc{watanabe2018espnet,
312
+ title={ESPnet: End-to-End Speech Processing Toolkit},
313
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
314
+ year={2018},
315
+ eprint={1804.00015},
316
+ archivePrefix={arXiv},
317
+ primaryClass={cs.CL}
318
+ }
319
+ ```
exp/codec_train_encodec_large_v1.4_raw_fs16000/360epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52e767ac23251fcf336e62ce322d103086c7fbbffcd053695cb8b8ecb71a6e34
3
+ size 114842233
exp/codec_train_encodec_large_v1.4_raw_fs16000/config.yaml ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train_encodec_large_v1.4.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: chunk
7
+ valid_iterator_type: null
8
+ output_dir: exp/codec_train_encodec_large_v1.4_raw_fs16000
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 1
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 4
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 52547
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ use_tf32: false
28
+ collect_stats: false
29
+ write_collected_feats: false
30
+ max_epoch: 360
31
+ patience: null
32
+ val_scheduler_criterion:
33
+ - valid
34
+ - loss
35
+ early_stopping_criterion:
36
+ - valid
37
+ - loss
38
+ - min
39
+ best_model_criterion:
40
+ - - valid
41
+ - mel_loss
42
+ - min
43
+ - - train
44
+ - mel_loss
45
+ - min
46
+ - - train
47
+ - total_count
48
+ - max
49
+ keep_nbest_models: 5
50
+ nbest_averaging_interval: 0
51
+ grad_clip: -1
52
+ grad_clip_type: 2.0
53
+ grad_noise: false
54
+ accum_grad: 1
55
+ no_forward_run: false
56
+ resume: true
57
+ train_dtype: float32
58
+ use_amp: false
59
+ log_interval: 50
60
+ use_matplotlib: true
61
+ use_tensorboard: true
62
+ create_graph_in_tensorboard: false
63
+ use_wandb: false
64
+ wandb_project: null
65
+ wandb_id: null
66
+ wandb_entity: null
67
+ wandb_name: null
68
+ wandb_model_log_interval: -1
69
+ detect_anomaly: false
70
+ use_adapter: false
71
+ adapter: lora
72
+ save_strategy: all
73
+ adapter_conf: {}
74
+ pretrain_path: null
75
+ init_param: []
76
+ ignore_init_mismatch: false
77
+ freeze_param: []
78
+ num_iters_per_epoch: 5000
79
+ batch_size: 128
80
+ valid_batch_size: null
81
+ batch_bins: 1000000
82
+ valid_batch_bins: null
83
+ train_shape_file:
84
+ - exp/codec_stats_raw/train/audio_shape
85
+ valid_shape_file:
86
+ - exp/codec_stats_raw/valid/audio_shape
87
+ batch_type: unsorted
88
+ valid_batch_type: null
89
+ fold_length:
90
+ - 256000
91
+ sort_in_batch: descending
92
+ shuffle_within_batch: false
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ truncate_audio: false
96
+ chunk_length: 32000
97
+ chunk_shift_ratio: 0.5
98
+ num_cache_chunks: 128
99
+ chunk_excluded_key_prefixes: []
100
+ chunk_default_fs: null
101
+ train_data_path_and_name_and_type:
102
+ - - dump/raw/train/wav.scp
103
+ - audio
104
+ - kaldi_ark
105
+ valid_data_path_and_name_and_type:
106
+ - - dump/raw/dev-small/wav.scp
107
+ - audio
108
+ - kaldi_ark
109
+ multi_task_dataset: false
110
+ allow_variable_data_keys: false
111
+ max_cache_size: 0.0
112
+ max_cache_fd: 32
113
+ allow_multi_rates: false
114
+ valid_max_cache_size: null
115
+ exclude_weight_decay: false
116
+ exclude_weight_decay_conf: {}
117
+ optim: adamw
118
+ optim_conf:
119
+ lr: 0.0002
120
+ betas:
121
+ - 0.5
122
+ - 0.9
123
+ eps: 1.0e-09
124
+ weight_decay: 0.0
125
+ scheduler: exponentiallr
126
+ scheduler_conf:
127
+ gamma: 0.999875
128
+ optim2: adamw
129
+ optim2_conf:
130
+ lr: 0.0002
131
+ betas:
132
+ - 0.5
133
+ - 0.9
134
+ eps: 1.0e-09
135
+ weight_decay: 0.0
136
+ scheduler2: exponentiallr
137
+ scheduler2_conf:
138
+ gamma: 0.999875
139
+ generator_first: true
140
+ skip_discriminator_prob: 0.3
141
+ model_conf: {}
142
+ use_preprocessor: true
143
+ codec: encodec
144
+ codec_conf:
145
+ sampling_rate: 16000
146
+ generator_params:
147
+ hidden_dim: 512
148
+ encdec_channels: 1
149
+ encdec_n_filters: 32
150
+ encdec_n_residual_layers: 3
151
+ encdec_ratios:
152
+ - 8
153
+ - 5
154
+ - 4
155
+ - 2
156
+ encdec_activation: ELU
157
+ encdec_activation_params:
158
+ alpha: 1.0
159
+ encdec_norm: weight_norm
160
+ encdec_kernel_size: 7
161
+ encdec_residual_kernel_size: 7
162
+ encdec_last_kernel_size: 7
163
+ encdec_dilation_base: 2
164
+ encdec_causal: false
165
+ encdec_pad_mode: reflect
166
+ encdec_true_skip: false
167
+ encdec_compress: 2
168
+ encdec_lstm: 2
169
+ decoder_trim_right_ratio: 1.0
170
+ decoder_final_activation: null
171
+ decoder_final_activation_params: null
172
+ quantizer_n_q: 8
173
+ quantizer_bins: 1024
174
+ quantizer_decay: 0.99
175
+ quantizer_kmeans_init: true
176
+ quantizer_kmeans_iters: 50
177
+ quantizer_threshold_ema_dead_code: 2
178
+ quantizer_target_bandwidth:
179
+ - 0.5
180
+ - 1
181
+ - 1.5
182
+ - 2.0
183
+ - 4
184
+ sample_rate: 16000
185
+ discriminator_params:
186
+ msstft_discriminator_params:
187
+ filters: 32
188
+ in_channels: 1
189
+ out_channels: 1
190
+ norm: weight_norm
191
+ n_ffts:
192
+ - 1024
193
+ - 2048
194
+ - 512
195
+ - 256
196
+ - 128
197
+ hop_lengths:
198
+ - 256
199
+ - 512
200
+ - 128
201
+ - 64
202
+ - 32
203
+ win_lengths:
204
+ - 1024
205
+ - 2048
206
+ - 512
207
+ - 256
208
+ - 128
209
+ activation: LeakyReLU
210
+ activation_params:
211
+ negative_slope: 0.3
212
+ generator_adv_loss_params:
213
+ average_by_discriminators: false
214
+ loss_type: mse
215
+ discriminator_adv_loss_params:
216
+ average_by_discriminators: false
217
+ loss_type: mse
218
+ use_feat_match_loss: true
219
+ feat_match_loss_params:
220
+ average_by_discriminators: false
221
+ average_by_layers: false
222
+ include_final_outputs: true
223
+ use_mel_loss: true
224
+ mel_loss_params:
225
+ range_start: 6
226
+ range_end: 11
227
+ window: hann
228
+ n_mels: 80
229
+ fmin: 0
230
+ fmax: null
231
+ log_base: null
232
+ fs: 16000
233
+ lambda_quantization: 1.0
234
+ lambda_commit: 1.0
235
+ lambda_reconstruct: 1.0
236
+ lambda_adv: 10.0
237
+ lambda_mel: 45.0
238
+ lambda_feat_match: 2.0
239
+ cache_generator_outputs: true
240
+ use_loss_balancer: false
241
+ required:
242
+ - output_dir
243
+ version: '202402'
244
+ distributed: true
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/adv_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/codec_commit_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/codec_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/codec_quantization_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_backward_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_forward_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_optim_step_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/discriminator_train_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/fake_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/feat_match_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/generator_backward_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/generator_forward_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/generator_optim_step_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/generator_train_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/gpu_max_cached_mem_GB.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/iter_time.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/mel_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/mel_loss_real.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/optim0_lr0.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/optim1_lr0.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/real_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/reconstruct_loss.png ADDED
exp/codec_train_encodec_large_v1.4_raw_fs16000/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202402'
2
+ files:
3
+ model_file: exp/codec_train_encodec_large_v1.4_raw_fs16000/360epoch.pth
4
+ python: "3.9.19 (main, Mar 21 2024, 17:11:28) \n[GCC 11.2.0]"
5
+ timestamp: 1729569537.320478
6
+ torch: 2.1.0
7
+ yaml_files:
8
+ train_config: exp/codec_train_encodec_large_v1.4_raw_fs16000/config.yaml