soumi-maiti commited on
Commit
1c70d1a
1 Parent(s): f5ada96

Update model

Browse files
Files changed (25) hide show
  1. README.md +350 -0
  2. exp/diar_enh_stats_8k/train/feats_stats.npz +3 -0
  3. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/57epoch.pth +3 -0
  4. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/DIAR_RESULTS.md +15 -0
  5. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/ENH_RESULTS.md +35 -0
  6. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/config.yaml +277 -0
  7. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/acc.png +0 -0
  8. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/backward_time.png +0 -0
  9. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/cf.png +0 -0
  10. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/der.png +0 -0
  11. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/fa.png +0 -0
  12. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/forward_time.png +0 -0
  13. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/gpu_max_cached_mem_GB.png +0 -0
  14. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/iter_time.png +0 -0
  15. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/loss.png +0 -0
  16. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/loss_att.png +0 -0
  17. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/loss_enh.png +0 -0
  18. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/loss_pit.png +0 -0
  19. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/mi.png +0 -0
  20. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/optim0_lr0.png +0 -0
  21. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/optim_step_time.png +0 -0
  22. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/sad_fr.png +0 -0
  23. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/sad_mr.png +0 -0
  24. exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/train_time.png +0 -0
  25. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - diarization
6
+ language: noinfo
7
+ datasets:
8
+ - librimix
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 DIAR model
13
+
14
+ ### `soumi-maiti/libri2mix_eend_ss`
15
+
16
+ This model was trained by soumimaiti using librimix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout d837c97c88f13ffe655a30bcff93d814f212b225
26
+ pip install -e .
27
+ cd egs2/librimix/enh_diar1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model soumi-maiti/libri2mix_eend_ss
29
+ ```
30
+
31
+
32
+
33
+ ## DIAR config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: chunk
43
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 2
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 54493
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: false
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 100
65
+ patience: 50
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss_enh
76
+ - min
77
+ keep_nbest_models: 1
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 5.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 2
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param: []
99
+ ignore_init_mismatch: false
100
+ freeze_param: []
101
+ num_iters_per_epoch: null
102
+ batch_size: 8
103
+ valid_batch_size: null
104
+ batch_bins: 1000000
105
+ valid_batch_bins: null
106
+ train_shape_file:
107
+ - exp/diar_enh_stats_8k/train/speech_shape
108
+ - exp/diar_enh_stats_8k/train/text_shape
109
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
110
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
111
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
112
+ valid_shape_file:
113
+ - exp/diar_enh_stats_8k/valid/speech_shape
114
+ - exp/diar_enh_stats_8k/valid/text_shape
115
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
116
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
117
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
118
+ batch_type: folded
119
+ valid_batch_type: null
120
+ fold_length:
121
+ - 800
122
+ - 80000
123
+ - 80000
124
+ - 80000
125
+ - 80000
126
+ sort_in_batch: descending
127
+ sort_batch: descending
128
+ multiple_iterator: false
129
+ chunk_length: 24000
130
+ chunk_shift_ratio: 0.5
131
+ num_cache_chunks: 1024
132
+ train_data_path_and_name_and_type:
133
+ - - dump/raw/train/wav.scp
134
+ - speech
135
+ - sound
136
+ - - dump/raw/train/espnet_rttm
137
+ - text
138
+ - rttm
139
+ - - dump/raw/train/spk1.scp
140
+ - speech_ref1
141
+ - sound
142
+ - - dump/raw/train/spk2.scp
143
+ - speech_ref2
144
+ - sound
145
+ - - dump/raw/train/noise1.scp
146
+ - noise_ref1
147
+ - sound
148
+ valid_data_path_and_name_and_type:
149
+ - - dump/raw/dev/wav.scp
150
+ - speech
151
+ - sound
152
+ - - dump/raw/dev/espnet_rttm
153
+ - text
154
+ - rttm
155
+ - - dump/raw/dev/spk1.scp
156
+ - speech_ref1
157
+ - sound
158
+ - - dump/raw/dev/spk2.scp
159
+ - speech_ref2
160
+ - sound
161
+ - - dump/raw/dev/noise1.scp
162
+ - noise_ref1
163
+ - sound
164
+ allow_variable_data_keys: false
165
+ max_cache_size: 0.0
166
+ max_cache_fd: 32
167
+ valid_max_cache_size: null
168
+ optim: adam
169
+ optim_conf:
170
+ lr: 0.001
171
+ eps: 1.0e-07
172
+ weight_decay: 0
173
+ scheduler: reducelronplateau
174
+ scheduler_conf:
175
+ mode: min
176
+ factor: 0.5
177
+ patience: 50
178
+ token_list: null
179
+ src_token_list: null
180
+ init: xavier_uniform
181
+ input_size: null
182
+ ctc_conf:
183
+ dropout_rate: 0.0
184
+ ctc_type: builtin
185
+ reduce: true
186
+ ignore_nan_grad: null
187
+ zero_infinity: true
188
+ enh_criterions:
189
+ - name: si_snr
190
+ conf:
191
+ eps: 1.0e-07
192
+ wrapper: pit
193
+ wrapper_conf:
194
+ weight: 1.0
195
+ independent_perm: true
196
+ diar_num_spk: 2
197
+ diar_input_size: 128
198
+ enh_model_conf:
199
+ loss_type: si_snr
200
+ asr_model_conf:
201
+ ctc_weight: 0.5
202
+ interctc_weight: 0.0
203
+ ignore_id: -1
204
+ lsm_weight: 0.0
205
+ length_normalized_loss: false
206
+ report_cer: true
207
+ report_wer: true
208
+ sym_space: <space>
209
+ sym_blank: <blank>
210
+ extract_feats_in_collect_stats: true
211
+ st_model_conf:
212
+ stft_consistency: false
213
+ loss_type: mask_mse
214
+ mask_type: null
215
+ diar_model_conf:
216
+ diar_weight: 0.2
217
+ attractor_weight: 0.2
218
+ subtask_series:
219
+ - enh
220
+ - diar
221
+ model_conf:
222
+ calc_enh_loss: true
223
+ bypass_enh_prob: 0
224
+ use_preprocessor: true
225
+ token_type: bpe
226
+ bpemodel: null
227
+ src_token_type: bpe
228
+ src_bpemodel: null
229
+ non_linguistic_symbols: null
230
+ cleaner: null
231
+ g2p: null
232
+ enh_encoder: conv
233
+ enh_encoder_conf:
234
+ channel: 512
235
+ kernel_size: 16
236
+ stride: 8
237
+ enh_separator: tcn_nomask
238
+ enh_separator_conf:
239
+ layer: 8
240
+ stack: 3
241
+ bottleneck_dim: 128
242
+ hidden_dim: 512
243
+ kernel: 3
244
+ causal: false
245
+ norm_type: gLN
246
+ enh_decoder: conv
247
+ enh_decoder_conf:
248
+ channel: 512
249
+ kernel_size: 16
250
+ stride: 8
251
+ enh_mask_module: multi_mask
252
+ enh_mask_module_conf:
253
+ max_num_spk: 3
254
+ mask_nonlinear: relu
255
+ bottleneck_dim: 128
256
+ frontend: default
257
+ frontend_conf: {}
258
+ specaug: null
259
+ specaug_conf: {}
260
+ normalize: utterance_mvn
261
+ normalize_conf: {}
262
+ asr_preencoder: null
263
+ asr_preencoder_conf: {}
264
+ asr_encoder: rnn
265
+ asr_encoder_conf: {}
266
+ asr_postencoder: null
267
+ asr_postencoder_conf: {}
268
+ asr_decoder: rnn
269
+ asr_decoder_conf: {}
270
+ st_preencoder: null
271
+ st_preencoder_conf: {}
272
+ st_encoder: rnn
273
+ st_encoder_conf: {}
274
+ st_postencoder: null
275
+ st_postencoder_conf: {}
276
+ st_decoder: rnn
277
+ st_decoder_conf: {}
278
+ st_extra_asr_decoder: rnn
279
+ st_extra_asr_decoder_conf: {}
280
+ st_extra_mt_decoder: rnn
281
+ st_extra_mt_decoder_conf: {}
282
+ diar_frontend: default
283
+ diar_frontend_conf:
284
+ hop_length: 64
285
+ fs: 8000
286
+ diar_specaug: null
287
+ diar_specaug_conf: {}
288
+ diar_normalize: utterance_mvn
289
+ diar_normalize_conf: {}
290
+ diar_encoder: transformer
291
+ diar_encoder_conf:
292
+ input_layer: conv2d8
293
+ num_blocks: 4
294
+ linear_units: 512
295
+ dropout_rate: 0.1
296
+ output_size: 256
297
+ attention_heads: 4
298
+ attention_dropout_rate: 0.1
299
+ diar_decoder: linear
300
+ diar_decoder_conf: {}
301
+ label_aggregator: label_aggregator
302
+ label_aggregator_conf:
303
+ win_length: 256
304
+ hop_length: 64
305
+ diar_attractor: rnn
306
+ diar_attractor_conf:
307
+ unit: 256
308
+ layer: 1
309
+ dropout: 0.0
310
+ attractor_grad: true
311
+ required:
312
+ - output_dir
313
+ version: '202205'
314
+ distributed: true
315
+ ```
316
+
317
+ </details>
318
+
319
+
320
+
321
+ ### Citing ESPnet
322
+
323
+ ```BibTex
324
+ @inproceedings{watanabe2018espnet,
325
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
326
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
327
+ year={2018},
328
+ booktitle={Proceedings of Interspeech},
329
+ pages={2207--2211},
330
+ doi={10.21437/Interspeech.2018-1456},
331
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
332
+ }
333
+
334
+
335
+
336
+
337
+ ```
338
+
339
+ or arXiv:
340
+
341
+ ```bibtex
342
+ @misc{watanabe2018espnet,
343
+ title={ESPnet: End-to-End Speech Processing Toolkit},
344
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
345
+ year={2018},
346
+ eprint={1804.00015},
347
+ archivePrefix={arXiv},
348
+ primaryClass={cs.CL}
349
+ }
350
+ ```
exp/diar_enh_stats_8k/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9d63118338cef56e57e6b49c0dd8ac549ead5dbea1c7aedb51797930bada6d9
3
+ size 778
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/57epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8adef049fd941d6deb198a9dbfe05b4f8dd702a3d3ff496ae197c5608fce80ea
3
+ size 38983318
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/DIAR_RESULTS.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_diar_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Thu Sep 1 06:42:28 EDT 2022`
5
+ - python version: `3.9.7 (default, Sep 16 2021, 13:09:58) [GCC 7.5.0]`
6
+ - espnet version: `espnet 202205`
7
+ - pytorch version: `pytorch 1.8.1+cu102`
8
+ - Git hash: `d837c97c88f13ffe655a30bcff93d814f212b225`
9
+ - Commit date: `Wed Jun 29 12:04:57 2022 -0700`
10
+
11
+ ## diar_enh_train_diar_enh_convtasnet_concat_feats_raw
12
+ ### DER
13
+ diarized_enhanced_test_lat_enh
14
+ |threshold_median_collar|DER|
15
+ |---|---|
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/ENH_RESULTS.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Thu Sep 1 06:50:40 EDT 2022`
5
+ - python version: `3.9.7 (default, Sep 16 2021, 13:09:58) [GCC 7.5.0]`
6
+ - espnet version: `espnet 202205`
7
+ - pytorch version: `pytorch 1.8.1+cu102`
8
+ - Git hash: `d837c97c88f13ffe655a30bcff93d814f212b225`
9
+ - Commit date: `Wed Jun 29 12:04:57 2022 -0700`
10
+
11
+
12
+ ## diar_enh_train_diar_enh_convtasnet_concat_feats_raw
13
+
14
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|
17
+ |---|---|---|---|---|---|
18
+ |diarized_enhanced_dev|83.6928|10.6693|10.1626|21.8855|9.4331|
19
+ |diarized_enhanced_dev_wo_diar|83.7127|10.6663|10.1550|21.8520|9.4306|
20
+ |diarized_enhanced_test|83.7409|10.5066|9.9290|21.3136|9.1342|
21
+ |diarized_enhanced_test_ave_enh|83.7409|10.5066|9.9290|21.3136|9.1342|
22
+ |diarized_enhanced_test_ave_enh2|83.7560|10.5096|9.9249|21.2740|9.1355|
23
+ |diarized_enhanced_test_decode_diar|83.7560|10.5096|9.9249|21.2740|9.1355|
24
+ |diarized_enhanced_test_decode_diar_enh|83.7409|10.5066|9.9290|21.3136|9.1342|
25
+ |diarized_enhanced_test_decode_diar_enh2|83.7560|10.5096|9.9249|21.2740|9.1355|
26
+ |diarized_enhanced_test_lat_enh|83.7105|10.5050|9.9230|21.3135|9.1285|
27
+ |diarized_enhanced_test_lat_enh2|83.7299|10.5073|9.9189|21.2735|9.1314|
28
+ |diarized_enhanced_test_sparse_2_0.2_decode_diar_enh|86.6907|11.2629|10.9169|28.3233|10.0191|
29
+ |diarized_enhanced_test_sparse_2_0.4_decode_diar_enh|85.3315|10.7804|10.3049|24.7652|9.4069|
30
+ |diarized_enhanced_test_sparse_2_0.6_decode_diar_enh|84.6040|10.5808|10.0880|23.3730|9.2251|
31
+ |diarized_enhanced_test_sparse_2_0.8_decode_diar_enh|84.1088|10.3668|9.8149|22.3321|8.9487|
32
+ |diarized_enhanced_test_sparse_2_0_decode_diar_enh|87.6317|11.7109|11.3008|54.0680|10.3340|
33
+ |diarized_enhanced_test_sparse_2_1_decode_diar_enh|84.2412|10.2397|9.6977|21.8898|8.8254|
34
+ |diarized_enhanced_test_wo_diar|83.7560|10.5096|9.9249|21.2740|9.1355|
35
+
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/config.yaml ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 54493
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: 50
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss_enh
39
+ - min
40
+ keep_nbest_models: 1
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 2
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param: []
62
+ ignore_init_mismatch: false
63
+ freeze_param: []
64
+ num_iters_per_epoch: null
65
+ batch_size: 8
66
+ valid_batch_size: null
67
+ batch_bins: 1000000
68
+ valid_batch_bins: null
69
+ train_shape_file:
70
+ - exp/diar_enh_stats_8k/train/speech_shape
71
+ - exp/diar_enh_stats_8k/train/text_shape
72
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
73
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
74
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
75
+ valid_shape_file:
76
+ - exp/diar_enh_stats_8k/valid/speech_shape
77
+ - exp/diar_enh_stats_8k/valid/text_shape
78
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
79
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
80
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
81
+ batch_type: folded
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 800
85
+ - 80000
86
+ - 80000
87
+ - 80000
88
+ - 80000
89
+ sort_in_batch: descending
90
+ sort_batch: descending
91
+ multiple_iterator: false
92
+ chunk_length: 24000
93
+ chunk_shift_ratio: 0.5
94
+ num_cache_chunks: 1024
95
+ train_data_path_and_name_and_type:
96
+ - - dump/raw/train/wav.scp
97
+ - speech
98
+ - sound
99
+ - - dump/raw/train/espnet_rttm
100
+ - text
101
+ - rttm
102
+ - - dump/raw/train/spk1.scp
103
+ - speech_ref1
104
+ - sound
105
+ - - dump/raw/train/spk2.scp
106
+ - speech_ref2
107
+ - sound
108
+ - - dump/raw/train/noise1.scp
109
+ - noise_ref1
110
+ - sound
111
+ valid_data_path_and_name_and_type:
112
+ - - dump/raw/dev/wav.scp
113
+ - speech
114
+ - sound
115
+ - - dump/raw/dev/espnet_rttm
116
+ - text
117
+ - rttm
118
+ - - dump/raw/dev/spk1.scp
119
+ - speech_ref1
120
+ - sound
121
+ - - dump/raw/dev/spk2.scp
122
+ - speech_ref2
123
+ - sound
124
+ - - dump/raw/dev/noise1.scp
125
+ - noise_ref1
126
+ - sound
127
+ allow_variable_data_keys: false
128
+ max_cache_size: 0.0
129
+ max_cache_fd: 32
130
+ valid_max_cache_size: null
131
+ optim: adam
132
+ optim_conf:
133
+ lr: 0.001
134
+ eps: 1.0e-07
135
+ weight_decay: 0
136
+ scheduler: reducelronplateau
137
+ scheduler_conf:
138
+ mode: min
139
+ factor: 0.5
140
+ patience: 50
141
+ token_list: null
142
+ src_token_list: null
143
+ init: xavier_uniform
144
+ input_size: null
145
+ ctc_conf:
146
+ dropout_rate: 0.0
147
+ ctc_type: builtin
148
+ reduce: true
149
+ ignore_nan_grad: null
150
+ zero_infinity: true
151
+ enh_criterions:
152
+ - name: si_snr
153
+ conf:
154
+ eps: 1.0e-07
155
+ wrapper: pit
156
+ wrapper_conf:
157
+ weight: 1.0
158
+ independent_perm: true
159
+ diar_num_spk: 2
160
+ diar_input_size: 128
161
+ enh_model_conf:
162
+ loss_type: si_snr
163
+ asr_model_conf:
164
+ ctc_weight: 0.5
165
+ interctc_weight: 0.0
166
+ ignore_id: -1
167
+ lsm_weight: 0.0
168
+ length_normalized_loss: false
169
+ report_cer: true
170
+ report_wer: true
171
+ sym_space: <space>
172
+ sym_blank: <blank>
173
+ extract_feats_in_collect_stats: true
174
+ st_model_conf:
175
+ stft_consistency: false
176
+ loss_type: mask_mse
177
+ mask_type: null
178
+ diar_model_conf:
179
+ diar_weight: 0.2
180
+ attractor_weight: 0.2
181
+ subtask_series:
182
+ - enh
183
+ - diar
184
+ model_conf:
185
+ calc_enh_loss: true
186
+ bypass_enh_prob: 0
187
+ use_preprocessor: true
188
+ token_type: bpe
189
+ bpemodel: null
190
+ src_token_type: bpe
191
+ src_bpemodel: null
192
+ non_linguistic_symbols: null
193
+ cleaner: null
194
+ g2p: null
195
+ enh_encoder: conv
196
+ enh_encoder_conf:
197
+ channel: 512
198
+ kernel_size: 16
199
+ stride: 8
200
+ enh_separator: tcn_nomask
201
+ enh_separator_conf:
202
+ layer: 8
203
+ stack: 3
204
+ bottleneck_dim: 128
205
+ hidden_dim: 512
206
+ kernel: 3
207
+ causal: false
208
+ norm_type: gLN
209
+ enh_decoder: conv
210
+ enh_decoder_conf:
211
+ channel: 512
212
+ kernel_size: 16
213
+ stride: 8
214
+ enh_mask_module: multi_mask
215
+ enh_mask_module_conf:
216
+ max_num_spk: 3
217
+ mask_nonlinear: relu
218
+ bottleneck_dim: 128
219
+ frontend: default
220
+ frontend_conf: {}
221
+ specaug: null
222
+ specaug_conf: {}
223
+ normalize: utterance_mvn
224
+ normalize_conf: {}
225
+ asr_preencoder: null
226
+ asr_preencoder_conf: {}
227
+ asr_encoder: rnn
228
+ asr_encoder_conf: {}
229
+ asr_postencoder: null
230
+ asr_postencoder_conf: {}
231
+ asr_decoder: rnn
232
+ asr_decoder_conf: {}
233
+ st_preencoder: null
234
+ st_preencoder_conf: {}
235
+ st_encoder: rnn
236
+ st_encoder_conf: {}
237
+ st_postencoder: null
238
+ st_postencoder_conf: {}
239
+ st_decoder: rnn
240
+ st_decoder_conf: {}
241
+ st_extra_asr_decoder: rnn
242
+ st_extra_asr_decoder_conf: {}
243
+ st_extra_mt_decoder: rnn
244
+ st_extra_mt_decoder_conf: {}
245
+ diar_frontend: default
246
+ diar_frontend_conf:
247
+ hop_length: 64
248
+ fs: 8000
249
+ diar_specaug: null
250
+ diar_specaug_conf: {}
251
+ diar_normalize: utterance_mvn
252
+ diar_normalize_conf: {}
253
+ diar_encoder: transformer
254
+ diar_encoder_conf:
255
+ input_layer: conv2d8
256
+ num_blocks: 4
257
+ linear_units: 512
258
+ dropout_rate: 0.1
259
+ output_size: 256
260
+ attention_heads: 4
261
+ attention_dropout_rate: 0.1
262
+ diar_decoder: linear
263
+ diar_decoder_conf: {}
264
+ label_aggregator: label_aggregator
265
+ label_aggregator_conf:
266
+ win_length: 256
267
+ hop_length: 64
268
+ diar_attractor: rnn
269
+ diar_attractor_conf:
270
+ unit: 256
271
+ layer: 1
272
+ dropout: 0.0
273
+ attractor_grad: true
274
+ required:
275
+ - output_dir
276
+ version: '202205'
277
+ distributed: true
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/acc.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/backward_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/cf.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/der.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/fa.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/forward_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/iter_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/loss.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/loss_att.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/loss_enh.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/loss_pit.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/mi.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/optim0_lr0.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/optim_step_time.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/sad_fr.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/sad_mr.png ADDED
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202205'
2
+ files:
3
+ model_file: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/57epoch.pth
4
+ python: "3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]"
5
+ timestamp: 1676313507.836365
6
+ torch: 1.8.1+cu102
7
+ yaml_files:
8
+ train_config: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/config.yaml