soumi-maiti commited on
Commit
9ae5af7
1 Parent(s): eac7f0a

Update model

Browse files
README.md ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - diarization
6
+ language: noinfo
7
+ datasets:
8
+ - librimix
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 DIAR model
13
+
14
+ ### `soumi-maiti/libri3mix_eend_ss`
15
+
16
+ This model was trained by soumimaiti using librimix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout d837c97c88f13ffe655a30bcff93d814f212b225
26
+ pip install -e .
27
+ cd egs2/librimix/enh_diar1_2
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model soumi-maiti/libri3mix_eend_ss
29
+ ```
30
+
31
+
32
+
33
+ ## DIAR config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: chunk
43
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: null
51
+ dist_rank: null
52
+ local_rank: 0
53
+ dist_master_addr: null
54
+ dist_master_port: null
55
+ dist_launcher: null
56
+ multiprocessing_distributed: false
57
+ unused_parameters: false
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 100
65
+ patience: 50
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss_enh
76
+ - min
77
+ keep_nbest_models: 1
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 5.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 4
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param: []
99
+ ignore_init_mismatch: false
100
+ freeze_param: []
101
+ num_iters_per_epoch: null
102
+ batch_size: 4
103
+ valid_batch_size: null
104
+ batch_bins: 1000000
105
+ valid_batch_bins: null
106
+ train_shape_file:
107
+ - exp/diar_enh_stats_8k/train/speech_shape
108
+ - exp/diar_enh_stats_8k/train/text_shape
109
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
110
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
111
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
112
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
113
+ valid_shape_file:
114
+ - exp/diar_enh_stats_8k/valid/speech_shape
115
+ - exp/diar_enh_stats_8k/valid/text_shape
116
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
117
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
118
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
119
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
120
+ batch_type: folded
121
+ valid_batch_type: null
122
+ fold_length:
123
+ - 800
124
+ - 80000
125
+ - 80000
126
+ - 80000
127
+ - 80000
128
+ - 80000
129
+ sort_in_batch: descending
130
+ sort_batch: descending
131
+ multiple_iterator: false
132
+ chunk_length: 24000
133
+ chunk_shift_ratio: 0.5
134
+ num_cache_chunks: 1024
135
+ train_data_path_and_name_and_type:
136
+ - - dump/raw/train/wav.scp
137
+ - speech
138
+ - sound
139
+ - - dump/raw/train/espnet_rttm
140
+ - text
141
+ - rttm
142
+ - - dump/raw/train/spk1.scp
143
+ - speech_ref1
144
+ - sound
145
+ - - dump/raw/train/spk2.scp
146
+ - speech_ref2
147
+ - sound
148
+ - - dump/raw/train/spk3.scp
149
+ - speech_ref3
150
+ - sound
151
+ - - dump/raw/train/noise1.scp
152
+ - noise_ref1
153
+ - sound
154
+ valid_data_path_and_name_and_type:
155
+ - - dump/raw/dev/wav.scp
156
+ - speech
157
+ - sound
158
+ - - dump/raw/dev/espnet_rttm
159
+ - text
160
+ - rttm
161
+ - - dump/raw/dev/spk1.scp
162
+ - speech_ref1
163
+ - sound
164
+ - - dump/raw/dev/spk2.scp
165
+ - speech_ref2
166
+ - sound
167
+ - - dump/raw/dev/spk3.scp
168
+ - speech_ref3
169
+ - sound
170
+ - - dump/raw/dev/noise1.scp
171
+ - noise_ref1
172
+ - sound
173
+ allow_variable_data_keys: false
174
+ max_cache_size: 0.0
175
+ max_cache_fd: 32
176
+ valid_max_cache_size: null
177
+ optim: adam
178
+ optim_conf:
179
+ lr: 0.001
180
+ eps: 1.0e-07
181
+ weight_decay: 0
182
+ scheduler: reducelronplateau
183
+ scheduler_conf:
184
+ mode: min
185
+ factor: 0.5
186
+ patience: 1
187
+ token_list: null
188
+ src_token_list: null
189
+ init: xavier_uniform
190
+ input_size: null
191
+ ctc_conf:
192
+ dropout_rate: 0.0
193
+ ctc_type: builtin
194
+ reduce: true
195
+ ignore_nan_grad: null
196
+ zero_infinity: true
197
+ enh_criterions:
198
+ - name: si_snr
199
+ conf:
200
+ eps: 1.0e-07
201
+ wrapper: pit
202
+ wrapper_conf:
203
+ weight: 1.0
204
+ independent_perm: true
205
+ diar_num_spk: 3
206
+ diar_input_size: 128
207
+ enh_model_conf:
208
+ loss_type: si_snr
209
+ asr_model_conf:
210
+ ctc_weight: 0.5
211
+ interctc_weight: 0.0
212
+ ignore_id: -1
213
+ lsm_weight: 0.0
214
+ length_normalized_loss: false
215
+ report_cer: true
216
+ report_wer: true
217
+ sym_space: <space>
218
+ sym_blank: <blank>
219
+ extract_feats_in_collect_stats: true
220
+ st_model_conf:
221
+ stft_consistency: false
222
+ loss_type: mask_mse
223
+ mask_type: null
224
+ diar_model_conf:
225
+ diar_weight: 0.2
226
+ attractor_weight: 0.2
227
+ subtask_series:
228
+ - enh
229
+ - diar
230
+ model_conf:
231
+ calc_enh_loss: true
232
+ bypass_enh_prob: 0
233
+ use_preprocessor: true
234
+ token_type: bpe
235
+ bpemodel: null
236
+ src_token_type: bpe
237
+ src_bpemodel: null
238
+ non_linguistic_symbols: null
239
+ cleaner: null
240
+ g2p: null
241
+ enh_encoder: conv
242
+ enh_encoder_conf:
243
+ channel: 512
244
+ kernel_size: 16
245
+ stride: 8
246
+ enh_separator: tcn_nomask
247
+ enh_separator_conf:
248
+ layer: 8
249
+ stack: 3
250
+ bottleneck_dim: 128
251
+ hidden_dim: 512
252
+ kernel: 3
253
+ causal: false
254
+ norm_type: gLN
255
+ enh_decoder: conv
256
+ enh_decoder_conf:
257
+ channel: 512
258
+ kernel_size: 16
259
+ stride: 8
260
+ enh_mask_module: multi_mask
261
+ enh_mask_module_conf:
262
+ max_num_spk: 3
263
+ mask_nonlinear: relu
264
+ bottleneck_dim: 128
265
+ frontend: default
266
+ frontend_conf: {}
267
+ specaug: null
268
+ specaug_conf: {}
269
+ normalize: utterance_mvn
270
+ normalize_conf: {}
271
+ asr_preencoder: null
272
+ asr_preencoder_conf: {}
273
+ asr_encoder: rnn
274
+ asr_encoder_conf: {}
275
+ asr_postencoder: null
276
+ asr_postencoder_conf: {}
277
+ asr_decoder: rnn
278
+ asr_decoder_conf: {}
279
+ st_preencoder: null
280
+ st_preencoder_conf: {}
281
+ st_encoder: rnn
282
+ st_encoder_conf: {}
283
+ st_postencoder: null
284
+ st_postencoder_conf: {}
285
+ st_decoder: rnn
286
+ st_decoder_conf: {}
287
+ st_extra_asr_decoder: rnn
288
+ st_extra_asr_decoder_conf: {}
289
+ st_extra_mt_decoder: rnn
290
+ st_extra_mt_decoder_conf: {}
291
+ diar_frontend: default
292
+ diar_frontend_conf:
293
+ hop_length: 64
294
+ fs: 8000
295
+ diar_specaug: null
296
+ diar_specaug_conf: {}
297
+ diar_normalize: utterance_mvn
298
+ diar_normalize_conf: {}
299
+ diar_encoder: transformer
300
+ diar_encoder_conf:
301
+ input_layer: conv2d8
302
+ num_blocks: 4
303
+ linear_units: 512
304
+ dropout_rate: 0.1
305
+ output_size: 256
306
+ attention_heads: 4
307
+ attention_dropout_rate: 0.1
308
+ diar_decoder: linear
309
+ diar_decoder_conf: {}
310
+ label_aggregator: label_aggregator
311
+ label_aggregator_conf:
312
+ win_length: 256
313
+ hop_length: 64
314
+ diar_attractor: rnn
315
+ diar_attractor_conf:
316
+ unit: 256
317
+ layer: 1
318
+ dropout: 0.0
319
+ attractor_grad: true
320
+ required:
321
+ - output_dir
322
+ version: '202205'
323
+ distributed: false
324
+ ```
325
+
326
+ </details>
327
+
328
+
329
+
330
+ ### Citing ESPnet
331
+
332
+ ```BibTex
333
+ @inproceedings{watanabe2018espnet,
334
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
335
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
336
+ year={2018},
337
+ booktitle={Proceedings of Interspeech},
338
+ pages={2207--2211},
339
+ doi={10.21437/Interspeech.2018-1456},
340
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
341
+ }
342
+
343
+
344
+
345
+
346
+ ```
347
+
348
+ or arXiv:
349
+
350
+ ```bibtex
351
+ @misc{watanabe2018espnet,
352
+ title={ESPnet: End-to-End Speech Processing Toolkit},
353
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
354
+ year={2018},
355
+ eprint={1804.00015},
356
+ archivePrefix={arXiv},
357
+ primaryClass={cs.CL}
358
+ }
359
+ ```
exp/diar_enh_stats_8k/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bb5cfdb14dc06555acb20c7cf7aa9fe46a4ba08a4e108d9d4c8c594f69bcf20
3
+ size 778
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/DIAR_RESULTS.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_diar_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Thu Mar 9 16:50:29 EST 2023`
5
+ - python version: `3.9.12 (main, Apr 5 2022, 06:56:58) [GCC 7.5.0]`
6
+ - espnet version: `espnet 202205`
7
+ - pytorch version: `pytorch 1.8.1+cu111`
8
+ - Git hash: `f2778f798b76e102602078213a21b63aa592be70`
9
+ - Commit date: `Sun Jul 10 23:45:31 2022 +0000`
10
+
11
+ ## diar_enh_train_diar_enh_convtasnet_concat_feats_raw
12
+ ### DER
13
+ diarized_enhanced_test_decode_diar_enh_wo_diar
14
+ |threshold_median_collar|DER|
15
+ |---|---|
16
+ |result_th0.3_med11_collar0.0|6.14|
17
+ |result_th0.3_med1_collar0.0|6.18|
18
+ |result_th0.4_med11_collar0.0|5.91|
19
+ |result_th0.4_med1_collar0.0|5.97|
20
+ |result_th0.5_med11_collar0.0|5.81|
21
+ |result_th0.5_med1_collar0.0|5.90|
22
+ |result_th0.6_med11_collar0.0|5.89|
23
+ |result_th0.6_med1_collar0.0|6.01|
24
+ |result_th0.7_med11_collar0.0|6.24|
25
+ |result_th0.7_med1_collar0.0|6.39|
26
+ ## diar_enh_train_diar_enh_convtasnet_concat_feats_raw
27
+ ### DER
28
+ diarized_enhanced_test
29
+ |threshold_median_collar|DER|
30
+ |---|---|
31
+ |result_th0.3_med11_collar0.0|6.14|
32
+ |result_th0.3_med1_collar0.0|6.18|
33
+ |result_th0.4_med11_collar0.0|5.91|
34
+ |result_th0.4_med1_collar0.0|5.97|
35
+ |result_th0.5_med11_collar0.0|5.81|
36
+ |result_th0.5_med1_collar0.0|5.90|
37
+ |result_th0.6_med11_collar0.0|5.89|
38
+ |result_th0.6_med1_collar0.0|6.01|
39
+ |result_th0.7_med11_collar0.0|6.24|
40
+ |result_th0.7_med1_collar0.0|6.39|
41
+ ## diar_enh_train_diar_enh_convtasnet_concat_feats_raw
42
+ ### DER
43
+ diarized_enhanced_test_decode_diar_enh
44
+ |threshold_median_collar|DER|
45
+ |---|---|
46
+ |result_th0.3_med11_collar0.0|6.14|
47
+ |result_th0.3_med1_collar0.0|6.18|
48
+ |result_th0.4_med11_collar0.0|5.91|
49
+ |result_th0.4_med1_collar0.0|5.97|
50
+ |result_th0.5_med11_collar0.0|5.81|
51
+ |result_th0.5_med1_collar0.0|5.90|
52
+ |result_th0.6_med11_collar0.0|5.89|
53
+ |result_th0.6_med1_collar0.0|6.01|
54
+ |result_th0.7_med11_collar0.0|6.24|
55
+ |result_th0.7_med1_collar0.0|6.39|
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/ENH_RESULTS.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Thu Mar 9 22:20:58 EST 2023`
5
+ - python version: `3.9.12 (main, Apr 5 2022, 06:56:58) [GCC 7.5.0]`
6
+ - espnet version: `espnet 202205`
7
+ - pytorch version: `pytorch 1.8.1+cu111`
8
+ - Git hash: `f2778f798b76e102602078213a21b63aa592be70`
9
+ - Commit date: `Sun Jul 10 23:45:31 2022 +0000`
10
+
11
+
12
+ ## diar_enh_train_diar_enh_convtasnet_concat_feats_raw
13
+
14
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|
17
+ |---|---|---|---|---|---|
18
+ |diarized_enhanced_test|72.5039|6.7901|5.0607|12.4202|4.1435|
19
+ |diarized_enhanced_test_decode_diar_enh|72.5039|6.7901|5.0607|12.4202|4.1435|
20
+ |diarized_enhanced_test_decode_diar_enh_wo_diar|72.5187|6.7670|5.0186|12.3369|4.1165|
21
+
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/config.yaml ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_diar_enh_convtasnet_concat_feats.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: 50
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss_enh
39
+ - min
40
+ keep_nbest_models: 1
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 4
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param: []
62
+ ignore_init_mismatch: false
63
+ freeze_param: []
64
+ num_iters_per_epoch: null
65
+ batch_size: 4
66
+ valid_batch_size: null
67
+ batch_bins: 1000000
68
+ valid_batch_bins: null
69
+ train_shape_file:
70
+ - exp/diar_enh_stats_8k/train/speech_shape
71
+ - exp/diar_enh_stats_8k/train/text_shape
72
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
73
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
74
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
75
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
76
+ valid_shape_file:
77
+ - exp/diar_enh_stats_8k/valid/speech_shape
78
+ - exp/diar_enh_stats_8k/valid/text_shape
79
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
80
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
81
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
82
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
83
+ batch_type: folded
84
+ valid_batch_type: null
85
+ fold_length:
86
+ - 800
87
+ - 80000
88
+ - 80000
89
+ - 80000
90
+ - 80000
91
+ - 80000
92
+ sort_in_batch: descending
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 24000
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 1024
98
+ train_data_path_and_name_and_type:
99
+ - - dump/raw/train/wav.scp
100
+ - speech
101
+ - sound
102
+ - - dump/raw/train/espnet_rttm
103
+ - text
104
+ - rttm
105
+ - - dump/raw/train/spk1.scp
106
+ - speech_ref1
107
+ - sound
108
+ - - dump/raw/train/spk2.scp
109
+ - speech_ref2
110
+ - sound
111
+ - - dump/raw/train/spk3.scp
112
+ - speech_ref3
113
+ - sound
114
+ - - dump/raw/train/noise1.scp
115
+ - noise_ref1
116
+ - sound
117
+ valid_data_path_and_name_and_type:
118
+ - - dump/raw/dev/wav.scp
119
+ - speech
120
+ - sound
121
+ - - dump/raw/dev/espnet_rttm
122
+ - text
123
+ - rttm
124
+ - - dump/raw/dev/spk1.scp
125
+ - speech_ref1
126
+ - sound
127
+ - - dump/raw/dev/spk2.scp
128
+ - speech_ref2
129
+ - sound
130
+ - - dump/raw/dev/spk3.scp
131
+ - speech_ref3
132
+ - sound
133
+ - - dump/raw/dev/noise1.scp
134
+ - noise_ref1
135
+ - sound
136
+ allow_variable_data_keys: false
137
+ max_cache_size: 0.0
138
+ max_cache_fd: 32
139
+ valid_max_cache_size: null
140
+ optim: adam
141
+ optim_conf:
142
+ lr: 0.001
143
+ eps: 1.0e-07
144
+ weight_decay: 0
145
+ scheduler: reducelronplateau
146
+ scheduler_conf:
147
+ mode: min
148
+ factor: 0.5
149
+ patience: 1
150
+ token_list: null
151
+ src_token_list: null
152
+ init: xavier_uniform
153
+ input_size: null
154
+ ctc_conf:
155
+ dropout_rate: 0.0
156
+ ctc_type: builtin
157
+ reduce: true
158
+ ignore_nan_grad: null
159
+ zero_infinity: true
160
+ enh_criterions:
161
+ - name: si_snr
162
+ conf:
163
+ eps: 1.0e-07
164
+ wrapper: pit
165
+ wrapper_conf:
166
+ weight: 1.0
167
+ independent_perm: true
168
+ diar_num_spk: 3
169
+ diar_input_size: 128
170
+ enh_model_conf:
171
+ loss_type: si_snr
172
+ asr_model_conf:
173
+ ctc_weight: 0.5
174
+ interctc_weight: 0.0
175
+ ignore_id: -1
176
+ lsm_weight: 0.0
177
+ length_normalized_loss: false
178
+ report_cer: true
179
+ report_wer: true
180
+ sym_space: <space>
181
+ sym_blank: <blank>
182
+ extract_feats_in_collect_stats: true
183
+ st_model_conf:
184
+ stft_consistency: false
185
+ loss_type: mask_mse
186
+ mask_type: null
187
+ diar_model_conf:
188
+ diar_weight: 0.2
189
+ attractor_weight: 0.2
190
+ subtask_series:
191
+ - enh
192
+ - diar
193
+ model_conf:
194
+ calc_enh_loss: true
195
+ bypass_enh_prob: 0
196
+ use_preprocessor: true
197
+ token_type: bpe
198
+ bpemodel: null
199
+ src_token_type: bpe
200
+ src_bpemodel: null
201
+ non_linguistic_symbols: null
202
+ cleaner: null
203
+ g2p: null
204
+ enh_encoder: conv
205
+ enh_encoder_conf:
206
+ channel: 512
207
+ kernel_size: 16
208
+ stride: 8
209
+ enh_separator: tcn_nomask
210
+ enh_separator_conf:
211
+ layer: 8
212
+ stack: 3
213
+ bottleneck_dim: 128
214
+ hidden_dim: 512
215
+ kernel: 3
216
+ causal: false
217
+ norm_type: gLN
218
+ enh_decoder: conv
219
+ enh_decoder_conf:
220
+ channel: 512
221
+ kernel_size: 16
222
+ stride: 8
223
+ enh_mask_module: multi_mask
224
+ enh_mask_module_conf:
225
+ max_num_spk: 3
226
+ mask_nonlinear: relu
227
+ bottleneck_dim: 128
228
+ frontend: default
229
+ frontend_conf: {}
230
+ specaug: null
231
+ specaug_conf: {}
232
+ normalize: utterance_mvn
233
+ normalize_conf: {}
234
+ asr_preencoder: null
235
+ asr_preencoder_conf: {}
236
+ asr_encoder: rnn
237
+ asr_encoder_conf: {}
238
+ asr_postencoder: null
239
+ asr_postencoder_conf: {}
240
+ asr_decoder: rnn
241
+ asr_decoder_conf: {}
242
+ st_preencoder: null
243
+ st_preencoder_conf: {}
244
+ st_encoder: rnn
245
+ st_encoder_conf: {}
246
+ st_postencoder: null
247
+ st_postencoder_conf: {}
248
+ st_decoder: rnn
249
+ st_decoder_conf: {}
250
+ st_extra_asr_decoder: rnn
251
+ st_extra_asr_decoder_conf: {}
252
+ st_extra_mt_decoder: rnn
253
+ st_extra_mt_decoder_conf: {}
254
+ diar_frontend: default
255
+ diar_frontend_conf:
256
+ hop_length: 64
257
+ fs: 8000
258
+ diar_specaug: null
259
+ diar_specaug_conf: {}
260
+ diar_normalize: utterance_mvn
261
+ diar_normalize_conf: {}
262
+ diar_encoder: transformer
263
+ diar_encoder_conf:
264
+ input_layer: conv2d8
265
+ num_blocks: 4
266
+ linear_units: 512
267
+ dropout_rate: 0.1
268
+ output_size: 256
269
+ attention_heads: 4
270
+ attention_dropout_rate: 0.1
271
+ diar_decoder: linear
272
+ diar_decoder_conf: {}
273
+ label_aggregator: label_aggregator
274
+ label_aggregator_conf:
275
+ win_length: 256
276
+ hop_length: 64
277
+ diar_attractor: rnn
278
+ diar_attractor_conf:
279
+ unit: 256
280
+ layer: 1
281
+ dropout: 0.0
282
+ attractor_grad: true
283
+ required:
284
+ - output_dir
285
+ version: '202205'
286
+ distributed: false
exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/valid.loss_enh.best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a699455f39d3e12299bcf466c363f8f68574f5711b270d253bffa1e669f53155
3
+ size 38983318
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202205'
2
+ files:
3
+ model_file: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/valid.loss_enh.best.pth
4
+ python: "3.9.16 | packaged by conda-forge | (main, Feb 1 2023, 21:39:03) \n[GCC 11.3.0]"
5
+ timestamp: 1683210313.020129
6
+ torch: 1.8.1+cu102
7
+ yaml_files:
8
+ train_config: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/config.yaml