soumi-maiti commited on
Commit
6a00062
1 Parent(s): a2007c5

Update model

Browse files
README.md CHANGED
@@ -35,12 +35,12 @@ cd egs2/librimix/enh_diar1_2
35
  <details><summary>expand</summary>
36
 
37
  ```
38
- config: conf/tuning/train_diar_enh_convtasnet_concat_feats.yaml
39
  print_config: false
40
  log_level: INFO
41
  dry_run: false
42
  iterator_type: chunk
43
- output_dir: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw
44
  ngpu: 1
45
  seed: 0
46
  num_workers: 4
@@ -262,7 +262,7 @@ enh_mask_module_conf:
262
  max_num_spk: 3
263
  mask_nonlinear: relu
264
  bottleneck_dim: 128
265
- frontend: default
266
  frontend_conf: {}
267
  specaug: null
268
  specaug_conf: {}
@@ -288,10 +288,8 @@ st_extra_asr_decoder: rnn
288
  st_extra_asr_decoder_conf: {}
289
  st_extra_mt_decoder: rnn
290
  st_extra_mt_decoder_conf: {}
291
- diar_frontend: default
292
- diar_frontend_conf:
293
- hop_length: 64
294
- fs: 8000
295
  diar_specaug: null
296
  diar_specaug_conf: {}
297
  diar_normalize: utterance_mvn
 
35
  <details><summary>expand</summary>
36
 
37
  ```
38
+ config: conf/tuning/train_diar_enh_convtasnet_2.yaml
39
  print_config: false
40
  log_level: INFO
41
  dry_run: false
42
  iterator_type: chunk
43
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_2_raw
44
  ngpu: 1
45
  seed: 0
46
  num_workers: 4
 
262
  max_num_spk: 3
263
  mask_nonlinear: relu
264
  bottleneck_dim: 128
265
+ frontend: null
266
  frontend_conf: {}
267
  specaug: null
268
  specaug_conf: {}
 
288
  st_extra_asr_decoder_conf: {}
289
  st_extra_mt_decoder: rnn
290
  st_extra_mt_decoder_conf: {}
291
+ diar_frontend: null
292
+ diar_frontend_conf: {}
 
 
293
  diar_specaug: null
294
  diar_specaug_conf: {}
295
  diar_normalize: utterance_mvn
exp/diar_enh_train_diar_enh_convtasnet_2_raw/DIAR_RESULTS.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_diar_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Mon Mar 13 03:12:32 EDT 2023`
5
+ - python version: `3.9.12 (main, Apr 5 2022, 06:56:58) [GCC 7.5.0]`
6
+ - espnet version: `espnet 202205`
7
+ - pytorch version: `pytorch 1.8.1+cu111`
8
+ - Git hash: `f2778f798b76e102602078213a21b63aa592be70`
9
+ - Commit date: `Sun Jul 10 23:45:31 2022 +0000`
10
+
11
+ ## diar_enh_train_diar_enh_convtasnet_2_raw
12
+ ### DER
13
+ diarized_enhanced_test_decode_diar_enh_wo_diar
14
+ |threshold_median_collar|DER|
15
+ |---|---|
16
+ |result_th0.3_med11_collar0.0|6.19|
17
+ |result_th0.3_med1_collar0.0|6.23|
18
+ |result_th0.4_med11_collar0.0|6.05|
19
+ |result_th0.4_med1_collar0.0|6.13|
20
+ |result_th0.5_med11_collar0.0|6.20|
21
+ |result_th0.5_med1_collar0.0|6.30|
22
+ |result_th0.6_med11_collar0.0|6.66|
23
+ |result_th0.6_med1_collar0.0|6.80|
24
+ |result_th0.7_med11_collar0.0|7.62|
25
+ |result_th0.7_med1_collar0.0|7.77|
26
+ ## diar_enh_train_diar_enh_convtasnet_2_raw
27
+ ### DER
28
+ diarized_enhanced_test_decode_diar_enh
29
+ |threshold_median_collar|DER|
30
+ |---|---|
31
+ |result_th0.3_med11_collar0.0|6.20|
32
+ |result_th0.3_med1_collar0.0|6.24|
33
+ |result_th0.4_med11_collar0.0|6.04|
34
+ |result_th0.4_med1_collar0.0|6.11|
35
+ |result_th0.5_med11_collar0.0|6.16|
36
+ |result_th0.5_med1_collar0.0|6.26|
37
+ |result_th0.6_med11_collar0.0|6.59|
38
+ |result_th0.6_med1_collar0.0|6.72|
39
+ |result_th0.7_med11_collar0.0|7.46|
40
+ |result_th0.7_med1_collar0.0|7.60|
41
+ ## diar_enh_train_diar_enh_convtasnet_2_raw
42
+ ### DER
43
+ diarized_enhanced_test
44
+ |threshold_median_collar|DER|
45
+ |---|---|
46
+ |result_th0.3_med11_collar0.0|6.19|
47
+ |result_th0.3_med1_collar0.0|6.23|
48
+ |result_th0.4_med11_collar0.0|6.05|
49
+ |result_th0.4_med1_collar0.0|6.13|
50
+ |result_th0.5_med11_collar0.0|6.20|
51
+ |result_th0.5_med1_collar0.0|6.30|
52
+ |result_th0.6_med11_collar0.0|6.66|
53
+ |result_th0.6_med1_collar0.0|6.80|
54
+ |result_th0.7_med11_collar0.0|7.62|
55
+ |result_th0.7_med1_collar0.0|7.77|
exp/diar_enh_train_diar_enh_convtasnet_2_raw/ENH_RESULTS.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Mon Mar 13 03:15:55 EDT 2023`
5
+ - python version: `3.9.12 (main, Apr 5 2022, 06:56:58) [GCC 7.5.0]`
6
+ - espnet version: `espnet 202205`
7
+ - pytorch version: `pytorch 1.8.1+cu111`
8
+ - Git hash: `f2778f798b76e102602078213a21b63aa592be70`
9
+ - Commit date: `Sun Jul 10 23:45:31 2022 +0000`
10
+
11
+
12
+ ## diar_enh_train_diar_enh_convtasnet_2_raw
13
+
14
+ config: conf/tuning/train_diar_enh_convtasnet_2.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|
17
+ |---|---|---|---|---|---|
18
+ |diarized_enhanced_test|70.1006|5.7082|3.8434|11.2483|2.7215|
19
+ |diarized_enhanced_test_decode_diar_enh|71.4425|6.3658|4.5344|11.8239|3.4320|
20
+ |diarized_enhanced_test_decode_diar_enh_wo_diar|71.5192|6.3575|4.4915|11.7014|3.4049|
21
+
exp/diar_enh_train_diar_enh_convtasnet_2_raw/config.yaml ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_diar_enh_convtasnet_2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/diar_enh_train_diar_enh_convtasnet_2_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: 50
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss_enh
39
+ - min
40
+ keep_nbest_models: 1
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 4
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param: []
62
+ ignore_init_mismatch: false
63
+ freeze_param: []
64
+ num_iters_per_epoch: null
65
+ batch_size: 4
66
+ valid_batch_size: null
67
+ batch_bins: 1000000
68
+ valid_batch_bins: null
69
+ train_shape_file:
70
+ - exp/diar_enh_stats_8k/train/speech_shape
71
+ - exp/diar_enh_stats_8k/train/text_shape
72
+ - exp/diar_enh_stats_8k/train/speech_ref1_shape
73
+ - exp/diar_enh_stats_8k/train/speech_ref2_shape
74
+ - exp/diar_enh_stats_8k/train/speech_ref3_shape
75
+ - exp/diar_enh_stats_8k/train/noise_ref1_shape
76
+ valid_shape_file:
77
+ - exp/diar_enh_stats_8k/valid/speech_shape
78
+ - exp/diar_enh_stats_8k/valid/text_shape
79
+ - exp/diar_enh_stats_8k/valid/speech_ref1_shape
80
+ - exp/diar_enh_stats_8k/valid/speech_ref2_shape
81
+ - exp/diar_enh_stats_8k/valid/speech_ref3_shape
82
+ - exp/diar_enh_stats_8k/valid/noise_ref1_shape
83
+ batch_type: folded
84
+ valid_batch_type: null
85
+ fold_length:
86
+ - 800
87
+ - 80000
88
+ - 80000
89
+ - 80000
90
+ - 80000
91
+ - 80000
92
+ sort_in_batch: descending
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 24000
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 1024
98
+ train_data_path_and_name_and_type:
99
+ - - dump/raw/train/wav.scp
100
+ - speech
101
+ - sound
102
+ - - dump/raw/train/espnet_rttm
103
+ - text
104
+ - rttm
105
+ - - dump/raw/train/spk1.scp
106
+ - speech_ref1
107
+ - sound
108
+ - - dump/raw/train/spk2.scp
109
+ - speech_ref2
110
+ - sound
111
+ - - dump/raw/train/spk3.scp
112
+ - speech_ref3
113
+ - sound
114
+ - - dump/raw/train/noise1.scp
115
+ - noise_ref1
116
+ - sound
117
+ valid_data_path_and_name_and_type:
118
+ - - dump/raw/dev/wav.scp
119
+ - speech
120
+ - sound
121
+ - - dump/raw/dev/espnet_rttm
122
+ - text
123
+ - rttm
124
+ - - dump/raw/dev/spk1.scp
125
+ - speech_ref1
126
+ - sound
127
+ - - dump/raw/dev/spk2.scp
128
+ - speech_ref2
129
+ - sound
130
+ - - dump/raw/dev/spk3.scp
131
+ - speech_ref3
132
+ - sound
133
+ - - dump/raw/dev/noise1.scp
134
+ - noise_ref1
135
+ - sound
136
+ allow_variable_data_keys: false
137
+ max_cache_size: 0.0
138
+ max_cache_fd: 32
139
+ valid_max_cache_size: null
140
+ optim: adam
141
+ optim_conf:
142
+ lr: 0.001
143
+ eps: 1.0e-07
144
+ weight_decay: 0
145
+ scheduler: reducelronplateau
146
+ scheduler_conf:
147
+ mode: min
148
+ factor: 0.5
149
+ patience: 1
150
+ token_list: null
151
+ src_token_list: null
152
+ init: xavier_uniform
153
+ input_size: null
154
+ ctc_conf:
155
+ dropout_rate: 0.0
156
+ ctc_type: builtin
157
+ reduce: true
158
+ ignore_nan_grad: null
159
+ zero_infinity: true
160
+ enh_criterions:
161
+ - name: si_snr
162
+ conf:
163
+ eps: 1.0e-07
164
+ wrapper: pit
165
+ wrapper_conf:
166
+ weight: 1.0
167
+ independent_perm: true
168
+ diar_num_spk: 3
169
+ diar_input_size: 128
170
+ enh_model_conf:
171
+ loss_type: si_snr
172
+ asr_model_conf:
173
+ ctc_weight: 0.5
174
+ interctc_weight: 0.0
175
+ ignore_id: -1
176
+ lsm_weight: 0.0
177
+ length_normalized_loss: false
178
+ report_cer: true
179
+ report_wer: true
180
+ sym_space: <space>
181
+ sym_blank: <blank>
182
+ extract_feats_in_collect_stats: true
183
+ st_model_conf:
184
+ stft_consistency: false
185
+ loss_type: mask_mse
186
+ mask_type: null
187
+ diar_model_conf:
188
+ diar_weight: 0.2
189
+ attractor_weight: 0.2
190
+ subtask_series:
191
+ - enh
192
+ - diar
193
+ model_conf:
194
+ calc_enh_loss: true
195
+ bypass_enh_prob: 0
196
+ use_preprocessor: true
197
+ token_type: bpe
198
+ bpemodel: null
199
+ src_token_type: bpe
200
+ src_bpemodel: null
201
+ non_linguistic_symbols: null
202
+ cleaner: null
203
+ g2p: null
204
+ enh_encoder: conv
205
+ enh_encoder_conf:
206
+ channel: 512
207
+ kernel_size: 16
208
+ stride: 8
209
+ enh_separator: tcn_nomask
210
+ enh_separator_conf:
211
+ layer: 8
212
+ stack: 3
213
+ bottleneck_dim: 128
214
+ hidden_dim: 512
215
+ kernel: 3
216
+ causal: false
217
+ norm_type: gLN
218
+ enh_decoder: conv
219
+ enh_decoder_conf:
220
+ channel: 512
221
+ kernel_size: 16
222
+ stride: 8
223
+ enh_mask_module: multi_mask
224
+ enh_mask_module_conf:
225
+ max_num_spk: 3
226
+ mask_nonlinear: relu
227
+ bottleneck_dim: 128
228
+ frontend: null
229
+ frontend_conf: {}
230
+ specaug: null
231
+ specaug_conf: {}
232
+ normalize: utterance_mvn
233
+ normalize_conf: {}
234
+ asr_preencoder: null
235
+ asr_preencoder_conf: {}
236
+ asr_encoder: rnn
237
+ asr_encoder_conf: {}
238
+ asr_postencoder: null
239
+ asr_postencoder_conf: {}
240
+ asr_decoder: rnn
241
+ asr_decoder_conf: {}
242
+ st_preencoder: null
243
+ st_preencoder_conf: {}
244
+ st_encoder: rnn
245
+ st_encoder_conf: {}
246
+ st_postencoder: null
247
+ st_postencoder_conf: {}
248
+ st_decoder: rnn
249
+ st_decoder_conf: {}
250
+ st_extra_asr_decoder: rnn
251
+ st_extra_asr_decoder_conf: {}
252
+ st_extra_mt_decoder: rnn
253
+ st_extra_mt_decoder_conf: {}
254
+ diar_frontend: null
255
+ diar_frontend_conf: {}
256
+ diar_specaug: null
257
+ diar_specaug_conf: {}
258
+ diar_normalize: utterance_mvn
259
+ diar_normalize_conf: {}
260
+ diar_encoder: transformer
261
+ diar_encoder_conf:
262
+ input_layer: conv2d8
263
+ num_blocks: 4
264
+ linear_units: 512
265
+ dropout_rate: 0.1
266
+ output_size: 256
267
+ attention_heads: 4
268
+ attention_dropout_rate: 0.1
269
+ diar_decoder: linear
270
+ diar_decoder_conf: {}
271
+ label_aggregator: label_aggregator
272
+ label_aggregator_conf:
273
+ win_length: 256
274
+ hop_length: 64
275
+ diar_attractor: rnn
276
+ diar_attractor_conf:
277
+ unit: 256
278
+ layer: 1
279
+ dropout: 0.0
280
+ attractor_grad: true
281
+ required:
282
+ - output_dir
283
+ version: '202205'
284
+ distributed: false
exp/diar_enh_train_diar_enh_convtasnet_2_raw/valid.loss_enh.best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64fe8f116078f977e694d571382cdcd28a7352f5dd3a829e06667190a3d227b1
3
+ size 36279117
meta.yaml CHANGED
@@ -1,8 +1,8 @@
1
  espnet: '202205'
2
  files:
3
- model_file: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/valid.loss_enh.best.pth
4
  python: "3.9.16 | packaged by conda-forge | (main, Feb 1 2023, 21:39:03) \n[GCC 11.3.0]"
5
- timestamp: 1683210501.537852
6
  torch: 1.8.1+cu102
7
  yaml_files:
8
- train_config: exp/diar_enh_train_diar_enh_convtasnet_concat_feats_raw/config.yaml
 
1
  espnet: '202205'
2
  files:
3
+ model_file: exp/diar_enh_train_diar_enh_convtasnet_2_raw/valid.loss_enh.best.pth
4
  python: "3.9.16 | packaged by conda-forge | (main, Feb 1 2023, 21:39:03) \n[GCC 11.3.0]"
5
+ timestamp: 1683210536.396694
6
  torch: 1.8.1+cu102
7
  yaml_files:
8
+ train_config: exp/diar_enh_train_diar_enh_convtasnet_2_raw/config.yaml