popcornell commited on
Commit
511d314
1 Parent(s): f73b8ee

Update model

Browse files
README.md CHANGED
@@ -1,3 +1,277 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language: noinfo
7
+ datasets:
8
+ - clarity
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `popcornell/clarity21_train_enh_beamformer_mvdr`
15
+
16
+ This model was trained by popcornell using clarity recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+
23
+ pip install -e .
24
+ cd egs2/clarity/enh_2021
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model popcornell/clarity21_train_enh_beamformer_mvdr
26
+ ```
27
+
28
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
29
+ # RESULTS
30
+ ## Environments
31
+ - date: `Tue Apr 12 20:54:54 UTC 2022`
32
+ - python version: `3.9.7 (default, Sep 16 2021, 13:09:58) [GCC 7.5.0]`
33
+ - espnet version: `espnet 0.10.5a1`
34
+ - pytorch version: `pytorch 1.10.1`
35
+ - Git hash: `46eaa5eb6bea11cc33927392dca7888921491d8c`
36
+ - Commit date: `Sat Mar 26 22:35:44 2022 +0100`
37
+
38
+
39
+ ## ..
40
+
41
+ config: conf/tuning/train_enh_beamformer_mvdr.yaml
42
+
43
+ |dataset|STOI|SAR|SDR|SIR|
44
+ |---|---|---|---|---|
45
+ |enhanced_dev|0.96|13.02|13.02|0.00|
46
+
47
+ ## ENH config
48
+
49
+ <details><summary>expand</summary>
50
+
51
+ ```
52
+ config: conf/tuning/train_enh_beamformer_mvdr.yaml
53
+ print_config: false
54
+ log_level: INFO
55
+ dry_run: false
56
+ iterator_type: sequence
57
+ output_dir: exp/enh_train_enh_beamformer_mvdr_raw
58
+ ngpu: 1
59
+ seed: 0
60
+ num_workers: 4
61
+ num_att_plot: 3
62
+ dist_backend: nccl
63
+ dist_init_method: env://
64
+ dist_world_size: null
65
+ dist_rank: null
66
+ local_rank: 0
67
+ dist_master_addr: null
68
+ dist_master_port: null
69
+ dist_launcher: null
70
+ multiprocessing_distributed: false
71
+ unused_parameters: false
72
+ sharded_ddp: false
73
+ cudnn_enabled: true
74
+ cudnn_benchmark: false
75
+ cudnn_deterministic: true
76
+ collect_stats: false
77
+ write_collected_feats: false
78
+ max_epoch: 8
79
+ patience: 4
80
+ val_scheduler_criterion:
81
+ - valid
82
+ - loss
83
+ early_stopping_criterion:
84
+ - valid
85
+ - loss
86
+ - min
87
+ best_model_criterion:
88
+ - - valid
89
+ - si_snr
90
+ - max
91
+ - - valid
92
+ - loss
93
+ - min
94
+ keep_nbest_models: 1
95
+ nbest_averaging_interval: 0
96
+ grad_clip: 5.0
97
+ grad_clip_type: 2.0
98
+ grad_noise: false
99
+ accum_grad: 1
100
+ no_forward_run: false
101
+ resume: true
102
+ train_dtype: float32
103
+ use_amp: false
104
+ log_interval: null
105
+ use_matplotlib: true
106
+ use_tensorboard: true
107
+ use_wandb: false
108
+ wandb_project: null
109
+ wandb_id: null
110
+ wandb_entity: null
111
+ wandb_name: null
112
+ wandb_model_log_interval: -1
113
+ detect_anomaly: false
114
+ pretrain_path: null
115
+ init_param: []
116
+ ignore_init_mismatch: false
117
+ freeze_param: []
118
+ num_iters_per_epoch: null
119
+ batch_size: 1
120
+ valid_batch_size: null
121
+ batch_bins: 1000000
122
+ valid_batch_bins: null
123
+ train_shape_file:
124
+ - exp/enh_stats_16000/train/speech_mix_shape
125
+ - exp/enh_stats_16000/train/speech_ref1_shape
126
+ - exp/enh_stats_16000/train/noise_ref1_shape
127
+ valid_shape_file:
128
+ - exp/enh_stats_16000/valid/speech_mix_shape
129
+ - exp/enh_stats_16000/valid/speech_ref1_shape
130
+ - exp/enh_stats_16000/valid/noise_ref1_shape
131
+ batch_type: folded
132
+ valid_batch_type: null
133
+ fold_length:
134
+ - 80000
135
+ - 80000
136
+ - 80000
137
+ sort_in_batch: descending
138
+ sort_batch: descending
139
+ multiple_iterator: false
140
+ chunk_length: 500
141
+ chunk_shift_ratio: 0.5
142
+ num_cache_chunks: 1024
143
+ train_data_path_and_name_and_type:
144
+ - - dump/raw/train/wav.scp
145
+ - speech_mix
146
+ - sound
147
+ - - dump/raw/train/spk1.scp
148
+ - speech_ref1
149
+ - sound
150
+ - - dump/raw/train/noise1.scp
151
+ - noise_ref1
152
+ - sound
153
+ valid_data_path_and_name_and_type:
154
+ - - dump/raw/dev/wav.scp
155
+ - speech_mix
156
+ - sound
157
+ - - dump/raw/dev/spk1.scp
158
+ - speech_ref1
159
+ - sound
160
+ - - dump/raw/dev/noise1.scp
161
+ - noise_ref1
162
+ - sound
163
+ allow_variable_data_keys: false
164
+ max_cache_size: 0.0
165
+ max_cache_fd: 32
166
+ valid_max_cache_size: null
167
+ optim: adam
168
+ optim_conf:
169
+ lr: 0.001
170
+ eps: 1.0e-08
171
+ weight_decay: 0
172
+ scheduler: reducelronplateau
173
+ scheduler_conf:
174
+ mode: min
175
+ factor: 0.5
176
+ patience: 1
177
+ init: xavier_uniform
178
+ model_conf:
179
+ stft_consistency: false
180
+ loss_type: mask_mse
181
+ mask_type: null
182
+ criterions:
183
+ - name: snr
184
+ conf:
185
+ eps: 1e-8
186
+ wrapper: fixed_order
187
+ wrapper_conf:
188
+ weight: 1.0
189
+ use_preprocessor: false
190
+ encoder: stft
191
+ encoder_conf:
192
+ n_fft: 1024
193
+ hop_length: 512
194
+ use_builtin_complex: false
195
+ separator: wpe_beamformer
196
+ separator_conf:
197
+ num_spk: 1
198
+ loss_type: spectrum
199
+ use_wpe: false
200
+ use_beamformer: true
201
+ bnet_type: blstmp
202
+ blayers: 3
203
+ bunits: 300
204
+ bprojs: 320
205
+ badim: 320
206
+ ref_channel: 0
207
+ use_noise_mask: true
208
+ bnonlinear: sigmoid
209
+ beamformer_type: mvdr_souden
210
+ rtf_iterations: 2
211
+ bdropout_rate: 0.0
212
+ shared_power: true
213
+ diagonal_loading: false
214
+ diag_eps_wpe: 1e-4
215
+ diag_eps_bf: 1e-4
216
+ mask_flooring: false
217
+ flooring_thres_wpe: 1e-6
218
+ flooring_thres_bf: 1e-6
219
+ use_torch_solver: true
220
+ decoder: stft
221
+ decoder_conf:
222
+ n_fft: 1024
223
+ hop_length: 512
224
+ required:
225
+ - output_dir
226
+ version: 0.10.5a1
227
+ distributed: false
228
+ ```
229
+
230
+ </details>
231
+
232
+
233
+
234
+ ### Citing ESPnet
235
+
236
+ ```BibTex
237
+ @inproceedings{watanabe2018espnet,
238
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
239
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
240
+ year={2018},
241
+ booktitle={Proceedings of Interspeech},
242
+ pages={2207--2211},
243
+ doi={10.21437/Interspeech.2018-1456},
244
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
245
+ }
246
+
247
+
248
+ @inproceedings{ESPnet-SE,
249
+ author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and
250
+ Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
251
+ title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
252
+ booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
253
+ pages = {785--792},
254
+ publisher = {{IEEE}},
255
+ year = {2021},
256
+ url = {https://doi.org/10.1109/SLT48900.2021.9383615},
257
+ doi = {10.1109/SLT48900.2021.9383615},
258
+ timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
259
+ biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
260
+ bibsource = {dblp computer science bibliography, https://dblp.org}
261
+ }
262
+
263
+
264
+ ```
265
+
266
+ or arXiv:
267
+
268
+ ```bibtex
269
+ @misc{watanabe2018espnet,
270
+ title={ESPnet: End-to-End Speech Processing Toolkit},
271
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
272
+ year={2018},
273
+ eprint={1804.00015},
274
+ archivePrefix={arXiv},
275
+ primaryClass={cs.CL}
276
+ }
277
+ ```
exp/enh_stats_16000/train/feats_stats.npz ADDED
Binary file (810 Bytes). View file
 
exp/enh_train_enh_beamformer_mvdr_raw/8epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22e4b012ae75d134a0f9fb1306106725bd13dd5e0219554f2f6f92f2c051218a
3
+ size 23399166
exp/enh_train_enh_beamformer_mvdr_raw/RESULTS.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Tue Apr 12 20:54:54 UTC 2022`
5
+ - python version: `3.9.7 (default, Sep 16 2021, 13:09:58) [GCC 7.5.0]`
6
+ - espnet version: `espnet 0.10.5a1`
7
+ - pytorch version: `pytorch 1.10.1`
8
+ - Git hash: `46eaa5eb6bea11cc33927392dca7888921491d8c`
9
+ - Commit date: `Sat Mar 26 22:35:44 2022 +0100`
10
+
11
+
12
+ ## ..
13
+
14
+ config: conf/tuning/train_enh_beamformer_mvdr.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|
17
+ |---|---|---|---|---|
18
+ |enhanced_dev|0.96|13.02|13.02|0.00|
19
+
exp/enh_train_enh_beamformer_mvdr_raw/config.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_beamformer_mvdr.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/enh_train_enh_beamformer_mvdr_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 8
28
+ patience: 4
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - si_snr
39
+ - max
40
+ - - valid
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 1
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 5.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param: []
65
+ ignore_init_mismatch: false
66
+ freeze_param: []
67
+ num_iters_per_epoch: null
68
+ batch_size: 1
69
+ valid_batch_size: null
70
+ batch_bins: 1000000
71
+ valid_batch_bins: null
72
+ train_shape_file:
73
+ - exp/enh_stats_16000/train/speech_mix_shape
74
+ - exp/enh_stats_16000/train/speech_ref1_shape
75
+ - exp/enh_stats_16000/train/noise_ref1_shape
76
+ valid_shape_file:
77
+ - exp/enh_stats_16000/valid/speech_mix_shape
78
+ - exp/enh_stats_16000/valid/speech_ref1_shape
79
+ - exp/enh_stats_16000/valid/noise_ref1_shape
80
+ batch_type: folded
81
+ valid_batch_type: null
82
+ fold_length:
83
+ - 80000
84
+ - 80000
85
+ - 80000
86
+ sort_in_batch: descending
87
+ sort_batch: descending
88
+ multiple_iterator: false
89
+ chunk_length: 500
90
+ chunk_shift_ratio: 0.5
91
+ num_cache_chunks: 1024
92
+ train_data_path_and_name_and_type:
93
+ - - dump/raw/train/wav.scp
94
+ - speech_mix
95
+ - sound
96
+ - - dump/raw/train/spk1.scp
97
+ - speech_ref1
98
+ - sound
99
+ - - dump/raw/train/noise1.scp
100
+ - noise_ref1
101
+ - sound
102
+ valid_data_path_and_name_and_type:
103
+ - - dump/raw/dev/wav.scp
104
+ - speech_mix
105
+ - sound
106
+ - - dump/raw/dev/spk1.scp
107
+ - speech_ref1
108
+ - sound
109
+ - - dump/raw/dev/noise1.scp
110
+ - noise_ref1
111
+ - sound
112
+ allow_variable_data_keys: false
113
+ max_cache_size: 0.0
114
+ max_cache_fd: 32
115
+ valid_max_cache_size: null
116
+ optim: adam
117
+ optim_conf:
118
+ lr: 0.001
119
+ eps: 1.0e-08
120
+ weight_decay: 0
121
+ scheduler: reducelronplateau
122
+ scheduler_conf:
123
+ mode: min
124
+ factor: 0.5
125
+ patience: 1
126
+ init: xavier_uniform
127
+ model_conf:
128
+ stft_consistency: false
129
+ loss_type: mask_mse
130
+ mask_type: null
131
+ criterions:
132
+ - name: snr
133
+ conf:
134
+ eps: 1e-8
135
+ wrapper: fixed_order
136
+ wrapper_conf:
137
+ weight: 1.0
138
+ use_preprocessor: false
139
+ encoder: stft
140
+ encoder_conf:
141
+ n_fft: 1024
142
+ hop_length: 512
143
+ use_builtin_complex: false
144
+ separator: wpe_beamformer
145
+ separator_conf:
146
+ num_spk: 1
147
+ loss_type: spectrum
148
+ use_wpe: false
149
+ use_beamformer: true
150
+ bnet_type: blstmp
151
+ blayers: 3
152
+ bunits: 300
153
+ bprojs: 320
154
+ badim: 320
155
+ ref_channel: 0
156
+ use_noise_mask: true
157
+ bnonlinear: sigmoid
158
+ beamformer_type: mvdr_souden
159
+ rtf_iterations: 2
160
+ bdropout_rate: 0.0
161
+ shared_power: true
162
+ diagonal_loading: false
163
+ diag_eps_wpe: 1e-4
164
+ diag_eps_bf: 1e-4
165
+ mask_flooring: false
166
+ flooring_thres_wpe: 1e-6
167
+ flooring_thres_bf: 1e-6
168
+ use_torch_solver: true
169
+ decoder: stft
170
+ decoder_conf:
171
+ n_fft: 1024
172
+ hop_length: 512
173
+ required:
174
+ - output_dir
175
+ version: 0.10.5a1
176
+ distributed: false
exp/enh_train_enh_beamformer_mvdr_raw/images/backward_time.png ADDED
exp/enh_train_enh_beamformer_mvdr_raw/images/forward_time.png ADDED
exp/enh_train_enh_beamformer_mvdr_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/enh_train_enh_beamformer_mvdr_raw/images/iter_time.png ADDED
exp/enh_train_enh_beamformer_mvdr_raw/images/loss.png ADDED
exp/enh_train_enh_beamformer_mvdr_raw/images/optim0_lr0.png ADDED
exp/enh_train_enh_beamformer_mvdr_raw/images/optim_step_time.png ADDED
exp/enh_train_enh_beamformer_mvdr_raw/images/snr_loss.png ADDED
exp/enh_train_enh_beamformer_mvdr_raw/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.5a1
2
+ files:
3
+ model_file: exp/enh_train_enh_beamformer_mvdr_raw/8epoch.pth
4
+ python: "3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]"
5
+ timestamp: 1649796895.876417
6
+ torch: 1.10.1
7
+ yaml_files:
8
+ train_config: exp/enh_train_enh_beamformer_mvdr_raw/config.yaml