subakany commited on
Commit
9a645d8
1 Parent(s): 249ecfd

pushing the model files

Browse files
Files changed (3) hide show
  1. encoder.ckpt +3 -0
  2. encoder_out.ckpt +3 -0
  3. hyperparams.yaml +282 -0
encoder.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ff541f6154ad9891570aa899bac07cb8ea5645ad1e4f88bcabf8a87d7ee48b
3
+ size 1059081
encoder_out.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faaea7a87e2ef529e100b39b3402108a299a4623e9c89f1173eecb043e20fcf4
3
+ size 266067
hyperparams.yaml ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2021-09-17 from:
2
+ # /home/mila/s/subakany/speechbrain_new/recipes/WSJ0Mix/separation/snrestimator_yamls/timedom_convnet_whamr_v2_stnorm_manyseparators.yaml
3
+ # yamllint disable
4
+ # ################################
5
+ # Model: SepFormer for source separation
6
+ # https://arxiv.org/abs/2010.13154
7
+ # Dataset : WSJ0-2mix and WSJ0-3mix
8
+ # ################################
9
+ #
10
+ # Basic parameters
11
+ # Seed needs to be set at top of yaml, before objects with parameters are made
12
+ #
13
+ seed: 1234
14
+ __set_seed: !apply:torch.manual_seed [1234]
15
+
16
+ # Data params
17
+
18
+ # e.g. '/yourpath/wsj0-mix/2speakers'
19
+ # end with 2speakers for wsj0-2mix or 3speakers for wsj0-3mix
20
+ data_folder: /miniscratch/subakany/LibriMixData_new/Libri2Mix/
21
+
22
+ # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
23
+ # e.g. /yourpath/wsj0-processed/si_tr_s/
24
+ # you need to convert the original wsj0 to 8k
25
+ # you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
26
+ base_folder_dm: /miniscratch/subakany/LibriMixData_new/LibriSpeech/train-clean-360_processed/
27
+ rir_path: /miniscratch/subakany/whamr_rirs_wav
28
+
29
+ experiment_name: snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators
30
+ output_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234
31
+ train_log: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
32
+ save_folder: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
33
+ train_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_train-360.csv
34
+ valid_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_dev.csv
35
+ test_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/libri2mix_test.csv
36
+
37
+ wsj_data_folder: /network/tmp1/subakany/wham_original
38
+ train_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tr.csv
39
+ test_wsj_data: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save/wham_tt.csv
40
+ base_folder_dm_whamr: /network/tmp1/subakany/wsj0-processed/si_tr_s
41
+ use_whamr_train: true
42
+ whamr_proportion: 0.6
43
+
44
+ test_onwsj: false
45
+
46
+ skip_prep: false
47
+
48
+ ckpt_interval_minutes: 60
49
+
50
+ # Experiment params
51
+ auto_mix_prec: false # Set it to True for mixed precision
52
+ test_only: false
53
+ num_spks: 2 # set to 3 for wsj0-3mix
54
+ progressbar: true
55
+ save_audio: false # Save estimated sources on disk
56
+ sample_rate: 8000
57
+
58
+ # Training parameters
59
+ N_epochs: 200
60
+ batch_size: 1
61
+ lr: 0.0001
62
+ clip_grad_norm: 5
63
+ loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
64
+ # if True, the training sequences are cut to a specified length
65
+ limit_training_signal_len: false
66
+ # this is the length of sequences if we choose to limit
67
+ # the signal length of training sequences
68
+ training_signal_len: 32000000
69
+
70
+ # Set it to True to dynamically create mixtures at training time
71
+ dynamic_mixing: true
72
+ use_wham_noise: true
73
+ use_reverb_augment: true
74
+
75
+ # Parameters for data augmentation
76
+ use_wavedrop: false
77
+ use_speedperturb: true
78
+ use_speedperturb_sameforeachsource: false
79
+ use_rand_shift: false
80
+ min_shift: -8000
81
+ max_shift: 8000
82
+
83
+ speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
84
+ perturb_prob: 1.0
85
+ drop_freq_prob: 0.0
86
+ drop_chunk_prob: 0.0
87
+ sample_rate: 8000
88
+ speeds: [95, 100, 105]
89
+
90
+ wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
91
+ perturb_prob: 0.0
92
+ drop_freq_prob: 1.0
93
+ drop_chunk_prob: 1.0
94
+ sample_rate: 8000
95
+
96
+ # loss thresholding -- this thresholds the training loss
97
+ threshold_byloss: true
98
+ threshold: -30
99
+
100
+ # Encoder parameters
101
+ N_encoder_out: 256
102
+ out_channels: 256
103
+ kernel_size: 16
104
+ kernel_stride: 8
105
+
106
+ # Dataloader options
107
+ dataloader_opts:
108
+ batch_size: 1
109
+ num_workers: 0
110
+
111
+
112
+ # Specifying the network
113
+ Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
114
+ kernel_size: 16
115
+ out_channels: 256
116
+
117
+
118
+ SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
119
+ num_layers: 8
120
+ d_model: 256
121
+ nhead: 8
122
+ d_ffn: 1024
123
+ dropout: 0
124
+ use_positional_encoding: true
125
+ norm_before: true
126
+
127
+ SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
128
+ num_layers: 8
129
+ d_model: 256
130
+ nhead: 8
131
+ d_ffn: 1024
132
+ dropout: 0
133
+ use_positional_encoding: true
134
+ norm_before: true
135
+
136
+ MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
137
+ num_spks: 2
138
+ in_channels: 256
139
+ out_channels: 256
140
+ num_layers: 2
141
+ K: 250
142
+ intra_model: *id001
143
+ inter_model: *id002
144
+ norm: ln
145
+ linear_layer_after_inter_intra: false
146
+ skip_around_intra: true
147
+
148
+ Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
149
+ in_channels: 256
150
+ out_channels: 1
151
+ kernel_size: 16
152
+ stride: 8
153
+ bias: false
154
+
155
+ snrmin: 0
156
+ snrmax: 10
157
+ out_n_neurons: 16
158
+ use_snr_compression: true
159
+ separation_norm_type: stnorm
160
+
161
+ # compute_features: !new:speechbrain.lobes.features.Fbank
162
+ # n_mels: !ref <n_mels>
163
+ # left_frames: 0
164
+ # right_frames: 0
165
+ # deltas: False
166
+
167
+ latent_dim: 128
168
+ n_inp: 256
169
+ encoder: &id006 !new:speechbrain.nnet.containers.Sequential
170
+ input_shape: [!!null '', 2, !!null '']
171
+ cnn1: !new:speechbrain.nnet.CNN.Conv1d
172
+ in_channels: 2
173
+ kernel_size: 4
174
+ out_channels: 128
175
+ stride: 1
176
+ skip_transpose: true
177
+ padding: valid
178
+ relu1: !new:torch.nn.ReLU
179
+ cnn2: !new:speechbrain.nnet.CNN.Conv1d
180
+ in_channels: 128
181
+ kernel_size: 4
182
+ out_channels: 128
183
+ stride: 2
184
+ skip_transpose: true
185
+ padding: valid
186
+ relu2: !new:torch.nn.ReLU
187
+ cnn3: !new:speechbrain.nnet.CNN.Conv1d
188
+ in_channels: 128
189
+ kernel_size: 4
190
+ out_channels: 128
191
+ stride: 2
192
+ skip_transpose: true
193
+ padding: valid
194
+ relu3: !new:torch.nn.ReLU
195
+ cnn4: !new:speechbrain.nnet.CNN.Conv1d
196
+ in_channels: 128
197
+ kernel_size: 4
198
+ out_channels: 128
199
+ stride: 2
200
+ skip_transpose: true
201
+ padding: valid
202
+ relu4: !new:torch.nn.ReLU
203
+ cnn5: !new:speechbrain.nnet.CNN.Conv1d
204
+ in_channels: 128
205
+ kernel_size: 4
206
+ out_channels: 128
207
+ stride: 2
208
+ skip_transpose: true
209
+ padding: valid
210
+
211
+ stat_pooling: !new:speechbrain.nnet.pooling.StatisticsPooling
212
+
213
+
214
+ # classifier_enc: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
215
+ # input_size: !ref <n_inp>
216
+ # channels: [1024, 1024, 1024, 1024, 3072]
217
+ # kernel_sizes: [5, 3, 3, 3, 1]
218
+ # dilations: [1, 2, 3, 4, 1]
219
+ # attention_channels: 128
220
+ # lin_neurons: 192
221
+
222
+ #classifier_out: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
223
+ # input_size: 192
224
+ # out_neurons: !ref <out_n_neurons>
225
+ #
226
+ # classifier_out: !new:speechbrain.nnet.linear.Linear
227
+ # input_size: 256
228
+ # n_neurons: 1
229
+
230
+ encoder_out: &id007 !new:speechbrain.nnet.containers.Sequential
231
+ # lr_scheduler: !ref <lr_scheduler>
232
+
233
+ input_shape: [!!null '', 256]
234
+ layer1: !new:speechbrain.nnet.linear.Linear
235
+ input_size: 256
236
+ n_neurons: 256
237
+ relu: !new:torch.nn.ReLU
238
+ layer2: !new:speechbrain.nnet.linear.Linear
239
+ input_size: 256
240
+ n_neurons: 1
241
+ sigm: !new:torch.nn.Sigmoid
242
+
243
+
244
+
245
+ classifier_loss: !new:torch.nn.CrossEntropyLoss
246
+
247
+ optimizer: !name:torch.optim.Adam
248
+ lr: 0.0001
249
+ weight_decay: 0
250
+
251
+ loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
252
+
253
+ lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
254
+ factor: 0.5
255
+ patience: 2
256
+ dont_halve_until_epoch: 95
257
+
258
+ epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter
259
+ limit: 200
260
+
261
+ modules:
262
+ encoder: *id003
263
+ decoder: *id004
264
+ masknet: *id005
265
+ encoder: *id006
266
+ encoder_out: *id007
267
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
268
+ checkpoints_dir: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/save
269
+ recoverables:
270
+ counter: *id008
271
+ encoder: *id006
272
+ encoder_out: *id007
273
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
274
+ save_file: results/snrtrain-timedomain-sbpooling-wwhamr-lessstride-stnorm-manyseparators/1234/train_log.txt
275
+
276
+ num_separators_per_model: 3
277
+ separator_base_folder: /home/mila/s/subakany/speechbrain_new/recipes/WHAMandWHAMR/separation/results/
278
+
279
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
280
+ loadables:
281
+ encoder: !ref <encoder>
282
+ encoder_out: !ref <encoder_out>