gorinars commited on
Commit
60646eb
1 Parent(s): 3535ab0

first commit

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. config.json +3 -0
  3. embedding_model.ckpt +3 -0
  4. hyperparams.yaml +64 -0
  5. projector.ckpt +3 -0
  6. train_hyp.yaml +252 -0
  7. train_log.txt +50 -0
.gitattributes CHANGED
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ embedding_model.ckpt filter=lfs diff=lfs merge=lfs -text
36
+ projector.ckpt filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "speechbrain_interface": "SpeakerRecognition"
3
+ }
embedding_model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34414582d17cf0b5f9b63e44b46fe1217343b0211b97a0c0b1e7b07da9f3b58f
3
+ size 84883955
hyperparams.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: ECAPA big for Speaker verification
3
+ # ############################################################################
4
+
5
+ # Feature parameters
6
+ n_mels: 80
7
+
8
+ # Pretrain folder (HuggingFace)
9
+ pretrained_path: gorinars/sb-ecapa-vggsound-uvgscl
10
+
11
+ # Output parameters
12
+ out_n_neurons: 308
13
+
14
+ # Model params
15
+ compute_features: !new:speechbrain.lobes.features.Fbank
16
+ n_mels: 80
17
+ left_frames: 0
18
+ right_frames: 0
19
+ deltas: false
20
+ sample_rate: 16000
21
+ n_fft: 400
22
+ win_length: 25
23
+ hop_length: 10
24
+ f_min: 0
25
+
26
+
27
+ mean_var_norm: !new:speechbrain.processing.features.InputNormalization
28
+ norm_type: sentence
29
+ std_norm: False
30
+
31
+
32
+ embedding_model: !new:speechbrain.nnet.containers.LengthsCapableSequential
33
+ input_shape: [null, 1, null]
34
+ embedding: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
35
+ input_size: !ref <n_mels>
36
+ channels: [1024, 1024, 1024, 1024, 3072]
37
+ kernel_sizes: [5, 3, 3, 3, 1]
38
+ dilations: [1, 2, 3, 4, 1]
39
+ groups: [1, 1, 1, 1, 1]
40
+ attention_channels: 128
41
+ lin_neurons: 256
42
+ projector: !new:crytorch.models.components.pann.SimSiamProjector
43
+ input_size: 256
44
+ hidden_size: 256
45
+ output_size: 256
46
+ norm_type: bn
47
+
48
+ modules:
49
+ compute_features: !ref <compute_features>
50
+ mean_var_norm: !ref <mean_var_norm>
51
+ embedding_model: !ref <embedding_model>
52
+
53
+ label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
54
+
55
+
56
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
57
+ loadables:
58
+ embedding: !ref <embedding_model.embedding>
59
+ projector: !ref <embedding_model.projector>
60
+ paths:
61
+ embedding: !ref <pretrained_path>/embedding_model.ckpt
62
+ projector: !ref <pretrained_path>/projector.ckpt
63
+
64
+
projector.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c890e8ca36066d9a30090ac1835ac6fafd6f03435dc4bdcfcafd44c64c02cbcf
3
+ size 538555
train_hyp.yaml ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2023-02-12 from:
2
+ # /home/agorin/cssl_sound/hparams/ecapa_vgg.yaml
3
+ # yamllint disable
4
+ # File : supclr_train.yaml
5
+ # Author : Zhepei Wang <zhepeiw2@illinois.edu>
6
+ # Date : 27.01.2022
7
+ # Last Modified Date: 31.03.2022
8
+ # Last Modified By : Zhepei Wang <zhepeiw2@illinois.edu>
9
+
10
+
11
+ seed: 2022
12
+ __set_seed: !apply:torch.manual_seed [2022]
13
+ np_rng: !new:numpy.random.RandomState [2022]
14
+
15
+ resume_interrupt: false
16
+ resume_task_idx: 0
17
+ balanced_cry: false
18
+
19
+ time_stamp: 2023-02-12+21-11-02
20
+ experiment_name: ecapa_vgg
21
+ # output_folder: !ref results/<experiment_name>/<seed>
22
+ output_base: results #/home/agorin/datasets/results_cssl
23
+ output_folder: results/2023-02-12+21-11-02_seed_2022+ecapa_vgg
24
+ train_log: results/2023-02-12+21-11-02_seed_2022+ecapa_vgg/train_log.txt
25
+ save_folder: results/2023-02-12+21-11-02_seed_2022+ecapa_vgg/save
26
+
27
+ # Number of classes
28
+ n_classes: 308
29
+ num_tasks: 1
30
+ # cont learning setup
31
+ task_classes: &id001 !apply:utils.prepare_task_classes
32
+ num_classes: 308
33
+ num_tasks: 1
34
+ seed: 2022
35
+ replay_num_keep: 0
36
+
37
+ use_mixup: false
38
+ mixup_alpha: 0.4
39
+ train_duration: 4.0
40
+
41
+ # Training parameters
42
+ number_of_epochs: 50
43
+ batch_size: 128
44
+ # lr: 0.001
45
+ # base_lr: 0.00000001
46
+ # max_lr: !ref <lr>
47
+ # step_size: 65000
48
+ warmup_epochs: 5
49
+ warmup_lr: 0.0
50
+ base_lr: 0.015
51
+ final_lr: 5e-09
52
+
53
+ # dataset
54
+ sample_rate: 16000
55
+
56
+ data_folder: /home/agorin/datasets/VGG-Sound
57
+ label_encoder_path: ./dataset/label_encoder_vggsound_ordered.txt
58
+ prepare_split_csv_fn: !name:dataset.prepare_vggsound2.prepare_split
59
+ root_dir: /home/agorin/datasets/VGG-Sound
60
+ output_dir: results/2023-02-12+21-11-02_seed_2022+ecapa_vgg/save
61
+ task_classes: *id001
62
+ train_split: 0.8
63
+ seed: 2022
64
+
65
+ train_dataloader_opts:
66
+ batch_size: 128
67
+ num_workers: 8
68
+ shuffle: true
69
+ drop_last: true
70
+
71
+
72
+ valid_dataloader_opts:
73
+ batch_size: 32
74
+ num_workers: 8
75
+
76
+
77
+
78
+ # Experiment params
79
+ auto_mix_prec: false # True # True # Set it to True for mixed precision
80
+
81
+
82
+ # Feature parameters
83
+ n_mels: 80
84
+ left_frames: 0
85
+ right_frames: 0
86
+ deltas: false
87
+ amp_to_db: false
88
+ normalize: true
89
+ win_length: 25
90
+ hop_length: 10
91
+ n_fft: 400
92
+ f_min: 0
93
+ use_time_roll: false
94
+ use_freq_shift: false
95
+ emb_dim: 256
96
+ emb_norm_type: bn
97
+ proj_norm_type: bn
98
+
99
+ # augmentation
100
+ # time_domain_aug: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
101
+ # sample_rate: !ref <sample_rate>
102
+ # # drop_chunk_count_high: 2
103
+ # # drop_chunk_noise_factor: 0.05
104
+ # speeds: [90, 95, 100, 105, 110]
105
+ # drop_freq_count_high: 4
106
+ # drop_chunk_count_high: 3
107
+ # # drop_chunk_length_low: 1000
108
+ # # drop_chunk_length_high: 5000
109
+ spec_domain_aug: !new:augmentation.TFAugmentation
110
+ time_warp: true
111
+ time_warp_window: 8
112
+ freq_mask: true
113
+ freq_mask_width: !tuple (0, 10)
114
+ n_freq_mask: 2
115
+ time_mask: true
116
+ time_mask_width: !tuple (0, 10)
117
+ n_time_mask: 2
118
+ replace_with_zero: true
119
+ time_roll: false
120
+ time_roll_limit: !tuple (0, 200)
121
+ freq_shift: false
122
+ freq_shift_limit: !tuple (-10, 10)
123
+
124
+
125
+ # Functions
126
+ compute_features: &id002 !new:speechbrain.lobes.features.Fbank
127
+ n_mels: 80
128
+ left_frames: 0
129
+ right_frames: 0
130
+ deltas: false
131
+ sample_rate: 16000
132
+ n_fft: 400
133
+ win_length: 25
134
+ hop_length: 10
135
+ f_min: 0
136
+
137
+ mean_var_norm: &id007 !new:speechbrain.processing.features.InputNormalization
138
+
139
+ norm_type: sentence
140
+ std_norm: false
141
+
142
+ embedding_model: &id003 !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
143
+ input_size: 80
144
+ channels: [1024, 1024, 1024, 1024, 3072]
145
+ kernel_sizes: [5, 3, 3, 3, 1]
146
+ dilations: [1, 2, 3, 4, 1]
147
+ groups: [1, 1, 1, 1, 1]
148
+ attention_channels: 128
149
+ lin_neurons: 256
150
+
151
+ # embedding_model: !new:models.pann.Cnn14
152
+ # mel_bins: !ref <n_mels>
153
+ # emb_dim: !ref <emb_dim>
154
+ # norm_type: !ref <emb_norm_type>
155
+
156
+ projector: &id005 !new:models.modules.SimSiamProjector
157
+ input_size: 256
158
+ hidden_size: 256
159
+ output_size: 256
160
+ norm_type: bn
161
+
162
+ predictor: &id006 !new:models.modules.SimSiamPredictor
163
+ input_size: 256
164
+ hidden_size: 128
165
+ norm_type: bn
166
+
167
+ classifier: &id004 !new:models.modules.Classifier
168
+ input_size: 256
169
+ output_size: 308
170
+
171
+ modules:
172
+ compute_features: *id002
173
+ embedding_model: *id003
174
+ classifier: *id004
175
+ projector: *id005
176
+ predictor: *id006
177
+ mean_var_norm: *id007
178
+ ssl_weight: 1.
179
+ compute_simclr_cost: !new:losses.SimCLRLoss
180
+ tau: 0.5
181
+
182
+ sup_weight: 0.
183
+ compute_sup_cost: !new:losses.LogSoftmaxWithProbWrapper
184
+ loss_fn: !new:torch.nn.Identity
185
+
186
+ dist_weight: 0
187
+ compute_dist_cost: !new:losses.SimCLRLoss
188
+ tau: 0.5
189
+
190
+
191
+ acc_metric: !name:speechbrain.utils.Accuracy.AccuracyStats
192
+
193
+ # opt_class: !name:torch.optim.Adam
194
+ # lr: !ref <base_lr>
195
+ # weight_decay: 0.0005
196
+ #
197
+ # lr_scheduler_fn: !name:speechbrain.nnet.schedulers.CyclicLRScheduler
198
+ # base_lr: !ref <final_lr>
199
+ # max_lr: !ref <base_lr>
200
+ # step_size: 888
201
+
202
+ opt_class: !name:torch.optim.SGD
203
+ lr: 0.015
204
+ weight_decay: 0.0005
205
+ momentum: 0.9
206
+
207
+ lr_scheduler_fn: !name:schedulers.SimSiamCosineScheduler
208
+ warmup_epochs: 5
209
+ warmup_lr: 0.0
210
+ num_epochs: 50
211
+ base_lr: 0.015
212
+ final_lr: 5e-09
213
+ steps_per_epoch: 200
214
+ constant_predictor_lr: true
215
+
216
+ epoch_counter_fn: !name:speechbrain.utils.epoch_loop.EpochCounter
217
+ limit: 50
218
+
219
+ datapoint_counter: &id008 !new:utils.DatapointCounter
220
+
221
+ #prev_checkpointer: null
222
+ #prev_checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
223
+ # checkpoints_dir: /home/agorin/vgg_offline/2022-04-13+23-33-21_seed_2022+ssl_offline/save/task0
224
+ # # Logging + checkpoints
225
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
226
+ checkpoints_dir: results/2023-02-12+21-11-02_seed_2022+ecapa_vgg/save
227
+
228
+ recoverables:
229
+ embedding_model: *id003
230
+ classifier: *id004
231
+ projector: *id005
232
+ predictor: *id006
233
+ normalizer: *id007
234
+ datapoint_counter: *id008
235
+ ssl_checkpoints_dir: # /home/agorin/vgg_offline/2022-04-13+23-33-21_seed_2022+ssl_offline/save
236
+
237
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
238
+ save_file: results/2023-02-12+21-11-02_seed_2022+ecapa_vgg/train_log.txt
239
+
240
+ # wandb
241
+ use_wandb: false
242
+ train_log_frequency: 20
243
+ wandb_logger_fn: !name:utils.MyWandBLogger
244
+ initializer: !name:wandb.init
245
+ entity: CAL
246
+ project: cssl_sound
247
+ name: 2023-02-12+21-11-02+seed_2022+ecapa_vgg
248
+ dir: results/2023-02-12+21-11-02_seed_2022+ecapa_vgg
249
+ reinit: true
250
+ yaml_config: hparams/vgg/supclr_train.yaml
251
+ resume: false
252
+
train_log.txt ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch: 1, lr: 3.00e-03, datapoints_seen: 167808 - train loss: 4.13
2
+ epoch: 2, lr: 5.99e-03, datapoints_seen: 335616 - train loss: 3.95
3
+ epoch: 3, lr: 8.99e-03, datapoints_seen: 503424 - train loss: 3.90
4
+ epoch: 4, lr: 1.20e-02, datapoints_seen: 671232 - train loss: 3.87
5
+ epoch: 5, lr: 1.50e-02, datapoints_seen: 839040 - train loss: 3.85
6
+ epoch: 6, lr: 1.50e-02, datapoints_seen: 1006848 - train loss: 3.84
7
+ epoch: 7, lr: 1.49e-02, datapoints_seen: 1174656 - train loss: 3.82
8
+ epoch: 8, lr: 1.48e-02, datapoints_seen: 1342464 - train loss: 3.82
9
+ epoch: 9, lr: 1.47e-02, datapoints_seen: 1510272 - train loss: 3.81
10
+ epoch: 10, lr: 1.45e-02, datapoints_seen: 1678080 - train loss: 3.80
11
+ epoch: 11, lr: 1.44e-02, datapoints_seen: 1845888 - train loss: 3.80
12
+ epoch: 12, lr: 1.41e-02, datapoints_seen: 2013696 - train loss: 3.80
13
+ epoch: 13, lr: 1.39e-02, datapoints_seen: 2181504 - train loss: 3.79
14
+ epoch: 14, lr: 1.36e-02, datapoints_seen: 2349312 - train loss: 3.79
15
+ epoch: 15, lr: 1.32e-02, datapoints_seen: 2517120 - train loss: 3.79
16
+ epoch: 16, lr: 1.29e-02, datapoints_seen: 2684928 - train loss: 3.79
17
+ epoch: 17, lr: 1.25e-02, datapoints_seen: 2852736 - train loss: 3.78
18
+ epoch: 18, lr: 1.21e-02, datapoints_seen: 3020544 - train loss: 3.78
19
+ epoch: 19, lr: 1.17e-02, datapoints_seen: 3188352 - train loss: 3.78
20
+ epoch: 20, lr: 1.13e-02, datapoints_seen: 3356160 - train loss: 3.78
21
+ epoch: 21, lr: 1.08e-02, datapoints_seen: 3523968 - train loss: 3.78
22
+ epoch: 22, lr: 1.03e-02, datapoints_seen: 3691776 - train loss: 3.77
23
+ epoch: 23, lr: 9.83e-03, datapoints_seen: 3859584 - train loss: 3.77
24
+ epoch: 24, lr: 9.32e-03, datapoints_seen: 4027392 - train loss: 3.77
25
+ epoch: 25, lr: 8.81e-03, datapoints_seen: 4195200 - train loss: 3.77
26
+ epoch: 26, lr: 8.29e-03, datapoints_seen: 4363008 - train loss: 3.77
27
+ epoch: 27, lr: 7.77e-03, datapoints_seen: 4530816 - train loss: 3.76
28
+ epoch: 28, lr: 7.25e-03, datapoints_seen: 4698624 - train loss: 3.76
29
+ epoch: 29, lr: 6.73e-03, datapoints_seen: 4866432 - train loss: 3.76
30
+ epoch: 30, lr: 6.21e-03, datapoints_seen: 5034240 - train loss: 3.76
31
+ epoch: 31, lr: 5.70e-03, datapoints_seen: 5202048 - train loss: 3.76
32
+ epoch: 32, lr: 5.19e-03, datapoints_seen: 5369856 - train loss: 3.75
33
+ epoch: 33, lr: 4.70e-03, datapoints_seen: 5537664 - train loss: 3.75
34
+ epoch: 34, lr: 4.22e-03, datapoints_seen: 5705472 - train loss: 3.75
35
+ epoch: 35, lr: 3.76e-03, datapoints_seen: 5873280 - train loss: 3.75
36
+ epoch: 36, lr: 3.32e-03, datapoints_seen: 6041088 - train loss: 3.75
37
+ epoch: 37, lr: 2.89e-03, datapoints_seen: 6208896 - train loss: 3.74
38
+ epoch: 38, lr: 2.49e-03, datapoints_seen: 6376704 - train loss: 3.74
39
+ epoch: 39, lr: 2.12e-03, datapoints_seen: 6544512 - train loss: 3.74
40
+ epoch: 40, lr: 1.77e-03, datapoints_seen: 6712320 - train loss: 3.74
41
+ epoch: 41, lr: 1.44e-03, datapoints_seen: 6880128 - train loss: 3.73
42
+ epoch: 42, lr: 1.15e-03, datapoints_seen: 7047936 - train loss: 3.73
43
+ epoch: 43, lr: 8.86e-04, datapoints_seen: 7215744 - train loss: 3.73
44
+ epoch: 44, lr: 6.56e-04, datapoints_seen: 7383552 - train loss: 3.73
45
+ epoch: 45, lr: 4.59e-04, datapoints_seen: 7551360 - train loss: 3.73
46
+ epoch: 46, lr: 2.96e-04, datapoints_seen: 7719168 - train loss: 3.73
47
+ epoch: 47, lr: 1.68e-04, datapoints_seen: 7886976 - train loss: 3.73
48
+ epoch: 48, lr: 7.57e-05, datapoints_seen: 8054784 - train loss: 3.73
49
+ epoch: 49, lr: 1.97e-05, datapoints_seen: 8222592 - train loss: 3.72
50
+ epoch: 50, lr: 3.26e-08, datapoints_seen: 8390400 - train loss: 3.72