sanchit-gandhi commited on
Commit
d175c02
·
verified ·
1 Parent(s): 5d594ee

Training in progress, step 1000

Browse files
.gitattributes CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- wandb/run-20240327_141033-golaq7b9/run-golaq7b9.wandb filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 22.52,
3
+ "train_loss": 0.17524469082718716,
4
+ "train_runtime": 15083.6622,
5
+ "train_samples": 7099,
6
+ "train_samples_per_second": 10.608,
7
+ "train_steps_per_second": 0.331
8
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:915a14d6cca1e5b43dd51645e21e653f0b7f7ef389ed27f01c02d53e9d5fbfaa
3
  size 3025686376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82b28427eca5db81abd6c0b41b5a828e9deac65e6e8d7d071ed00be3850a7dda
3
  size 3025686376
run.sh CHANGED
@@ -16,6 +16,7 @@ python run_speech_recognition_seq2seq.py \
16
  --eval_steps="1000" \
17
  --save_strategy="steps" \
18
  --save_steps="1000" \
 
19
  --generation_max_length="225" \
20
  --preprocessing_num_workers="1" \
21
  --dataloader_num_workers="4" \
 
16
  --eval_steps="1000" \
17
  --save_strategy="steps" \
18
  --save_steps="1000" \
19
+ --save_total_limit="1" \
20
  --generation_max_length="225" \
21
  --preprocessing_num_workers="1" \
22
  --dataloader_num_workers="4" \
runs/Mar27_19-04-58_hf-dgx-01/events.out.tfevents.1711562712.hf-dgx-01.1894903.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d2088d006f005c7eb61e37b06f361c2c53843da4442f53136b5f5981f55f50
3
+ size 14123
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 22.52,
3
+ "train_loss": 0.17524469082718716,
4
+ "train_runtime": 15083.6622,
5
+ "train_samples": 7099,
6
+ "train_samples_per_second": 10.608,
7
+ "train_steps_per_second": 0.331
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 22.52252252252252,
5
+ "eval_steps": 1000,
6
+ "global_step": 5000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.11,
13
+ "grad_norm": 29.428333282470703,
14
+ "learning_rate": 4.000000000000001e-06,
15
+ "loss": 11.9112,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.23,
20
+ "grad_norm": 12.572431564331055,
21
+ "learning_rate": 9e-06,
22
+ "loss": 5.9607,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.34,
27
+ "grad_norm": 6.247668743133545,
28
+ "learning_rate": 1.4000000000000001e-05,
29
+ "loss": 2.7899,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.45,
34
+ "grad_norm": 5.499792098999023,
35
+ "learning_rate": 1.9e-05,
36
+ "loss": 1.934,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.56,
41
+ "grad_norm": 10.862707138061523,
42
+ "learning_rate": 2.4e-05,
43
+ "loss": 1.1845,
44
+ "step": 125
45
+ },
46
+ {
47
+ "epoch": 0.68,
48
+ "grad_norm": 6.8538055419921875,
49
+ "learning_rate": 2.9e-05,
50
+ "loss": 0.7883,
51
+ "step": 150
52
+ },
53
+ {
54
+ "epoch": 0.79,
55
+ "grad_norm": 8.127602577209473,
56
+ "learning_rate": 3.4000000000000007e-05,
57
+ "loss": 0.6147,
58
+ "step": 175
59
+ },
60
+ {
61
+ "epoch": 0.9,
62
+ "grad_norm": 4.003240585327148,
63
+ "learning_rate": 3.9000000000000006e-05,
64
+ "loss": 0.5233,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 1.01,
69
+ "grad_norm": 3.650707483291626,
70
+ "learning_rate": 4.4000000000000006e-05,
71
+ "loss": 0.453,
72
+ "step": 225
73
+ },
74
+ {
75
+ "epoch": 1.13,
76
+ "grad_norm": 4.5928239822387695,
77
+ "learning_rate": 4.9e-05,
78
+ "loss": 0.3913,
79
+ "step": 250
80
+ },
81
+ {
82
+ "epoch": 1.24,
83
+ "grad_norm": 4.008325576782227,
84
+ "learning_rate": 5.4000000000000005e-05,
85
+ "loss": 0.3729,
86
+ "step": 275
87
+ },
88
+ {
89
+ "epoch": 1.35,
90
+ "grad_norm": 4.239988327026367,
91
+ "learning_rate": 5.9e-05,
92
+ "loss": 0.3544,
93
+ "step": 300
94
+ },
95
+ {
96
+ "epoch": 1.46,
97
+ "grad_norm": 3.8822410106658936,
98
+ "learning_rate": 6.400000000000001e-05,
99
+ "loss": 0.3229,
100
+ "step": 325
101
+ },
102
+ {
103
+ "epoch": 1.58,
104
+ "grad_norm": 3.0306766033172607,
105
+ "learning_rate": 6.9e-05,
106
+ "loss": 0.3357,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 1.69,
111
+ "grad_norm": 2.7435803413391113,
112
+ "learning_rate": 7.4e-05,
113
+ "loss": 0.3148,
114
+ "step": 375
115
+ },
116
+ {
117
+ "epoch": 1.8,
118
+ "grad_norm": 3.684567928314209,
119
+ "learning_rate": 7.900000000000001e-05,
120
+ "loss": 0.2912,
121
+ "step": 400
122
+ },
123
+ {
124
+ "epoch": 1.91,
125
+ "grad_norm": 2.486985206604004,
126
+ "learning_rate": 8.4e-05,
127
+ "loss": 0.3058,
128
+ "step": 425
129
+ },
130
+ {
131
+ "epoch": 2.03,
132
+ "grad_norm": 2.5083959102630615,
133
+ "learning_rate": 8.900000000000001e-05,
134
+ "loss": 0.2651,
135
+ "step": 450
136
+ },
137
+ {
138
+ "epoch": 2.14,
139
+ "grad_norm": 4.557464599609375,
140
+ "learning_rate": 9.4e-05,
141
+ "loss": 0.2339,
142
+ "step": 475
143
+ },
144
+ {
145
+ "epoch": 2.25,
146
+ "grad_norm": 3.3180325031280518,
147
+ "learning_rate": 9.900000000000001e-05,
148
+ "loss": 0.2337,
149
+ "step": 500
150
+ },
151
+ {
152
+ "epoch": 2.36,
153
+ "grad_norm": 2.496147632598877,
154
+ "learning_rate": 9.955555555555556e-05,
155
+ "loss": 0.2372,
156
+ "step": 525
157
+ },
158
+ {
159
+ "epoch": 2.48,
160
+ "grad_norm": 2.2330338954925537,
161
+ "learning_rate": 9.900000000000001e-05,
162
+ "loss": 0.2219,
163
+ "step": 550
164
+ },
165
+ {
166
+ "epoch": 2.59,
167
+ "grad_norm": 3.0495846271514893,
168
+ "learning_rate": 9.844444444444444e-05,
169
+ "loss": 0.2323,
170
+ "step": 575
171
+ },
172
+ {
173
+ "epoch": 2.7,
174
+ "grad_norm": 2.3662843704223633,
175
+ "learning_rate": 9.78888888888889e-05,
176
+ "loss": 0.2324,
177
+ "step": 600
178
+ },
179
+ {
180
+ "epoch": 2.82,
181
+ "grad_norm": 1.981231451034546,
182
+ "learning_rate": 9.733333333333335e-05,
183
+ "loss": 0.2088,
184
+ "step": 625
185
+ },
186
+ {
187
+ "epoch": 2.93,
188
+ "grad_norm": 2.484710454940796,
189
+ "learning_rate": 9.677777777777778e-05,
190
+ "loss": 0.2195,
191
+ "step": 650
192
+ },
193
+ {
194
+ "epoch": 3.04,
195
+ "grad_norm": 1.7488161325454712,
196
+ "learning_rate": 9.622222222222222e-05,
197
+ "loss": 0.1868,
198
+ "step": 675
199
+ },
200
+ {
201
+ "epoch": 3.15,
202
+ "grad_norm": 2.266071081161499,
203
+ "learning_rate": 9.566666666666667e-05,
204
+ "loss": 0.1537,
205
+ "step": 700
206
+ },
207
+ {
208
+ "epoch": 3.27,
209
+ "grad_norm": 1.6045178174972534,
210
+ "learning_rate": 9.511111111111112e-05,
211
+ "loss": 0.157,
212
+ "step": 725
213
+ },
214
+ {
215
+ "epoch": 3.38,
216
+ "grad_norm": 1.8283653259277344,
217
+ "learning_rate": 9.455555555555556e-05,
218
+ "loss": 0.1516,
219
+ "step": 750
220
+ },
221
+ {
222
+ "epoch": 3.49,
223
+ "grad_norm": 2.1718389987945557,
224
+ "learning_rate": 9.4e-05,
225
+ "loss": 0.1657,
226
+ "step": 775
227
+ },
228
+ {
229
+ "epoch": 3.6,
230
+ "grad_norm": 2.778785467147827,
231
+ "learning_rate": 9.344444444444444e-05,
232
+ "loss": 0.1529,
233
+ "step": 800
234
+ },
235
+ {
236
+ "epoch": 3.72,
237
+ "grad_norm": 2.0423874855041504,
238
+ "learning_rate": 9.28888888888889e-05,
239
+ "loss": 0.153,
240
+ "step": 825
241
+ },
242
+ {
243
+ "epoch": 3.83,
244
+ "grad_norm": 1.7835185527801514,
245
+ "learning_rate": 9.233333333333333e-05,
246
+ "loss": 0.1514,
247
+ "step": 850
248
+ },
249
+ {
250
+ "epoch": 3.94,
251
+ "grad_norm": 2.091015100479126,
252
+ "learning_rate": 9.177777777777778e-05,
253
+ "loss": 0.151,
254
+ "step": 875
255
+ },
256
+ {
257
+ "epoch": 4.05,
258
+ "grad_norm": 1.47210693359375,
259
+ "learning_rate": 9.122222222222223e-05,
260
+ "loss": 0.1248,
261
+ "step": 900
262
+ },
263
+ {
264
+ "epoch": 4.17,
265
+ "grad_norm": 1.5700939893722534,
266
+ "learning_rate": 9.066666666666667e-05,
267
+ "loss": 0.0955,
268
+ "step": 925
269
+ },
270
+ {
271
+ "epoch": 4.28,
272
+ "grad_norm": 1.0798161029815674,
273
+ "learning_rate": 9.011111111111111e-05,
274
+ "loss": 0.0965,
275
+ "step": 950
276
+ },
277
+ {
278
+ "epoch": 4.39,
279
+ "grad_norm": 1.250017523765564,
280
+ "learning_rate": 8.955555555555556e-05,
281
+ "loss": 0.1029,
282
+ "step": 975
283
+ },
284
+ {
285
+ "epoch": 4.5,
286
+ "grad_norm": 1.3333516120910645,
287
+ "learning_rate": 8.900000000000001e-05,
288
+ "loss": 0.1015,
289
+ "step": 1000
290
+ },
291
+ {
292
+ "epoch": 4.5,
293
+ "eval_loss": 0.3065292239189148,
294
+ "eval_runtime": 1302.6648,
295
+ "eval_samples_per_second": 2.397,
296
+ "eval_steps_per_second": 0.075,
297
+ "eval_wer": 0.3243838368229931,
298
+ "step": 1000
299
+ },
300
+ {
301
+ "epoch": 4.62,
302
+ "grad_norm": 2.2534544467926025,
303
+ "learning_rate": 8.844444444444445e-05,
304
+ "loss": 0.1098,
305
+ "step": 1025
306
+ },
307
+ {
308
+ "epoch": 4.73,
309
+ "grad_norm": 1.6706323623657227,
310
+ "learning_rate": 8.78888888888889e-05,
311
+ "loss": 0.1066,
312
+ "step": 1050
313
+ },
314
+ {
315
+ "epoch": 4.84,
316
+ "grad_norm": 1.9353983402252197,
317
+ "learning_rate": 8.733333333333333e-05,
318
+ "loss": 0.1033,
319
+ "step": 1075
320
+ },
321
+ {
322
+ "epoch": 4.95,
323
+ "grad_norm": 1.833392858505249,
324
+ "learning_rate": 8.677777777777778e-05,
325
+ "loss": 0.1041,
326
+ "step": 1100
327
+ },
328
+ {
329
+ "epoch": 5.07,
330
+ "grad_norm": 1.094043254852295,
331
+ "learning_rate": 8.622222222222222e-05,
332
+ "loss": 0.0782,
333
+ "step": 1125
334
+ },
335
+ {
336
+ "epoch": 5.18,
337
+ "grad_norm": 1.6280676126480103,
338
+ "learning_rate": 8.566666666666667e-05,
339
+ "loss": 0.0604,
340
+ "step": 1150
341
+ },
342
+ {
343
+ "epoch": 5.29,
344
+ "grad_norm": 1.2326525449752808,
345
+ "learning_rate": 8.511111111111112e-05,
346
+ "loss": 0.0665,
347
+ "step": 1175
348
+ },
349
+ {
350
+ "epoch": 5.41,
351
+ "grad_norm": 1.186036467552185,
352
+ "learning_rate": 8.455555555555556e-05,
353
+ "loss": 0.0679,
354
+ "step": 1200
355
+ },
356
+ {
357
+ "epoch": 5.52,
358
+ "grad_norm": 1.3472570180892944,
359
+ "learning_rate": 8.4e-05,
360
+ "loss": 0.0656,
361
+ "step": 1225
362
+ },
363
+ {
364
+ "epoch": 5.63,
365
+ "grad_norm": 2.1403074264526367,
366
+ "learning_rate": 8.344444444444445e-05,
367
+ "loss": 0.0674,
368
+ "step": 1250
369
+ },
370
+ {
371
+ "epoch": 5.74,
372
+ "grad_norm": 1.0580947399139404,
373
+ "learning_rate": 8.28888888888889e-05,
374
+ "loss": 0.0713,
375
+ "step": 1275
376
+ },
377
+ {
378
+ "epoch": 5.86,
379
+ "grad_norm": 1.0808650255203247,
380
+ "learning_rate": 8.233333333333333e-05,
381
+ "loss": 0.0713,
382
+ "step": 1300
383
+ },
384
+ {
385
+ "epoch": 5.97,
386
+ "grad_norm": 1.0721344947814941,
387
+ "learning_rate": 8.177777777777778e-05,
388
+ "loss": 0.0707,
389
+ "step": 1325
390
+ },
391
+ {
392
+ "epoch": 6.08,
393
+ "grad_norm": 1.7433174848556519,
394
+ "learning_rate": 8.122222222222222e-05,
395
+ "loss": 0.0492,
396
+ "step": 1350
397
+ },
398
+ {
399
+ "epoch": 6.19,
400
+ "grad_norm": 0.9549305438995361,
401
+ "learning_rate": 8.066666666666667e-05,
402
+ "loss": 0.0418,
403
+ "step": 1375
404
+ },
405
+ {
406
+ "epoch": 6.31,
407
+ "grad_norm": 1.4030609130859375,
408
+ "learning_rate": 8.011111111111111e-05,
409
+ "loss": 0.0382,
410
+ "step": 1400
411
+ },
412
+ {
413
+ "epoch": 6.42,
414
+ "grad_norm": 0.9085283279418945,
415
+ "learning_rate": 7.955555555555556e-05,
416
+ "loss": 0.0369,
417
+ "step": 1425
418
+ },
419
+ {
420
+ "epoch": 6.53,
421
+ "grad_norm": 1.0393314361572266,
422
+ "learning_rate": 7.900000000000001e-05,
423
+ "loss": 0.0403,
424
+ "step": 1450
425
+ },
426
+ {
427
+ "epoch": 6.64,
428
+ "grad_norm": 0.675774872303009,
429
+ "learning_rate": 7.844444444444446e-05,
430
+ "loss": 0.0414,
431
+ "step": 1475
432
+ },
433
+ {
434
+ "epoch": 6.76,
435
+ "grad_norm": 0.8051535487174988,
436
+ "learning_rate": 7.788888888888888e-05,
437
+ "loss": 0.0426,
438
+ "step": 1500
439
+ },
440
+ {
441
+ "epoch": 6.87,
442
+ "grad_norm": 1.4626388549804688,
443
+ "learning_rate": 7.733333333333333e-05,
444
+ "loss": 0.0436,
445
+ "step": 1525
446
+ },
447
+ {
448
+ "epoch": 6.98,
449
+ "grad_norm": 0.8418045043945312,
450
+ "learning_rate": 7.677777777777778e-05,
451
+ "loss": 0.0442,
452
+ "step": 1550
453
+ },
454
+ {
455
+ "epoch": 7.09,
456
+ "grad_norm": 1.3747352361679077,
457
+ "learning_rate": 7.622222222222223e-05,
458
+ "loss": 0.0281,
459
+ "step": 1575
460
+ },
461
+ {
462
+ "epoch": 7.21,
463
+ "grad_norm": 0.5290963649749756,
464
+ "learning_rate": 7.566666666666667e-05,
465
+ "loss": 0.0237,
466
+ "step": 1600
467
+ },
468
+ {
469
+ "epoch": 7.32,
470
+ "grad_norm": 1.2137552499771118,
471
+ "learning_rate": 7.511111111111111e-05,
472
+ "loss": 0.0249,
473
+ "step": 1625
474
+ },
475
+ {
476
+ "epoch": 7.43,
477
+ "grad_norm": 0.7687398791313171,
478
+ "learning_rate": 7.455555555555556e-05,
479
+ "loss": 0.0261,
480
+ "step": 1650
481
+ },
482
+ {
483
+ "epoch": 7.55,
484
+ "grad_norm": 1.1545344591140747,
485
+ "learning_rate": 7.4e-05,
486
+ "loss": 0.0249,
487
+ "step": 1675
488
+ },
489
+ {
490
+ "epoch": 7.66,
491
+ "grad_norm": 0.7673143148422241,
492
+ "learning_rate": 7.344444444444445e-05,
493
+ "loss": 0.0248,
494
+ "step": 1700
495
+ },
496
+ {
497
+ "epoch": 7.77,
498
+ "grad_norm": 0.9905190467834473,
499
+ "learning_rate": 7.28888888888889e-05,
500
+ "loss": 0.0254,
501
+ "step": 1725
502
+ },
503
+ {
504
+ "epoch": 7.88,
505
+ "grad_norm": 1.764397382736206,
506
+ "learning_rate": 7.233333333333335e-05,
507
+ "loss": 0.0297,
508
+ "step": 1750
509
+ },
510
+ {
511
+ "epoch": 8.0,
512
+ "grad_norm": 0.9069448709487915,
513
+ "learning_rate": 7.177777777777777e-05,
514
+ "loss": 0.0275,
515
+ "step": 1775
516
+ },
517
+ {
518
+ "epoch": 8.11,
519
+ "grad_norm": 1.1385760307312012,
520
+ "learning_rate": 7.122222222222222e-05,
521
+ "loss": 0.0162,
522
+ "step": 1800
523
+ },
524
+ {
525
+ "epoch": 8.22,
526
+ "grad_norm": 0.5694571733474731,
527
+ "learning_rate": 7.066666666666667e-05,
528
+ "loss": 0.0149,
529
+ "step": 1825
530
+ },
531
+ {
532
+ "epoch": 8.33,
533
+ "grad_norm": 1.0839495658874512,
534
+ "learning_rate": 7.011111111111112e-05,
535
+ "loss": 0.0175,
536
+ "step": 1850
537
+ },
538
+ {
539
+ "epoch": 8.45,
540
+ "grad_norm": 0.7086426019668579,
541
+ "learning_rate": 6.955555555555556e-05,
542
+ "loss": 0.0189,
543
+ "step": 1875
544
+ },
545
+ {
546
+ "epoch": 8.56,
547
+ "grad_norm": 0.9548362493515015,
548
+ "learning_rate": 6.9e-05,
549
+ "loss": 0.0193,
550
+ "step": 1900
551
+ },
552
+ {
553
+ "epoch": 8.67,
554
+ "grad_norm": 0.9621508717536926,
555
+ "learning_rate": 6.844444444444445e-05,
556
+ "loss": 0.0186,
557
+ "step": 1925
558
+ },
559
+ {
560
+ "epoch": 8.78,
561
+ "grad_norm": 0.6629220843315125,
562
+ "learning_rate": 6.788888888888888e-05,
563
+ "loss": 0.0171,
564
+ "step": 1950
565
+ },
566
+ {
567
+ "epoch": 8.9,
568
+ "grad_norm": 0.7981088161468506,
569
+ "learning_rate": 6.733333333333333e-05,
570
+ "loss": 0.0175,
571
+ "step": 1975
572
+ },
573
+ {
574
+ "epoch": 9.01,
575
+ "grad_norm": 0.45495709776878357,
576
+ "learning_rate": 6.677777777777779e-05,
577
+ "loss": 0.0167,
578
+ "step": 2000
579
+ },
580
+ {
581
+ "epoch": 9.01,
582
+ "eval_loss": 0.3443203866481781,
583
+ "eval_runtime": 1269.8219,
584
+ "eval_samples_per_second": 2.459,
585
+ "eval_steps_per_second": 0.077,
586
+ "eval_wer": 0.2994668933013984,
587
+ "step": 2000
588
+ },
589
+ {
590
+ "epoch": 9.12,
591
+ "grad_norm": 1.0250108242034912,
592
+ "learning_rate": 6.622222222222224e-05,
593
+ "loss": 0.0124,
594
+ "step": 2025
595
+ },
596
+ {
597
+ "epoch": 9.23,
598
+ "grad_norm": 0.533909022808075,
599
+ "learning_rate": 6.566666666666666e-05,
600
+ "loss": 0.0128,
601
+ "step": 2050
602
+ },
603
+ {
604
+ "epoch": 9.35,
605
+ "grad_norm": 0.5022910237312317,
606
+ "learning_rate": 6.511111111111111e-05,
607
+ "loss": 0.0127,
608
+ "step": 2075
609
+ },
610
+ {
611
+ "epoch": 9.46,
612
+ "grad_norm": 1.3371328115463257,
613
+ "learning_rate": 6.455555555555556e-05,
614
+ "loss": 0.0116,
615
+ "step": 2100
616
+ },
617
+ {
618
+ "epoch": 9.57,
619
+ "grad_norm": 1.2396471500396729,
620
+ "learning_rate": 6.400000000000001e-05,
621
+ "loss": 0.0112,
622
+ "step": 2125
623
+ },
624
+ {
625
+ "epoch": 9.68,
626
+ "grad_norm": 1.2121708393096924,
627
+ "learning_rate": 6.344444444444445e-05,
628
+ "loss": 0.0107,
629
+ "step": 2150
630
+ },
631
+ {
632
+ "epoch": 9.8,
633
+ "grad_norm": 1.3228121995925903,
634
+ "learning_rate": 6.28888888888889e-05,
635
+ "loss": 0.0108,
636
+ "step": 2175
637
+ },
638
+ {
639
+ "epoch": 9.91,
640
+ "grad_norm": 0.6204155683517456,
641
+ "learning_rate": 6.233333333333334e-05,
642
+ "loss": 0.0123,
643
+ "step": 2200
644
+ },
645
+ {
646
+ "epoch": 10.02,
647
+ "grad_norm": 0.4221612811088562,
648
+ "learning_rate": 6.177777777777779e-05,
649
+ "loss": 0.0117,
650
+ "step": 2225
651
+ },
652
+ {
653
+ "epoch": 10.14,
654
+ "grad_norm": 0.8225328922271729,
655
+ "learning_rate": 6.122222222222222e-05,
656
+ "loss": 0.008,
657
+ "step": 2250
658
+ },
659
+ {
660
+ "epoch": 10.25,
661
+ "grad_norm": 0.22648921608924866,
662
+ "learning_rate": 6.066666666666667e-05,
663
+ "loss": 0.0075,
664
+ "step": 2275
665
+ },
666
+ {
667
+ "epoch": 10.36,
668
+ "grad_norm": 1.0620574951171875,
669
+ "learning_rate": 6.011111111111112e-05,
670
+ "loss": 0.0077,
671
+ "step": 2300
672
+ },
673
+ {
674
+ "epoch": 10.47,
675
+ "grad_norm": 0.5009572505950928,
676
+ "learning_rate": 5.9555555555555554e-05,
677
+ "loss": 0.008,
678
+ "step": 2325
679
+ },
680
+ {
681
+ "epoch": 10.59,
682
+ "grad_norm": 0.6466513872146606,
683
+ "learning_rate": 5.9e-05,
684
+ "loss": 0.0098,
685
+ "step": 2350
686
+ },
687
+ {
688
+ "epoch": 10.7,
689
+ "grad_norm": 0.2255641371011734,
690
+ "learning_rate": 5.844444444444445e-05,
691
+ "loss": 0.0094,
692
+ "step": 2375
693
+ },
694
+ {
695
+ "epoch": 10.81,
696
+ "grad_norm": 0.838545560836792,
697
+ "learning_rate": 5.788888888888889e-05,
698
+ "loss": 0.0089,
699
+ "step": 2400
700
+ },
701
+ {
702
+ "epoch": 10.92,
703
+ "grad_norm": 0.6793853044509888,
704
+ "learning_rate": 5.7333333333333336e-05,
705
+ "loss": 0.0087,
706
+ "step": 2425
707
+ },
708
+ {
709
+ "epoch": 11.04,
710
+ "grad_norm": 0.548841655254364,
711
+ "learning_rate": 5.6777777777777786e-05,
712
+ "loss": 0.0069,
713
+ "step": 2450
714
+ },
715
+ {
716
+ "epoch": 11.15,
717
+ "grad_norm": 0.22741466760635376,
718
+ "learning_rate": 5.622222222222222e-05,
719
+ "loss": 0.0065,
720
+ "step": 2475
721
+ },
722
+ {
723
+ "epoch": 11.26,
724
+ "grad_norm": 0.4155316650867462,
725
+ "learning_rate": 5.566666666666667e-05,
726
+ "loss": 0.0058,
727
+ "step": 2500
728
+ },
729
+ {
730
+ "epoch": 11.37,
731
+ "grad_norm": 0.48344260454177856,
732
+ "learning_rate": 5.511111111111111e-05,
733
+ "loss": 0.005,
734
+ "step": 2525
735
+ },
736
+ {
737
+ "epoch": 11.49,
738
+ "grad_norm": 0.9006750583648682,
739
+ "learning_rate": 5.455555555555556e-05,
740
+ "loss": 0.0045,
741
+ "step": 2550
742
+ },
743
+ {
744
+ "epoch": 11.6,
745
+ "grad_norm": 0.9966240525245667,
746
+ "learning_rate": 5.4000000000000005e-05,
747
+ "loss": 0.0047,
748
+ "step": 2575
749
+ },
750
+ {
751
+ "epoch": 11.71,
752
+ "grad_norm": 0.39858147501945496,
753
+ "learning_rate": 5.3444444444444455e-05,
754
+ "loss": 0.0053,
755
+ "step": 2600
756
+ },
757
+ {
758
+ "epoch": 11.82,
759
+ "grad_norm": 0.6118489503860474,
760
+ "learning_rate": 5.2888888888888885e-05,
761
+ "loss": 0.0053,
762
+ "step": 2625
763
+ },
764
+ {
765
+ "epoch": 11.94,
766
+ "grad_norm": 0.5074841976165771,
767
+ "learning_rate": 5.2333333333333336e-05,
768
+ "loss": 0.0057,
769
+ "step": 2650
770
+ },
771
+ {
772
+ "epoch": 12.05,
773
+ "grad_norm": 0.6888458728790283,
774
+ "learning_rate": 5.177777777777778e-05,
775
+ "loss": 0.0053,
776
+ "step": 2675
777
+ },
778
+ {
779
+ "epoch": 12.16,
780
+ "grad_norm": 0.7311161160469055,
781
+ "learning_rate": 5.122222222222223e-05,
782
+ "loss": 0.006,
783
+ "step": 2700
784
+ },
785
+ {
786
+ "epoch": 12.27,
787
+ "grad_norm": 0.47264620661735535,
788
+ "learning_rate": 5.0666666666666674e-05,
789
+ "loss": 0.0058,
790
+ "step": 2725
791
+ },
792
+ {
793
+ "epoch": 12.39,
794
+ "grad_norm": 0.6639235019683838,
795
+ "learning_rate": 5.011111111111111e-05,
796
+ "loss": 0.0052,
797
+ "step": 2750
798
+ },
799
+ {
800
+ "epoch": 12.5,
801
+ "grad_norm": 0.1161256805062294,
802
+ "learning_rate": 4.955555555555556e-05,
803
+ "loss": 0.0038,
804
+ "step": 2775
805
+ },
806
+ {
807
+ "epoch": 12.61,
808
+ "grad_norm": 0.4923400580883026,
809
+ "learning_rate": 4.9e-05,
810
+ "loss": 0.0036,
811
+ "step": 2800
812
+ },
813
+ {
814
+ "epoch": 12.73,
815
+ "grad_norm": 0.6149506568908691,
816
+ "learning_rate": 4.844444444444445e-05,
817
+ "loss": 0.0046,
818
+ "step": 2825
819
+ },
820
+ {
821
+ "epoch": 12.84,
822
+ "grad_norm": 0.16888651251792908,
823
+ "learning_rate": 4.7888888888888886e-05,
824
+ "loss": 0.0041,
825
+ "step": 2850
826
+ },
827
+ {
828
+ "epoch": 12.95,
829
+ "grad_norm": 1.0652014017105103,
830
+ "learning_rate": 4.7333333333333336e-05,
831
+ "loss": 0.0041,
832
+ "step": 2875
833
+ },
834
+ {
835
+ "epoch": 13.06,
836
+ "grad_norm": 0.21759897470474243,
837
+ "learning_rate": 4.677777777777778e-05,
838
+ "loss": 0.003,
839
+ "step": 2900
840
+ },
841
+ {
842
+ "epoch": 13.18,
843
+ "grad_norm": 0.23394200205802917,
844
+ "learning_rate": 4.6222222222222224e-05,
845
+ "loss": 0.0034,
846
+ "step": 2925
847
+ },
848
+ {
849
+ "epoch": 13.29,
850
+ "grad_norm": 0.05768038332462311,
851
+ "learning_rate": 4.566666666666667e-05,
852
+ "loss": 0.0037,
853
+ "step": 2950
854
+ },
855
+ {
856
+ "epoch": 13.4,
857
+ "grad_norm": 0.08611828088760376,
858
+ "learning_rate": 4.511111111111112e-05,
859
+ "loss": 0.0034,
860
+ "step": 2975
861
+ },
862
+ {
863
+ "epoch": 13.51,
864
+ "grad_norm": 0.1028035581111908,
865
+ "learning_rate": 4.4555555555555555e-05,
866
+ "loss": 0.0032,
867
+ "step": 3000
868
+ },
869
+ {
870
+ "epoch": 13.51,
871
+ "eval_loss": 0.3575945198535919,
872
+ "eval_runtime": 1297.8448,
873
+ "eval_samples_per_second": 2.406,
874
+ "eval_steps_per_second": 0.076,
875
+ "eval_wer": 0.27779494707563934,
876
+ "step": 3000
877
+ },
878
+ {
879
+ "epoch": 13.63,
880
+ "grad_norm": 0.23182912170886993,
881
+ "learning_rate": 4.4000000000000006e-05,
882
+ "loss": 0.0027,
883
+ "step": 3025
884
+ },
885
+ {
886
+ "epoch": 13.74,
887
+ "grad_norm": 0.100206658244133,
888
+ "learning_rate": 4.344444444444445e-05,
889
+ "loss": 0.0027,
890
+ "step": 3050
891
+ },
892
+ {
893
+ "epoch": 13.85,
894
+ "grad_norm": 0.9118719100952148,
895
+ "learning_rate": 4.2888888888888886e-05,
896
+ "loss": 0.003,
897
+ "step": 3075
898
+ },
899
+ {
900
+ "epoch": 13.96,
901
+ "grad_norm": 0.06793611496686935,
902
+ "learning_rate": 4.233333333333334e-05,
903
+ "loss": 0.003,
904
+ "step": 3100
905
+ },
906
+ {
907
+ "epoch": 14.08,
908
+ "grad_norm": 0.0683990940451622,
909
+ "learning_rate": 4.177777777777778e-05,
910
+ "loss": 0.0021,
911
+ "step": 3125
912
+ },
913
+ {
914
+ "epoch": 14.19,
915
+ "grad_norm": 0.19087089598178864,
916
+ "learning_rate": 4.1222222222222224e-05,
917
+ "loss": 0.0028,
918
+ "step": 3150
919
+ },
920
+ {
921
+ "epoch": 14.3,
922
+ "grad_norm": 0.14526407420635223,
923
+ "learning_rate": 4.066666666666667e-05,
924
+ "loss": 0.0025,
925
+ "step": 3175
926
+ },
927
+ {
928
+ "epoch": 14.41,
929
+ "grad_norm": 0.5902572870254517,
930
+ "learning_rate": 4.011111111111111e-05,
931
+ "loss": 0.0031,
932
+ "step": 3200
933
+ },
934
+ {
935
+ "epoch": 14.53,
936
+ "grad_norm": 0.1988796442747116,
937
+ "learning_rate": 3.9555555555555556e-05,
938
+ "loss": 0.0021,
939
+ "step": 3225
940
+ },
941
+ {
942
+ "epoch": 14.64,
943
+ "grad_norm": 0.178738534450531,
944
+ "learning_rate": 3.9000000000000006e-05,
945
+ "loss": 0.0031,
946
+ "step": 3250
947
+ },
948
+ {
949
+ "epoch": 14.75,
950
+ "grad_norm": 0.03732344135642052,
951
+ "learning_rate": 3.844444444444444e-05,
952
+ "loss": 0.0026,
953
+ "step": 3275
954
+ },
955
+ {
956
+ "epoch": 14.86,
957
+ "grad_norm": 0.047354888170957565,
958
+ "learning_rate": 3.7888888888888894e-05,
959
+ "loss": 0.0016,
960
+ "step": 3300
961
+ },
962
+ {
963
+ "epoch": 14.98,
964
+ "grad_norm": 0.058274924755096436,
965
+ "learning_rate": 3.733333333333334e-05,
966
+ "loss": 0.0019,
967
+ "step": 3325
968
+ },
969
+ {
970
+ "epoch": 15.09,
971
+ "grad_norm": 1.4180477857589722,
972
+ "learning_rate": 3.677777777777778e-05,
973
+ "loss": 0.0016,
974
+ "step": 3350
975
+ },
976
+ {
977
+ "epoch": 15.2,
978
+ "grad_norm": 0.03281378000974655,
979
+ "learning_rate": 3.6222222222222225e-05,
980
+ "loss": 0.0016,
981
+ "step": 3375
982
+ },
983
+ {
984
+ "epoch": 15.32,
985
+ "grad_norm": 0.2159404158592224,
986
+ "learning_rate": 3.566666666666667e-05,
987
+ "loss": 0.0026,
988
+ "step": 3400
989
+ },
990
+ {
991
+ "epoch": 15.43,
992
+ "grad_norm": 0.18890638649463654,
993
+ "learning_rate": 3.511111111111111e-05,
994
+ "loss": 0.0016,
995
+ "step": 3425
996
+ },
997
+ {
998
+ "epoch": 15.54,
999
+ "grad_norm": 0.022921651601791382,
1000
+ "learning_rate": 3.4555555555555556e-05,
1001
+ "loss": 0.0012,
1002
+ "step": 3450
1003
+ },
1004
+ {
1005
+ "epoch": 15.65,
1006
+ "grad_norm": 0.02838265895843506,
1007
+ "learning_rate": 3.4000000000000007e-05,
1008
+ "loss": 0.0014,
1009
+ "step": 3475
1010
+ },
1011
+ {
1012
+ "epoch": 15.77,
1013
+ "grad_norm": 0.04957688972353935,
1014
+ "learning_rate": 3.3444444444444443e-05,
1015
+ "loss": 0.0012,
1016
+ "step": 3500
1017
+ },
1018
+ {
1019
+ "epoch": 15.88,
1020
+ "grad_norm": 0.03910296410322189,
1021
+ "learning_rate": 3.2888888888888894e-05,
1022
+ "loss": 0.0008,
1023
+ "step": 3525
1024
+ },
1025
+ {
1026
+ "epoch": 15.99,
1027
+ "grad_norm": 0.3031899034976959,
1028
+ "learning_rate": 3.233333333333333e-05,
1029
+ "loss": 0.0015,
1030
+ "step": 3550
1031
+ },
1032
+ {
1033
+ "epoch": 16.1,
1034
+ "grad_norm": 0.026370937004685402,
1035
+ "learning_rate": 3.177777777777778e-05,
1036
+ "loss": 0.0009,
1037
+ "step": 3575
1038
+ },
1039
+ {
1040
+ "epoch": 16.22,
1041
+ "grad_norm": 0.04645024240016937,
1042
+ "learning_rate": 3.1222222222222225e-05,
1043
+ "loss": 0.0014,
1044
+ "step": 3600
1045
+ },
1046
+ {
1047
+ "epoch": 16.33,
1048
+ "grad_norm": 0.03346904739737511,
1049
+ "learning_rate": 3.066666666666667e-05,
1050
+ "loss": 0.001,
1051
+ "step": 3625
1052
+ },
1053
+ {
1054
+ "epoch": 16.44,
1055
+ "grad_norm": 0.41791531443595886,
1056
+ "learning_rate": 3.0111111111111113e-05,
1057
+ "loss": 0.0019,
1058
+ "step": 3650
1059
+ },
1060
+ {
1061
+ "epoch": 16.55,
1062
+ "grad_norm": 0.023621816188097,
1063
+ "learning_rate": 2.955555555555556e-05,
1064
+ "loss": 0.0009,
1065
+ "step": 3675
1066
+ },
1067
+ {
1068
+ "epoch": 16.67,
1069
+ "grad_norm": 0.020701350644230843,
1070
+ "learning_rate": 2.9e-05,
1071
+ "loss": 0.0009,
1072
+ "step": 3700
1073
+ },
1074
+ {
1075
+ "epoch": 16.78,
1076
+ "grad_norm": 0.018095409497618675,
1077
+ "learning_rate": 2.8444444444444447e-05,
1078
+ "loss": 0.0007,
1079
+ "step": 3725
1080
+ },
1081
+ {
1082
+ "epoch": 16.89,
1083
+ "grad_norm": 0.03800148516893387,
1084
+ "learning_rate": 2.788888888888889e-05,
1085
+ "loss": 0.001,
1086
+ "step": 3750
1087
+ },
1088
+ {
1089
+ "epoch": 17.0,
1090
+ "grad_norm": 0.0219491608440876,
1091
+ "learning_rate": 2.733333333333333e-05,
1092
+ "loss": 0.0012,
1093
+ "step": 3775
1094
+ },
1095
+ {
1096
+ "epoch": 17.12,
1097
+ "grad_norm": 0.19971542060375214,
1098
+ "learning_rate": 2.677777777777778e-05,
1099
+ "loss": 0.001,
1100
+ "step": 3800
1101
+ },
1102
+ {
1103
+ "epoch": 17.23,
1104
+ "grad_norm": 0.022324278950691223,
1105
+ "learning_rate": 2.6222222222222226e-05,
1106
+ "loss": 0.0005,
1107
+ "step": 3825
1108
+ },
1109
+ {
1110
+ "epoch": 17.34,
1111
+ "grad_norm": 0.014598184265196323,
1112
+ "learning_rate": 2.5666666666666666e-05,
1113
+ "loss": 0.0007,
1114
+ "step": 3850
1115
+ },
1116
+ {
1117
+ "epoch": 17.45,
1118
+ "grad_norm": 0.01482320111244917,
1119
+ "learning_rate": 2.5111111111111113e-05,
1120
+ "loss": 0.0008,
1121
+ "step": 3875
1122
+ },
1123
+ {
1124
+ "epoch": 17.57,
1125
+ "grad_norm": 0.019341906532645226,
1126
+ "learning_rate": 2.4555555555555557e-05,
1127
+ "loss": 0.0005,
1128
+ "step": 3900
1129
+ },
1130
+ {
1131
+ "epoch": 17.68,
1132
+ "grad_norm": 0.044308606535196304,
1133
+ "learning_rate": 2.4e-05,
1134
+ "loss": 0.0008,
1135
+ "step": 3925
1136
+ },
1137
+ {
1138
+ "epoch": 17.79,
1139
+ "grad_norm": 0.01700867898762226,
1140
+ "learning_rate": 2.3444444444444448e-05,
1141
+ "loss": 0.0009,
1142
+ "step": 3950
1143
+ },
1144
+ {
1145
+ "epoch": 17.91,
1146
+ "grad_norm": 0.01428561843931675,
1147
+ "learning_rate": 2.288888888888889e-05,
1148
+ "loss": 0.0004,
1149
+ "step": 3975
1150
+ },
1151
+ {
1152
+ "epoch": 18.02,
1153
+ "grad_norm": 0.011909844353795052,
1154
+ "learning_rate": 2.2333333333333335e-05,
1155
+ "loss": 0.0004,
1156
+ "step": 4000
1157
+ },
1158
+ {
1159
+ "epoch": 18.02,
1160
+ "eval_loss": 0.36695417761802673,
1161
+ "eval_runtime": 1296.9402,
1162
+ "eval_samples_per_second": 2.408,
1163
+ "eval_steps_per_second": 0.076,
1164
+ "eval_wer": 0.2677122769064359,
1165
+ "step": 4000
1166
+ },
1167
+ {
1168
+ "epoch": 18.13,
1169
+ "grad_norm": 0.011953528970479965,
1170
+ "learning_rate": 2.177777777777778e-05,
1171
+ "loss": 0.0004,
1172
+ "step": 4025
1173
+ },
1174
+ {
1175
+ "epoch": 18.24,
1176
+ "grad_norm": 0.013035556301474571,
1177
+ "learning_rate": 2.1222222222222223e-05,
1178
+ "loss": 0.0005,
1179
+ "step": 4050
1180
+ },
1181
+ {
1182
+ "epoch": 18.36,
1183
+ "grad_norm": 0.011018014512956142,
1184
+ "learning_rate": 2.0666666666666666e-05,
1185
+ "loss": 0.0003,
1186
+ "step": 4075
1187
+ },
1188
+ {
1189
+ "epoch": 18.47,
1190
+ "grad_norm": 0.011594709008932114,
1191
+ "learning_rate": 2.011111111111111e-05,
1192
+ "loss": 0.0004,
1193
+ "step": 4100
1194
+ },
1195
+ {
1196
+ "epoch": 18.58,
1197
+ "grad_norm": 0.01165748666971922,
1198
+ "learning_rate": 1.9555555555555557e-05,
1199
+ "loss": 0.0003,
1200
+ "step": 4125
1201
+ },
1202
+ {
1203
+ "epoch": 18.69,
1204
+ "grad_norm": 0.012751756235957146,
1205
+ "learning_rate": 1.9e-05,
1206
+ "loss": 0.0003,
1207
+ "step": 4150
1208
+ },
1209
+ {
1210
+ "epoch": 18.81,
1211
+ "grad_norm": 0.01092427410185337,
1212
+ "learning_rate": 1.8444444444444445e-05,
1213
+ "loss": 0.0003,
1214
+ "step": 4175
1215
+ },
1216
+ {
1217
+ "epoch": 18.92,
1218
+ "grad_norm": 0.010369419120252132,
1219
+ "learning_rate": 1.788888888888889e-05,
1220
+ "loss": 0.0007,
1221
+ "step": 4200
1222
+ },
1223
+ {
1224
+ "epoch": 19.03,
1225
+ "grad_norm": 0.009451022371649742,
1226
+ "learning_rate": 1.7333333333333336e-05,
1227
+ "loss": 0.0004,
1228
+ "step": 4225
1229
+ },
1230
+ {
1231
+ "epoch": 19.14,
1232
+ "grad_norm": 0.010264468379318714,
1233
+ "learning_rate": 1.677777777777778e-05,
1234
+ "loss": 0.0003,
1235
+ "step": 4250
1236
+ },
1237
+ {
1238
+ "epoch": 19.26,
1239
+ "grad_norm": 0.009353878907859325,
1240
+ "learning_rate": 1.6222222222222223e-05,
1241
+ "loss": 0.0003,
1242
+ "step": 4275
1243
+ },
1244
+ {
1245
+ "epoch": 19.37,
1246
+ "grad_norm": 0.007795905694365501,
1247
+ "learning_rate": 1.5666666666666667e-05,
1248
+ "loss": 0.0003,
1249
+ "step": 4300
1250
+ },
1251
+ {
1252
+ "epoch": 19.48,
1253
+ "grad_norm": 0.009554468095302582,
1254
+ "learning_rate": 1.5111111111111112e-05,
1255
+ "loss": 0.0004,
1256
+ "step": 4325
1257
+ },
1258
+ {
1259
+ "epoch": 19.59,
1260
+ "grad_norm": 0.009386077523231506,
1261
+ "learning_rate": 1.4555555555555556e-05,
1262
+ "loss": 0.0003,
1263
+ "step": 4350
1264
+ },
1265
+ {
1266
+ "epoch": 19.71,
1267
+ "grad_norm": 0.007565716747194529,
1268
+ "learning_rate": 1.4000000000000001e-05,
1269
+ "loss": 0.0004,
1270
+ "step": 4375
1271
+ },
1272
+ {
1273
+ "epoch": 19.82,
1274
+ "grad_norm": 0.011739292182028294,
1275
+ "learning_rate": 1.3444444444444445e-05,
1276
+ "loss": 0.0003,
1277
+ "step": 4400
1278
+ },
1279
+ {
1280
+ "epoch": 19.93,
1281
+ "grad_norm": 0.011955379508435726,
1282
+ "learning_rate": 1.2888888888888889e-05,
1283
+ "loss": 0.0004,
1284
+ "step": 4425
1285
+ },
1286
+ {
1287
+ "epoch": 20.05,
1288
+ "grad_norm": 0.007369581609964371,
1289
+ "learning_rate": 1.2333333333333334e-05,
1290
+ "loss": 0.0004,
1291
+ "step": 4450
1292
+ },
1293
+ {
1294
+ "epoch": 20.16,
1295
+ "grad_norm": 0.010209435597062111,
1296
+ "learning_rate": 1.1777777777777778e-05,
1297
+ "loss": 0.0003,
1298
+ "step": 4475
1299
+ },
1300
+ {
1301
+ "epoch": 20.27,
1302
+ "grad_norm": 0.009368482045829296,
1303
+ "learning_rate": 1.1222222222222224e-05,
1304
+ "loss": 0.0003,
1305
+ "step": 4500
1306
+ },
1307
+ {
1308
+ "epoch": 20.38,
1309
+ "grad_norm": 0.008915912359952927,
1310
+ "learning_rate": 1.0666666666666667e-05,
1311
+ "loss": 0.0003,
1312
+ "step": 4525
1313
+ },
1314
+ {
1315
+ "epoch": 20.5,
1316
+ "grad_norm": 0.01048735436052084,
1317
+ "learning_rate": 1.0111111111111111e-05,
1318
+ "loss": 0.0003,
1319
+ "step": 4550
1320
+ },
1321
+ {
1322
+ "epoch": 20.61,
1323
+ "grad_norm": 0.010569226928055286,
1324
+ "learning_rate": 9.555555555555556e-06,
1325
+ "loss": 0.0003,
1326
+ "step": 4575
1327
+ },
1328
+ {
1329
+ "epoch": 20.72,
1330
+ "grad_norm": 0.008401792496442795,
1331
+ "learning_rate": 9e-06,
1332
+ "loss": 0.0003,
1333
+ "step": 4600
1334
+ },
1335
+ {
1336
+ "epoch": 20.83,
1337
+ "grad_norm": 0.01062182616442442,
1338
+ "learning_rate": 8.444444444444446e-06,
1339
+ "loss": 0.0003,
1340
+ "step": 4625
1341
+ },
1342
+ {
1343
+ "epoch": 20.95,
1344
+ "grad_norm": 0.007442856673151255,
1345
+ "learning_rate": 7.88888888888889e-06,
1346
+ "loss": 0.0004,
1347
+ "step": 4650
1348
+ },
1349
+ {
1350
+ "epoch": 21.06,
1351
+ "grad_norm": 0.007747430354356766,
1352
+ "learning_rate": 7.333333333333334e-06,
1353
+ "loss": 0.0003,
1354
+ "step": 4675
1355
+ },
1356
+ {
1357
+ "epoch": 21.17,
1358
+ "grad_norm": 0.008953329175710678,
1359
+ "learning_rate": 6.777777777777779e-06,
1360
+ "loss": 0.0003,
1361
+ "step": 4700
1362
+ },
1363
+ {
1364
+ "epoch": 21.28,
1365
+ "grad_norm": 0.0087329912930727,
1366
+ "learning_rate": 6.222222222222222e-06,
1367
+ "loss": 0.0003,
1368
+ "step": 4725
1369
+ },
1370
+ {
1371
+ "epoch": 21.4,
1372
+ "grad_norm": 0.007937785238027573,
1373
+ "learning_rate": 5.666666666666667e-06,
1374
+ "loss": 0.0003,
1375
+ "step": 4750
1376
+ },
1377
+ {
1378
+ "epoch": 21.51,
1379
+ "grad_norm": 0.007708992809057236,
1380
+ "learning_rate": 5.1111111111111115e-06,
1381
+ "loss": 0.0003,
1382
+ "step": 4775
1383
+ },
1384
+ {
1385
+ "epoch": 21.62,
1386
+ "grad_norm": 0.011778591200709343,
1387
+ "learning_rate": 4.555555555555556e-06,
1388
+ "loss": 0.0003,
1389
+ "step": 4800
1390
+ },
1391
+ {
1392
+ "epoch": 21.73,
1393
+ "grad_norm": 0.00828944519162178,
1394
+ "learning_rate": 4.000000000000001e-06,
1395
+ "loss": 0.0002,
1396
+ "step": 4825
1397
+ },
1398
+ {
1399
+ "epoch": 21.85,
1400
+ "grad_norm": 0.007438404019922018,
1401
+ "learning_rate": 3.4444444444444444e-06,
1402
+ "loss": 0.0003,
1403
+ "step": 4850
1404
+ },
1405
+ {
1406
+ "epoch": 21.96,
1407
+ "grad_norm": 0.007443991024047136,
1408
+ "learning_rate": 2.888888888888889e-06,
1409
+ "loss": 0.0003,
1410
+ "step": 4875
1411
+ },
1412
+ {
1413
+ "epoch": 22.07,
1414
+ "grad_norm": 0.008769960142672062,
1415
+ "learning_rate": 2.3333333333333336e-06,
1416
+ "loss": 0.0003,
1417
+ "step": 4900
1418
+ },
1419
+ {
1420
+ "epoch": 22.18,
1421
+ "grad_norm": 0.008519369177520275,
1422
+ "learning_rate": 1.777777777777778e-06,
1423
+ "loss": 0.0003,
1424
+ "step": 4925
1425
+ },
1426
+ {
1427
+ "epoch": 22.3,
1428
+ "grad_norm": 0.007310151122510433,
1429
+ "learning_rate": 1.2222222222222223e-06,
1430
+ "loss": 0.0002,
1431
+ "step": 4950
1432
+ },
1433
+ {
1434
+ "epoch": 22.41,
1435
+ "grad_norm": 0.0072664907202124596,
1436
+ "learning_rate": 6.666666666666667e-07,
1437
+ "loss": 0.0002,
1438
+ "step": 4975
1439
+ },
1440
+ {
1441
+ "epoch": 22.52,
1442
+ "grad_norm": 0.00765978591516614,
1443
+ "learning_rate": 1.1111111111111112e-07,
1444
+ "loss": 0.0003,
1445
+ "step": 5000
1446
+ },
1447
+ {
1448
+ "epoch": 22.52,
1449
+ "eval_loss": 0.3728739619255066,
1450
+ "eval_runtime": 1237.2295,
1451
+ "eval_samples_per_second": 2.524,
1452
+ "eval_steps_per_second": 0.079,
1453
+ "eval_wer": 0.2660897782585181,
1454
+ "step": 5000
1455
+ },
1456
+ {
1457
+ "epoch": 22.52,
1458
+ "step": 5000,
1459
+ "total_flos": 2.532745423355904e+20,
1460
+ "train_loss": 0.17524469082718716,
1461
+ "train_runtime": 15083.6622,
1462
+ "train_samples_per_second": 10.608,
1463
+ "train_steps_per_second": 0.331
1464
+ }
1465
+ ],
1466
+ "logging_steps": 25,
1467
+ "max_steps": 5000,
1468
+ "num_input_tokens_seen": 0,
1469
+ "num_train_epochs": 23,
1470
+ "save_steps": 1000,
1471
+ "total_flos": 2.532745423355904e+20,
1472
+ "train_batch_size": 32,
1473
+ "trial_name": null,
1474
+ "trial_params": null
1475
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f27ea4e0d69ad73da18d7df2aac11132046f87eda8cb3c5ff28639d1fba157c7
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad2da03ce66289217d1fa50c2801f63f453b9bfc44d54b73414b3331a94379e0
3
  size 5048
wandb/debug-internal.log CHANGED
The diff for this file is too large to render. See raw diff
 
wandb/debug.log CHANGED
@@ -1,28 +1,28 @@
1
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_setup.py:_flush():76] Current SDK version is 0.16.2
2
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_setup.py:_flush():76] Configure stats pid to 1482814
3
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_setup.py:_flush():76] Loading settings from /home/sanchit/.config/wandb/settings
4
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_setup.py:_flush():76] Loading settings from /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/settings
5
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_speech_recognition_seq2seq.py', 'program_abspath': '/home/sanchit/distil-large-v3-hi-ft-frozen-encoder/run_speech_recognition_seq2seq.py', 'program': 'run_speech_recognition_seq2seq.py'}
8
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_init.py:_log_setup():526] Logging user logs to /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/run-20240327_141033-golaq7b9/logs/debug.log
9
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_init.py:_log_setup():527] Logging internal logs to /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/run-20240327_141033-golaq7b9/logs/debug-internal.log
10
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_init.py:init():566] calling init triggers
11
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
  config: {}
13
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_init.py:init():616] starting backend
14
- 2024-03-27 14:10:33,211 INFO MainThread:1482814 [wandb_init.py:init():620] setting up manager
15
- 2024-03-27 14:10:33,212 INFO MainThread:1482814 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
- 2024-03-27 14:10:33,213 INFO MainThread:1482814 [wandb_init.py:init():628] backend started and connected
17
- 2024-03-27 14:10:33,217 INFO MainThread:1482814 [wandb_init.py:init():720] updated telemetry
18
- 2024-03-27 14:10:33,272 INFO MainThread:1482814 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
- 2024-03-27 14:10:33,578 INFO MainThread:1482814 [wandb_run.py:_on_init():2254] communicating current version
20
- 2024-03-27 14:10:33,602 INFO MainThread:1482814 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
 
22
- 2024-03-27 14:10:33,602 INFO MainThread:1482814 [wandb_init.py:init():804] starting run threads in backend
23
- 2024-03-27 14:10:34,006 INFO MainThread:1482814 [wandb_run.py:_console_start():2233] atexit reg
24
- 2024-03-27 14:10:34,006 INFO MainThread:1482814 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
- 2024-03-27 14:10:34,006 INFO MainThread:1482814 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
- 2024-03-27 14:10:34,007 INFO MainThread:1482814 [wandb_run.py:_redirect():2178] Redirects installed.
27
- 2024-03-27 14:10:34,007 INFO MainThread:1482814 [wandb_init.py:init():847] run started, returning control to user process
28
- 2024-03-27 14:10:34,009 INFO MainThread:1482814 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 51866, 'num_mel_bins': 128, 'd_model': 1280, 'encoder_layers': 32, 'encoder_attention_heads': 20, 'decoder_layers': 2, 'decoder_attention_heads': 20, 'decoder_ffn_dim': 5120, 'encoder_ffn_dim': 5120, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'activation_function': 'gelu', 'init_std': 0.02, 'encoder_layerdrop': 0.0, 'decoder_layerdrop': 0.0, 'use_cache': True, 'num_hidden_layers': 32, 'scale_embedding': False, 'max_source_positions': 1500, 'max_target_positions': 448, 'classifier_proj_size': 256, 'use_weighted_layer_sum': False, 'apply_spec_augment': False, 'mask_time_prob': 0.05, 'mask_time_length': 10, 'mask_time_min_masks': 2, 'mask_feature_prob': 0.0, 'mask_feature_length': 10, 'mask_feature_min_masks': 0, 'median_filter_width': 7, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 448, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': [220, 50257], 'architectures': ['WhisperForConditionalGeneration'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 50257, 'pad_token_id': 50256, 'eos_token_id': 50257, 'sep_token_id': None, 'decoder_start_token_id': 50258, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distil-whisper/distil-large-v3', 'transformers_version': '4.40.0.dev0', 'model_type': 'whisper', 'forced_decoder_ids': None, 'output_dir': './', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 5000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Mar27_14-10-22_hf-dgx-01', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 1000, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'input_length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'sortish_sampler': False, 'predict_with_generate': True, 'generation_max_length': 225, 'generation_num_beams': None, 'generation_config': None}
 
1
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Current SDK version is 0.16.2
2
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Configure stats pid to 1894903
3
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Loading settings from /home/sanchit/.config/wandb/settings
4
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Loading settings from /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/settings
5
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_speech_recognition_seq2seq.py', 'program_abspath': '/home/sanchit/distil-large-v3-hi-ft-frozen-encoder/run_speech_recognition_seq2seq.py', 'program': 'run_speech_recognition_seq2seq.py'}
8
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:_log_setup():526] Logging user logs to /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/run-20240327_190513-7p2x8a0l/logs/debug.log
9
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:_log_setup():527] Logging internal logs to /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/run-20240327_190513-7p2x8a0l/logs/debug-internal.log
10
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:init():566] calling init triggers
11
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
  config: {}
13
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:init():616] starting backend
14
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:init():620] setting up manager
15
+ 2024-03-27 19:05:13,646 INFO MainThread:1894903 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-03-27 19:05:13,647 INFO MainThread:1894903 [wandb_init.py:init():628] backend started and connected
17
+ 2024-03-27 19:05:13,651 INFO MainThread:1894903 [wandb_init.py:init():720] updated telemetry
18
+ 2024-03-27 19:05:13,720 INFO MainThread:1894903 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-03-27 19:05:14,027 INFO MainThread:1894903 [wandb_run.py:_on_init():2254] communicating current version
20
+ 2024-03-27 19:05:14,056 INFO MainThread:1894903 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
 
22
+ 2024-03-27 19:05:14,056 INFO MainThread:1894903 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_run.py:_console_start():2233] atexit reg
24
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_run.py:_redirect():2178] Redirects installed.
27
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-03-27 19:05:14,729 INFO MainThread:1894903 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 51866, 'num_mel_bins': 128, 'd_model': 1280, 'encoder_layers': 32, 'encoder_attention_heads': 20, 'decoder_layers': 2, 'decoder_attention_heads': 20, 'decoder_ffn_dim': 5120, 'encoder_ffn_dim': 5120, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'activation_function': 'gelu', 'init_std': 0.02, 'encoder_layerdrop': 0.0, 'decoder_layerdrop': 0.0, 'use_cache': True, 'num_hidden_layers': 32, 'scale_embedding': False, 'max_source_positions': 1500, 'max_target_positions': 448, 'classifier_proj_size': 256, 'use_weighted_layer_sum': False, 'apply_spec_augment': False, 'mask_time_prob': 0.05, 'mask_time_length': 10, 'mask_time_min_masks': 2, 'mask_feature_prob': 0.0, 'mask_feature_length': 10, 'mask_feature_min_masks': 0, 'median_filter_width': 7, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 448, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': [220, 50257], 'architectures': ['WhisperForConditionalGeneration'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 50257, 'pad_token_id': 50256, 'eos_token_id': 50257, 'sep_token_id': None, 'decoder_start_token_id': 50258, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distil-whisper/distil-large-v3', 'transformers_version': '4.40.0.dev0', 'model_type': 'whisper', 'forced_decoder_ids': None, 'output_dir': './', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 5000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Mar27_19-04-58_hf-dgx-01', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 1000, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'input_length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'sortish_sampler': False, 'predict_with_generate': True, 'generation_max_length': 225, 'generation_num_beams': None, 'generation_config': None}
wandb/run-20240327_190513-7p2x8a0l/files/config.yaml ADDED
@@ -0,0 +1,751 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.8.10
7
+ cli_version: 0.16.2
8
+ framework: huggingface
9
+ huggingface_version: 4.40.0.dev0
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1711562713.647808
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 2
17
+ - 3
18
+ - 5
19
+ - 11
20
+ - 12
21
+ - 49
22
+ - 51
23
+ - 53
24
+ - 55
25
+ - 71
26
+ - 98
27
+ - 100
28
+ 2:
29
+ - 1
30
+ - 2
31
+ - 3
32
+ - 5
33
+ - 11
34
+ - 12
35
+ - 49
36
+ - 51
37
+ - 53
38
+ - 55
39
+ - 71
40
+ - 98
41
+ - 100
42
+ 3:
43
+ - 7
44
+ - 23
45
+ 4: 3.8.10
46
+ 5: 0.16.2
47
+ 6: 4.40.0.dev0
48
+ 8:
49
+ - 5
50
+ 9:
51
+ 1: transformers_trainer
52
+ 13: linux-x86_64
53
+ m:
54
+ - 1: train/global_step
55
+ 6:
56
+ - 3
57
+ - 1: train/loss
58
+ 5: 1
59
+ 6:
60
+ - 1
61
+ - 1: train/grad_norm
62
+ 5: 1
63
+ 6:
64
+ - 1
65
+ - 1: train/learning_rate
66
+ 5: 1
67
+ 6:
68
+ - 1
69
+ - 1: train/epoch
70
+ 5: 1
71
+ 6:
72
+ - 1
73
+ - 1: eval/loss
74
+ 5: 1
75
+ 6:
76
+ - 1
77
+ - 1: eval/wer
78
+ 5: 1
79
+ 6:
80
+ - 1
81
+ - 1: eval/runtime
82
+ 5: 1
83
+ 6:
84
+ - 1
85
+ - 1: eval/samples_per_second
86
+ 5: 1
87
+ 6:
88
+ - 1
89
+ - 1: eval/steps_per_second
90
+ 5: 1
91
+ 6:
92
+ - 1
93
+ vocab_size:
94
+ desc: null
95
+ value: 51866
96
+ num_mel_bins:
97
+ desc: null
98
+ value: 128
99
+ d_model:
100
+ desc: null
101
+ value: 1280
102
+ encoder_layers:
103
+ desc: null
104
+ value: 32
105
+ encoder_attention_heads:
106
+ desc: null
107
+ value: 20
108
+ decoder_layers:
109
+ desc: null
110
+ value: 2
111
+ decoder_attention_heads:
112
+ desc: null
113
+ value: 20
114
+ decoder_ffn_dim:
115
+ desc: null
116
+ value: 5120
117
+ encoder_ffn_dim:
118
+ desc: null
119
+ value: 5120
120
+ dropout:
121
+ desc: null
122
+ value: 0.0
123
+ attention_dropout:
124
+ desc: null
125
+ value: 0.0
126
+ activation_dropout:
127
+ desc: null
128
+ value: 0.0
129
+ activation_function:
130
+ desc: null
131
+ value: gelu
132
+ init_std:
133
+ desc: null
134
+ value: 0.02
135
+ encoder_layerdrop:
136
+ desc: null
137
+ value: 0.0
138
+ decoder_layerdrop:
139
+ desc: null
140
+ value: 0.0
141
+ use_cache:
142
+ desc: null
143
+ value: true
144
+ num_hidden_layers:
145
+ desc: null
146
+ value: 32
147
+ scale_embedding:
148
+ desc: null
149
+ value: false
150
+ max_source_positions:
151
+ desc: null
152
+ value: 1500
153
+ max_target_positions:
154
+ desc: null
155
+ value: 448
156
+ classifier_proj_size:
157
+ desc: null
158
+ value: 256
159
+ use_weighted_layer_sum:
160
+ desc: null
161
+ value: false
162
+ apply_spec_augment:
163
+ desc: null
164
+ value: false
165
+ mask_time_prob:
166
+ desc: null
167
+ value: 0.05
168
+ mask_time_length:
169
+ desc: null
170
+ value: 10
171
+ mask_time_min_masks:
172
+ desc: null
173
+ value: 2
174
+ mask_feature_prob:
175
+ desc: null
176
+ value: 0.0
177
+ mask_feature_length:
178
+ desc: null
179
+ value: 10
180
+ mask_feature_min_masks:
181
+ desc: null
182
+ value: 0
183
+ median_filter_width:
184
+ desc: null
185
+ value: 7
186
+ return_dict:
187
+ desc: null
188
+ value: true
189
+ output_hidden_states:
190
+ desc: null
191
+ value: false
192
+ output_attentions:
193
+ desc: null
194
+ value: false
195
+ torchscript:
196
+ desc: null
197
+ value: false
198
+ torch_dtype:
199
+ desc: null
200
+ value: float16
201
+ use_bfloat16:
202
+ desc: null
203
+ value: false
204
+ tf_legacy_loss:
205
+ desc: null
206
+ value: false
207
+ pruned_heads:
208
+ desc: null
209
+ value: {}
210
+ tie_word_embeddings:
211
+ desc: null
212
+ value: true
213
+ chunk_size_feed_forward:
214
+ desc: null
215
+ value: 0
216
+ is_encoder_decoder:
217
+ desc: null
218
+ value: true
219
+ is_decoder:
220
+ desc: null
221
+ value: false
222
+ cross_attention_hidden_size:
223
+ desc: null
224
+ value: null
225
+ add_cross_attention:
226
+ desc: null
227
+ value: false
228
+ tie_encoder_decoder:
229
+ desc: null
230
+ value: false
231
+ max_length:
232
+ desc: null
233
+ value: 448
234
+ min_length:
235
+ desc: null
236
+ value: 0
237
+ do_sample:
238
+ desc: null
239
+ value: false
240
+ early_stopping:
241
+ desc: null
242
+ value: false
243
+ num_beams:
244
+ desc: null
245
+ value: 1
246
+ num_beam_groups:
247
+ desc: null
248
+ value: 1
249
+ diversity_penalty:
250
+ desc: null
251
+ value: 0.0
252
+ temperature:
253
+ desc: null
254
+ value: 1.0
255
+ top_k:
256
+ desc: null
257
+ value: 50
258
+ top_p:
259
+ desc: null
260
+ value: 1.0
261
+ typical_p:
262
+ desc: null
263
+ value: 1.0
264
+ repetition_penalty:
265
+ desc: null
266
+ value: 1.0
267
+ length_penalty:
268
+ desc: null
269
+ value: 1.0
270
+ no_repeat_ngram_size:
271
+ desc: null
272
+ value: 0
273
+ encoder_no_repeat_ngram_size:
274
+ desc: null
275
+ value: 0
276
+ bad_words_ids:
277
+ desc: null
278
+ value: null
279
+ num_return_sequences:
280
+ desc: null
281
+ value: 1
282
+ output_scores:
283
+ desc: null
284
+ value: false
285
+ return_dict_in_generate:
286
+ desc: null
287
+ value: false
288
+ forced_bos_token_id:
289
+ desc: null
290
+ value: null
291
+ forced_eos_token_id:
292
+ desc: null
293
+ value: null
294
+ remove_invalid_values:
295
+ desc: null
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: null
299
+ value: null
300
+ suppress_tokens:
301
+ desc: null
302
+ value: null
303
+ begin_suppress_tokens:
304
+ desc: null
305
+ value:
306
+ - 220
307
+ - 50257
308
+ architectures:
309
+ desc: null
310
+ value:
311
+ - WhisperForConditionalGeneration
312
+ finetuning_task:
313
+ desc: null
314
+ value: null
315
+ id2label:
316
+ desc: null
317
+ value:
318
+ '0': LABEL_0
319
+ '1': LABEL_1
320
+ label2id:
321
+ desc: null
322
+ value:
323
+ LABEL_0: 0
324
+ LABEL_1: 1
325
+ tokenizer_class:
326
+ desc: null
327
+ value: null
328
+ prefix:
329
+ desc: null
330
+ value: null
331
+ bos_token_id:
332
+ desc: null
333
+ value: 50257
334
+ pad_token_id:
335
+ desc: null
336
+ value: 50256
337
+ eos_token_id:
338
+ desc: null
339
+ value: 50257
340
+ sep_token_id:
341
+ desc: null
342
+ value: null
343
+ decoder_start_token_id:
344
+ desc: null
345
+ value: 50258
346
+ task_specific_params:
347
+ desc: null
348
+ value: null
349
+ problem_type:
350
+ desc: null
351
+ value: null
352
+ _name_or_path:
353
+ desc: null
354
+ value: distil-whisper/distil-large-v3
355
+ transformers_version:
356
+ desc: null
357
+ value: 4.40.0.dev0
358
+ model_type:
359
+ desc: null
360
+ value: whisper
361
+ forced_decoder_ids:
362
+ desc: null
363
+ value: null
364
+ output_dir:
365
+ desc: null
366
+ value: ./
367
+ overwrite_output_dir:
368
+ desc: null
369
+ value: true
370
+ do_train:
371
+ desc: null
372
+ value: true
373
+ do_eval:
374
+ desc: null
375
+ value: true
376
+ do_predict:
377
+ desc: null
378
+ value: false
379
+ evaluation_strategy:
380
+ desc: null
381
+ value: steps
382
+ prediction_loss_only:
383
+ desc: null
384
+ value: false
385
+ per_device_train_batch_size:
386
+ desc: null
387
+ value: 32
388
+ per_device_eval_batch_size:
389
+ desc: null
390
+ value: 32
391
+ per_gpu_train_batch_size:
392
+ desc: null
393
+ value: null
394
+ per_gpu_eval_batch_size:
395
+ desc: null
396
+ value: null
397
+ gradient_accumulation_steps:
398
+ desc: null
399
+ value: 1
400
+ eval_accumulation_steps:
401
+ desc: null
402
+ value: null
403
+ eval_delay:
404
+ desc: null
405
+ value: 0
406
+ learning_rate:
407
+ desc: null
408
+ value: 0.0001
409
+ weight_decay:
410
+ desc: null
411
+ value: 0.0
412
+ adam_beta1:
413
+ desc: null
414
+ value: 0.9
415
+ adam_beta2:
416
+ desc: null
417
+ value: 0.999
418
+ adam_epsilon:
419
+ desc: null
420
+ value: 1.0e-08
421
+ max_grad_norm:
422
+ desc: null
423
+ value: 1.0
424
+ num_train_epochs:
425
+ desc: null
426
+ value: 3.0
427
+ max_steps:
428
+ desc: null
429
+ value: 5000
430
+ lr_scheduler_type:
431
+ desc: null
432
+ value: linear
433
+ lr_scheduler_kwargs:
434
+ desc: null
435
+ value: {}
436
+ warmup_ratio:
437
+ desc: null
438
+ value: 0.0
439
+ warmup_steps:
440
+ desc: null
441
+ value: 500
442
+ log_level:
443
+ desc: null
444
+ value: passive
445
+ log_level_replica:
446
+ desc: null
447
+ value: warning
448
+ log_on_each_node:
449
+ desc: null
450
+ value: true
451
+ logging_dir:
452
+ desc: null
453
+ value: ./runs/Mar27_19-04-58_hf-dgx-01
454
+ logging_strategy:
455
+ desc: null
456
+ value: steps
457
+ logging_first_step:
458
+ desc: null
459
+ value: false
460
+ logging_steps:
461
+ desc: null
462
+ value: 25
463
+ logging_nan_inf_filter:
464
+ desc: null
465
+ value: true
466
+ save_strategy:
467
+ desc: null
468
+ value: steps
469
+ save_steps:
470
+ desc: null
471
+ value: 1000
472
+ save_total_limit:
473
+ desc: null
474
+ value: 1
475
+ save_safetensors:
476
+ desc: null
477
+ value: true
478
+ save_on_each_node:
479
+ desc: null
480
+ value: false
481
+ save_only_model:
482
+ desc: null
483
+ value: false
484
+ no_cuda:
485
+ desc: null
486
+ value: false
487
+ use_cpu:
488
+ desc: null
489
+ value: false
490
+ use_mps_device:
491
+ desc: null
492
+ value: false
493
+ seed:
494
+ desc: null
495
+ value: 42
496
+ data_seed:
497
+ desc: null
498
+ value: null
499
+ jit_mode_eval:
500
+ desc: null
501
+ value: false
502
+ use_ipex:
503
+ desc: null
504
+ value: false
505
+ bf16:
506
+ desc: null
507
+ value: false
508
+ fp16:
509
+ desc: null
510
+ value: true
511
+ fp16_opt_level:
512
+ desc: null
513
+ value: O1
514
+ half_precision_backend:
515
+ desc: null
516
+ value: auto
517
+ bf16_full_eval:
518
+ desc: null
519
+ value: false
520
+ fp16_full_eval:
521
+ desc: null
522
+ value: false
523
+ tf32:
524
+ desc: null
525
+ value: null
526
+ local_rank:
527
+ desc: null
528
+ value: 0
529
+ ddp_backend:
530
+ desc: null
531
+ value: null
532
+ tpu_num_cores:
533
+ desc: null
534
+ value: null
535
+ tpu_metrics_debug:
536
+ desc: null
537
+ value: false
538
+ debug:
539
+ desc: null
540
+ value: []
541
+ dataloader_drop_last:
542
+ desc: null
543
+ value: false
544
+ eval_steps:
545
+ desc: null
546
+ value: 1000
547
+ dataloader_num_workers:
548
+ desc: null
549
+ value: 4
550
+ dataloader_prefetch_factor:
551
+ desc: null
552
+ value: null
553
+ past_index:
554
+ desc: null
555
+ value: -1
556
+ run_name:
557
+ desc: null
558
+ value: ./
559
+ disable_tqdm:
560
+ desc: null
561
+ value: false
562
+ remove_unused_columns:
563
+ desc: null
564
+ value: true
565
+ label_names:
566
+ desc: null
567
+ value: null
568
+ load_best_model_at_end:
569
+ desc: null
570
+ value: false
571
+ metric_for_best_model:
572
+ desc: null
573
+ value: null
574
+ greater_is_better:
575
+ desc: null
576
+ value: null
577
+ ignore_data_skip:
578
+ desc: null
579
+ value: false
580
+ fsdp:
581
+ desc: null
582
+ value: []
583
+ fsdp_min_num_params:
584
+ desc: null
585
+ value: 0
586
+ fsdp_config:
587
+ desc: null
588
+ value:
589
+ min_num_params: 0
590
+ xla: false
591
+ xla_fsdp_v2: false
592
+ xla_fsdp_grad_ckpt: false
593
+ fsdp_transformer_layer_cls_to_wrap:
594
+ desc: null
595
+ value: null
596
+ accelerator_config:
597
+ desc: null
598
+ value:
599
+ split_batches: false
600
+ dispatch_batches: null
601
+ even_batches: true
602
+ use_seedable_sampler: true
603
+ deepspeed:
604
+ desc: null
605
+ value: null
606
+ label_smoothing_factor:
607
+ desc: null
608
+ value: 0.0
609
+ optim:
610
+ desc: null
611
+ value: adamw_torch
612
+ optim_args:
613
+ desc: null
614
+ value: null
615
+ adafactor:
616
+ desc: null
617
+ value: false
618
+ group_by_length:
619
+ desc: null
620
+ value: false
621
+ length_column_name:
622
+ desc: null
623
+ value: input_length
624
+ report_to:
625
+ desc: null
626
+ value:
627
+ - tensorboard
628
+ - wandb
629
+ ddp_find_unused_parameters:
630
+ desc: null
631
+ value: null
632
+ ddp_bucket_cap_mb:
633
+ desc: null
634
+ value: null
635
+ ddp_broadcast_buffers:
636
+ desc: null
637
+ value: null
638
+ dataloader_pin_memory:
639
+ desc: null
640
+ value: true
641
+ dataloader_persistent_workers:
642
+ desc: null
643
+ value: false
644
+ skip_memory_metrics:
645
+ desc: null
646
+ value: true
647
+ use_legacy_prediction_loop:
648
+ desc: null
649
+ value: false
650
+ push_to_hub:
651
+ desc: null
652
+ value: true
653
+ resume_from_checkpoint:
654
+ desc: null
655
+ value: null
656
+ hub_model_id:
657
+ desc: null
658
+ value: null
659
+ hub_strategy:
660
+ desc: null
661
+ value: every_save
662
+ hub_token:
663
+ desc: null
664
+ value: <HUB_TOKEN>
665
+ hub_private_repo:
666
+ desc: null
667
+ value: false
668
+ hub_always_push:
669
+ desc: null
670
+ value: false
671
+ gradient_checkpointing:
672
+ desc: null
673
+ value: true
674
+ gradient_checkpointing_kwargs:
675
+ desc: null
676
+ value: null
677
+ include_inputs_for_metrics:
678
+ desc: null
679
+ value: false
680
+ fp16_backend:
681
+ desc: null
682
+ value: auto
683
+ push_to_hub_model_id:
684
+ desc: null
685
+ value: null
686
+ push_to_hub_organization:
687
+ desc: null
688
+ value: null
689
+ push_to_hub_token:
690
+ desc: null
691
+ value: <PUSH_TO_HUB_TOKEN>
692
+ mp_parameters:
693
+ desc: null
694
+ value: ''
695
+ auto_find_batch_size:
696
+ desc: null
697
+ value: false
698
+ full_determinism:
699
+ desc: null
700
+ value: false
701
+ torchdynamo:
702
+ desc: null
703
+ value: null
704
+ ray_scope:
705
+ desc: null
706
+ value: last
707
+ ddp_timeout:
708
+ desc: null
709
+ value: 1800
710
+ torch_compile:
711
+ desc: null
712
+ value: false
713
+ torch_compile_backend:
714
+ desc: null
715
+ value: null
716
+ torch_compile_mode:
717
+ desc: null
718
+ value: null
719
+ dispatch_batches:
720
+ desc: null
721
+ value: null
722
+ split_batches:
723
+ desc: null
724
+ value: null
725
+ include_tokens_per_second:
726
+ desc: null
727
+ value: false
728
+ include_num_input_tokens_seen:
729
+ desc: null
730
+ value: false
731
+ neftune_noise_alpha:
732
+ desc: null
733
+ value: null
734
+ optim_target_modules:
735
+ desc: null
736
+ value: null
737
+ sortish_sampler:
738
+ desc: null
739
+ value: false
740
+ predict_with_generate:
741
+ desc: null
742
+ value: true
743
+ generation_max_length:
744
+ desc: null
745
+ value: 225
746
+ generation_num_beams:
747
+ desc: null
748
+ value: null
749
+ generation_config:
750
+ desc: null
751
+ value: null
wandb/run-20240327_190513-7p2x8a0l/files/output.log ADDED
@@ -0,0 +1,1033 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ 0%| | 0/5000 [00:00<?, ?it/s]/home/sanchit/hf/lib/python3.8/site-packages/torch/utils/checkpoint.py:460: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
3
+ warnings.warn(
4
+ /home/sanchit/hf/lib/python3.8/site-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
5
+ warnings.warn(
6
+ [WARNING|logging.py:329] 2024-03-27 19:05:28,423 >> `use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+ 0%|▍ | 25/5000 [00:55<2:17:36, 1.66s/it]
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+ 1%|▊ | 49/5000 [01:35<2:16:42, 1.66s/it]
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+ 2%|█▏ | 75/5000 [02:18<2:16:00, 1.66s/it]
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+ 2%|█▌ | 100/5000 [02:59<2:12:59, 1.63s/it]
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+ 2%|█▉ | 124/5000 [03:39<2:14:58, 1.66s/it]
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+ 3%|██▎ | 149/5000 [04:21<2:14:36, 1.66s/it]
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+ 4%|██▋ | 175/5000 [05:04<2:09:58, 1.62s/it]
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+ 4%|███ | 200/5000 [05:45<2:12:57, 1.66s/it]
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+ 4%|███▌ | 225/5000 [06:40<4:28:32, 3.37s/it]
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+ 5%|███▉ | 250/5000 [07:21<2:11:07, 1.66s/it]
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+ 6%|████▎ | 275/5000 [08:03<2:10:40, 1.66s/it]
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+ 6%|████▋ | 300/5000 [08:44<2:09:22, 1.65s/it]
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+ 6%|█████ | 325/5000 [09:26<2:09:34, 1.66s/it]
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+ 7%|█████▍ | 350/5000 [10:08<2:09:00, 1.66s/it]
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
+ 8%|█████▊ | 375/5000 [10:49<2:08:16, 1.66s/it]
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+ 8%|██████▏ | 399/5000 [11:29<2:07:37, 1.66s/it]
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+
371
+
372
+
373
+
374
+
375
+
376
+ 8%|██████▋ | 425/5000 [12:12<2:07:02, 1.67s/it]
377
+
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+ 9%|███████ | 450/5000 [13:07<2:57:10, 2.34s/it]
398
+
399
+
400
+
401
+
402
+
403
+
404
+
405
+
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+
415
+
416
+
417
+
418
+
419
+ 10%|███████▍ | 475/5000 [13:48<2:05:43, 1.67s/it]
420
+
421
+
422
+
423
+
424
+
425
+
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+ 10%|███████▊ | 500/5000 [14:30<2:05:03, 1.67s/it]
442
+
443
+
444
+
445
+
446
+
447
+
448
+
449
+
450
+
451
+
452
+
453
+
454
+
455
+
456
+
457
+
458
+
459
+
460
+
461
+
462
+ 10%|████████▏ | 524/5000 [15:10<2:04:10, 1.66s/it]
463
+
464
+
465
+
466
+
467
+
468
+
469
+
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+
483
+
484
+ 11%|████████▌ | 549/5000 [15:51<2:03:08, 1.66s/it]
485
+
486
+
487
+
488
+
489
+
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
502
+
503
+
504
+
505
+
506
+ 12%|████████▉ | 575/5000 [16:35<2:02:47, 1.66s/it]
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+ 12%|█████████▎ | 600/5000 [17:16<2:02:00, 1.66s/it]
529
+
530
+
531
+
532
+
533
+
534
+
535
+
536
+
537
+
538
+
539
+
540
+
541
+
542
+
543
+
544
+
545
+
546
+
547
+
548
+
549
+
550
+ 12%|█████████▊ | 625/5000 [17:58<2:01:53, 1.67s/it]
551
+
552
+
553
+
554
+
555
+
556
+
557
+
558
+
559
+
560
+
561
+
562
+
563
+
564
+
565
+
566
+
567
+
568
+
569
+
570
+
571
+ 13%|██████████ | 649/5000 [18:38<2:01:13, 1.67s/it]
572
+
573
+
574
+
575
+
576
+
577
+
578
+
579
+
580
+
581
+
582
+
583
+
584
+
585
+
586
+
587
+
588
+
589
+
590
+
591
+
592
+
593
+ 14%|██████████▌ | 675/5000 [19:33<2:15:32, 1.88s/it]
594
+
595
+
596
+
597
+
598
+
599
+
600
+
601
+
602
+
603
+
604
+
605
+
606
+
607
+
608
+
609
+
610
+
611
+
612
+
613
+
614
+ 14%|██████████▉ | 700/5000 [20:15<2:52:11, 2.40s/it]
615
+
616
+
617
+
618
+
619
+
620
+
621
+
622
+
623
+
624
+
625
+
626
+
627
+
628
+
629
+
630
+
631
+
632
+
633
+
634
+
635
+
636
+ 14%|███████████▎ | 725/5000 [21:00<1:59:02, 1.67s/it]
637
+
638
+
639
+
640
+
641
+
642
+
643
+
644
+
645
+
646
+
647
+
648
+
649
+
650
+
651
+
652
+
653
+
654
+
655
+
656
+
657
+ 15%|███████████▋ | 749/5000 [21:40<1:58:00, 1.67s/it]
658
+
659
+
660
+
661
+
662
+
663
+
664
+
665
+
666
+
667
+
668
+
669
+
670
+
671
+
672
+
673
+
674
+
675
+
676
+
677
+
678
+
679
+ 16%|████████████ | 775/5000 [22:23<1:57:00, 1.66s/it]
680
+
681
+
682
+
683
+
684
+
685
+
686
+
687
+
688
+
689
+
690
+
691
+
692
+
693
+
694
+
695
+
696
+
697
+
698
+
699
+
700
+
701
+ 16%|████████████▍ | 800/5000 [23:05<1:56:25, 1.66s/it]
702
+
703
+
704
+
705
+
706
+
707
+
708
+
709
+
710
+
711
+
712
+
713
+
714
+
715
+
716
+
717
+
718
+
719
+
720
+
721
+
722
+ 16%|████████████▊ | 824/5000 [23:45<1:55:40, 1.66s/it]
723
+
724
+
725
+
726
+
727
+
728
+
729
+
730
+
731
+
732
+
733
+
734
+
735
+
736
+
737
+
738
+
739
+
740
+
741
+
742
+
743
+
744
+ 17%|█████████████▏ | 849/5000 [24:26<1:55:06, 1.66s/it]
745
+
746
+
747
+
748
+
749
+
750
+
751
+
752
+
753
+
754
+
755
+
756
+
757
+
758
+
759
+
760
+
761
+
762
+
763
+
764
+
765
+
766
+ 18%|█████████████▋ | 875/5000 [25:10<1:54:31, 1.67s/it]
767
+
768
+
769
+
770
+
771
+
772
+
773
+
774
+
775
+
776
+
777
+
778
+
779
+
780
+
781
+
782
+
783
+
784
+
785
+
786
+
787
+
788
+ 18%|██████████████ | 900/5000 [26:03<1:59:10, 1.74s/it]
789
+
790
+
791
+
792
+
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+ 18%|██████████████▍ | 924/5000 [26:43<1:52:44, 1.66s/it]
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
+
818
+
819
+
820
+
821
+
822
+
823
+
824
+
825
+
826
+
827
+
828
+
829
+
830
+
831
+ 19%|██████████████▊ | 950/5000 [27:26<1:52:20, 1.66s/it]
832
+
833
+
834
+
835
+
836
+
837
+
838
+
839
+
840
+
841
+
842
+
843
+
844
+
845
+
846
+
847
+
848
+
849
+
850
+
851
+
852
+
853
+ 20%|███████████████▏ | 975/5000 [28:08<1:51:49, 1.67s/it]
854
+
855
+
856
+
857
+
858
+
859
+
860
+
861
+
862
+
863
+
864
+
865
+
866
+
867
+
868
+
869
+
870
+
871
+
872
+
873
+
874
+ 20%|███████████████▍ | 1000/5000 [28:49<1:50:58, 1.66s/it][INFO|trainer.py:768] 2024-03-27 19:34:04,504 >> The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`, you can safely ignore this message.
875
+ [INFO|trainer.py:3544] 2024-03-27 19:34:04,506 >> ***** Running Evaluation *****
876
+ [INFO|trainer.py:3546] 2024-03-27 19:34:04,507 >> Num examples = 3123
877
+ [INFO|trainer.py:3549] 2024-03-27 19:34:04,507 >> Batch size = 32
878
+ {'loss': 0.1035, 'grad_norm': 1.2479132413864136, 'learning_rate': 8.900000000000001e-05, 'epoch': 4.5}
879
+ [INFO|generation_whisper.py:1111] 2024-03-27 19:34:16,924 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
880
+ [INFO|generation_whisper.py:1111] 2024-03-27 19:34:32,221 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
881
+ 0%| | 0/98 [00:00<?, ?it/s][INFO|generation_whisper.py:1111] 2024-03-27 19:34:46,567 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
882
+ 2%|█▋ | 2/98 [00:14<11:28, 7.17s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:35:00,902 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
883
+ 3%|██▌ | 3/98 [00:28<16:04, 10.16s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:35:15,181 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
884
+ 4%|███▍ | 4/98 [00:42<18:19, 11.69s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:35:29,903 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
885
+ 5%|████▎ | 5/98 [00:57<19:45, 12.75s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:35:43,657 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
886
+ 6%|█████▏ | 6/98 [01:11<20:03, 13.08s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:35:57,833 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
887
+ 7%|██████ | 7/98 [01:25<20:22, 13.43s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:36:12,133 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
888
+ 8%|██████▊ | 8/98 [01:39<20:33, 13.71s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:36:26,263 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
889
+ 9%|███████▋ | 9/98 [01:54<20:31, 13.84s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:36:40,303 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
890
+ 10%|████████▍ | 10/98 [02:08<20:23, 13.90s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:36:54,524 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
891
+ 11%|█████████▎ | 11/98 [02:22<20:17, 14.00s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:37:08,502 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
892
+ 12%|██████████▏ | 12/98 [02:36<20:03, 13.99s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:37:22,202 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
893
+ 13%|███████████ | 13/98 [02:49<19:41, 13.90s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:37:36,108 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
894
+ 14%|███████████▊ | 14/98 [03:03<19:27, 13.90s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:37:50,111 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
895
+ 15%|████████████▋ | 15/98 [03:17<19:16, 13.93s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:38:03,743 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
896
+ 16%|█████████████▌ | 16/98 [03:31<18:55, 13.84s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:38:17,662 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
897
+ 17%|██████████████▍ | 17/98 [03:45<18:43, 13.87s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:38:31,417 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
898
+ 18%|███████████████▏ | 18/98 [03:59<18:26, 13.83s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:38:45,386 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
899
+ 19%|████████████████ | 19/98 [04:13<18:15, 13.87s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:38:59,266 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
900
+ 20%|████████████████▉ | 20/98 [04:27<18:02, 13.88s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:39:13,359 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
901
+ 21%|█████████████████▊ | 21/98 [04:41<17:53, 13.94s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:39:27,370 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
902
+ 22%|██████████████████▋ | 22/98 [04:55<17:41, 13.96s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:39:41,500 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
903
+ 23%|███████████████████▍ | 23/98 [05:09<17:30, 14.01s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:39:55,547 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
904
+ 24%|████████████████████▎ | 24/98 [05:23<17:17, 14.02s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:40:09,901 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
905
+ 26%|█████████████████████▏ | 25/98 [05:37<17:10, 14.12s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:40:24,028 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
906
+ 27%|██████████████████████ | 26/98 [05:51<16:56, 14.12s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:40:38,068 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
907
+ 28%|██████████████████████▊ | 27/98 [06:05<16:41, 14.10s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:40:52,103 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
908
+ 29%|███████████████████████▋ | 28/98 [06:19<16:25, 14.08s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:41:06,018 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
909
+ 30%|████████████████████████▌ | 29/98 [06:33<16:08, 14.03s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:41:20,035 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
910
+ 31%|█████████████████████████▍ | 30/98 [06:47<15:53, 14.03s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:41:33,986 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
911
+ 32%|██████████████████████████▎ | 31/98 [07:01<15:38, 14.00s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:41:47,777 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
912
+ 33%|███████████████████████████ | 32/98 [07:15<15:20, 13.94s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:42:01,578 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
913
+ 34%|███████████████████████████▉ | 33/98 [07:29<15:03, 13.90s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:42:15,086 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
914
+ 35%|████████████████████████████▊ | 34/98 [07:42<14:41, 13.78s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:42:29,489 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
915
+ 36%|█████████████████████████████▋ | 35/98 [07:57<14:40, 13.97s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:42:43,167 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
916
+ 37%|██████████████████████████████▍ | 36/98 [08:10<14:20, 13.88s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:42:57,438 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
917
+ 38%|███████████████████████████████▎ | 37/98 [08:25<14:13, 14.00s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:43:11,313 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
918
+ 39%|████████████████████████████████▏ | 38/98 [08:39<13:57, 13.96s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:43:25,386 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
919
+ 40%|█████████████████████████████████ | 39/98 [08:53<13:45, 13.99s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:43:39,191 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
920
+ 41%|█████████████████████████████████▉ | 40/98 [09:06<13:28, 13.94s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:43:53,137 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
921
+ 42%|██████████████████████████████████▋ | 41/98 [09:20<13:14, 13.94s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:44:06,953 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
922
+ 43%|███████████████████████████████████▌ | 42/98 [09:34<12:58, 13.90s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:44:20,954 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
923
+ 44%|████████████████████████████████████▍ | 43/98 [09:48<12:46, 13.93s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:44:34,912 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
924
+ 45%|█████████████████████████████████████▎ | 44/98 [10:02<12:32, 13.94s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:44:48,785 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
925
+ 46%|██████████████████████████████████████ | 45/98 [10:16<12:17, 13.92s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:45:02,804 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
926
+ 47%|██████████████████████████████████████▉ | 46/98 [10:30<12:05, 13.95s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:45:17,142 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
927
+ 48%|███████████████████████████████████████▊ | 47/98 [10:44<11:57, 14.07s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:45:31,387 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
928
+ 49%|████████████████████████████████████████▋ | 48/98 [10:59<11:45, 14.12s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:45:45,556 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
929
+ 50%|█████████████████████████████████████████▌ | 49/98 [11:13<11:32, 14.13s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:45:57,797 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
930
+ 51%|██████████████████████████████████████████▎ | 50/98 [11:25<10:51, 13.57s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:46:03,379 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
931
+ 52%|███████████████████████████████████████████▏ | 51/98 [11:31<08:45, 11.17s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:46:09,863 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
932
+ 53%|████████████████████████████████████████████ | 52/98 [11:37<07:29, 9.76s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:46:24,315 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
933
+ 54%|████████████████████████████████████████████▉ | 53/98 [11:52<08:22, 11.17s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:46:38,546 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
934
+ 55%|█████████████████████████████████████████████▋ | 54/98 [12:06<08:51, 12.09s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:46:52,468 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
935
+ 56%|██████████████████████████████████████████████▌ | 55/98 [12:20<09:03, 12.64s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:47:06,484 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
936
+ 57%|███████████████████████████████████████████████▍ | 56/98 [12:34<09:08, 13.05s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:47:20,369 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
937
+ 58%|████████████████████████████████████████████████▎ | 57/98 [12:48<09:05, 13.30s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:47:34,273 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
938
+ 59%|█████████████████████████████████████████████████ | 58/98 [13:02<08:59, 13.48s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:47:48,252 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
939
+ 60%|█████████████████████████████████████████████████▉ | 59/98 [13:16<08:51, 13.63s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:48:02,183 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
940
+ 61%|██████████████████████████████████████████████████▊ | 60/98 [13:29<08:41, 13.72s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:48:15,984 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
941
+ 62%|███████████████████████████████████████████████████▋ | 61/98 [13:43<08:28, 13.75s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:48:29,807 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
942
+ 63%|████████████████████████████████████████████████████▌ | 62/98 [13:57<08:15, 13.77s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:48:43,858 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
943
+ 64%|█████████████████████████████████████████████████████▎ | 63/98 [14:11<08:04, 13.85s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:48:58,084 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
944
+ 65%|██████████████████████████████████████████████████████▏ | 64/98 [14:25<07:54, 13.97s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:49:11,939 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
945
+ 66%|███████████████████████████████████████████████████████ | 65/98 [14:39<07:39, 13.93s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:49:26,109 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
946
+ 67%|███████████████████████████████████████████████████████▉ | 66/98 [14:53<07:28, 14.00s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:49:40,079 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
947
+ 68%|████████████████████████████████████████████████████████▋ | 67/98 [15:07<07:13, 13.99s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:49:53,707 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
948
+ 69%|█████████████████████████████████████████████████████████▌ | 68/98 [15:21<06:56, 13.88s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:50:07,852 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
949
+ 70%|██████████████████████████████████████████████████████████▍ | 69/98 [15:35<06:44, 13.96s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:50:21,891 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
950
+ 71%|███████████████████████████████████████████████████████████▎ | 70/98 [15:49<06:31, 13.99s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:50:35,042 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
951
+ 72%|████████████████████████████████████████████████████████████▏ | 71/98 [16:02<06:10, 13.73s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:50:49,446 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
952
+ 73%|████████████████████████████████████████████████████████████▉ | 72/98 [16:17<06:02, 13.94s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:51:03,382 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
953
+ 74%|█████████████████████████████████████████████████████████████��� | 73/98 [16:31<05:48, 13.94s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:51:17,823 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
954
+ 76%|██████████████████████████████████████████████████████████████▋ | 74/98 [16:45<05:38, 14.09s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:51:31,733 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
955
+ 77%|███████████████████████████████████████████████████████████████▌ | 75/98 [16:59<05:22, 14.03s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:51:45,415 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
956
+ 78%|████████████████████████████████████████████████████████████████▎ | 76/98 [17:13<05:06, 13.93s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:51:59,435 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
957
+ 79%|█████████████████████████████████████████████████████████████████▏ | 77/98 [17:27<04:53, 13.96s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:52:12,864 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
958
+ 80%|██████████████████████████████████████████████████████████████████ | 78/98 [17:40<04:35, 13.80s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:52:27,026 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
959
+ 81%|██████████████████████████████████████████████████████████████████▉ | 79/98 [17:54<04:24, 13.91s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:52:40,949 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
960
+ 82%|███████████████████████████████████████████████████████████████████▊ | 80/98 [18:08<04:10, 13.91s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:52:54,527 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
961
+ 83%|████████████████████████████████████████████████████████████████████▌ | 81/98 [18:22<03:54, 13.81s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:53:08,717 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
962
+ 84%|█████████████████████████████████████████████████████████████████████▍ | 82/98 [18:36<03:42, 13.93s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:53:22,833 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
963
+ 85%|██████████████████████████████████████████████████████████████████████▎ | 83/98 [18:50<03:29, 13.98s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:53:36,626 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
964
+ 86%|███████████████████████████████████████████████████████████████████████▏ | 84/98 [19:04<03:14, 13.93s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:53:50,526 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
965
+ 87%|███████████████████████████████████████████████████████████████████████▉ | 85/98 [19:18<03:00, 13.92s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:54:03,498 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
966
+ 88%|████████████████████████████████████████████████████████████████████████▊ | 86/98 [19:31<02:43, 13.63s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:54:17,511 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
967
+ 89%|█████████████████████████████████████████████████████████████████████████▋ | 87/98 [19:45<02:31, 13.75s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:54:31,136 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
968
+ 90%|██████████████████████████████████████████████████████████████████████████▌ | 88/98 [19:58<02:17, 13.71s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:54:44,966 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
969
+ 91%|███████████████████████████████████████████████████████████████████████████▍ | 89/98 [20:12<02:03, 13.75s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:54:58,947 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
970
+ 93%|█████████████████████████████████████████████████████████████████████████████ | 91/98 [20:40<01:36, 13.71s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:55:12,423 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
971
+ 94%|█████████████████████████████████████████████████████████████████████████████▉ | 92/98 [20:53<01:22, 13.73s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:55:26,178 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
972
+ 95%|██████████████████████████████████████████████████████████████████████████████▊ | 93/98 [21:07<01:08, 13.72s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:55:39,889 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
973
+ 96%|███████████████████████████████████████████████████████████████████████████████▌ | 94/98 [21:21<00:54, 13.67s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:55:53,445 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
974
+ 97%|████████████████████████████████████████████████████████████████████████████████▍ | 95/98 [21:35<00:41, 13.72s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:56:07,282 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
975
+ 98%|█████████████████████████████████████████████████████████████████████████████████▎ | 96/98 [21:49<00:27, 13.80s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:56:21,266 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
976
+ 99%|██████████████████████████████████████████████████████████████████████████████████▏| 97/98 [22:03<00:13, 13.87s/it][INFO|generation_whisper.py:1111] 2024-03-27 19:56:35,449 >> You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
977
+ [WARNING|configuration_utils.py:447] 2024-03-27 19:56:44,906 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.of task=transcribe.
978
+ [WARNING|configuration_utils.py:447] 2024-03-27 19:56:44,906 >> Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.of task=transcribe.
979
+ Non-default generation parameters: {'max_length': 448, 'begin_suppress_tokens': [220, 50257]}
980
+ [INFO|configuration_utils.py:471] 2024-03-27 19:56:44,906 >> Configuration saved in ./checkpoint-1000/config.json
981
+ [INFO|configuration_utils.py:697] 2024-03-27 19:56:44,907 >> Configuration saved in ./checkpoint-1000/generation_config.json
982
+ {'eval_loss': 0.30151915550231934, 'eval_wer': 0.3249633006258209, 'eval_runtime': 1360.3971, 'eval_samples_per_second': 2.296, 'eval_steps_per_second': 0.072, 'epoch': 4.5}
983
+ [INFO|modeling_utils.py:2475] 2024-03-27 19:56:52,939 >> Model weights saved in ./checkpoint-1000/model.safetensors
984
+ [INFO|feature_extraction_utils.py:424] 2024-03-27 19:56:52,940 >> Feature extractor saved in ./checkpoint-1000/preprocessor_config.json
985
+ [INFO|feature_extraction_utils.py:424] 2024-03-27 19:57:03,422 >> Feature extractor saved in ./preprocessor_config.json
986
+ /home/sanchit/hf/lib/python3.8/site-packages/torch/utils/checkpoint.py:460: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
987
+ warnings.warn(
988
+ /home/sanchit/hf/lib/python3.8/site-packages/torch/utils/checkpoint.py:90: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
989
+ warnings.warn(
990
+ 20%|██████████████▊ | 1002/5000 [51:52<323:26:46, 291.25s/it]
991
+ 20%|██████████████▊ | 1003/5000 [51:53<226:54:23, 204.37s/it]
992
+ 20%|██████████████▊ | 1004/5000 [51:55<159:21:05, 143.56s/it]
993
+ 20%|██████████████▊ | 1005/5000 [51:57<112:04:19, 100.99s/it]
994
+ 20%|███████████████▎ | 1007/5000 [52:00<55:49:31, 50.33s/it]
995
+ 20%|███████████████▎ | 1008/5000 [52:02<39:37:18, 35.73s/it]
996
+ 20%|███████████████▎ | 1009/5000 [52:03<28:16:59, 25.51s/it]
997
+ 20%|███████████████▎ | 1010/5000 [52:05<20:20:35, 18.35s/it]
998
+ 20%|███████████████▎ | 1011/5000 [52:07<14:47:29, 13.35s/it]
999
+ 20%|███████████████▌ | 1013/5000 [52:10<8:11:11, 7.39s/it]
1000
+ 20%|███████████████▌ | 1014/5000 [52:12<6:16:40, 5.67s/it]
1001
+ 20%|███████████████▋ | 1015/5000 [52:13<4:56:55, 4.47s/it]
1002
+ 20%|███████████████▋ | 1016/5000 [52:15<4:00:59, 3.63s/it]
1003
+ 20%|███████████████▋ | 1017/5000 [52:17<3:21:51, 3.04s/it]
1004
+ 20%|███████████████▋ | 1019/5000 [52:20<2:35:03, 2.34s/it]
1005
+ 20%|███████████████▋ | 1020/5000 [52:21<2:21:35, 2.13s/it]
1006
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1007
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1008
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1009
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1010
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1011
+ {'loss': 0.1114, 'grad_norm': 1.8175023794174194, 'learning_rate': 8.844444444444445e-05, 'epoch': 4.62}
1012
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1013
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1014
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1015
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1016
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1017
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1018
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1019
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1020
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1021
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1022
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1023
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1024
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1025
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1026
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1027
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1028
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1029
+ 20%|█████████���█████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1030
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1031
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1032
+ 20%|███████████████▋ | 1021/5000 [52:23<2:11:25, 1.98s/it]
1033
+ {'loss': 0.1059, 'grad_norm': 1.7170511484146118, 'learning_rate': 8.78888888888889e-05, 'epoch': 4.73}
wandb/run-20240327_190513-7p2x8a0l/files/requirements.txt ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ accelerate==0.27.2
3
+ aiohttp==3.9.3
4
+ aiosignal==1.3.1
5
+ anyio==4.2.0
6
+ appdirs==1.4.4
7
+ argon2-cffi-bindings==21.2.0
8
+ argon2-cffi==23.1.0
9
+ arrow==1.3.0
10
+ asttokens==2.4.1
11
+ astunparse==1.6.3
12
+ async-lru==2.0.4
13
+ async-timeout==4.0.3
14
+ attrs==23.2.0
15
+ audioread==3.0.1
16
+ av==11.0.0
17
+ babel==2.14.0
18
+ backcall==0.2.0
19
+ beautifulsoup4==4.12.3
20
+ bitsandbytes==0.42.0
21
+ bleach==6.1.0
22
+ cached-property==1.5.2
23
+ cachetools==5.3.2
24
+ certifi==2024.2.2
25
+ cffi==1.16.0
26
+ charset-normalizer==3.3.2
27
+ chex==0.1.7
28
+ click==8.1.7
29
+ coloredlogs==15.0.1
30
+ comm==0.2.1
31
+ contourpy==1.1.1
32
+ ctranslate2==4.1.0
33
+ cycler==0.12.1
34
+ datasets==2.18.0
35
+ debugpy==1.8.0
36
+ decorator==5.1.1
37
+ defusedxml==0.7.1
38
+ dill==0.3.7
39
+ dm-tree==0.1.8
40
+ docker-pycreds==0.4.0
41
+ docstring-parser==0.15
42
+ einops==0.7.0
43
+ etils==1.3.0
44
+ evaluate==0.4.1
45
+ exceptiongroup==1.2.0
46
+ executing==2.0.1
47
+ fastjsonschema==2.19.1
48
+ filelock==3.13.1
49
+ flash-attn==2.5.3
50
+ flatbuffers==23.5.26
51
+ flax==0.7.2
52
+ fonttools==4.48.1
53
+ fqdn==1.5.1
54
+ frozenlist==1.4.1
55
+ fsspec==2024.2.0
56
+ gast==0.4.0
57
+ gitdb==4.0.11
58
+ gitpython==3.1.41
59
+ google-auth-oauthlib==1.0.0
60
+ google-auth==2.27.0
61
+ google-pasta==0.2.0
62
+ grpcio==1.60.1
63
+ h11==0.14.0
64
+ h5py==3.10.0
65
+ httpcore==1.0.2
66
+ httpx==0.26.0
67
+ huggingface-hub==0.21.4
68
+ humanfriendly==10.0
69
+ idna==3.6
70
+ importlib-metadata==7.0.1
71
+ importlib-resources==6.1.1
72
+ iniconfig==2.0.0
73
+ ipdb==0.13.13
74
+ ipykernel==6.29.2
75
+ ipython==8.12.3
76
+ isoduration==20.11.0
77
+ jax==0.4.13
78
+ jaxlib==0.4.13
79
+ jedi==0.19.1
80
+ jinja2==3.1.2
81
+ jiwer==3.0.3
82
+ joblib==1.3.2
83
+ json5==0.9.14
84
+ jsonpointer==2.4
85
+ jsonschema-specifications==2023.12.1
86
+ jsonschema==4.21.1
87
+ jupyter-client==8.6.0
88
+ jupyter-core==5.7.1
89
+ jupyter-events==0.9.0
90
+ jupyter-lsp==2.2.2
91
+ jupyter-server-terminals==0.5.2
92
+ jupyter-server==2.12.5
93
+ jupyterlab-pygments==0.3.0
94
+ jupyterlab-server==2.25.2
95
+ jupyterlab==4.1.0
96
+ keras==2.13.1
97
+ kiwisolver==1.4.5
98
+ lazy-loader==0.3
99
+ libclang==16.0.6
100
+ librosa==0.10.1
101
+ llvmlite==0.41.1
102
+ markdown-it-py==3.0.0
103
+ markdown==3.5.2
104
+ markupsafe==2.1.3
105
+ matplotlib-inline==0.1.6
106
+ matplotlib==3.7.4
107
+ mdurl==0.1.2
108
+ mistune==3.0.2
109
+ ml-dtypes==0.2.0
110
+ more-itertools==10.2.0
111
+ mpmath==1.2.1
112
+ msclap==1.3.3
113
+ msgpack==1.0.7
114
+ multidict==6.0.5
115
+ multiprocess==0.70.15
116
+ nbclient==0.9.0
117
+ nbconvert==7.16.0
118
+ nbformat==5.9.2
119
+ nest-asyncio==1.6.0
120
+ networkx==3.0rc1
121
+ ninja==1.11.1.1
122
+ notebook-shim==0.2.3
123
+ numba==0.58.1
124
+ numpy==1.24.3
125
+ nvidia-cublas-cu12==12.1.3.1
126
+ nvidia-cuda-cupti-cu12==12.1.105
127
+ nvidia-cuda-nvrtc-cu12==12.1.105
128
+ nvidia-cuda-runtime-cu12==12.1.105
129
+ nvidia-cudnn-cu12==8.9.2.26
130
+ nvidia-cufft-cu12==11.0.2.54
131
+ nvidia-curand-cu12==10.3.2.106
132
+ nvidia-cusolver-cu12==11.4.5.107
133
+ nvidia-cusparse-cu12==12.1.0.106
134
+ nvidia-nccl-cu12==2.19.3
135
+ nvidia-nvjitlink-cu12==12.1.105
136
+ nvidia-nvtx-cu12==12.1.105
137
+ oauthlib==3.2.2
138
+ onnxruntime==1.17.1
139
+ openai-whisper==20231117
140
+ opt-einsum==3.3.0
141
+ optax==0.1.8
142
+ orbax-checkpoint==0.2.3
143
+ overrides==7.7.0
144
+ packaging==23.2
145
+ pandas==2.0.3
146
+ pandocfilters==1.5.1
147
+ parameterized==0.9.0
148
+ parso==0.8.3
149
+ peft==0.8.2
150
+ pexpect==4.9.0
151
+ pickleshare==0.7.5
152
+ pillow==9.3.0
153
+ pip==24.0
154
+ pkg-resources==0.0.0
155
+ pkgutil-resolve-name==1.3.10
156
+ platformdirs==4.2.0
157
+ pluggy==1.4.0
158
+ pooch==1.8.0
159
+ prometheus-client==0.19.0
160
+ prompt-toolkit==3.0.43
161
+ protobuf==4.25.2
162
+ psutil==5.9.8
163
+ ptyprocess==0.7.0
164
+ pure-eval==0.2.2
165
+ pyarrow-hotfix==0.6
166
+ pyarrow==15.0.0
167
+ pyasn1-modules==0.3.0
168
+ pyasn1==0.5.1
169
+ pycparser==2.21
170
+ pygments==2.17.2
171
+ pyparsing==3.1.1
172
+ pytest==7.4.4
173
+ python-dateutil==2.8.2
174
+ python-json-logger==2.0.7
175
+ pytorch-triton==3.0.0+901819d2b6
176
+ pytz==2024.1
177
+ pyyaml==6.0.1
178
+ pyzmq==25.1.2
179
+ rapidfuzz==3.6.1
180
+ referencing==0.33.0
181
+ regex==2023.12.25
182
+ requests-oauthlib==1.3.1
183
+ requests==2.31.0
184
+ responses==0.18.0
185
+ rfc3339-validator==0.1.4
186
+ rfc3986-validator==0.1.1
187
+ rich==13.7.0
188
+ rpds-py==0.17.1
189
+ rsa==4.9
190
+ safetensors==0.4.2
191
+ scikit-learn==1.3.2
192
+ scipy==1.10.1
193
+ send2trash==1.8.2
194
+ sentry-sdk==1.40.0
195
+ setproctitle==1.3.3
196
+ setuptools==44.0.0
197
+ shtab==1.7.0
198
+ six==1.16.0
199
+ smmap==5.0.1
200
+ sniffio==1.3.0
201
+ soundfile==0.12.1
202
+ soupsieve==2.5
203
+ soxr==0.3.7
204
+ stack-data==0.6.3
205
+ sympy==1.11.1
206
+ tensorboard-data-server==0.7.2
207
+ tensorboard==2.13.0
208
+ tensorflow-cpu==2.13.1
209
+ tensorflow-estimator==2.13.0
210
+ tensorflow-io-gcs-filesystem==0.34.0
211
+ tensorstore==0.1.45
212
+ termcolor==2.4.0
213
+ terminado==0.18.0
214
+ threadpoolctl==3.2.0
215
+ tiktoken==0.6.0
216
+ tinycss2==1.2.1
217
+ tokenizers==0.15.1
218
+ tomli==2.0.1
219
+ toolz==0.12.1
220
+ torch==2.2.1
221
+ torchaudio==2.2.1
222
+ torchlibrosa==0.1.0
223
+ torchvision==0.17.1
224
+ tornado==6.4
225
+ tqdm==4.66.1
226
+ traitlets==5.14.1
227
+ transformers==4.39.0.dev0
228
+ triton==2.2.0
229
+ trl==0.7.11
230
+ types-python-dateutil==2.8.19.20240106
231
+ typing-extensions==4.9.0
232
+ tyro==0.7.3
233
+ tzdata==2023.4
234
+ uri-template==1.3.0
235
+ urllib3==2.2.0
236
+ wandb==0.16.2
237
+ wcwidth==0.2.13
238
+ webcolors==1.13
239
+ webencodings==0.5.1
240
+ websocket-client==1.7.0
241
+ werkzeug==3.0.1
242
+ wheel==0.42.0
243
+ wrapt==1.16.0
244
+ xxhash==3.4.1
245
+ yarl==1.9.4
246
+ zipp==3.17.0
wandb/run-20240327_190513-7p2x8a0l/files/wandb-metadata.json ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-166-generic-x86_64-with-glibc2.29",
3
+ "python": "3.8.10",
4
+ "heartbeatAt": "2024-03-27T18:05:14.699269",
5
+ "startedAt": "2024-03-27T18:05:13.643873",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "--model_name_or_path=distil-whisper/distil-large-v3",
10
+ "--dataset_name=mozilla-foundation/common_voice_16_1",
11
+ "--dataset_config_name=hi",
12
+ "--language=hindi",
13
+ "--train_split_name=train+validation",
14
+ "--eval_split_name=test",
15
+ "--max_steps=5000",
16
+ "--output_dir=./",
17
+ "--per_device_train_batch_size=32",
18
+ "--per_device_eval_batch_size=32",
19
+ "--logging_steps=25",
20
+ "--learning_rate=1e-4",
21
+ "--warmup_steps=500",
22
+ "--evaluation_strategy=steps",
23
+ "--eval_steps=1000",
24
+ "--save_strategy=steps",
25
+ "--save_steps=1000",
26
+ "--save_total_limit=1",
27
+ "--generation_max_length=225",
28
+ "--preprocessing_num_workers=1",
29
+ "--dataloader_num_workers=4",
30
+ "--length_column_name=input_length",
31
+ "--max_duration_in_seconds=30",
32
+ "--text_column_name=sentence",
33
+ "--freeze_feature_encoder=False",
34
+ "--freeze_encoder",
35
+ "--gradient_checkpointing",
36
+ "--fp16",
37
+ "--overwrite_output_dir",
38
+ "--do_train",
39
+ "--do_eval",
40
+ "--predict_with_generate",
41
+ "--use_auth_token",
42
+ "--push_to_hub"
43
+ ],
44
+ "state": "running",
45
+ "program": "run_speech_recognition_seq2seq.py",
46
+ "codePathLocal": "run_speech_recognition_seq2seq.py",
47
+ "codePath": "run_speech_recognition_seq2seq.py",
48
+ "git": {
49
+ "remote": "https://huggingface.co/sanchit-gandhi/distil-large-v3-hi-ft-frozen-encoder",
50
+ "commit": "e7946df277d73ac75c34c2017b01c6d39e0275cd"
51
+ },
52
+ "email": "sanchit@huggingface.co",
53
+ "root": "/home/sanchit/distil-large-v3-hi-ft-frozen-encoder",
54
+ "host": "hf-dgx-01",
55
+ "username": "sanchit",
56
+ "executable": "/home/sanchit/hf/bin/python",
57
+ "cpu_count": 64,
58
+ "cpu_count_logical": 128,
59
+ "cpu_freq": {
60
+ "current": 2055.302406250001,
61
+ "min": 1500.0,
62
+ "max": 2250.0
63
+ },
64
+ "cpu_freq_per_core": [
65
+ {
66
+ "current": 1785.051,
67
+ "min": 1500.0,
68
+ "max": 2250.0
69
+ },
70
+ {
71
+ "current": 1714.027,
72
+ "min": 1500.0,
73
+ "max": 2250.0
74
+ },
75
+ {
76
+ "current": 1716.844,
77
+ "min": 1500.0,
78
+ "max": 2250.0
79
+ },
80
+ {
81
+ "current": 2279.976,
82
+ "min": 1500.0,
83
+ "max": 2250.0
84
+ },
85
+ {
86
+ "current": 3328.296,
87
+ "min": 1500.0,
88
+ "max": 2250.0
89
+ },
90
+ {
91
+ "current": 2778.772,
92
+ "min": 1500.0,
93
+ "max": 2250.0
94
+ },
95
+ {
96
+ "current": 1670.514,
97
+ "min": 1500.0,
98
+ "max": 2250.0
99
+ },
100
+ {
101
+ "current": 1679.17,
102
+ "min": 1500.0,
103
+ "max": 2250.0
104
+ },
105
+ {
106
+ "current": 1666.11,
107
+ "min": 1500.0,
108
+ "max": 2250.0
109
+ },
110
+ {
111
+ "current": 3342.154,
112
+ "min": 1500.0,
113
+ "max": 2250.0
114
+ },
115
+ {
116
+ "current": 1667.105,
117
+ "min": 1500.0,
118
+ "max": 2250.0
119
+ },
120
+ {
121
+ "current": 1667.65,
122
+ "min": 1500.0,
123
+ "max": 2250.0
124
+ },
125
+ {
126
+ "current": 1665.492,
127
+ "min": 1500.0,
128
+ "max": 2250.0
129
+ },
130
+ {
131
+ "current": 3344.267,
132
+ "min": 1500.0,
133
+ "max": 2250.0
134
+ },
135
+ {
136
+ "current": 1991.129,
137
+ "min": 1500.0,
138
+ "max": 2250.0
139
+ },
140
+ {
141
+ "current": 1669.832,
142
+ "min": 1500.0,
143
+ "max": 2250.0
144
+ },
145
+ {
146
+ "current": 1681.287,
147
+ "min": 1500.0,
148
+ "max": 2250.0
149
+ },
150
+ {
151
+ "current": 1667.975,
152
+ "min": 1500.0,
153
+ "max": 2250.0
154
+ },
155
+ {
156
+ "current": 1665.691,
157
+ "min": 1500.0,
158
+ "max": 2250.0
159
+ },
160
+ {
161
+ "current": 3343.923,
162
+ "min": 1500.0,
163
+ "max": 2250.0
164
+ },
165
+ {
166
+ "current": 1714.613,
167
+ "min": 1500.0,
168
+ "max": 2250.0
169
+ },
170
+ {
171
+ "current": 1711.757,
172
+ "min": 1500.0,
173
+ "max": 2250.0
174
+ },
175
+ {
176
+ "current": 2605.829,
177
+ "min": 1500.0,
178
+ "max": 2250.0
179
+ },
180
+ {
181
+ "current": 1715.095,
182
+ "min": 1500.0,
183
+ "max": 2250.0
184
+ },
185
+ {
186
+ "current": 1868.607,
187
+ "min": 1500.0,
188
+ "max": 2250.0
189
+ },
190
+ {
191
+ "current": 1761.225,
192
+ "min": 1500.0,
193
+ "max": 2250.0
194
+ },
195
+ {
196
+ "current": 1722.608,
197
+ "min": 1500.0,
198
+ "max": 2250.0
199
+ },
200
+ {
201
+ "current": 2446.611,
202
+ "min": 1500.0,
203
+ "max": 2250.0
204
+ },
205
+ {
206
+ "current": 1795.419,
207
+ "min": 1500.0,
208
+ "max": 2250.0
209
+ },
210
+ {
211
+ "current": 1794.485,
212
+ "min": 1500.0,
213
+ "max": 2250.0
214
+ },
215
+ {
216
+ "current": 1796.608,
217
+ "min": 1500.0,
218
+ "max": 2250.0
219
+ },
220
+ {
221
+ "current": 1774.005,
222
+ "min": 1500.0,
223
+ "max": 2250.0
224
+ },
225
+ {
226
+ "current": 3318.879,
227
+ "min": 1500.0,
228
+ "max": 2250.0
229
+ },
230
+ {
231
+ "current": 3356.822,
232
+ "min": 1500.0,
233
+ "max": 2250.0
234
+ },
235
+ {
236
+ "current": 1674.163,
237
+ "min": 1500.0,
238
+ "max": 2250.0
239
+ },
240
+ {
241
+ "current": 1676.528,
242
+ "min": 1500.0,
243
+ "max": 2250.0
244
+ },
245
+ {
246
+ "current": 3346.209,
247
+ "min": 1500.0,
248
+ "max": 2250.0
249
+ },
250
+ {
251
+ "current": 1679.409,
252
+ "min": 1500.0,
253
+ "max": 2250.0
254
+ },
255
+ {
256
+ "current": 1673.8,
257
+ "min": 1500.0,
258
+ "max": 2250.0
259
+ },
260
+ {
261
+ "current": 1675.44,
262
+ "min": 1500.0,
263
+ "max": 2250.0
264
+ },
265
+ {
266
+ "current": 1722.367,
267
+ "min": 1500.0,
268
+ "max": 2250.0
269
+ },
270
+ {
271
+ "current": 1721.361,
272
+ "min": 1500.0,
273
+ "max": 2250.0
274
+ },
275
+ {
276
+ "current": 1719.425,
277
+ "min": 1500.0,
278
+ "max": 2250.0
279
+ },
280
+ {
281
+ "current": 2130.252,
282
+ "min": 1500.0,
283
+ "max": 2250.0
284
+ },
285
+ {
286
+ "current": 1792.838,
287
+ "min": 1500.0,
288
+ "max": 2250.0
289
+ },
290
+ {
291
+ "current": 1794.535,
292
+ "min": 1500.0,
293
+ "max": 2250.0
294
+ },
295
+ {
296
+ "current": 1793.69,
297
+ "min": 1500.0,
298
+ "max": 2250.0
299
+ },
300
+ {
301
+ "current": 1792.257,
302
+ "min": 1500.0,
303
+ "max": 2250.0
304
+ },
305
+ {
306
+ "current": 1794.417,
307
+ "min": 1500.0,
308
+ "max": 2250.0
309
+ },
310
+ {
311
+ "current": 1794.058,
312
+ "min": 1500.0,
313
+ "max": 2250.0
314
+ },
315
+ {
316
+ "current": 1795.79,
317
+ "min": 1500.0,
318
+ "max": 2250.0
319
+ },
320
+ {
321
+ "current": 1795.254,
322
+ "min": 1500.0,
323
+ "max": 2250.0
324
+ },
325
+ {
326
+ "current": 1665.759,
327
+ "min": 1500.0,
328
+ "max": 2250.0
329
+ },
330
+ {
331
+ "current": 3285.802,
332
+ "min": 1500.0,
333
+ "max": 2250.0
334
+ },
335
+ {
336
+ "current": 3327.549,
337
+ "min": 1500.0,
338
+ "max": 2250.0
339
+ },
340
+ {
341
+ "current": 1666.454,
342
+ "min": 1500.0,
343
+ "max": 2250.0
344
+ },
345
+ {
346
+ "current": 2291.123,
347
+ "min": 1500.0,
348
+ "max": 2250.0
349
+ },
350
+ {
351
+ "current": 1845.287,
352
+ "min": 1500.0,
353
+ "max": 2250.0
354
+ },
355
+ {
356
+ "current": 1737.335,
357
+ "min": 1500.0,
358
+ "max": 2250.0
359
+ },
360
+ {
361
+ "current": 1736.798,
362
+ "min": 1500.0,
363
+ "max": 2250.0
364
+ },
365
+ {
366
+ "current": 1665.761,
367
+ "min": 1500.0,
368
+ "max": 2250.0
369
+ },
370
+ {
371
+ "current": 2172.943,
372
+ "min": 1500.0,
373
+ "max": 2250.0
374
+ },
375
+ {
376
+ "current": 2109.384,
377
+ "min": 1500.0,
378
+ "max": 2250.0
379
+ },
380
+ {
381
+ "current": 1666.332,
382
+ "min": 1500.0,
383
+ "max": 2250.0
384
+ },
385
+ {
386
+ "current": 2173.511,
387
+ "min": 1500.0,
388
+ "max": 2250.0
389
+ },
390
+ {
391
+ "current": 2187.364,
392
+ "min": 1500.0,
393
+ "max": 2250.0
394
+ },
395
+ {
396
+ "current": 2152.119,
397
+ "min": 1500.0,
398
+ "max": 2250.0
399
+ },
400
+ {
401
+ "current": 3315.314,
402
+ "min": 1500.0,
403
+ "max": 2250.0
404
+ },
405
+ {
406
+ "current": 3333.811,
407
+ "min": 1500.0,
408
+ "max": 2250.0
409
+ },
410
+ {
411
+ "current": 1966.133,
412
+ "min": 1500.0,
413
+ "max": 2250.0
414
+ },
415
+ {
416
+ "current": 1730.353,
417
+ "min": 1500.0,
418
+ "max": 2250.0
419
+ },
420
+ {
421
+ "current": 2183.755,
422
+ "min": 1500.0,
423
+ "max": 2250.0
424
+ },
425
+ {
426
+ "current": 1836.471,
427
+ "min": 1500.0,
428
+ "max": 2250.0
429
+ },
430
+ {
431
+ "current": 3319.659,
432
+ "min": 1500.0,
433
+ "max": 2250.0
434
+ },
435
+ {
436
+ "current": 1666.1,
437
+ "min": 1500.0,
438
+ "max": 2250.0
439
+ },
440
+ {
441
+ "current": 3156.296,
442
+ "min": 1500.0,
443
+ "max": 2250.0
444
+ },
445
+ {
446
+ "current": 1661.519,
447
+ "min": 1500.0,
448
+ "max": 2250.0
449
+ },
450
+ {
451
+ "current": 3259.492,
452
+ "min": 1500.0,
453
+ "max": 2250.0
454
+ },
455
+ {
456
+ "current": 2267.628,
457
+ "min": 1500.0,
458
+ "max": 2250.0
459
+ },
460
+ {
461
+ "current": 1666.441,
462
+ "min": 1500.0,
463
+ "max": 2250.0
464
+ },
465
+ {
466
+ "current": 1666.329,
467
+ "min": 1500.0,
468
+ "max": 2250.0
469
+ },
470
+ {
471
+ "current": 1661.974,
472
+ "min": 1500.0,
473
+ "max": 2250.0
474
+ },
475
+ {
476
+ "current": 1663.269,
477
+ "min": 1500.0,
478
+ "max": 2250.0
479
+ },
480
+ {
481
+ "current": 3315.801,
482
+ "min": 1500.0,
483
+ "max": 2250.0
484
+ },
485
+ {
486
+ "current": 1663.948,
487
+ "min": 1500.0,
488
+ "max": 2250.0
489
+ },
490
+ {
491
+ "current": 1665.171,
492
+ "min": 1500.0,
493
+ "max": 2250.0
494
+ },
495
+ {
496
+ "current": 3147.217,
497
+ "min": 1500.0,
498
+ "max": 2250.0
499
+ },
500
+ {
501
+ "current": 1663.184,
502
+ "min": 1500.0,
503
+ "max": 2250.0
504
+ },
505
+ {
506
+ "current": 2145.9,
507
+ "min": 1500.0,
508
+ "max": 2250.0
509
+ },
510
+ {
511
+ "current": 2302.183,
512
+ "min": 1500.0,
513
+ "max": 2250.0
514
+ },
515
+ {
516
+ "current": 1664.105,
517
+ "min": 1500.0,
518
+ "max": 2250.0
519
+ },
520
+ {
521
+ "current": 1812.149,
522
+ "min": 1500.0,
523
+ "max": 2250.0
524
+ },
525
+ {
526
+ "current": 1739.416,
527
+ "min": 1500.0,
528
+ "max": 2250.0
529
+ },
530
+ {
531
+ "current": 1735.942,
532
+ "min": 1500.0,
533
+ "max": 2250.0
534
+ },
535
+ {
536
+ "current": 1735.725,
537
+ "min": 1500.0,
538
+ "max": 2250.0
539
+ },
540
+ {
541
+ "current": 1846.358,
542
+ "min": 1500.0,
543
+ "max": 2250.0
544
+ },
545
+ {
546
+ "current": 3324.686,
547
+ "min": 1500.0,
548
+ "max": 2250.0
549
+ },
550
+ {
551
+ "current": 3313.397,
552
+ "min": 1500.0,
553
+ "max": 2250.0
554
+ },
555
+ {
556
+ "current": 1664.908,
557
+ "min": 1500.0,
558
+ "max": 2250.0
559
+ },
560
+ {
561
+ "current": 1666.332,
562
+ "min": 1500.0,
563
+ "max": 2250.0
564
+ },
565
+ {
566
+ "current": 3357.779,
567
+ "min": 1500.0,
568
+ "max": 2250.0
569
+ },
570
+ {
571
+ "current": 2196.764,
572
+ "min": 1500.0,
573
+ "max": 2250.0
574
+ },
575
+ {
576
+ "current": 1629.818,
577
+ "min": 1500.0,
578
+ "max": 2250.0
579
+ },
580
+ {
581
+ "current": 1548.482,
582
+ "min": 1500.0,
583
+ "max": 2250.0
584
+ },
585
+ {
586
+ "current": 1550.408,
587
+ "min": 1500.0,
588
+ "max": 2250.0
589
+ },
590
+ {
591
+ "current": 1551.062,
592
+ "min": 1500.0,
593
+ "max": 2250.0
594
+ },
595
+ {
596
+ "current": 1733.314,
597
+ "min": 1500.0,
598
+ "max": 2250.0
599
+ },
600
+ {
601
+ "current": 2797.062,
602
+ "min": 1500.0,
603
+ "max": 2250.0
604
+ },
605
+ {
606
+ "current": 1556.743,
607
+ "min": 1500.0,
608
+ "max": 2250.0
609
+ },
610
+ {
611
+ "current": 2040.048,
612
+ "min": 1500.0,
613
+ "max": 2250.0
614
+ },
615
+ {
616
+ "current": 1557.951,
617
+ "min": 1500.0,
618
+ "max": 2250.0
619
+ },
620
+ {
621
+ "current": 1548.578,
622
+ "min": 1500.0,
623
+ "max": 2250.0
624
+ },
625
+ {
626
+ "current": 2204.931,
627
+ "min": 1500.0,
628
+ "max": 2250.0
629
+ },
630
+ {
631
+ "current": 2191.05,
632
+ "min": 1500.0,
633
+ "max": 2250.0
634
+ },
635
+ {
636
+ "current": 2206.083,
637
+ "min": 1500.0,
638
+ "max": 2250.0
639
+ },
640
+ {
641
+ "current": 2179.071,
642
+ "min": 1500.0,
643
+ "max": 2250.0
644
+ },
645
+ {
646
+ "current": 2137.078,
647
+ "min": 1500.0,
648
+ "max": 2250.0
649
+ },
650
+ {
651
+ "current": 2908.115,
652
+ "min": 1500.0,
653
+ "max": 2250.0
654
+ },
655
+ {
656
+ "current": 3329.978,
657
+ "min": 1500.0,
658
+ "max": 2250.0
659
+ },
660
+ {
661
+ "current": 2146.791,
662
+ "min": 1500.0,
663
+ "max": 2250.0
664
+ },
665
+ {
666
+ "current": 1835.104,
667
+ "min": 1500.0,
668
+ "max": 2250.0
669
+ },
670
+ {
671
+ "current": 3279.409,
672
+ "min": 1500.0,
673
+ "max": 2250.0
674
+ },
675
+ {
676
+ "current": 2340.495,
677
+ "min": 1500.0,
678
+ "max": 2250.0
679
+ },
680
+ {
681
+ "current": 2350.105,
682
+ "min": 1500.0,
683
+ "max": 2250.0
684
+ },
685
+ {
686
+ "current": 1860.834,
687
+ "min": 1500.0,
688
+ "max": 2250.0
689
+ },
690
+ {
691
+ "current": 1928.589,
692
+ "min": 1500.0,
693
+ "max": 2250.0
694
+ },
695
+ {
696
+ "current": 3299.491,
697
+ "min": 1500.0,
698
+ "max": 2250.0
699
+ },
700
+ {
701
+ "current": 1897.289,
702
+ "min": 1500.0,
703
+ "max": 2250.0
704
+ }
705
+ ],
706
+ "disk": {
707
+ "/": {
708
+ "total": 1757.8785285949707,
709
+ "used": 1610.3510513305664
710
+ }
711
+ },
712
+ "gpu": "NVIDIA A100-SXM4-80GB",
713
+ "gpu_count": 5,
714
+ "gpu_devices": [
715
+ {
716
+ "name": "NVIDIA A100-SXM4-80GB",
717
+ "memory_total": 85899345920
718
+ },
719
+ {
720
+ "name": "NVIDIA A100-SXM4-80GB",
721
+ "memory_total": 85899345920
722
+ },
723
+ {
724
+ "name": "NVIDIA A100-SXM4-80GB",
725
+ "memory_total": 85899345920
726
+ },
727
+ {
728
+ "name": "NVIDIA DGX Display",
729
+ "memory_total": 4294967296
730
+ },
731
+ {
732
+ "name": "NVIDIA A100-SXM4-80GB",
733
+ "memory_total": 85899345920
734
+ }
735
+ ],
736
+ "memory": {
737
+ "total": 503.5396919250488
738
+ }
739
+ }
wandb/run-20240327_190513-7p2x8a0l/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss": 0.1059, "train/grad_norm": 1.7170511484146118, "train/learning_rate": 8.78888888888889e-05, "train/epoch": 4.73, "train/global_step": 1050, "_timestamp": 1711565906.6150107, "_runtime": 3192.9672026634216, "_step": 42, "eval/loss": 0.30151915550231934, "eval/wer": 0.3249633006258209, "eval/runtime": 1360.3971, "eval/samples_per_second": 2.296, "eval/steps_per_second": 0.072}
wandb/run-20240327_190513-7p2x8a0l/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/run-20240327_190513-7p2x8a0l/logs/debug.log ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Current SDK version is 0.16.2
2
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Configure stats pid to 1894903
3
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Loading settings from /home/sanchit/.config/wandb/settings
4
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Loading settings from /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/settings
5
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_speech_recognition_seq2seq.py', 'program_abspath': '/home/sanchit/distil-large-v3-hi-ft-frozen-encoder/run_speech_recognition_seq2seq.py', 'program': 'run_speech_recognition_seq2seq.py'}
8
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:_log_setup():526] Logging user logs to /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/run-20240327_190513-7p2x8a0l/logs/debug.log
9
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:_log_setup():527] Logging internal logs to /home/sanchit/distil-large-v3-hi-ft-frozen-encoder/wandb/run-20240327_190513-7p2x8a0l/logs/debug-internal.log
10
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:init():566] calling init triggers
11
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:init():573] wandb.init called with sweep_config: {}
12
+ config: {}
13
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:init():616] starting backend
14
+ 2024-03-27 19:05:13,645 INFO MainThread:1894903 [wandb_init.py:init():620] setting up manager
15
+ 2024-03-27 19:05:13,646 INFO MainThread:1894903 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
16
+ 2024-03-27 19:05:13,647 INFO MainThread:1894903 [wandb_init.py:init():628] backend started and connected
17
+ 2024-03-27 19:05:13,651 INFO MainThread:1894903 [wandb_init.py:init():720] updated telemetry
18
+ 2024-03-27 19:05:13,720 INFO MainThread:1894903 [wandb_init.py:init():753] communicating run to backend with 90.0 second timeout
19
+ 2024-03-27 19:05:14,027 INFO MainThread:1894903 [wandb_run.py:_on_init():2254] communicating current version
20
+ 2024-03-27 19:05:14,056 INFO MainThread:1894903 [wandb_run.py:_on_init():2263] got version response upgrade_message: "wandb version 0.16.5 is available! To upgrade, please run:\n $ pip install wandb --upgrade"
21
+
22
+ 2024-03-27 19:05:14,056 INFO MainThread:1894903 [wandb_init.py:init():804] starting run threads in backend
23
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_run.py:_console_start():2233] atexit reg
24
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_run.py:_redirect():2088] redirect: wrap_raw
25
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_run.py:_redirect():2153] Wrapping output streams.
26
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_run.py:_redirect():2178] Redirects installed.
27
+ 2024-03-27 19:05:14,727 INFO MainThread:1894903 [wandb_init.py:init():847] run started, returning control to user process
28
+ 2024-03-27 19:05:14,729 INFO MainThread:1894903 [wandb_run.py:_config_callback():1342] config_cb None None {'vocab_size': 51866, 'num_mel_bins': 128, 'd_model': 1280, 'encoder_layers': 32, 'encoder_attention_heads': 20, 'decoder_layers': 2, 'decoder_attention_heads': 20, 'decoder_ffn_dim': 5120, 'encoder_ffn_dim': 5120, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'activation_function': 'gelu', 'init_std': 0.02, 'encoder_layerdrop': 0.0, 'decoder_layerdrop': 0.0, 'use_cache': True, 'num_hidden_layers': 32, 'scale_embedding': False, 'max_source_positions': 1500, 'max_target_positions': 448, 'classifier_proj_size': 256, 'use_weighted_layer_sum': False, 'apply_spec_augment': False, 'mask_time_prob': 0.05, 'mask_time_length': 10, 'mask_time_min_masks': 2, 'mask_feature_prob': 0.0, 'mask_feature_length': 10, 'mask_feature_min_masks': 0, 'median_filter_width': 7, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 448, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': [220, 50257], 'architectures': ['WhisperForConditionalGeneration'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 50257, 'pad_token_id': 50256, 'eos_token_id': 50257, 'sep_token_id': None, 'decoder_start_token_id': 50258, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distil-whisper/distil-large-v3', 'transformers_version': '4.40.0.dev0', 'model_type': 'whisper', 'forced_decoder_ids': None, 'output_dir': './', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'evaluation_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 5000, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 500, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Mar27_19-04-58_hf-dgx-01', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 25, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 1000, 'dataloader_num_workers': 4, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'input_length', 'report_to': ['tensorboard', 'wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'sortish_sampler': False, 'predict_with_generate': True, 'generation_max_length': 225, 'generation_num_beams': None, 'generation_config': None}
wandb/run-20240327_190513-7p2x8a0l/run-7p2x8a0l.wandb ADDED
Binary file (865 kB). View file