RodrigoFardin commited on
Commit
fdf4d85
1 Parent(s): 0520b5c

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +8 -0
  2. test_results.json +8 -0
  3. trainer_state.json +1038 -0
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.005917159763314,
3
+ "eval_accuracy": 0.890927624872579,
4
+ "eval_loss": 0.4576520621776581,
5
+ "eval_runtime": 1124.6542,
6
+ "eval_samples_per_second": 0.872,
7
+ "eval_steps_per_second": 0.437
8
+ }
test_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.005917159763314,
3
+ "eval_accuracy": 0.890927624872579,
4
+ "eval_loss": 0.4576520621776581,
5
+ "eval_runtime": 1124.6542,
6
+ "eval_samples_per_second": 0.872,
7
+ "eval_steps_per_second": 0.437
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1038 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8978675645342312,
3
+ "best_model_checkpoint": "videomae-base-finetuned-dd\\checkpoint-1344",
4
+ "epoch": 4.005917159763314,
5
+ "eval_steps": 500,
6
+ "global_step": 1352,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0073964497041420114,
13
+ "grad_norm": 22.897619247436523,
14
+ "learning_rate": 3.6764705882352942e-06,
15
+ "loss": 0.7648,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.014792899408284023,
20
+ "grad_norm": 6.239708423614502,
21
+ "learning_rate": 7.3529411764705884e-06,
22
+ "loss": 0.6619,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.022189349112426034,
27
+ "grad_norm": 11.775946617126465,
28
+ "learning_rate": 1.1029411764705883e-05,
29
+ "loss": 0.6844,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.029585798816568046,
34
+ "grad_norm": 12.948445320129395,
35
+ "learning_rate": 1.4705882352941177e-05,
36
+ "loss": 0.6158,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.03698224852071006,
41
+ "grad_norm": 18.087614059448242,
42
+ "learning_rate": 1.8382352941176472e-05,
43
+ "loss": 0.6569,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.04437869822485207,
48
+ "grad_norm": 10.075726509094238,
49
+ "learning_rate": 2.2058823529411766e-05,
50
+ "loss": 0.6572,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.051775147928994084,
55
+ "grad_norm": 4.17144250869751,
56
+ "learning_rate": 2.5735294117647057e-05,
57
+ "loss": 0.3788,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.05917159763313609,
62
+ "grad_norm": 24.00593376159668,
63
+ "learning_rate": 2.9411764705882354e-05,
64
+ "loss": 0.5752,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.06656804733727811,
69
+ "grad_norm": 0.1779569834470749,
70
+ "learning_rate": 3.308823529411765e-05,
71
+ "loss": 0.488,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.07396449704142012,
76
+ "grad_norm": 2.1989564895629883,
77
+ "learning_rate": 3.6764705882352945e-05,
78
+ "loss": 1.0225,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.08136094674556213,
83
+ "grad_norm": 16.50670623779297,
84
+ "learning_rate": 4.044117647058824e-05,
85
+ "loss": 1.0776,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.08875739644970414,
90
+ "grad_norm": 36.278076171875,
91
+ "learning_rate": 4.411764705882353e-05,
92
+ "loss": 1.1623,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.09615384615384616,
97
+ "grad_norm": 8.829483032226562,
98
+ "learning_rate": 4.7794117647058826e-05,
99
+ "loss": 0.9484,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.10355029585798817,
104
+ "grad_norm": 0.47137251496315,
105
+ "learning_rate": 4.983552631578948e-05,
106
+ "loss": 0.2775,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.11094674556213018,
111
+ "grad_norm": 1.3937591314315796,
112
+ "learning_rate": 4.942434210526316e-05,
113
+ "loss": 0.959,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.11834319526627218,
118
+ "grad_norm": 14.168859481811523,
119
+ "learning_rate": 4.901315789473684e-05,
120
+ "loss": 1.2776,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.1257396449704142,
125
+ "grad_norm": 209.46127319335938,
126
+ "learning_rate": 4.860197368421053e-05,
127
+ "loss": 1.1028,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.13313609467455623,
132
+ "grad_norm": 0.22284063696861267,
133
+ "learning_rate": 4.819078947368421e-05,
134
+ "loss": 0.2779,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.14053254437869822,
139
+ "grad_norm": 0.16761070489883423,
140
+ "learning_rate": 4.7779605263157896e-05,
141
+ "loss": 1.05,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.14792899408284024,
146
+ "grad_norm": 33.303768157958984,
147
+ "learning_rate": 4.736842105263158e-05,
148
+ "loss": 0.5783,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.15532544378698224,
153
+ "grad_norm": 1.759582281112671,
154
+ "learning_rate": 4.6957236842105265e-05,
155
+ "loss": 0.5052,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.16272189349112426,
160
+ "grad_norm": 0.16995219886302948,
161
+ "learning_rate": 4.654605263157895e-05,
162
+ "loss": 0.8945,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.17011834319526628,
167
+ "grad_norm": 5.4278740882873535,
168
+ "learning_rate": 4.6134868421052635e-05,
169
+ "loss": 1.1972,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.17751479289940827,
174
+ "grad_norm": 5.3494038581848145,
175
+ "learning_rate": 4.572368421052632e-05,
176
+ "loss": 0.5657,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.1849112426035503,
181
+ "grad_norm": 38.29714584350586,
182
+ "learning_rate": 4.5312500000000004e-05,
183
+ "loss": 0.6919,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.19230769230769232,
188
+ "grad_norm": 31.136438369750977,
189
+ "learning_rate": 4.490131578947369e-05,
190
+ "loss": 0.4502,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.1997041420118343,
195
+ "grad_norm": 16.622379302978516,
196
+ "learning_rate": 4.449013157894737e-05,
197
+ "loss": 0.4002,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.20710059171597633,
202
+ "grad_norm": 0.7369871735572815,
203
+ "learning_rate": 4.407894736842105e-05,
204
+ "loss": 0.7867,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.21449704142011836,
209
+ "grad_norm": 1.566986083984375,
210
+ "learning_rate": 4.3667763157894735e-05,
211
+ "loss": 0.9427,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.22189349112426035,
216
+ "grad_norm": 20.51339340209961,
217
+ "learning_rate": 4.3256578947368426e-05,
218
+ "loss": 1.3318,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.22928994082840237,
223
+ "grad_norm": 16.819499969482422,
224
+ "learning_rate": 4.284539473684211e-05,
225
+ "loss": 0.7456,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.23668639053254437,
230
+ "grad_norm": 0.06786404550075531,
231
+ "learning_rate": 4.2434210526315796e-05,
232
+ "loss": 0.0428,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.2440828402366864,
237
+ "grad_norm": 18.684722900390625,
238
+ "learning_rate": 4.202302631578947e-05,
239
+ "loss": 0.5777,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.2485207100591716,
244
+ "eval_accuracy": 0.8372615039281706,
245
+ "eval_loss": 0.8213610649108887,
246
+ "eval_runtime": 1015.4502,
247
+ "eval_samples_per_second": 0.877,
248
+ "eval_steps_per_second": 0.439,
249
+ "step": 336
250
+ },
251
+ {
252
+ "epoch": 1.0029585798816567,
253
+ "grad_norm": 0.05207992345094681,
254
+ "learning_rate": 4.161184210526316e-05,
255
+ "loss": 0.3122,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 1.0103550295857988,
260
+ "grad_norm": 0.08761809021234512,
261
+ "learning_rate": 4.120065789473684e-05,
262
+ "loss": 0.8974,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 1.017751479289941,
267
+ "grad_norm": 2.162889003753662,
268
+ "learning_rate": 4.078947368421053e-05,
269
+ "loss": 0.5468,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 1.0251479289940828,
274
+ "grad_norm": 0.3578147888183594,
275
+ "learning_rate": 4.037828947368421e-05,
276
+ "loss": 0.4065,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 1.032544378698225,
281
+ "grad_norm": 0.1941654235124588,
282
+ "learning_rate": 3.9967105263157896e-05,
283
+ "loss": 0.5691,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 1.0399408284023668,
288
+ "grad_norm": 0.4009058475494385,
289
+ "learning_rate": 3.955592105263158e-05,
290
+ "loss": 1.0974,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 1.047337278106509,
295
+ "grad_norm": 1.8374804258346558,
296
+ "learning_rate": 3.9144736842105265e-05,
297
+ "loss": 0.3989,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 1.054733727810651,
302
+ "grad_norm": 0.1476195603609085,
303
+ "learning_rate": 3.873355263157895e-05,
304
+ "loss": 0.9985,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 1.0621301775147929,
309
+ "grad_norm": 0.23249651491641998,
310
+ "learning_rate": 3.8322368421052634e-05,
311
+ "loss": 0.0116,
312
+ "step": 420
313
+ },
314
+ {
315
+ "epoch": 1.069526627218935,
316
+ "grad_norm": 120.95174407958984,
317
+ "learning_rate": 3.791118421052632e-05,
318
+ "loss": 0.2631,
319
+ "step": 430
320
+ },
321
+ {
322
+ "epoch": 1.0769230769230769,
323
+ "grad_norm": 0.18491333723068237,
324
+ "learning_rate": 3.7500000000000003e-05,
325
+ "loss": 0.6313,
326
+ "step": 440
327
+ },
328
+ {
329
+ "epoch": 1.084319526627219,
330
+ "grad_norm": 139.33804321289062,
331
+ "learning_rate": 3.708881578947369e-05,
332
+ "loss": 0.8515,
333
+ "step": 450
334
+ },
335
+ {
336
+ "epoch": 1.0917159763313609,
337
+ "grad_norm": 0.22990567982196808,
338
+ "learning_rate": 3.6677631578947366e-05,
339
+ "loss": 0.7119,
340
+ "step": 460
341
+ },
342
+ {
343
+ "epoch": 1.099112426035503,
344
+ "grad_norm": 1.84480881690979,
345
+ "learning_rate": 3.626644736842105e-05,
346
+ "loss": 0.2762,
347
+ "step": 470
348
+ },
349
+ {
350
+ "epoch": 1.106508875739645,
351
+ "grad_norm": 12.12067699432373,
352
+ "learning_rate": 3.5855263157894735e-05,
353
+ "loss": 0.3234,
354
+ "step": 480
355
+ },
356
+ {
357
+ "epoch": 1.113905325443787,
358
+ "grad_norm": 25.058439254760742,
359
+ "learning_rate": 3.5444078947368426e-05,
360
+ "loss": 0.869,
361
+ "step": 490
362
+ },
363
+ {
364
+ "epoch": 1.121301775147929,
365
+ "grad_norm": 20.12900161743164,
366
+ "learning_rate": 3.503289473684211e-05,
367
+ "loss": 1.085,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 1.128698224852071,
372
+ "grad_norm": 12.086252212524414,
373
+ "learning_rate": 3.4621710526315795e-05,
374
+ "loss": 0.7208,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 1.136094674556213,
379
+ "grad_norm": 25.66025161743164,
380
+ "learning_rate": 3.421052631578947e-05,
381
+ "loss": 0.6842,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 1.143491124260355,
386
+ "grad_norm": 15.72556209564209,
387
+ "learning_rate": 3.379934210526316e-05,
388
+ "loss": 0.7922,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 1.150887573964497,
393
+ "grad_norm": 0.22508259117603302,
394
+ "learning_rate": 3.338815789473684e-05,
395
+ "loss": 0.4978,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 1.1582840236686391,
400
+ "grad_norm": 0.3129032254219055,
401
+ "learning_rate": 3.297697368421053e-05,
402
+ "loss": 0.9733,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 1.165680473372781,
407
+ "grad_norm": 2.5810675621032715,
408
+ "learning_rate": 3.256578947368421e-05,
409
+ "loss": 1.0189,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 1.1730769230769231,
414
+ "grad_norm": 0.23227569460868835,
415
+ "learning_rate": 3.2154605263157896e-05,
416
+ "loss": 0.1902,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 1.180473372781065,
421
+ "grad_norm": 41.80963134765625,
422
+ "learning_rate": 3.174342105263158e-05,
423
+ "loss": 0.676,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 1.1878698224852071,
428
+ "grad_norm": 0.23230111598968506,
429
+ "learning_rate": 3.1332236842105265e-05,
430
+ "loss": 0.7534,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 1.195266272189349,
435
+ "grad_norm": 0.22635290026664734,
436
+ "learning_rate": 3.092105263157895e-05,
437
+ "loss": 0.4239,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 1.202662721893491,
442
+ "grad_norm": 1.2736923694610596,
443
+ "learning_rate": 3.0509868421052634e-05,
444
+ "loss": 0.1946,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 1.2100591715976332,
449
+ "grad_norm": 138.90115356445312,
450
+ "learning_rate": 3.009868421052632e-05,
451
+ "loss": 0.3972,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 1.217455621301775,
456
+ "grad_norm": 0.8171183466911316,
457
+ "learning_rate": 2.96875e-05,
458
+ "loss": 0.3475,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 1.2248520710059172,
463
+ "grad_norm": 0.14913396537303925,
464
+ "learning_rate": 2.9276315789473684e-05,
465
+ "loss": 0.0097,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 1.232248520710059,
470
+ "grad_norm": 0.16601596772670746,
471
+ "learning_rate": 2.886513157894737e-05,
472
+ "loss": 0.7906,
473
+ "step": 650
474
+ },
475
+ {
476
+ "epoch": 1.2396449704142012,
477
+ "grad_norm": 0.0358225516974926,
478
+ "learning_rate": 2.8453947368421054e-05,
479
+ "loss": 1.0754,
480
+ "step": 660
481
+ },
482
+ {
483
+ "epoch": 1.2470414201183433,
484
+ "grad_norm": 18.65558433532715,
485
+ "learning_rate": 2.8042763157894735e-05,
486
+ "loss": 1.144,
487
+ "step": 670
488
+ },
489
+ {
490
+ "epoch": 1.2492603550295858,
491
+ "eval_accuracy": 0.8754208754208754,
492
+ "eval_loss": 0.4453337490558624,
493
+ "eval_runtime": 1018.2198,
494
+ "eval_samples_per_second": 0.875,
495
+ "eval_steps_per_second": 0.438,
496
+ "step": 673
497
+ },
498
+ {
499
+ "epoch": 2.0051775147928996,
500
+ "grad_norm": 0.26922106742858887,
501
+ "learning_rate": 2.7631578947368426e-05,
502
+ "loss": 0.0545,
503
+ "step": 680
504
+ },
505
+ {
506
+ "epoch": 2.0125739644970415,
507
+ "grad_norm": 0.08718305081129074,
508
+ "learning_rate": 2.7220394736842107e-05,
509
+ "loss": 0.2609,
510
+ "step": 690
511
+ },
512
+ {
513
+ "epoch": 2.0199704142011834,
514
+ "grad_norm": 0.062128640711307526,
515
+ "learning_rate": 2.6809210526315792e-05,
516
+ "loss": 0.205,
517
+ "step": 700
518
+ },
519
+ {
520
+ "epoch": 2.0273668639053253,
521
+ "grad_norm": 0.03102479875087738,
522
+ "learning_rate": 2.6398026315789476e-05,
523
+ "loss": 0.2414,
524
+ "step": 710
525
+ },
526
+ {
527
+ "epoch": 2.0347633136094676,
528
+ "grad_norm": 0.1195179671049118,
529
+ "learning_rate": 2.598684210526316e-05,
530
+ "loss": 0.8337,
531
+ "step": 720
532
+ },
533
+ {
534
+ "epoch": 2.0421597633136095,
535
+ "grad_norm": 0.18848393857479095,
536
+ "learning_rate": 2.5575657894736842e-05,
537
+ "loss": 0.938,
538
+ "step": 730
539
+ },
540
+ {
541
+ "epoch": 2.0495562130177514,
542
+ "grad_norm": 0.30771324038505554,
543
+ "learning_rate": 2.5164473684210527e-05,
544
+ "loss": 0.5181,
545
+ "step": 740
546
+ },
547
+ {
548
+ "epoch": 2.0569526627218937,
549
+ "grad_norm": 45.85493469238281,
550
+ "learning_rate": 2.4753289473684215e-05,
551
+ "loss": 0.9593,
552
+ "step": 750
553
+ },
554
+ {
555
+ "epoch": 2.0643491124260356,
556
+ "grad_norm": 0.8830978870391846,
557
+ "learning_rate": 2.4342105263157896e-05,
558
+ "loss": 0.0696,
559
+ "step": 760
560
+ },
561
+ {
562
+ "epoch": 2.0717455621301775,
563
+ "grad_norm": 0.23656447231769562,
564
+ "learning_rate": 2.393092105263158e-05,
565
+ "loss": 0.3063,
566
+ "step": 770
567
+ },
568
+ {
569
+ "epoch": 2.0791420118343193,
570
+ "grad_norm": 14.187068939208984,
571
+ "learning_rate": 2.3519736842105265e-05,
572
+ "loss": 1.1477,
573
+ "step": 780
574
+ },
575
+ {
576
+ "epoch": 2.0865384615384617,
577
+ "grad_norm": 20.795085906982422,
578
+ "learning_rate": 2.3108552631578946e-05,
579
+ "loss": 0.6215,
580
+ "step": 790
581
+ },
582
+ {
583
+ "epoch": 2.0939349112426036,
584
+ "grad_norm": 0.27157312631607056,
585
+ "learning_rate": 2.2697368421052634e-05,
586
+ "loss": 0.271,
587
+ "step": 800
588
+ },
589
+ {
590
+ "epoch": 2.1013313609467454,
591
+ "grad_norm": 26.25171661376953,
592
+ "learning_rate": 2.228618421052632e-05,
593
+ "loss": 0.9012,
594
+ "step": 810
595
+ },
596
+ {
597
+ "epoch": 2.1087278106508878,
598
+ "grad_norm": 0.18786455690860748,
599
+ "learning_rate": 2.1875e-05,
600
+ "loss": 0.1285,
601
+ "step": 820
602
+ },
603
+ {
604
+ "epoch": 2.1161242603550297,
605
+ "grad_norm": 0.09338750690221786,
606
+ "learning_rate": 2.1463815789473684e-05,
607
+ "loss": 0.6236,
608
+ "step": 830
609
+ },
610
+ {
611
+ "epoch": 2.1235207100591715,
612
+ "grad_norm": 0.15658584237098694,
613
+ "learning_rate": 2.105263157894737e-05,
614
+ "loss": 0.2534,
615
+ "step": 840
616
+ },
617
+ {
618
+ "epoch": 2.1309171597633134,
619
+ "grad_norm": 0.03897108510136604,
620
+ "learning_rate": 2.0641447368421053e-05,
621
+ "loss": 0.0317,
622
+ "step": 850
623
+ },
624
+ {
625
+ "epoch": 2.1383136094674557,
626
+ "grad_norm": 0.17228557169437408,
627
+ "learning_rate": 2.0230263157894738e-05,
628
+ "loss": 0.5894,
629
+ "step": 860
630
+ },
631
+ {
632
+ "epoch": 2.1457100591715976,
633
+ "grad_norm": 37.38359069824219,
634
+ "learning_rate": 1.9819078947368423e-05,
635
+ "loss": 0.3589,
636
+ "step": 870
637
+ },
638
+ {
639
+ "epoch": 2.1531065088757395,
640
+ "grad_norm": 14.766074180603027,
641
+ "learning_rate": 1.9407894736842107e-05,
642
+ "loss": 0.2452,
643
+ "step": 880
644
+ },
645
+ {
646
+ "epoch": 2.160502958579882,
647
+ "grad_norm": 0.09724285453557968,
648
+ "learning_rate": 1.8996710526315788e-05,
649
+ "loss": 0.4429,
650
+ "step": 890
651
+ },
652
+ {
653
+ "epoch": 2.1678994082840237,
654
+ "grad_norm": 0.20985926687717438,
655
+ "learning_rate": 1.8585526315789476e-05,
656
+ "loss": 0.3037,
657
+ "step": 900
658
+ },
659
+ {
660
+ "epoch": 2.1752958579881656,
661
+ "grad_norm": 17.582971572875977,
662
+ "learning_rate": 1.8174342105263157e-05,
663
+ "loss": 0.8678,
664
+ "step": 910
665
+ },
666
+ {
667
+ "epoch": 2.1826923076923075,
668
+ "grad_norm": 0.10595466196537018,
669
+ "learning_rate": 1.7763157894736842e-05,
670
+ "loss": 0.5378,
671
+ "step": 920
672
+ },
673
+ {
674
+ "epoch": 2.19008875739645,
675
+ "grad_norm": 0.08267045021057129,
676
+ "learning_rate": 1.7351973684210527e-05,
677
+ "loss": 0.3613,
678
+ "step": 930
679
+ },
680
+ {
681
+ "epoch": 2.1974852071005917,
682
+ "grad_norm": 0.33162182569503784,
683
+ "learning_rate": 1.694078947368421e-05,
684
+ "loss": 0.4509,
685
+ "step": 940
686
+ },
687
+ {
688
+ "epoch": 2.2048816568047336,
689
+ "grad_norm": 0.070041224360466,
690
+ "learning_rate": 1.6529605263157896e-05,
691
+ "loss": 0.2242,
692
+ "step": 950
693
+ },
694
+ {
695
+ "epoch": 2.212278106508876,
696
+ "grad_norm": 0.07664915174245834,
697
+ "learning_rate": 1.611842105263158e-05,
698
+ "loss": 0.0046,
699
+ "step": 960
700
+ },
701
+ {
702
+ "epoch": 2.219674556213018,
703
+ "grad_norm": 84.51457977294922,
704
+ "learning_rate": 1.5707236842105265e-05,
705
+ "loss": 0.4331,
706
+ "step": 970
707
+ },
708
+ {
709
+ "epoch": 2.2270710059171597,
710
+ "grad_norm": 0.13595078885555267,
711
+ "learning_rate": 1.5296052631578946e-05,
712
+ "loss": 0.2439,
713
+ "step": 980
714
+ },
715
+ {
716
+ "epoch": 2.234467455621302,
717
+ "grad_norm": 107.51094818115234,
718
+ "learning_rate": 1.4884868421052634e-05,
719
+ "loss": 0.1195,
720
+ "step": 990
721
+ },
722
+ {
723
+ "epoch": 2.241863905325444,
724
+ "grad_norm": 6.981244087219238,
725
+ "learning_rate": 1.4473684210526317e-05,
726
+ "loss": 0.2644,
727
+ "step": 1000
728
+ },
729
+ {
730
+ "epoch": 2.2492603550295858,
731
+ "grad_norm": 0.01494936365634203,
732
+ "learning_rate": 1.4062500000000001e-05,
733
+ "loss": 0.2783,
734
+ "step": 1010
735
+ },
736
+ {
737
+ "epoch": 2.2492603550295858,
738
+ "eval_accuracy": 0.8170594837261503,
739
+ "eval_loss": 0.8930483460426331,
740
+ "eval_runtime": 977.9049,
741
+ "eval_samples_per_second": 0.911,
742
+ "eval_steps_per_second": 0.456,
743
+ "step": 1010
744
+ },
745
+ {
746
+ "epoch": 3.007396449704142,
747
+ "grad_norm": 2.669276714324951,
748
+ "learning_rate": 1.3651315789473684e-05,
749
+ "loss": 0.6067,
750
+ "step": 1020
751
+ },
752
+ {
753
+ "epoch": 3.014792899408284,
754
+ "grad_norm": 0.0429365374147892,
755
+ "learning_rate": 1.3240131578947369e-05,
756
+ "loss": 0.2312,
757
+ "step": 1030
758
+ },
759
+ {
760
+ "epoch": 3.022189349112426,
761
+ "grad_norm": 0.0598057359457016,
762
+ "learning_rate": 1.2828947368421055e-05,
763
+ "loss": 0.0344,
764
+ "step": 1040
765
+ },
766
+ {
767
+ "epoch": 3.029585798816568,
768
+ "grad_norm": 0.0926346629858017,
769
+ "learning_rate": 1.2417763157894738e-05,
770
+ "loss": 0.9614,
771
+ "step": 1050
772
+ },
773
+ {
774
+ "epoch": 3.03698224852071,
775
+ "grad_norm": 0.01585511490702629,
776
+ "learning_rate": 1.200657894736842e-05,
777
+ "loss": 0.7316,
778
+ "step": 1060
779
+ },
780
+ {
781
+ "epoch": 3.044378698224852,
782
+ "grad_norm": 0.10066540539264679,
783
+ "learning_rate": 1.1595394736842107e-05,
784
+ "loss": 0.0041,
785
+ "step": 1070
786
+ },
787
+ {
788
+ "epoch": 3.051775147928994,
789
+ "grad_norm": 0.06809567660093307,
790
+ "learning_rate": 1.118421052631579e-05,
791
+ "loss": 0.0092,
792
+ "step": 1080
793
+ },
794
+ {
795
+ "epoch": 3.059171597633136,
796
+ "grad_norm": 0.14008688926696777,
797
+ "learning_rate": 1.0773026315789474e-05,
798
+ "loss": 1.0007,
799
+ "step": 1090
800
+ },
801
+ {
802
+ "epoch": 3.0665680473372783,
803
+ "grad_norm": 0.030784226953983307,
804
+ "learning_rate": 1.0361842105263159e-05,
805
+ "loss": 0.4198,
806
+ "step": 1100
807
+ },
808
+ {
809
+ "epoch": 3.07396449704142,
810
+ "grad_norm": 0.07336018979549408,
811
+ "learning_rate": 9.950657894736842e-06,
812
+ "loss": 0.2221,
813
+ "step": 1110
814
+ },
815
+ {
816
+ "epoch": 3.081360946745562,
817
+ "grad_norm": 0.06217151880264282,
818
+ "learning_rate": 9.539473684210528e-06,
819
+ "loss": 0.2991,
820
+ "step": 1120
821
+ },
822
+ {
823
+ "epoch": 3.088757396449704,
824
+ "grad_norm": 0.07419371604919434,
825
+ "learning_rate": 9.128289473684211e-06,
826
+ "loss": 0.6161,
827
+ "step": 1130
828
+ },
829
+ {
830
+ "epoch": 3.0961538461538463,
831
+ "grad_norm": 0.09845346957445145,
832
+ "learning_rate": 8.717105263157894e-06,
833
+ "loss": 0.204,
834
+ "step": 1140
835
+ },
836
+ {
837
+ "epoch": 3.103550295857988,
838
+ "grad_norm": 0.020927123725414276,
839
+ "learning_rate": 8.30592105263158e-06,
840
+ "loss": 0.5403,
841
+ "step": 1150
842
+ },
843
+ {
844
+ "epoch": 3.11094674556213,
845
+ "grad_norm": 0.07396041601896286,
846
+ "learning_rate": 7.894736842105263e-06,
847
+ "loss": 0.4637,
848
+ "step": 1160
849
+ },
850
+ {
851
+ "epoch": 3.1183431952662723,
852
+ "grad_norm": 0.11606968194246292,
853
+ "learning_rate": 7.483552631578948e-06,
854
+ "loss": 1.3375,
855
+ "step": 1170
856
+ },
857
+ {
858
+ "epoch": 3.1257396449704142,
859
+ "grad_norm": 0.08246757835149765,
860
+ "learning_rate": 7.072368421052632e-06,
861
+ "loss": 0.4418,
862
+ "step": 1180
863
+ },
864
+ {
865
+ "epoch": 3.133136094674556,
866
+ "grad_norm": 0.17207257449626923,
867
+ "learning_rate": 6.661184210526317e-06,
868
+ "loss": 0.2426,
869
+ "step": 1190
870
+ },
871
+ {
872
+ "epoch": 3.140532544378698,
873
+ "grad_norm": 0.49036145210266113,
874
+ "learning_rate": 6.25e-06,
875
+ "loss": 0.2545,
876
+ "step": 1200
877
+ },
878
+ {
879
+ "epoch": 3.1479289940828403,
880
+ "grad_norm": 68.61939239501953,
881
+ "learning_rate": 5.838815789473685e-06,
882
+ "loss": 0.5084,
883
+ "step": 1210
884
+ },
885
+ {
886
+ "epoch": 3.155325443786982,
887
+ "grad_norm": 0.08831863105297089,
888
+ "learning_rate": 5.4276315789473686e-06,
889
+ "loss": 0.0049,
890
+ "step": 1220
891
+ },
892
+ {
893
+ "epoch": 3.162721893491124,
894
+ "grad_norm": 0.10537251830101013,
895
+ "learning_rate": 5.016447368421053e-06,
896
+ "loss": 0.1406,
897
+ "step": 1230
898
+ },
899
+ {
900
+ "epoch": 3.1701183431952664,
901
+ "grad_norm": 17.6816349029541,
902
+ "learning_rate": 4.605263157894737e-06,
903
+ "loss": 0.486,
904
+ "step": 1240
905
+ },
906
+ {
907
+ "epoch": 3.1775147928994083,
908
+ "grad_norm": 0.461302250623703,
909
+ "learning_rate": 4.194078947368421e-06,
910
+ "loss": 0.0042,
911
+ "step": 1250
912
+ },
913
+ {
914
+ "epoch": 3.18491124260355,
915
+ "grad_norm": 0.04792853444814682,
916
+ "learning_rate": 3.7828947368421055e-06,
917
+ "loss": 0.719,
918
+ "step": 1260
919
+ },
920
+ {
921
+ "epoch": 3.1923076923076925,
922
+ "grad_norm": 0.058049630373716354,
923
+ "learning_rate": 3.3717105263157897e-06,
924
+ "loss": 0.2919,
925
+ "step": 1270
926
+ },
927
+ {
928
+ "epoch": 3.1997041420118344,
929
+ "grad_norm": 13.063404083251953,
930
+ "learning_rate": 2.960526315789474e-06,
931
+ "loss": 0.2856,
932
+ "step": 1280
933
+ },
934
+ {
935
+ "epoch": 3.2071005917159763,
936
+ "grad_norm": 0.4628206193447113,
937
+ "learning_rate": 2.549342105263158e-06,
938
+ "loss": 0.0601,
939
+ "step": 1290
940
+ },
941
+ {
942
+ "epoch": 3.214497041420118,
943
+ "grad_norm": 0.045592982321977615,
944
+ "learning_rate": 2.138157894736842e-06,
945
+ "loss": 0.53,
946
+ "step": 1300
947
+ },
948
+ {
949
+ "epoch": 3.2218934911242605,
950
+ "grad_norm": 0.04378387704491615,
951
+ "learning_rate": 1.7269736842105266e-06,
952
+ "loss": 0.0033,
953
+ "step": 1310
954
+ },
955
+ {
956
+ "epoch": 3.2292899408284024,
957
+ "grad_norm": 0.052677396684885025,
958
+ "learning_rate": 1.3157894736842106e-06,
959
+ "loss": 0.003,
960
+ "step": 1320
961
+ },
962
+ {
963
+ "epoch": 3.2366863905325443,
964
+ "grad_norm": 0.07824493199586868,
965
+ "learning_rate": 9.046052631578948e-07,
966
+ "loss": 0.3084,
967
+ "step": 1330
968
+ },
969
+ {
970
+ "epoch": 3.2440828402366866,
971
+ "grad_norm": 0.05082060024142265,
972
+ "learning_rate": 4.934210526315789e-07,
973
+ "loss": 0.0037,
974
+ "step": 1340
975
+ },
976
+ {
977
+ "epoch": 3.247041420118343,
978
+ "eval_accuracy": 0.8978675645342312,
979
+ "eval_loss": 0.40774551033973694,
980
+ "eval_runtime": 1030.7199,
981
+ "eval_samples_per_second": 0.864,
982
+ "eval_steps_per_second": 0.433,
983
+ "step": 1344
984
+ },
985
+ {
986
+ "epoch": 4.004437869822485,
987
+ "grad_norm": 0.045703090727329254,
988
+ "learning_rate": 8.223684210526316e-08,
989
+ "loss": 0.2113,
990
+ "step": 1350
991
+ },
992
+ {
993
+ "epoch": 4.005917159763314,
994
+ "eval_accuracy": 0.8978675645342312,
995
+ "eval_loss": 0.40793702006340027,
996
+ "eval_runtime": 1018.6267,
997
+ "eval_samples_per_second": 0.875,
998
+ "eval_steps_per_second": 0.438,
999
+ "step": 1352
1000
+ },
1001
+ {
1002
+ "epoch": 4.005917159763314,
1003
+ "step": 1352,
1004
+ "total_flos": 3.3668665207526523e+18,
1005
+ "train_loss": 0.5293050300165978,
1006
+ "train_runtime": 11070.5286,
1007
+ "train_samples_per_second": 0.244,
1008
+ "train_steps_per_second": 0.122
1009
+ },
1010
+ {
1011
+ "epoch": 4.005917159763314,
1012
+ "eval_accuracy": 0.890927624872579,
1013
+ "eval_loss": 0.4576520323753357,
1014
+ "eval_runtime": 1130.3017,
1015
+ "eval_samples_per_second": 0.868,
1016
+ "eval_steps_per_second": 0.434,
1017
+ "step": 1352
1018
+ },
1019
+ {
1020
+ "epoch": 4.005917159763314,
1021
+ "eval_accuracy": 0.890927624872579,
1022
+ "eval_loss": 0.4576520621776581,
1023
+ "eval_runtime": 1124.6542,
1024
+ "eval_samples_per_second": 0.872,
1025
+ "eval_steps_per_second": 0.437,
1026
+ "step": 1352
1027
+ }
1028
+ ],
1029
+ "logging_steps": 10,
1030
+ "max_steps": 1352,
1031
+ "num_input_tokens_seen": 0,
1032
+ "num_train_epochs": 9223372036854775807,
1033
+ "save_steps": 500,
1034
+ "total_flos": 3.3668665207526523e+18,
1035
+ "train_batch_size": 2,
1036
+ "trial_name": null,
1037
+ "trial_params": null
1038
+ }