qubvel-hf HF staff commited on
Commit
e47c1ba
1 Parent(s): 9eaf3be

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,8 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: timm/resnet18.a1_in1k
5
  tags:
 
 
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
@@ -16,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # vit-base-beans
18
 
19
- This model is a fine-tuned version of [timm/resnet18.a1_in1k](https://huggingface.co/timm/resnet18.a1_in1k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.7389
22
  - Accuracy: 0.8045
 
3
  license: apache-2.0
4
  base_model: timm/resnet18.a1_in1k
5
  tags:
6
+ - image-classification
7
+ - vision
8
  - generated_from_trainer
9
  metrics:
10
  - accuracy
 
18
 
19
  # vit-base-beans
20
 
21
+ This model is a fine-tuned version of [timm/resnet18.a1_in1k](https://huggingface.co/timm/resnet18.a1_in1k) on the beans dataset.
22
  It achieves the following results on the evaluation set:
23
  - Loss: 0.7389
24
  - Accuracy: 0.8045
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 15.0,
3
- "eval_accuracy": 0.7894736842105263,
4
- "eval_loss": 0.8550169467926025,
5
- "eval_runtime": 0.7658,
6
- "eval_samples_per_second": 173.669,
7
- "eval_steps_per_second": 22.198,
8
  "total_flos": 1.5658365504595968e+17,
9
- "train_loss": 0.6299933981284117,
10
- "train_runtime": 92.772,
11
- "train_samples_per_second": 167.184,
12
- "train_steps_per_second": 21.019
13
  }
 
1
  {
2
  "epoch": 15.0,
3
+ "eval_accuracy": 0.8045112781954887,
4
+ "eval_loss": 0.7388833165168762,
5
+ "eval_runtime": 0.856,
6
+ "eval_samples_per_second": 155.372,
7
+ "eval_steps_per_second": 19.86,
8
  "total_flos": 1.5658365504595968e+17,
9
+ "train_loss": 0.923483537771763,
10
+ "train_runtime": 151.8082,
11
+ "train_samples_per_second": 102.168,
12
+ "train_steps_per_second": 12.845
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 15.0,
3
- "eval_accuracy": 0.7894736842105263,
4
- "eval_loss": 0.8550169467926025,
5
- "eval_runtime": 0.7658,
6
- "eval_samples_per_second": 173.669,
7
- "eval_steps_per_second": 22.198
8
  }
 
1
  {
2
  "epoch": 15.0,
3
+ "eval_accuracy": 0.8045112781954887,
4
+ "eval_loss": 0.7388833165168762,
5
+ "eval_runtime": 0.856,
6
+ "eval_samples_per_second": 155.372,
7
+ "eval_steps_per_second": 19.86
8
  }
runs/Nov05_18-37-10_ip-10-90-1-182/events.out.tfevents.1730831994.ip-10-90-1-182.2476579.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a5ac7adbfdb9f91ce2c30406e678fcd7d233d78c1800655312a0c129f6660d1
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 15.0,
3
  "total_flos": 1.5658365504595968e+17,
4
- "train_loss": 0.6299933981284117,
5
- "train_runtime": 92.772,
6
- "train_samples_per_second": 167.184,
7
- "train_steps_per_second": 21.019
8
  }
 
1
  {
2
  "epoch": 15.0,
3
  "total_flos": 1.5658365504595968e+17,
4
+ "train_loss": 0.923483537771763,
5
+ "train_runtime": 151.8082,
6
+ "train_samples_per_second": 102.168,
7
+ "train_steps_per_second": 12.845
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 0.8550169467926025,
3
  "best_model_checkpoint": "./beans_outputs/checkpoint-1950",
4
  "epoch": 15.0,
5
  "eval_steps": 500,
@@ -10,1512 +10,1512 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.07692307692307693,
13
- "grad_norm": 2.0955963134765625,
14
- "learning_rate": 1.9692307692307696e-05,
15
  "loss": 1.1239,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.15384615384615385,
20
- "grad_norm": 1.8236017227172852,
21
- "learning_rate": 1.9384615384615386e-05,
22
- "loss": 1.1222,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.23076923076923078,
27
- "grad_norm": 1.9866633415222168,
28
- "learning_rate": 1.907692307692308e-05,
29
- "loss": 1.1165,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.3076923076923077,
34
- "grad_norm": 2.4556338787078857,
35
- "learning_rate": 1.876923076923077e-05,
36
- "loss": 1.1047,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.38461538461538464,
41
- "grad_norm": 1.6562598943710327,
42
- "learning_rate": 1.8461538461538465e-05,
43
- "loss": 1.1085,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.46153846153846156,
48
- "grad_norm": 1.501336932182312,
49
- "learning_rate": 1.8153846153846155e-05,
50
- "loss": 1.1048,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.5384615384615384,
55
- "grad_norm": 2.606107234954834,
56
- "learning_rate": 1.784615384615385e-05,
57
- "loss": 1.1031,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.6153846153846154,
62
- "grad_norm": 2.634584426879883,
63
- "learning_rate": 1.753846153846154e-05,
64
- "loss": 1.0916,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.6923076923076923,
69
- "grad_norm": 2.039886236190796,
70
- "learning_rate": 1.7230769230769234e-05,
71
- "loss": 1.095,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.7692307692307693,
76
- "grad_norm": 2.4800949096679688,
77
- "learning_rate": 1.6923076923076924e-05,
78
- "loss": 1.0916,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.8461538461538461,
83
- "grad_norm": 1.6766040325164795,
84
- "learning_rate": 1.6615384615384618e-05,
85
- "loss": 1.0959,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.9230769230769231,
90
- "grad_norm": 2.2848353385925293,
91
- "learning_rate": 1.630769230769231e-05,
92
- "loss": 1.1005,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
- "grad_norm": 4.07199239730835,
98
- "learning_rate": 1.6000000000000003e-05,
99
- "loss": 1.0881,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 1.0,
104
- "eval_accuracy": 0.41353383458646614,
105
- "eval_loss": 1.0901767015457153,
106
- "eval_runtime": 0.7894,
107
- "eval_samples_per_second": 168.48,
108
- "eval_steps_per_second": 21.535,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.0769230769230769,
113
- "grad_norm": 2.0711560249328613,
114
- "learning_rate": 1.5692307692307693e-05,
115
- "loss": 1.0794,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1538461538461537,
120
- "grad_norm": 2.499732494354248,
121
- "learning_rate": 1.5384615384615387e-05,
122
- "loss": 1.0755,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.2307692307692308,
127
- "grad_norm": 1.4569323062896729,
128
- "learning_rate": 1.5076923076923078e-05,
129
- "loss": 1.0807,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.3076923076923077,
134
- "grad_norm": 2.351478338241577,
135
- "learning_rate": 1.4769230769230772e-05,
136
- "loss": 1.0965,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.3846153846153846,
141
- "grad_norm": 2.1514322757720947,
142
- "learning_rate": 1.4461538461538462e-05,
143
- "loss": 1.0839,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.4615384615384617,
148
- "grad_norm": 2.151601791381836,
149
- "learning_rate": 1.4153846153846156e-05,
150
- "loss": 1.0837,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.5384615384615383,
155
- "grad_norm": 1.797500729560852,
156
- "learning_rate": 1.3846153846153847e-05,
157
- "loss": 1.0807,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.6153846153846154,
162
- "grad_norm": 1.7583892345428467,
163
- "learning_rate": 1.353846153846154e-05,
164
- "loss": 1.073,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.6923076923076923,
169
- "grad_norm": 3.5006496906280518,
170
- "learning_rate": 1.3230769230769231e-05,
171
- "loss": 1.0741,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.7692307692307692,
176
- "grad_norm": 2.0179672241210938,
177
- "learning_rate": 1.2923076923076925e-05,
178
- "loss": 1.077,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.8461538461538463,
183
- "grad_norm": 2.057086944580078,
184
- "learning_rate": 1.2615384615384616e-05,
185
- "loss": 1.0925,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.9230769230769231,
190
- "grad_norm": 1.9710103273391724,
191
- "learning_rate": 1.230769230769231e-05,
192
- "loss": 1.0757,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 2.0,
197
- "grad_norm": 2.9315028190612793,
198
- "learning_rate": 1.2e-05,
199
- "loss": 1.0716,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 2.0,
204
- "eval_accuracy": 0.5037593984962406,
205
- "eval_loss": 1.0685255527496338,
206
- "eval_runtime": 0.7711,
207
- "eval_samples_per_second": 172.471,
208
- "eval_steps_per_second": 22.045,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.076923076923077,
213
- "grad_norm": 2.183527946472168,
214
- "learning_rate": 1.1692307692307694e-05,
215
- "loss": 1.0786,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.1538461538461537,
220
- "grad_norm": 2.379652500152588,
221
- "learning_rate": 1.1384615384615385e-05,
222
- "loss": 1.0676,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.230769230769231,
227
- "grad_norm": 2.387296438217163,
228
- "learning_rate": 1.1076923076923079e-05,
229
- "loss": 1.064,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.3076923076923075,
234
- "grad_norm": 2.8164947032928467,
235
- "learning_rate": 1.076923076923077e-05,
236
- "loss": 1.0548,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.3846153846153846,
241
- "grad_norm": 1.849363088607788,
242
- "learning_rate": 1.0461538461538463e-05,
243
- "loss": 1.0693,
244
  "step": 310
245
  },
246
  {
247
  "epoch": 2.4615384615384617,
248
- "grad_norm": 2.0274300575256348,
249
- "learning_rate": 1.0153846153846154e-05,
250
- "loss": 1.0604,
251
  "step": 320
252
  },
253
  {
254
  "epoch": 2.5384615384615383,
255
- "grad_norm": 1.4544223546981812,
256
- "learning_rate": 9.846153846153848e-06,
257
- "loss": 1.0458,
258
  "step": 330
259
  },
260
  {
261
  "epoch": 2.6153846153846154,
262
- "grad_norm": 2.1712183952331543,
263
- "learning_rate": 9.53846153846154e-06,
264
- "loss": 1.0609,
265
  "step": 340
266
  },
267
  {
268
  "epoch": 2.6923076923076925,
269
- "grad_norm": 1.922677993774414,
270
- "learning_rate": 9.230769230769232e-06,
271
- "loss": 1.0646,
272
  "step": 350
273
  },
274
  {
275
  "epoch": 2.769230769230769,
276
- "grad_norm": 1.8288860321044922,
277
- "learning_rate": 8.923076923076925e-06,
278
- "loss": 1.0597,
279
  "step": 360
280
  },
281
  {
282
  "epoch": 2.8461538461538463,
283
- "grad_norm": 2.123480796813965,
284
- "learning_rate": 8.615384615384617e-06,
285
- "loss": 1.052,
286
  "step": 370
287
  },
288
  {
289
  "epoch": 2.9230769230769234,
290
- "grad_norm": 1.820168137550354,
291
- "learning_rate": 8.307692307692309e-06,
292
- "loss": 1.0434,
293
  "step": 380
294
  },
295
  {
296
  "epoch": 3.0,
297
- "grad_norm": 4.973505020141602,
298
- "learning_rate": 8.000000000000001e-06,
299
- "loss": 1.061,
300
  "step": 390
301
  },
302
  {
303
  "epoch": 3.0,
304
- "eval_accuracy": 0.6240601503759399,
305
- "eval_loss": 1.0459144115447998,
306
- "eval_runtime": 0.7618,
307
- "eval_samples_per_second": 174.588,
308
- "eval_steps_per_second": 22.316,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 3.076923076923077,
313
- "grad_norm": 2.1673223972320557,
314
- "learning_rate": 7.692307692307694e-06,
315
- "loss": 1.0616,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 3.1538461538461537,
320
- "grad_norm": 1.8567888736724854,
321
- "learning_rate": 7.384615384615386e-06,
322
- "loss": 1.0501,
323
  "step": 410
324
  },
325
  {
326
  "epoch": 3.230769230769231,
327
- "grad_norm": 2.0640571117401123,
328
- "learning_rate": 7.076923076923078e-06,
329
- "loss": 1.0611,
330
  "step": 420
331
  },
332
  {
333
  "epoch": 3.3076923076923075,
334
- "grad_norm": 2.215384006500244,
335
- "learning_rate": 6.76923076923077e-06,
336
- "loss": 1.0469,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 3.3846153846153846,
341
- "grad_norm": 2.1000049114227295,
342
- "learning_rate": 6.461538461538463e-06,
343
- "loss": 1.0499,
344
  "step": 440
345
  },
346
  {
347
  "epoch": 3.4615384615384617,
348
- "grad_norm": 1.7218382358551025,
349
- "learning_rate": 6.153846153846155e-06,
350
- "loss": 1.0564,
351
  "step": 450
352
  },
353
  {
354
  "epoch": 3.5384615384615383,
355
- "grad_norm": 2.3569300174713135,
356
- "learning_rate": 5.846153846153847e-06,
357
- "loss": 1.0599,
358
  "step": 460
359
  },
360
  {
361
  "epoch": 3.6153846153846154,
362
- "grad_norm": 1.5210909843444824,
363
- "learning_rate": 5.538461538461539e-06,
364
- "loss": 1.0367,
365
  "step": 470
366
  },
367
  {
368
  "epoch": 3.6923076923076925,
369
- "grad_norm": 2.7621657848358154,
370
- "learning_rate": 5.230769230769232e-06,
371
- "loss": 1.0421,
372
  "step": 480
373
  },
374
  {
375
  "epoch": 3.769230769230769,
376
- "grad_norm": 1.5097808837890625,
377
- "learning_rate": 4.923076923076924e-06,
378
- "loss": 1.0362,
379
  "step": 490
380
  },
381
  {
382
  "epoch": 3.8461538461538463,
383
- "grad_norm": 1.5118447542190552,
384
- "learning_rate": 4.615384615384616e-06,
385
- "loss": 1.0572,
386
  "step": 500
387
  },
388
  {
389
  "epoch": 3.9230769230769234,
390
- "grad_norm": 1.7513490915298462,
391
- "learning_rate": 4.307692307692308e-06,
392
- "loss": 1.0361,
393
  "step": 510
394
  },
395
  {
396
  "epoch": 4.0,
397
- "grad_norm": 5.398025035858154,
398
- "learning_rate": 4.000000000000001e-06,
399
- "loss": 1.0514,
400
  "step": 520
401
  },
402
  {
403
  "epoch": 4.0,
404
- "eval_accuracy": 0.6015037593984962,
405
- "eval_loss": 1.0407124757766724,
406
- "eval_runtime": 0.7726,
407
- "eval_samples_per_second": 172.155,
408
- "eval_steps_per_second": 22.005,
409
  "step": 520
410
  },
411
  {
412
  "epoch": 4.076923076923077,
413
- "grad_norm": 2.5516345500946045,
414
- "learning_rate": 3.692307692307693e-06,
415
- "loss": 1.0529,
416
  "step": 530
417
  },
418
  {
419
  "epoch": 4.153846153846154,
420
- "grad_norm": 1.6976008415222168,
421
- "learning_rate": 3.384615384615385e-06,
422
- "loss": 1.0472,
423
  "step": 540
424
  },
425
  {
426
  "epoch": 4.230769230769231,
427
- "grad_norm": 2.5672519207000732,
428
- "learning_rate": 3.0769230769230774e-06,
429
- "loss": 1.0565,
430
  "step": 550
431
  },
432
  {
433
  "epoch": 4.3076923076923075,
434
- "grad_norm": 2.166529655456543,
435
- "learning_rate": 2.7692307692307697e-06,
436
- "loss": 1.0619,
437
  "step": 560
438
  },
439
  {
440
  "epoch": 4.384615384615385,
441
- "grad_norm": 1.961472511291504,
442
- "learning_rate": 2.461538461538462e-06,
443
- "loss": 1.0322,
444
  "step": 570
445
  },
446
  {
447
  "epoch": 4.461538461538462,
448
- "grad_norm": 2.392319440841675,
449
- "learning_rate": 2.153846153846154e-06,
450
- "loss": 1.0388,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 4.538461538461538,
455
- "grad_norm": 2.3034205436706543,
456
- "learning_rate": 1.8461538461538465e-06,
457
- "loss": 1.0358,
458
  "step": 590
459
  },
460
  {
461
  "epoch": 4.615384615384615,
462
- "grad_norm": 2.037050247192383,
463
- "learning_rate": 1.5384615384615387e-06,
464
- "loss": 1.0334,
465
  "step": 600
466
  },
467
  {
468
  "epoch": 4.6923076923076925,
469
- "grad_norm": 3.0737335681915283,
470
- "learning_rate": 1.230769230769231e-06,
471
- "loss": 1.0506,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 4.769230769230769,
476
- "grad_norm": 2.1824796199798584,
477
- "learning_rate": 9.230769230769232e-07,
478
- "loss": 1.0516,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 4.846153846153846,
483
- "grad_norm": 1.9239214658737183,
484
- "learning_rate": 6.153846153846155e-07,
485
- "loss": 1.0399,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 4.923076923076923,
490
- "grad_norm": 2.267302989959717,
491
- "learning_rate": 3.0769230769230774e-07,
492
- "loss": 1.0374,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 5.0,
497
- "grad_norm": 4.4957404136657715,
498
- "learning_rate": 0.0,
499
- "loss": 1.05,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 5.0,
504
- "eval_accuracy": 0.6766917293233082,
505
- "eval_loss": 1.0332472324371338,
506
- "eval_runtime": 0.8208,
507
- "eval_samples_per_second": 162.035,
508
- "eval_steps_per_second": 20.711,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 5.076923076923077,
513
- "grad_norm": 1.9310355186462402,
514
  "learning_rate": 1.3230769230769231e-05,
515
- "loss": 1.0281,
516
  "step": 660
517
  },
518
  {
519
  "epoch": 5.153846153846154,
520
- "grad_norm": 2.36603045463562,
521
  "learning_rate": 1.312820512820513e-05,
522
- "loss": 1.0587,
523
  "step": 670
524
  },
525
  {
526
  "epoch": 5.230769230769231,
527
- "grad_norm": 2.0758917331695557,
528
  "learning_rate": 1.3025641025641027e-05,
529
- "loss": 1.0534,
530
  "step": 680
531
  },
532
  {
533
  "epoch": 5.3076923076923075,
534
- "grad_norm": 2.549725294113159,
535
  "learning_rate": 1.2923076923076925e-05,
536
- "loss": 1.0421,
537
  "step": 690
538
  },
539
  {
540
  "epoch": 5.384615384615385,
541
- "grad_norm": 2.7183680534362793,
542
  "learning_rate": 1.2820512820512823e-05,
543
- "loss": 1.035,
544
  "step": 700
545
  },
546
  {
547
  "epoch": 5.461538461538462,
548
- "grad_norm": 1.7176955938339233,
549
  "learning_rate": 1.2717948717948718e-05,
550
- "loss": 1.0268,
551
  "step": 710
552
  },
553
  {
554
  "epoch": 5.538461538461538,
555
- "grad_norm": 1.8257861137390137,
556
  "learning_rate": 1.2615384615384616e-05,
557
- "loss": 1.0302,
558
  "step": 720
559
  },
560
  {
561
  "epoch": 5.615384615384615,
562
- "grad_norm": 2.50368595123291,
563
  "learning_rate": 1.2512820512820514e-05,
564
- "loss": 1.0331,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 5.6923076923076925,
569
- "grad_norm": 2.4315121173858643,
570
  "learning_rate": 1.2410256410256412e-05,
571
- "loss": 1.0208,
572
  "step": 740
573
  },
574
  {
575
  "epoch": 5.769230769230769,
576
- "grad_norm": 2.043854236602783,
577
  "learning_rate": 1.230769230769231e-05,
578
- "loss": 1.0241,
579
  "step": 750
580
  },
581
  {
582
  "epoch": 5.846153846153846,
583
- "grad_norm": 2.0800740718841553,
584
  "learning_rate": 1.2205128205128208e-05,
585
- "loss": 1.0251,
586
  "step": 760
587
  },
588
  {
589
  "epoch": 5.923076923076923,
590
- "grad_norm": 3.4356396198272705,
591
  "learning_rate": 1.2102564102564102e-05,
592
- "loss": 1.0104,
593
  "step": 770
594
  },
595
  {
596
  "epoch": 6.0,
597
- "grad_norm": 4.49416971206665,
598
  "learning_rate": 1.2e-05,
599
- "loss": 1.0357,
600
  "step": 780
601
  },
602
  {
603
  "epoch": 6.0,
604
- "eval_accuracy": 0.6541353383458647,
605
- "eval_loss": 1.0109117031097412,
606
- "eval_runtime": 0.7801,
607
- "eval_samples_per_second": 170.483,
608
- "eval_steps_per_second": 21.791,
609
  "step": 780
610
  },
611
  {
612
  "epoch": 6.076923076923077,
613
- "grad_norm": 2.650513172149658,
614
  "learning_rate": 1.1897435897435898e-05,
615
- "loss": 1.0148,
616
  "step": 790
617
  },
618
  {
619
  "epoch": 6.153846153846154,
620
- "grad_norm": 2.3682632446289062,
621
  "learning_rate": 1.1794871794871796e-05,
622
- "loss": 1.0094,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 6.230769230769231,
627
- "grad_norm": 1.6716077327728271,
628
  "learning_rate": 1.1692307692307694e-05,
629
- "loss": 1.0097,
630
  "step": 810
631
  },
632
  {
633
  "epoch": 6.3076923076923075,
634
- "grad_norm": 2.4839890003204346,
635
  "learning_rate": 1.1589743589743592e-05,
636
- "loss": 1.01,
637
  "step": 820
638
  },
639
  {
640
  "epoch": 6.384615384615385,
641
- "grad_norm": 2.4004769325256348,
642
  "learning_rate": 1.1487179487179487e-05,
643
- "loss": 1.0104,
644
  "step": 830
645
  },
646
  {
647
  "epoch": 6.461538461538462,
648
- "grad_norm": 2.9597084522247314,
649
  "learning_rate": 1.1384615384615385e-05,
650
- "loss": 1.0137,
651
  "step": 840
652
  },
653
  {
654
  "epoch": 6.538461538461538,
655
- "grad_norm": 2.680335760116577,
656
  "learning_rate": 1.1282051282051283e-05,
657
- "loss": 1.0147,
658
  "step": 850
659
  },
660
  {
661
  "epoch": 6.615384615384615,
662
- "grad_norm": 1.7677160501480103,
663
  "learning_rate": 1.117948717948718e-05,
664
- "loss": 0.9947,
665
  "step": 860
666
  },
667
  {
668
  "epoch": 6.6923076923076925,
669
- "grad_norm": 2.0484132766723633,
670
  "learning_rate": 1.1076923076923079e-05,
671
- "loss": 1.0169,
672
  "step": 870
673
  },
674
  {
675
  "epoch": 6.769230769230769,
676
- "grad_norm": 2.1910479068756104,
677
  "learning_rate": 1.0974358974358977e-05,
678
- "loss": 1.0024,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 6.846153846153846,
683
- "grad_norm": 2.181236743927002,
684
  "learning_rate": 1.0871794871794871e-05,
685
- "loss": 0.9962,
686
  "step": 890
687
  },
688
  {
689
  "epoch": 6.923076923076923,
690
- "grad_norm": 2.898885488510132,
691
  "learning_rate": 1.076923076923077e-05,
692
- "loss": 1.012,
693
  "step": 900
694
  },
695
  {
696
  "epoch": 7.0,
697
- "grad_norm": 4.950052738189697,
698
  "learning_rate": 1.0666666666666667e-05,
699
- "loss": 1.0012,
700
  "step": 910
701
  },
702
  {
703
  "epoch": 7.0,
704
- "eval_accuracy": 0.7368421052631579,
705
- "eval_loss": 0.981479287147522,
706
- "eval_runtime": 0.7693,
707
- "eval_samples_per_second": 172.886,
708
- "eval_steps_per_second": 22.098,
709
  "step": 910
710
  },
711
  {
712
  "epoch": 7.076923076923077,
713
- "grad_norm": 2.692753553390503,
714
  "learning_rate": 1.0564102564102565e-05,
715
- "loss": 0.9889,
716
  "step": 920
717
  },
718
  {
719
  "epoch": 7.153846153846154,
720
- "grad_norm": 2.9175124168395996,
721
  "learning_rate": 1.0461538461538463e-05,
722
- "loss": 0.9911,
723
  "step": 930
724
  },
725
  {
726
  "epoch": 7.230769230769231,
727
- "grad_norm": 3.221527099609375,
728
  "learning_rate": 1.0358974358974361e-05,
729
- "loss": 0.9827,
730
  "step": 940
731
  },
732
  {
733
  "epoch": 7.3076923076923075,
734
- "grad_norm": 2.507923126220703,
735
  "learning_rate": 1.0256410256410256e-05,
736
- "loss": 0.9919,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 7.384615384615385,
741
- "grad_norm": 2.4533870220184326,
742
  "learning_rate": 1.0153846153846154e-05,
743
- "loss": 0.9962,
744
  "step": 960
745
  },
746
  {
747
  "epoch": 7.461538461538462,
748
- "grad_norm": 2.1032631397247314,
749
  "learning_rate": 1.0051282051282052e-05,
750
- "loss": 0.9751,
751
  "step": 970
752
  },
753
  {
754
  "epoch": 7.538461538461538,
755
- "grad_norm": 2.5848186016082764,
756
  "learning_rate": 9.94871794871795e-06,
757
- "loss": 0.9939,
758
  "step": 980
759
  },
760
  {
761
  "epoch": 7.615384615384615,
762
- "grad_norm": 2.17742919921875,
763
  "learning_rate": 9.846153846153848e-06,
764
- "loss": 0.9745,
765
  "step": 990
766
  },
767
  {
768
  "epoch": 7.6923076923076925,
769
- "grad_norm": 1.9953967332839966,
770
  "learning_rate": 9.743589743589744e-06,
771
- "loss": 0.9665,
772
  "step": 1000
773
  },
774
  {
775
  "epoch": 7.769230769230769,
776
- "grad_norm": 3.0263218879699707,
777
  "learning_rate": 9.641025641025642e-06,
778
- "loss": 0.9646,
779
  "step": 1010
780
  },
781
  {
782
  "epoch": 7.846153846153846,
783
- "grad_norm": 2.3735406398773193,
784
  "learning_rate": 9.53846153846154e-06,
785
- "loss": 0.9836,
786
  "step": 1020
787
  },
788
  {
789
  "epoch": 7.923076923076923,
790
- "grad_norm": 2.548480272293091,
791
  "learning_rate": 9.435897435897436e-06,
792
- "loss": 0.9546,
793
  "step": 1030
794
  },
795
  {
796
  "epoch": 8.0,
797
- "grad_norm": 3.8450028896331787,
798
  "learning_rate": 9.333333333333334e-06,
799
- "loss": 0.9932,
800
  "step": 1040
801
  },
802
  {
803
  "epoch": 8.0,
804
- "eval_accuracy": 0.7669172932330827,
805
- "eval_loss": 0.9549766778945923,
806
- "eval_runtime": 0.7536,
807
- "eval_samples_per_second": 176.486,
808
- "eval_steps_per_second": 22.558,
809
  "step": 1040
810
  },
811
  {
812
  "epoch": 8.076923076923077,
813
- "grad_norm": 1.6870847940444946,
814
  "learning_rate": 9.230769230769232e-06,
815
- "loss": 0.9752,
816
  "step": 1050
817
  },
818
  {
819
  "epoch": 8.153846153846153,
820
- "grad_norm": 2.1122217178344727,
821
  "learning_rate": 9.128205128205129e-06,
822
- "loss": 0.9573,
823
  "step": 1060
824
  },
825
  {
826
  "epoch": 8.23076923076923,
827
- "grad_norm": 4.1552886962890625,
828
  "learning_rate": 9.025641025641027e-06,
829
- "loss": 0.9764,
830
  "step": 1070
831
  },
832
  {
833
  "epoch": 8.307692307692308,
834
- "grad_norm": 1.7864203453063965,
835
  "learning_rate": 8.923076923076925e-06,
836
- "loss": 0.9434,
837
  "step": 1080
838
  },
839
  {
840
  "epoch": 8.384615384615385,
841
- "grad_norm": 2.2091946601867676,
842
  "learning_rate": 8.820512820512821e-06,
843
- "loss": 0.974,
844
  "step": 1090
845
  },
846
  {
847
  "epoch": 8.461538461538462,
848
- "grad_norm": 2.4063644409179688,
849
  "learning_rate": 8.717948717948719e-06,
850
- "loss": 0.9576,
851
  "step": 1100
852
  },
853
  {
854
  "epoch": 8.538461538461538,
855
- "grad_norm": 1.6061931848526,
856
  "learning_rate": 8.615384615384617e-06,
857
- "loss": 0.9588,
858
  "step": 1110
859
  },
860
  {
861
  "epoch": 8.615384615384615,
862
- "grad_norm": 2.8999595642089844,
863
  "learning_rate": 8.512820512820513e-06,
864
- "loss": 0.9791,
865
  "step": 1120
866
  },
867
  {
868
  "epoch": 8.692307692307692,
869
- "grad_norm": 3.6554131507873535,
870
  "learning_rate": 8.410256410256411e-06,
871
- "loss": 0.9629,
872
  "step": 1130
873
  },
874
  {
875
  "epoch": 8.76923076923077,
876
- "grad_norm": 1.7246966361999512,
877
  "learning_rate": 8.307692307692309e-06,
878
- "loss": 0.9707,
879
  "step": 1140
880
  },
881
  {
882
  "epoch": 8.846153846153847,
883
- "grad_norm": 2.3160033226013184,
884
  "learning_rate": 8.205128205128205e-06,
885
- "loss": 0.9578,
886
  "step": 1150
887
  },
888
  {
889
  "epoch": 8.923076923076923,
890
- "grad_norm": 2.685718059539795,
891
  "learning_rate": 8.102564102564103e-06,
892
- "loss": 0.972,
893
  "step": 1160
894
  },
895
  {
896
  "epoch": 9.0,
897
- "grad_norm": 3.6465442180633545,
898
  "learning_rate": 8.000000000000001e-06,
899
- "loss": 0.9748,
900
  "step": 1170
901
  },
902
  {
903
  "epoch": 9.0,
904
- "eval_accuracy": 0.7669172932330827,
905
- "eval_loss": 0.9408761858940125,
906
- "eval_runtime": 0.7615,
907
- "eval_samples_per_second": 174.644,
908
- "eval_steps_per_second": 22.323,
909
  "step": 1170
910
  },
911
  {
912
  "epoch": 9.076923076923077,
913
- "grad_norm": 1.8668149709701538,
914
  "learning_rate": 7.897435897435898e-06,
915
- "loss": 0.9712,
916
  "step": 1180
917
  },
918
  {
919
  "epoch": 9.153846153846153,
920
- "grad_norm": 2.042644739151001,
921
  "learning_rate": 7.794871794871796e-06,
922
- "loss": 0.9407,
923
  "step": 1190
924
  },
925
  {
926
  "epoch": 9.23076923076923,
927
- "grad_norm": 1.967020869255066,
928
  "learning_rate": 7.692307692307694e-06,
929
- "loss": 0.9457,
930
  "step": 1200
931
  },
932
  {
933
  "epoch": 9.307692307692308,
934
- "grad_norm": 2.147862672805786,
935
  "learning_rate": 7.58974358974359e-06,
936
- "loss": 0.9442,
937
  "step": 1210
938
  },
939
  {
940
  "epoch": 9.384615384615385,
941
- "grad_norm": 1.8528053760528564,
942
  "learning_rate": 7.487179487179488e-06,
943
- "loss": 0.9526,
944
  "step": 1220
945
  },
946
  {
947
  "epoch": 9.461538461538462,
948
- "grad_norm": 3.2000551223754883,
949
  "learning_rate": 7.384615384615386e-06,
950
- "loss": 0.9465,
951
  "step": 1230
952
  },
953
  {
954
  "epoch": 9.538461538461538,
955
- "grad_norm": 2.259323835372925,
956
  "learning_rate": 7.282051282051282e-06,
957
- "loss": 0.9503,
958
  "step": 1240
959
  },
960
  {
961
  "epoch": 9.615384615384615,
962
- "grad_norm": 2.4054858684539795,
963
  "learning_rate": 7.17948717948718e-06,
964
- "loss": 0.9274,
965
  "step": 1250
966
  },
967
  {
968
  "epoch": 9.692307692307692,
969
- "grad_norm": 3.4811408519744873,
970
  "learning_rate": 7.076923076923078e-06,
971
- "loss": 0.943,
972
  "step": 1260
973
  },
974
  {
975
  "epoch": 9.76923076923077,
976
- "grad_norm": 1.7080141305923462,
977
  "learning_rate": 6.974358974358974e-06,
978
- "loss": 0.9247,
979
  "step": 1270
980
  },
981
  {
982
  "epoch": 9.846153846153847,
983
- "grad_norm": 2.0476508140563965,
984
  "learning_rate": 6.871794871794872e-06,
985
- "loss": 0.9194,
986
  "step": 1280
987
  },
988
  {
989
  "epoch": 9.923076923076923,
990
- "grad_norm": 2.149641990661621,
991
  "learning_rate": 6.76923076923077e-06,
992
- "loss": 0.9269,
993
  "step": 1290
994
  },
995
  {
996
  "epoch": 10.0,
997
- "grad_norm": 5.121323108673096,
998
  "learning_rate": 6.666666666666667e-06,
999
- "loss": 0.9113,
1000
  "step": 1300
1001
  },
1002
  {
1003
  "epoch": 10.0,
1004
- "eval_accuracy": 0.7819548872180451,
1005
- "eval_loss": 0.9149269461631775,
1006
- "eval_runtime": 0.7775,
1007
- "eval_samples_per_second": 171.06,
1008
- "eval_steps_per_second": 21.865,
1009
  "step": 1300
1010
  },
1011
  {
1012
  "epoch": 10.076923076923077,
1013
- "grad_norm": 2.06109881401062,
1014
  "learning_rate": 6.564102564102565e-06,
1015
- "loss": 0.925,
1016
  "step": 1310
1017
  },
1018
  {
1019
  "epoch": 10.153846153846153,
1020
- "grad_norm": 1.9137018918991089,
1021
  "learning_rate": 6.461538461538463e-06,
1022
- "loss": 0.9657,
1023
  "step": 1320
1024
  },
1025
  {
1026
  "epoch": 10.23076923076923,
1027
- "grad_norm": 2.0686280727386475,
1028
  "learning_rate": 6.358974358974359e-06,
1029
- "loss": 0.9565,
1030
  "step": 1330
1031
  },
1032
  {
1033
  "epoch": 10.307692307692308,
1034
- "grad_norm": 2.046623945236206,
1035
  "learning_rate": 6.256410256410257e-06,
1036
- "loss": 0.918,
1037
  "step": 1340
1038
  },
1039
  {
1040
  "epoch": 10.384615384615385,
1041
- "grad_norm": 2.281343936920166,
1042
  "learning_rate": 6.153846153846155e-06,
1043
- "loss": 0.9118,
1044
  "step": 1350
1045
  },
1046
  {
1047
  "epoch": 10.461538461538462,
1048
- "grad_norm": 2.694427728652954,
1049
  "learning_rate": 6.051282051282051e-06,
1050
- "loss": 0.9377,
1051
  "step": 1360
1052
  },
1053
  {
1054
  "epoch": 10.538461538461538,
1055
- "grad_norm": 2.3148765563964844,
1056
  "learning_rate": 5.948717948717949e-06,
1057
- "loss": 0.911,
1058
  "step": 1370
1059
  },
1060
  {
1061
  "epoch": 10.615384615384615,
1062
- "grad_norm": 2.595669746398926,
1063
  "learning_rate": 5.846153846153847e-06,
1064
- "loss": 0.9146,
1065
  "step": 1380
1066
  },
1067
  {
1068
  "epoch": 10.692307692307692,
1069
- "grad_norm": 2.136301279067993,
1070
  "learning_rate": 5.743589743589743e-06,
1071
- "loss": 0.9061,
1072
  "step": 1390
1073
  },
1074
  {
1075
  "epoch": 10.76923076923077,
1076
- "grad_norm": 3.0159363746643066,
1077
  "learning_rate": 5.641025641025641e-06,
1078
- "loss": 0.9365,
1079
  "step": 1400
1080
  },
1081
  {
1082
  "epoch": 10.846153846153847,
1083
- "grad_norm": 2.1136507987976074,
1084
  "learning_rate": 5.538461538461539e-06,
1085
- "loss": 0.9232,
1086
  "step": 1410
1087
  },
1088
  {
1089
  "epoch": 10.923076923076923,
1090
- "grad_norm": 1.713663101196289,
1091
  "learning_rate": 5.435897435897436e-06,
1092
- "loss": 0.9344,
1093
  "step": 1420
1094
  },
1095
  {
1096
  "epoch": 11.0,
1097
- "grad_norm": 4.04538631439209,
1098
  "learning_rate": 5.333333333333334e-06,
1099
- "loss": 0.9255,
1100
  "step": 1430
1101
  },
1102
  {
1103
  "epoch": 11.0,
1104
- "eval_accuracy": 0.7894736842105263,
1105
- "eval_loss": 0.8905543088912964,
1106
- "eval_runtime": 0.7659,
1107
- "eval_samples_per_second": 173.659,
1108
- "eval_steps_per_second": 22.197,
1109
  "step": 1430
1110
  },
1111
  {
1112
  "epoch": 11.076923076923077,
1113
- "grad_norm": 2.04194974899292,
1114
  "learning_rate": 5.230769230769232e-06,
1115
- "loss": 0.9333,
1116
  "step": 1440
1117
  },
1118
  {
1119
  "epoch": 11.153846153846153,
1120
- "grad_norm": 3.108344554901123,
1121
  "learning_rate": 5.128205128205128e-06,
1122
- "loss": 0.9174,
1123
  "step": 1450
1124
  },
1125
  {
1126
  "epoch": 11.23076923076923,
1127
- "grad_norm": 2.406233072280884,
1128
  "learning_rate": 5.025641025641026e-06,
1129
- "loss": 0.8948,
1130
  "step": 1460
1131
  },
1132
  {
1133
  "epoch": 11.307692307692308,
1134
- "grad_norm": 2.4100501537323,
1135
  "learning_rate": 4.923076923076924e-06,
1136
- "loss": 0.9155,
1137
  "step": 1470
1138
  },
1139
  {
1140
  "epoch": 11.384615384615385,
1141
- "grad_norm": 2.7117860317230225,
1142
  "learning_rate": 4.820512820512821e-06,
1143
- "loss": 0.9075,
1144
  "step": 1480
1145
  },
1146
  {
1147
  "epoch": 11.461538461538462,
1148
- "grad_norm": 2.0159695148468018,
1149
  "learning_rate": 4.717948717948718e-06,
1150
- "loss": 0.9338,
1151
  "step": 1490
1152
  },
1153
  {
1154
  "epoch": 11.538461538461538,
1155
- "grad_norm": 3.280245304107666,
1156
  "learning_rate": 4.615384615384616e-06,
1157
- "loss": 0.9243,
1158
  "step": 1500
1159
  },
1160
  {
1161
  "epoch": 11.615384615384615,
1162
- "grad_norm": 3.1355690956115723,
1163
  "learning_rate": 4.512820512820513e-06,
1164
- "loss": 0.9185,
1165
  "step": 1510
1166
  },
1167
  {
1168
  "epoch": 11.692307692307692,
1169
- "grad_norm": 3.0900094509124756,
1170
  "learning_rate": 4.4102564102564104e-06,
1171
- "loss": 0.937,
1172
  "step": 1520
1173
  },
1174
  {
1175
  "epoch": 11.76923076923077,
1176
- "grad_norm": 1.8758033514022827,
1177
  "learning_rate": 4.307692307692308e-06,
1178
- "loss": 0.9052,
1179
  "step": 1530
1180
  },
1181
  {
1182
  "epoch": 11.846153846153847,
1183
- "grad_norm": 2.0586955547332764,
1184
  "learning_rate": 4.2051282051282055e-06,
1185
- "loss": 0.8874,
1186
  "step": 1540
1187
  },
1188
  {
1189
  "epoch": 11.923076923076923,
1190
- "grad_norm": 2.0720062255859375,
1191
  "learning_rate": 4.102564102564103e-06,
1192
- "loss": 0.9141,
1193
  "step": 1550
1194
  },
1195
  {
1196
  "epoch": 12.0,
1197
- "grad_norm": 3.183523416519165,
1198
  "learning_rate": 4.000000000000001e-06,
1199
- "loss": 0.8877,
1200
  "step": 1560
1201
  },
1202
  {
1203
  "epoch": 12.0,
1204
- "eval_accuracy": 0.7894736842105263,
1205
- "eval_loss": 0.8749483823776245,
1206
- "eval_runtime": 0.7374,
1207
- "eval_samples_per_second": 180.372,
1208
- "eval_steps_per_second": 23.055,
1209
  "step": 1560
1210
  },
1211
  {
1212
  "epoch": 12.076923076923077,
1213
- "grad_norm": 2.0058720111846924,
1214
  "learning_rate": 3.897435897435898e-06,
1215
- "loss": 0.8829,
1216
  "step": 1570
1217
  },
1218
  {
1219
  "epoch": 12.153846153846153,
1220
- "grad_norm": 2.2991676330566406,
1221
  "learning_rate": 3.794871794871795e-06,
1222
- "loss": 0.9152,
1223
  "step": 1580
1224
  },
1225
  {
1226
  "epoch": 12.23076923076923,
1227
- "grad_norm": 1.5903538465499878,
1228
  "learning_rate": 3.692307692307693e-06,
1229
- "loss": 0.9149,
1230
  "step": 1590
1231
  },
1232
  {
1233
  "epoch": 12.307692307692308,
1234
- "grad_norm": 1.7883615493774414,
1235
  "learning_rate": 3.58974358974359e-06,
1236
- "loss": 0.9163,
1237
  "step": 1600
1238
  },
1239
  {
1240
  "epoch": 12.384615384615385,
1241
- "grad_norm": 2.2841601371765137,
1242
  "learning_rate": 3.487179487179487e-06,
1243
- "loss": 0.8958,
1244
  "step": 1610
1245
  },
1246
  {
1247
  "epoch": 12.461538461538462,
1248
- "grad_norm": 2.3814501762390137,
1249
  "learning_rate": 3.384615384615385e-06,
1250
- "loss": 0.8918,
1251
  "step": 1620
1252
  },
1253
  {
1254
  "epoch": 12.538461538461538,
1255
- "grad_norm": 1.9848734140396118,
1256
  "learning_rate": 3.2820512820512823e-06,
1257
- "loss": 0.889,
1258
  "step": 1630
1259
  },
1260
  {
1261
  "epoch": 12.615384615384615,
1262
- "grad_norm": 1.7236778736114502,
1263
  "learning_rate": 3.1794871794871795e-06,
1264
- "loss": 0.8979,
1265
  "step": 1640
1266
  },
1267
  {
1268
  "epoch": 12.692307692307692,
1269
- "grad_norm": 3.340665102005005,
1270
  "learning_rate": 3.0769230769230774e-06,
1271
- "loss": 0.8695,
1272
  "step": 1650
1273
  },
1274
  {
1275
  "epoch": 12.76923076923077,
1276
- "grad_norm": 2.127927780151367,
1277
  "learning_rate": 2.9743589743589746e-06,
1278
- "loss": 0.9323,
1279
  "step": 1660
1280
  },
1281
  {
1282
  "epoch": 12.846153846153847,
1283
- "grad_norm": 1.8213707208633423,
1284
  "learning_rate": 2.8717948717948717e-06,
1285
- "loss": 0.9178,
1286
  "step": 1670
1287
  },
1288
  {
1289
  "epoch": 12.923076923076923,
1290
- "grad_norm": 2.0011963844299316,
1291
  "learning_rate": 2.7692307692307697e-06,
1292
- "loss": 0.8872,
1293
  "step": 1680
1294
  },
1295
  {
1296
  "epoch": 13.0,
1297
- "grad_norm": 3.812871217727661,
1298
  "learning_rate": 2.666666666666667e-06,
1299
- "loss": 0.9032,
1300
  "step": 1690
1301
  },
1302
  {
1303
  "epoch": 13.0,
1304
- "eval_accuracy": 0.7969924812030075,
1305
- "eval_loss": 0.8698711395263672,
1306
- "eval_runtime": 0.7423,
1307
- "eval_samples_per_second": 179.165,
1308
- "eval_steps_per_second": 22.901,
1309
  "step": 1690
1310
  },
1311
  {
1312
  "epoch": 13.076923076923077,
1313
- "grad_norm": 2.8741540908813477,
1314
  "learning_rate": 2.564102564102564e-06,
1315
- "loss": 0.8842,
1316
  "step": 1700
1317
  },
1318
  {
1319
  "epoch": 13.153846153846153,
1320
- "grad_norm": 2.3278818130493164,
1321
  "learning_rate": 2.461538461538462e-06,
1322
- "loss": 0.9131,
1323
  "step": 1710
1324
  },
1325
  {
1326
  "epoch": 13.23076923076923,
1327
- "grad_norm": 2.8419501781463623,
1328
  "learning_rate": 2.358974358974359e-06,
1329
- "loss": 0.8965,
1330
  "step": 1720
1331
  },
1332
  {
1333
  "epoch": 13.307692307692308,
1334
- "grad_norm": 1.8506221771240234,
1335
  "learning_rate": 2.2564102564102566e-06,
1336
- "loss": 0.8967,
1337
  "step": 1730
1338
  },
1339
  {
1340
  "epoch": 13.384615384615385,
1341
- "grad_norm": 2.6166839599609375,
1342
  "learning_rate": 2.153846153846154e-06,
1343
- "loss": 0.8785,
1344
  "step": 1740
1345
  },
1346
  {
1347
  "epoch": 13.461538461538462,
1348
- "grad_norm": 4.287515640258789,
1349
  "learning_rate": 2.0512820512820513e-06,
1350
- "loss": 0.914,
1351
  "step": 1750
1352
  },
1353
  {
1354
  "epoch": 13.538461538461538,
1355
- "grad_norm": 2.516889810562134,
1356
  "learning_rate": 1.948717948717949e-06,
1357
- "loss": 0.9286,
1358
  "step": 1760
1359
  },
1360
  {
1361
  "epoch": 13.615384615384615,
1362
- "grad_norm": 1.8332946300506592,
1363
  "learning_rate": 1.8461538461538465e-06,
1364
- "loss": 0.8995,
1365
  "step": 1770
1366
  },
1367
  {
1368
  "epoch": 13.692307692307692,
1369
- "grad_norm": 2.2418551445007324,
1370
  "learning_rate": 1.7435897435897436e-06,
1371
- "loss": 0.8818,
1372
  "step": 1780
1373
  },
1374
  {
1375
  "epoch": 13.76923076923077,
1376
- "grad_norm": 1.794832706451416,
1377
  "learning_rate": 1.6410256410256412e-06,
1378
- "loss": 0.9044,
1379
  "step": 1790
1380
  },
1381
  {
1382
  "epoch": 13.846153846153847,
1383
- "grad_norm": 3.0142152309417725,
1384
  "learning_rate": 1.5384615384615387e-06,
1385
- "loss": 0.8826,
1386
  "step": 1800
1387
  },
1388
  {
1389
  "epoch": 13.923076923076923,
1390
- "grad_norm": 2.5891315937042236,
1391
  "learning_rate": 1.4358974358974359e-06,
1392
- "loss": 0.8387,
1393
  "step": 1810
1394
  },
1395
  {
1396
  "epoch": 14.0,
1397
- "grad_norm": 5.37412691116333,
1398
  "learning_rate": 1.3333333333333334e-06,
1399
- "loss": 0.9001,
1400
  "step": 1820
1401
  },
1402
  {
1403
  "epoch": 14.0,
1404
- "eval_accuracy": 0.7819548872180451,
1405
- "eval_loss": 0.8673797845840454,
1406
- "eval_runtime": 0.7642,
1407
- "eval_samples_per_second": 174.027,
1408
- "eval_steps_per_second": 22.244,
1409
  "step": 1820
1410
  },
1411
  {
1412
  "epoch": 14.076923076923077,
1413
- "grad_norm": 1.8213236331939697,
1414
  "learning_rate": 1.230769230769231e-06,
1415
- "loss": 0.9047,
1416
  "step": 1830
1417
  },
1418
  {
1419
  "epoch": 14.153846153846153,
1420
- "grad_norm": 1.8006333112716675,
1421
  "learning_rate": 1.1282051282051283e-06,
1422
- "loss": 0.8768,
1423
  "step": 1840
1424
  },
1425
  {
1426
  "epoch": 14.23076923076923,
1427
- "grad_norm": 2.691574811935425,
1428
  "learning_rate": 1.0256410256410257e-06,
1429
- "loss": 0.8757,
1430
  "step": 1850
1431
  },
1432
  {
1433
  "epoch": 14.307692307692308,
1434
- "grad_norm": 5.015848636627197,
1435
  "learning_rate": 9.230769230769232e-07,
1436
- "loss": 0.8734,
1437
  "step": 1860
1438
  },
1439
  {
1440
  "epoch": 14.384615384615385,
1441
- "grad_norm": 2.5233821868896484,
1442
  "learning_rate": 8.205128205128206e-07,
1443
- "loss": 0.8787,
1444
  "step": 1870
1445
  },
1446
  {
1447
  "epoch": 14.461538461538462,
1448
- "grad_norm": 2.1718924045562744,
1449
  "learning_rate": 7.179487179487179e-07,
1450
- "loss": 0.8767,
1451
  "step": 1880
1452
  },
1453
  {
1454
  "epoch": 14.538461538461538,
1455
- "grad_norm": 3.0364015102386475,
1456
  "learning_rate": 6.153846153846155e-07,
1457
- "loss": 0.873,
1458
  "step": 1890
1459
  },
1460
  {
1461
  "epoch": 14.615384615384615,
1462
- "grad_norm": 2.5152034759521484,
1463
  "learning_rate": 5.128205128205128e-07,
1464
- "loss": 0.9096,
1465
  "step": 1900
1466
  },
1467
  {
1468
  "epoch": 14.692307692307692,
1469
- "grad_norm": 1.819096565246582,
1470
  "learning_rate": 4.102564102564103e-07,
1471
- "loss": 0.892,
1472
  "step": 1910
1473
  },
1474
  {
1475
  "epoch": 14.76923076923077,
1476
- "grad_norm": 3.512732982635498,
1477
  "learning_rate": 3.0769230769230774e-07,
1478
- "loss": 0.8937,
1479
  "step": 1920
1480
  },
1481
  {
1482
  "epoch": 14.846153846153847,
1483
- "grad_norm": 2.917677879333496,
1484
  "learning_rate": 2.0512820512820514e-07,
1485
- "loss": 0.9231,
1486
  "step": 1930
1487
  },
1488
  {
1489
  "epoch": 14.923076923076923,
1490
- "grad_norm": 2.0683395862579346,
1491
  "learning_rate": 1.0256410256410257e-07,
1492
- "loss": 0.8613,
1493
  "step": 1940
1494
  },
1495
  {
1496
  "epoch": 15.0,
1497
- "grad_norm": 4.704519271850586,
1498
  "learning_rate": 0.0,
1499
- "loss": 0.8842,
1500
  "step": 1950
1501
  },
1502
  {
1503
  "epoch": 15.0,
1504
- "eval_accuracy": 0.7894736842105263,
1505
- "eval_loss": 0.8550169467926025,
1506
- "eval_runtime": 0.7718,
1507
- "eval_samples_per_second": 172.329,
1508
- "eval_steps_per_second": 22.027,
1509
  "step": 1950
1510
  },
1511
  {
1512
  "epoch": 15.0,
1513
  "step": 1950,
1514
  "total_flos": 1.5658365504595968e+17,
1515
- "train_loss": 0.6299933981284117,
1516
- "train_runtime": 92.772,
1517
- "train_samples_per_second": 167.184,
1518
- "train_steps_per_second": 21.019
1519
  }
1520
  ],
1521
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.7388833165168762,
3
  "best_model_checkpoint": "./beans_outputs/checkpoint-1950",
4
  "epoch": 15.0,
5
  "eval_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.07692307692307693,
13
+ "grad_norm": 2.1216182708740234,
14
+ "learning_rate": 1.98974358974359e-05,
15
  "loss": 1.1239,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.15384615384615385,
20
+ "grad_norm": 1.8308407068252563,
21
+ "learning_rate": 1.9794871794871798e-05,
22
+ "loss": 1.1221,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.23076923076923078,
27
+ "grad_norm": 1.9811038970947266,
28
+ "learning_rate": 1.9692307692307696e-05,
29
+ "loss": 1.1163,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.3076923076923077,
34
+ "grad_norm": 2.4690616130828857,
35
+ "learning_rate": 1.958974358974359e-05,
36
+ "loss": 1.1046,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.38461538461538464,
41
+ "grad_norm": 1.6778465509414673,
42
+ "learning_rate": 1.9487179487179488e-05,
43
+ "loss": 1.1082,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.46153846153846156,
48
+ "grad_norm": 1.4839682579040527,
49
+ "learning_rate": 1.9384615384615386e-05,
50
+ "loss": 1.1043,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.5384615384615384,
55
+ "grad_norm": 2.6265501976013184,
56
+ "learning_rate": 1.9282051282051284e-05,
57
+ "loss": 1.1028,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.6153846153846154,
62
+ "grad_norm": 2.565593719482422,
63
+ "learning_rate": 1.9179487179487182e-05,
64
+ "loss": 1.0909,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.6923076923076923,
69
+ "grad_norm": 2.020240545272827,
70
+ "learning_rate": 1.907692307692308e-05,
71
+ "loss": 1.0939,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.7692307692307693,
76
+ "grad_norm": 2.49810791015625,
77
+ "learning_rate": 1.8974358974358975e-05,
78
+ "loss": 1.0917,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.8461538461538461,
83
+ "grad_norm": 1.8923050165176392,
84
+ "learning_rate": 1.8871794871794873e-05,
85
+ "loss": 1.0937,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.9230769230769231,
90
+ "grad_norm": 2.2869341373443604,
91
+ "learning_rate": 1.876923076923077e-05,
92
+ "loss": 1.1001,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 1.0,
97
+ "grad_norm": 4.069129467010498,
98
+ "learning_rate": 1.866666666666667e-05,
99
+ "loss": 1.0863,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 1.0,
104
+ "eval_accuracy": 0.42857142857142855,
105
+ "eval_loss": 1.0881553888320923,
106
+ "eval_runtime": 0.9817,
107
+ "eval_samples_per_second": 135.479,
108
+ "eval_steps_per_second": 17.317,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.0769230769230769,
113
+ "grad_norm": 1.7320090532302856,
114
+ "learning_rate": 1.8564102564102567e-05,
115
+ "loss": 1.0792,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1538461538461537,
120
+ "grad_norm": 2.5494987964630127,
121
+ "learning_rate": 1.8461538461538465e-05,
122
+ "loss": 1.072,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.2307692307692308,
127
+ "grad_norm": 1.4740803241729736,
128
+ "learning_rate": 1.835897435897436e-05,
129
+ "loss": 1.0773,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.3076923076923077,
134
+ "grad_norm": 2.3478026390075684,
135
+ "learning_rate": 1.8256410256410257e-05,
136
+ "loss": 1.0907,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.3846153846153846,
141
+ "grad_norm": 2.1786012649536133,
142
+ "learning_rate": 1.8153846153846155e-05,
143
+ "loss": 1.08,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.4615384615384617,
148
+ "grad_norm": 2.1593339443206787,
149
+ "learning_rate": 1.8051282051282053e-05,
150
+ "loss": 1.0789,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.5384615384615383,
155
+ "grad_norm": 2.1174795627593994,
156
+ "learning_rate": 1.794871794871795e-05,
157
+ "loss": 1.0779,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.6153846153846154,
162
+ "grad_norm": 1.8251662254333496,
163
+ "learning_rate": 1.784615384615385e-05,
164
+ "loss": 1.067,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.6923076923076923,
169
+ "grad_norm": 3.535820245742798,
170
+ "learning_rate": 1.7743589743589744e-05,
171
+ "loss": 1.0701,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.7692307692307692,
176
+ "grad_norm": 2.0242528915405273,
177
+ "learning_rate": 1.7641025641025642e-05,
178
+ "loss": 1.0714,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.8461538461538463,
183
+ "grad_norm": 2.1979262828826904,
184
+ "learning_rate": 1.753846153846154e-05,
185
+ "loss": 1.0868,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.9230769230769231,
190
+ "grad_norm": 2.096327543258667,
191
+ "learning_rate": 1.7435897435897438e-05,
192
+ "loss": 1.0682,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 2.0,
197
+ "grad_norm": 3.008599042892456,
198
+ "learning_rate": 1.7333333333333336e-05,
199
+ "loss": 1.063,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 2.0,
204
+ "eval_accuracy": 0.5413533834586466,
205
+ "eval_loss": 1.0589852333068848,
206
+ "eval_runtime": 0.7866,
207
+ "eval_samples_per_second": 169.088,
208
+ "eval_steps_per_second": 21.613,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.076923076923077,
213
+ "grad_norm": 2.03446888923645,
214
+ "learning_rate": 1.7230769230769234e-05,
215
+ "loss": 1.0712,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.1538461538461537,
220
+ "grad_norm": 2.4047350883483887,
221
+ "learning_rate": 1.7128205128205128e-05,
222
+ "loss": 1.0593,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.230769230769231,
227
+ "grad_norm": 2.37992262840271,
228
+ "learning_rate": 1.7025641025641026e-05,
229
+ "loss": 1.0577,
230
  "step": 290
231
  },
232
  {
233
  "epoch": 2.3076923076923075,
234
+ "grad_norm": 2.940575361251831,
235
+ "learning_rate": 1.6923076923076924e-05,
236
+ "loss": 1.044,
237
  "step": 300
238
  },
239
  {
240
  "epoch": 2.3846153846153846,
241
+ "grad_norm": 1.983780026435852,
242
+ "learning_rate": 1.6820512820512822e-05,
243
+ "loss": 1.0573,
244
  "step": 310
245
  },
246
  {
247
  "epoch": 2.4615384615384617,
248
+ "grad_norm": 1.9737600088119507,
249
+ "learning_rate": 1.671794871794872e-05,
250
+ "loss": 1.048,
251
  "step": 320
252
  },
253
  {
254
  "epoch": 2.5384615384615383,
255
+ "grad_norm": 1.4561364650726318,
256
+ "learning_rate": 1.6615384615384618e-05,
257
+ "loss": 1.0317,
258
  "step": 330
259
  },
260
  {
261
  "epoch": 2.6153846153846154,
262
+ "grad_norm": 2.187842845916748,
263
+ "learning_rate": 1.6512820512820513e-05,
264
+ "loss": 1.0482,
265
  "step": 340
266
  },
267
  {
268
  "epoch": 2.6923076923076925,
269
+ "grad_norm": 1.9264109134674072,
270
+ "learning_rate": 1.641025641025641e-05,
271
+ "loss": 1.0491,
272
  "step": 350
273
  },
274
  {
275
  "epoch": 2.769230769230769,
276
+ "grad_norm": 1.8492430448532104,
277
+ "learning_rate": 1.630769230769231e-05,
278
+ "loss": 1.0409,
279
  "step": 360
280
  },
281
  {
282
  "epoch": 2.8461538461538463,
283
+ "grad_norm": 2.1464972496032715,
284
+ "learning_rate": 1.6205128205128207e-05,
285
+ "loss": 1.0359,
286
  "step": 370
287
  },
288
  {
289
  "epoch": 2.9230769230769234,
290
+ "grad_norm": 1.8184483051300049,
291
+ "learning_rate": 1.6102564102564105e-05,
292
+ "loss": 1.0237,
293
  "step": 380
294
  },
295
  {
296
  "epoch": 3.0,
297
+ "grad_norm": 4.10977029800415,
298
+ "learning_rate": 1.6000000000000003e-05,
299
+ "loss": 1.0447,
300
  "step": 390
301
  },
302
  {
303
  "epoch": 3.0,
304
+ "eval_accuracy": 0.6992481203007519,
305
+ "eval_loss": 1.022922396659851,
306
+ "eval_runtime": 0.831,
307
+ "eval_samples_per_second": 160.057,
308
+ "eval_steps_per_second": 20.458,
309
  "step": 390
310
  },
311
  {
312
  "epoch": 3.076923076923077,
313
+ "grad_norm": 2.4164669513702393,
314
+ "learning_rate": 1.5897435897435897e-05,
315
+ "loss": 1.0417,
316
  "step": 400
317
  },
318
  {
319
  "epoch": 3.1538461538461537,
320
+ "grad_norm": 1.9611194133758545,
321
+ "learning_rate": 1.5794871794871795e-05,
322
+ "loss": 1.03,
323
  "step": 410
324
  },
325
  {
326
  "epoch": 3.230769230769231,
327
+ "grad_norm": 2.1152236461639404,
328
+ "learning_rate": 1.5692307692307693e-05,
329
+ "loss": 1.038,
330
  "step": 420
331
  },
332
  {
333
  "epoch": 3.3076923076923075,
334
+ "grad_norm": 2.2656948566436768,
335
+ "learning_rate": 1.558974358974359e-05,
336
+ "loss": 1.0163,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 3.3846153846153846,
341
+ "grad_norm": 2.1608102321624756,
342
+ "learning_rate": 1.548717948717949e-05,
343
+ "loss": 1.0277,
344
  "step": 440
345
  },
346
  {
347
  "epoch": 3.4615384615384617,
348
+ "grad_norm": 1.7433905601501465,
349
+ "learning_rate": 1.5384615384615387e-05,
350
+ "loss": 1.0379,
351
  "step": 450
352
  },
353
  {
354
  "epoch": 3.5384615384615383,
355
+ "grad_norm": 2.625889778137207,
356
+ "learning_rate": 1.5282051282051282e-05,
357
+ "loss": 1.0314,
358
  "step": 460
359
  },
360
  {
361
  "epoch": 3.6153846153846154,
362
+ "grad_norm": 1.6532872915267944,
363
+ "learning_rate": 1.517948717948718e-05,
364
+ "loss": 1.0052,
365
  "step": 470
366
  },
367
  {
368
  "epoch": 3.6923076923076925,
369
+ "grad_norm": 2.715625524520874,
370
+ "learning_rate": 1.5076923076923078e-05,
371
+ "loss": 1.0101,
372
  "step": 480
373
  },
374
  {
375
  "epoch": 3.769230769230769,
376
+ "grad_norm": 1.6237256526947021,
377
+ "learning_rate": 1.4974358974358976e-05,
378
+ "loss": 0.9981,
379
  "step": 490
380
  },
381
  {
382
  "epoch": 3.8461538461538463,
383
+ "grad_norm": 1.582321286201477,
384
+ "learning_rate": 1.4871794871794874e-05,
385
+ "loss": 1.0226,
386
  "step": 500
387
  },
388
  {
389
  "epoch": 3.9230769230769234,
390
+ "grad_norm": 1.7656166553497314,
391
+ "learning_rate": 1.4769230769230772e-05,
392
+ "loss": 0.9989,
393
  "step": 510
394
  },
395
  {
396
  "epoch": 4.0,
397
+ "grad_norm": 5.592857837677002,
398
+ "learning_rate": 1.4666666666666666e-05,
399
+ "loss": 1.0223,
400
  "step": 520
401
  },
402
  {
403
  "epoch": 4.0,
404
+ "eval_accuracy": 0.6917293233082706,
405
+ "eval_loss": 0.9968266487121582,
406
+ "eval_runtime": 0.8013,
407
+ "eval_samples_per_second": 165.971,
408
+ "eval_steps_per_second": 21.214,
409
  "step": 520
410
  },
411
  {
412
  "epoch": 4.076923076923077,
413
+ "grad_norm": 2.7708559036254883,
414
+ "learning_rate": 1.4564102564102564e-05,
415
+ "loss": 1.0136,
416
  "step": 530
417
  },
418
  {
419
  "epoch": 4.153846153846154,
420
+ "grad_norm": 1.880313515663147,
421
+ "learning_rate": 1.4461538461538462e-05,
422
+ "loss": 1.0166,
423
  "step": 540
424
  },
425
  {
426
  "epoch": 4.230769230769231,
427
+ "grad_norm": 2.6722023487091064,
428
+ "learning_rate": 1.435897435897436e-05,
429
+ "loss": 1.0238,
430
  "step": 550
431
  },
432
  {
433
  "epoch": 4.3076923076923075,
434
+ "grad_norm": 2.3504159450531006,
435
+ "learning_rate": 1.4256410256410258e-05,
436
+ "loss": 1.0338,
437
  "step": 560
438
  },
439
  {
440
  "epoch": 4.384615384615385,
441
+ "grad_norm": 2.2466416358947754,
442
+ "learning_rate": 1.4153846153846156e-05,
443
+ "loss": 0.9838,
444
  "step": 570
445
  },
446
  {
447
  "epoch": 4.461538461538462,
448
+ "grad_norm": 2.590592861175537,
449
+ "learning_rate": 1.405128205128205e-05,
450
+ "loss": 0.9915,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 4.538461538461538,
455
+ "grad_norm": 2.3943469524383545,
456
+ "learning_rate": 1.3948717948717949e-05,
457
+ "loss": 0.9885,
458
  "step": 590
459
  },
460
  {
461
  "epoch": 4.615384615384615,
462
+ "grad_norm": 1.8981530666351318,
463
+ "learning_rate": 1.3846153846153847e-05,
464
+ "loss": 0.9802,
465
  "step": 600
466
  },
467
  {
468
  "epoch": 4.6923076923076925,
469
+ "grad_norm": 3.2020835876464844,
470
+ "learning_rate": 1.3743589743589745e-05,
471
+ "loss": 0.9993,
472
  "step": 610
473
  },
474
  {
475
  "epoch": 4.769230769230769,
476
+ "grad_norm": 2.171642541885376,
477
+ "learning_rate": 1.3641025641025643e-05,
478
+ "loss": 1.0033,
479
  "step": 620
480
  },
481
  {
482
  "epoch": 4.846153846153846,
483
+ "grad_norm": 1.8594719171524048,
484
+ "learning_rate": 1.353846153846154e-05,
485
+ "loss": 0.9689,
486
  "step": 630
487
  },
488
  {
489
  "epoch": 4.923076923076923,
490
+ "grad_norm": 2.3601503372192383,
491
+ "learning_rate": 1.3435897435897435e-05,
492
+ "loss": 0.9703,
493
  "step": 640
494
  },
495
  {
496
  "epoch": 5.0,
497
+ "grad_norm": 5.1482744216918945,
498
+ "learning_rate": 1.3333333333333333e-05,
499
+ "loss": 1.0,
500
  "step": 650
501
  },
502
  {
503
  "epoch": 5.0,
504
+ "eval_accuracy": 0.7518796992481203,
505
+ "eval_loss": 0.9575085639953613,
506
+ "eval_runtime": 0.8114,
507
+ "eval_samples_per_second": 163.92,
508
+ "eval_steps_per_second": 20.952,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 5.076923076923077,
513
+ "grad_norm": 1.8753620386123657,
514
  "learning_rate": 1.3230769230769231e-05,
515
+ "loss": 0.9664,
516
  "step": 660
517
  },
518
  {
519
  "epoch": 5.153846153846154,
520
+ "grad_norm": 2.668313503265381,
521
  "learning_rate": 1.312820512820513e-05,
522
+ "loss": 1.0055,
523
  "step": 670
524
  },
525
  {
526
  "epoch": 5.230769230769231,
527
+ "grad_norm": 2.544369697570801,
528
  "learning_rate": 1.3025641025641027e-05,
529
+ "loss": 0.9936,
530
  "step": 680
531
  },
532
  {
533
  "epoch": 5.3076923076923075,
534
+ "grad_norm": 2.7921175956726074,
535
  "learning_rate": 1.2923076923076925e-05,
536
+ "loss": 0.9852,
537
  "step": 690
538
  },
539
  {
540
  "epoch": 5.384615384615385,
541
+ "grad_norm": 2.6936609745025635,
542
  "learning_rate": 1.2820512820512823e-05,
543
+ "loss": 0.9725,
544
  "step": 700
545
  },
546
  {
547
  "epoch": 5.461538461538462,
548
+ "grad_norm": 2.4392247200012207,
549
  "learning_rate": 1.2717948717948718e-05,
550
+ "loss": 0.9653,
551
  "step": 710
552
  },
553
  {
554
  "epoch": 5.538461538461538,
555
+ "grad_norm": 1.9628013372421265,
556
  "learning_rate": 1.2615384615384616e-05,
557
+ "loss": 0.9497,
558
  "step": 720
559
  },
560
  {
561
  "epoch": 5.615384615384615,
562
+ "grad_norm": 2.3558847904205322,
563
  "learning_rate": 1.2512820512820514e-05,
564
+ "loss": 0.9646,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 5.6923076923076925,
569
+ "grad_norm": 2.4973325729370117,
570
  "learning_rate": 1.2410256410256412e-05,
571
+ "loss": 0.9458,
572
  "step": 740
573
  },
574
  {
575
  "epoch": 5.769230769230769,
576
+ "grad_norm": 2.091280221939087,
577
  "learning_rate": 1.230769230769231e-05,
578
+ "loss": 0.9506,
579
  "step": 750
580
  },
581
  {
582
  "epoch": 5.846153846153846,
583
+ "grad_norm": 2.1584551334381104,
584
  "learning_rate": 1.2205128205128208e-05,
585
+ "loss": 0.9517,
586
  "step": 760
587
  },
588
  {
589
  "epoch": 5.923076923076923,
590
+ "grad_norm": 3.8231847286224365,
591
  "learning_rate": 1.2102564102564102e-05,
592
+ "loss": 0.9375,
593
  "step": 770
594
  },
595
  {
596
  "epoch": 6.0,
597
+ "grad_norm": 4.746103286743164,
598
  "learning_rate": 1.2e-05,
599
+ "loss": 0.9726,
600
  "step": 780
601
  },
602
  {
603
  "epoch": 6.0,
604
+ "eval_accuracy": 0.7744360902255639,
605
+ "eval_loss": 0.92984938621521,
606
+ "eval_runtime": 0.8125,
607
+ "eval_samples_per_second": 163.686,
608
+ "eval_steps_per_second": 20.922,
609
  "step": 780
610
  },
611
  {
612
  "epoch": 6.076923076923077,
613
+ "grad_norm": 2.8764736652374268,
614
  "learning_rate": 1.1897435897435898e-05,
615
+ "loss": 0.9424,
616
  "step": 790
617
  },
618
  {
619
  "epoch": 6.153846153846154,
620
+ "grad_norm": 2.512113332748413,
621
  "learning_rate": 1.1794871794871796e-05,
622
+ "loss": 0.9378,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 6.230769230769231,
627
+ "grad_norm": 1.8056284189224243,
628
  "learning_rate": 1.1692307692307694e-05,
629
+ "loss": 0.9309,
630
  "step": 810
631
  },
632
  {
633
  "epoch": 6.3076923076923075,
634
+ "grad_norm": 2.3125550746917725,
635
  "learning_rate": 1.1589743589743592e-05,
636
+ "loss": 0.9316,
637
  "step": 820
638
  },
639
  {
640
  "epoch": 6.384615384615385,
641
+ "grad_norm": 2.485017776489258,
642
  "learning_rate": 1.1487179487179487e-05,
643
+ "loss": 0.9359,
644
  "step": 830
645
  },
646
  {
647
  "epoch": 6.461538461538462,
648
+ "grad_norm": 3.3460640907287598,
649
  "learning_rate": 1.1384615384615385e-05,
650
+ "loss": 0.9316,
651
  "step": 840
652
  },
653
  {
654
  "epoch": 6.538461538461538,
655
+ "grad_norm": 2.9212265014648438,
656
  "learning_rate": 1.1282051282051283e-05,
657
+ "loss": 0.9417,
658
  "step": 850
659
  },
660
  {
661
  "epoch": 6.615384615384615,
662
+ "grad_norm": 1.9060611724853516,
663
  "learning_rate": 1.117948717948718e-05,
664
+ "loss": 0.9127,
665
  "step": 860
666
  },
667
  {
668
  "epoch": 6.6923076923076925,
669
+ "grad_norm": 2.729116201400757,
670
  "learning_rate": 1.1076923076923079e-05,
671
+ "loss": 0.9581,
672
  "step": 870
673
  },
674
  {
675
  "epoch": 6.769230769230769,
676
+ "grad_norm": 2.170494794845581,
677
  "learning_rate": 1.0974358974358977e-05,
678
+ "loss": 0.9267,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 6.846153846153846,
683
+ "grad_norm": 2.36336350440979,
684
  "learning_rate": 1.0871794871794871e-05,
685
+ "loss": 0.9209,
686
  "step": 890
687
  },
688
  {
689
  "epoch": 6.923076923076923,
690
+ "grad_norm": 3.067629098892212,
691
  "learning_rate": 1.076923076923077e-05,
692
+ "loss": 0.9425,
693
  "step": 900
694
  },
695
  {
696
  "epoch": 7.0,
697
+ "grad_norm": 4.193312168121338,
698
  "learning_rate": 1.0666666666666667e-05,
699
+ "loss": 0.9258,
700
  "step": 910
701
  },
702
  {
703
  "epoch": 7.0,
704
+ "eval_accuracy": 0.8045112781954887,
705
+ "eval_loss": 0.8871035575866699,
706
+ "eval_runtime": 0.9311,
707
+ "eval_samples_per_second": 142.845,
708
+ "eval_steps_per_second": 18.258,
709
  "step": 910
710
  },
711
  {
712
  "epoch": 7.076923076923077,
713
+ "grad_norm": 2.743016004562378,
714
  "learning_rate": 1.0564102564102565e-05,
715
+ "loss": 0.9111,
716
  "step": 920
717
  },
718
  {
719
  "epoch": 7.153846153846154,
720
+ "grad_norm": 3.4307682514190674,
721
  "learning_rate": 1.0461538461538463e-05,
722
+ "loss": 0.9108,
723
  "step": 930
724
  },
725
  {
726
  "epoch": 7.230769230769231,
727
+ "grad_norm": 3.426872968673706,
728
  "learning_rate": 1.0358974358974361e-05,
729
+ "loss": 0.9011,
730
  "step": 940
731
  },
732
  {
733
  "epoch": 7.3076923076923075,
734
+ "grad_norm": 2.8081014156341553,
735
  "learning_rate": 1.0256410256410256e-05,
736
+ "loss": 0.9076,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 7.384615384615385,
741
+ "grad_norm": 2.9387893676757812,
742
  "learning_rate": 1.0153846153846154e-05,
743
+ "loss": 0.9236,
744
  "step": 960
745
  },
746
  {
747
  "epoch": 7.461538461538462,
748
+ "grad_norm": 2.192082643508911,
749
  "learning_rate": 1.0051282051282052e-05,
750
+ "loss": 0.8889,
751
  "step": 970
752
  },
753
  {
754
  "epoch": 7.538461538461538,
755
+ "grad_norm": 2.60426926612854,
756
  "learning_rate": 9.94871794871795e-06,
757
+ "loss": 0.9243,
758
  "step": 980
759
  },
760
  {
761
  "epoch": 7.615384615384615,
762
+ "grad_norm": 2.2395219802856445,
763
  "learning_rate": 9.846153846153848e-06,
764
+ "loss": 0.8859,
765
  "step": 990
766
  },
767
  {
768
  "epoch": 7.6923076923076925,
769
+ "grad_norm": 2.0519468784332275,
770
  "learning_rate": 9.743589743589744e-06,
771
+ "loss": 0.8821,
772
  "step": 1000
773
  },
774
  {
775
  "epoch": 7.769230769230769,
776
+ "grad_norm": 3.1931142807006836,
777
  "learning_rate": 9.641025641025642e-06,
778
+ "loss": 0.8825,
779
  "step": 1010
780
  },
781
  {
782
  "epoch": 7.846153846153846,
783
+ "grad_norm": 2.4902703762054443,
784
  "learning_rate": 9.53846153846154e-06,
785
+ "loss": 0.9106,
786
  "step": 1020
787
  },
788
  {
789
  "epoch": 7.923076923076923,
790
+ "grad_norm": 2.7248635292053223,
791
  "learning_rate": 9.435897435897436e-06,
792
+ "loss": 0.8597,
793
  "step": 1030
794
  },
795
  {
796
  "epoch": 8.0,
797
+ "grad_norm": 4.114623069763184,
798
  "learning_rate": 9.333333333333334e-06,
799
+ "loss": 0.9203,
800
  "step": 1040
801
  },
802
  {
803
  "epoch": 8.0,
804
+ "eval_accuracy": 0.8345864661654135,
805
+ "eval_loss": 0.8486866354942322,
806
+ "eval_runtime": 0.8033,
807
+ "eval_samples_per_second": 165.565,
808
+ "eval_steps_per_second": 21.162,
809
  "step": 1040
810
  },
811
  {
812
  "epoch": 8.076923076923077,
813
+ "grad_norm": 1.7978770732879639,
814
  "learning_rate": 9.230769230769232e-06,
815
+ "loss": 0.8922,
816
  "step": 1050
817
  },
818
  {
819
  "epoch": 8.153846153846153,
820
+ "grad_norm": 2.1388049125671387,
821
  "learning_rate": 9.128205128205129e-06,
822
+ "loss": 0.8696,
823
  "step": 1060
824
  },
825
  {
826
  "epoch": 8.23076923076923,
827
+ "grad_norm": 3.935715913772583,
828
  "learning_rate": 9.025641025641027e-06,
829
+ "loss": 0.9006,
830
  "step": 1070
831
  },
832
  {
833
  "epoch": 8.307692307692308,
834
+ "grad_norm": 2.129542112350464,
835
  "learning_rate": 8.923076923076925e-06,
836
+ "loss": 0.8603,
837
  "step": 1080
838
  },
839
  {
840
  "epoch": 8.384615384615385,
841
+ "grad_norm": 2.4084651470184326,
842
  "learning_rate": 8.820512820512821e-06,
843
+ "loss": 0.8989,
844
  "step": 1090
845
  },
846
  {
847
  "epoch": 8.461538461538462,
848
+ "grad_norm": 2.7610435485839844,
849
  "learning_rate": 8.717948717948719e-06,
850
+ "loss": 0.8732,
851
  "step": 1100
852
  },
853
  {
854
  "epoch": 8.538461538461538,
855
+ "grad_norm": 1.6848160028457642,
856
  "learning_rate": 8.615384615384617e-06,
857
+ "loss": 0.8715,
858
  "step": 1110
859
  },
860
  {
861
  "epoch": 8.615384615384615,
862
+ "grad_norm": 3.277689218521118,
863
  "learning_rate": 8.512820512820513e-06,
864
+ "loss": 0.8989,
865
  "step": 1120
866
  },
867
  {
868
  "epoch": 8.692307692307692,
869
+ "grad_norm": 3.7184839248657227,
870
  "learning_rate": 8.410256410256411e-06,
871
+ "loss": 0.8822,
872
  "step": 1130
873
  },
874
  {
875
  "epoch": 8.76923076923077,
876
+ "grad_norm": 2.3165555000305176,
877
  "learning_rate": 8.307692307692309e-06,
878
+ "loss": 0.889,
879
  "step": 1140
880
  },
881
  {
882
  "epoch": 8.846153846153847,
883
+ "grad_norm": 2.629028558731079,
884
  "learning_rate": 8.205128205128205e-06,
885
+ "loss": 0.8753,
886
  "step": 1150
887
  },
888
  {
889
  "epoch": 8.923076923076923,
890
+ "grad_norm": 2.8620731830596924,
891
  "learning_rate": 8.102564102564103e-06,
892
+ "loss": 0.8855,
893
  "step": 1160
894
  },
895
  {
896
  "epoch": 9.0,
897
+ "grad_norm": 3.924490451812744,
898
  "learning_rate": 8.000000000000001e-06,
899
+ "loss": 0.9038,
900
  "step": 1170
901
  },
902
  {
903
  "epoch": 9.0,
904
+ "eval_accuracy": 0.8120300751879699,
905
+ "eval_loss": 0.8329855799674988,
906
+ "eval_runtime": 0.781,
907
+ "eval_samples_per_second": 170.303,
908
+ "eval_steps_per_second": 21.768,
909
  "step": 1170
910
  },
911
  {
912
  "epoch": 9.076923076923077,
913
+ "grad_norm": 1.9303114414215088,
914
  "learning_rate": 7.897435897435898e-06,
915
+ "loss": 0.8952,
916
  "step": 1180
917
  },
918
  {
919
  "epoch": 9.153846153846153,
920
+ "grad_norm": 2.1960716247558594,
921
  "learning_rate": 7.794871794871796e-06,
922
+ "loss": 0.8401,
923
  "step": 1190
924
  },
925
  {
926
  "epoch": 9.23076923076923,
927
+ "grad_norm": 2.7040772438049316,
928
  "learning_rate": 7.692307692307694e-06,
929
+ "loss": 0.8488,
930
  "step": 1200
931
  },
932
  {
933
  "epoch": 9.307692307692308,
934
+ "grad_norm": 2.097287654876709,
935
  "learning_rate": 7.58974358974359e-06,
936
+ "loss": 0.8544,
937
  "step": 1210
938
  },
939
  {
940
  "epoch": 9.384615384615385,
941
+ "grad_norm": 2.012044906616211,
942
  "learning_rate": 7.487179487179488e-06,
943
+ "loss": 0.8679,
944
  "step": 1220
945
  },
946
  {
947
  "epoch": 9.461538461538462,
948
+ "grad_norm": 3.2651612758636475,
949
  "learning_rate": 7.384615384615386e-06,
950
+ "loss": 0.8558,
951
  "step": 1230
952
  },
953
  {
954
  "epoch": 9.538461538461538,
955
+ "grad_norm": 2.5597023963928223,
956
  "learning_rate": 7.282051282051282e-06,
957
+ "loss": 0.8719,
958
  "step": 1240
959
  },
960
  {
961
  "epoch": 9.615384615384615,
962
+ "grad_norm": 2.756129503250122,
963
  "learning_rate": 7.17948717948718e-06,
964
+ "loss": 0.8272,
965
  "step": 1250
966
  },
967
  {
968
  "epoch": 9.692307692307692,
969
+ "grad_norm": 2.7830724716186523,
970
  "learning_rate": 7.076923076923078e-06,
971
+ "loss": 0.854,
972
  "step": 1260
973
  },
974
  {
975
  "epoch": 9.76923076923077,
976
+ "grad_norm": 1.841320514678955,
977
  "learning_rate": 6.974358974358974e-06,
978
+ "loss": 0.8283,
979
  "step": 1270
980
  },
981
  {
982
  "epoch": 9.846153846153847,
983
+ "grad_norm": 2.4447691440582275,
984
  "learning_rate": 6.871794871794872e-06,
985
+ "loss": 0.8179,
986
  "step": 1280
987
  },
988
  {
989
  "epoch": 9.923076923076923,
990
+ "grad_norm": 2.266535758972168,
991
  "learning_rate": 6.76923076923077e-06,
992
+ "loss": 0.8368,
993
  "step": 1290
994
  },
995
  {
996
  "epoch": 10.0,
997
+ "grad_norm": 4.2985944747924805,
998
  "learning_rate": 6.666666666666667e-06,
999
+ "loss": 0.8112,
1000
  "step": 1300
1001
  },
1002
  {
1003
  "epoch": 10.0,
1004
+ "eval_accuracy": 0.8345864661654135,
1005
+ "eval_loss": 0.8083619475364685,
1006
+ "eval_runtime": 0.8269,
1007
+ "eval_samples_per_second": 160.851,
1008
+ "eval_steps_per_second": 20.56,
1009
  "step": 1300
1010
  },
1011
  {
1012
  "epoch": 10.076923076923077,
1013
+ "grad_norm": 1.931143045425415,
1014
  "learning_rate": 6.564102564102565e-06,
1015
+ "loss": 0.8378,
1016
  "step": 1310
1017
  },
1018
  {
1019
  "epoch": 10.153846153846153,
1020
+ "grad_norm": 2.4910976886749268,
1021
  "learning_rate": 6.461538461538463e-06,
1022
+ "loss": 0.8909,
1023
  "step": 1320
1024
  },
1025
  {
1026
  "epoch": 10.23076923076923,
1027
+ "grad_norm": 2.3511204719543457,
1028
  "learning_rate": 6.358974358974359e-06,
1029
+ "loss": 0.8715,
1030
  "step": 1330
1031
  },
1032
  {
1033
  "epoch": 10.307692307692308,
1034
+ "grad_norm": 2.0618457794189453,
1035
  "learning_rate": 6.256410256410257e-06,
1036
+ "loss": 0.833,
1037
  "step": 1340
1038
  },
1039
  {
1040
  "epoch": 10.384615384615385,
1041
+ "grad_norm": 2.4037184715270996,
1042
  "learning_rate": 6.153846153846155e-06,
1043
+ "loss": 0.809,
1044
  "step": 1350
1045
  },
1046
  {
1047
  "epoch": 10.461538461538462,
1048
+ "grad_norm": 2.6920347213745117,
1049
  "learning_rate": 6.051282051282051e-06,
1050
+ "loss": 0.8441,
1051
  "step": 1360
1052
  },
1053
  {
1054
  "epoch": 10.538461538461538,
1055
+ "grad_norm": 2.5570709705352783,
1056
  "learning_rate": 5.948717948717949e-06,
1057
+ "loss": 0.8166,
1058
  "step": 1370
1059
  },
1060
  {
1061
  "epoch": 10.615384615384615,
1062
+ "grad_norm": 2.6978418827056885,
1063
  "learning_rate": 5.846153846153847e-06,
1064
+ "loss": 0.8193,
1065
  "step": 1380
1066
  },
1067
  {
1068
  "epoch": 10.692307692307692,
1069
+ "grad_norm": 2.370861291885376,
1070
  "learning_rate": 5.743589743589743e-06,
1071
+ "loss": 0.81,
1072
  "step": 1390
1073
  },
1074
  {
1075
  "epoch": 10.76923076923077,
1076
+ "grad_norm": 3.260789394378662,
1077
  "learning_rate": 5.641025641025641e-06,
1078
+ "loss": 0.8671,
1079
  "step": 1400
1080
  },
1081
  {
1082
  "epoch": 10.846153846153847,
1083
+ "grad_norm": 2.27559494972229,
1084
  "learning_rate": 5.538461538461539e-06,
1085
+ "loss": 0.8321,
1086
  "step": 1410
1087
  },
1088
  {
1089
  "epoch": 10.923076923076923,
1090
+ "grad_norm": 1.8184912204742432,
1091
  "learning_rate": 5.435897435897436e-06,
1092
+ "loss": 0.8388,
1093
  "step": 1420
1094
  },
1095
  {
1096
  "epoch": 11.0,
1097
+ "grad_norm": 4.08992862701416,
1098
  "learning_rate": 5.333333333333334e-06,
1099
+ "loss": 0.8335,
1100
  "step": 1430
1101
  },
1102
  {
1103
  "epoch": 11.0,
1104
+ "eval_accuracy": 0.8345864661654135,
1105
+ "eval_loss": 0.7785491943359375,
1106
+ "eval_runtime": 0.8255,
1107
+ "eval_samples_per_second": 161.106,
1108
+ "eval_steps_per_second": 20.592,
1109
  "step": 1430
1110
  },
1111
  {
1112
  "epoch": 11.076923076923077,
1113
+ "grad_norm": 2.0166661739349365,
1114
  "learning_rate": 5.230769230769232e-06,
1115
+ "loss": 0.8471,
1116
  "step": 1440
1117
  },
1118
  {
1119
  "epoch": 11.153846153846153,
1120
+ "grad_norm": 3.2678654193878174,
1121
  "learning_rate": 5.128205128205128e-06,
1122
+ "loss": 0.8298,
1123
  "step": 1450
1124
  },
1125
  {
1126
  "epoch": 11.23076923076923,
1127
+ "grad_norm": 2.545058488845825,
1128
  "learning_rate": 5.025641025641026e-06,
1129
+ "loss": 0.7991,
1130
  "step": 1460
1131
  },
1132
  {
1133
  "epoch": 11.307692307692308,
1134
+ "grad_norm": 2.469082832336426,
1135
  "learning_rate": 4.923076923076924e-06,
1136
+ "loss": 0.8256,
1137
  "step": 1470
1138
  },
1139
  {
1140
  "epoch": 11.384615384615385,
1141
+ "grad_norm": 2.8207569122314453,
1142
  "learning_rate": 4.820512820512821e-06,
1143
+ "loss": 0.8172,
1144
  "step": 1480
1145
  },
1146
  {
1147
  "epoch": 11.461538461538462,
1148
+ "grad_norm": 2.142630100250244,
1149
  "learning_rate": 4.717948717948718e-06,
1150
+ "loss": 0.8448,
1151
  "step": 1490
1152
  },
1153
  {
1154
  "epoch": 11.538461538461538,
1155
+ "grad_norm": 2.3974075317382812,
1156
  "learning_rate": 4.615384615384616e-06,
1157
+ "loss": 0.8292,
1158
  "step": 1500
1159
  },
1160
  {
1161
  "epoch": 11.615384615384615,
1162
+ "grad_norm": 3.240954875946045,
1163
  "learning_rate": 4.512820512820513e-06,
1164
+ "loss": 0.8275,
1165
  "step": 1510
1166
  },
1167
  {
1168
  "epoch": 11.692307692307692,
1169
+ "grad_norm": 3.3133740425109863,
1170
  "learning_rate": 4.4102564102564104e-06,
1171
+ "loss": 0.8569,
1172
  "step": 1520
1173
  },
1174
  {
1175
  "epoch": 11.76923076923077,
1176
+ "grad_norm": 2.0835375785827637,
1177
  "learning_rate": 4.307692307692308e-06,
1178
+ "loss": 0.8086,
1179
  "step": 1530
1180
  },
1181
  {
1182
  "epoch": 11.846153846153847,
1183
+ "grad_norm": 2.20538067817688,
1184
  "learning_rate": 4.2051282051282055e-06,
1185
+ "loss": 0.797,
1186
  "step": 1540
1187
  },
1188
  {
1189
  "epoch": 11.923076923076923,
1190
+ "grad_norm": 2.1278247833251953,
1191
  "learning_rate": 4.102564102564103e-06,
1192
+ "loss": 0.8301,
1193
  "step": 1550
1194
  },
1195
  {
1196
  "epoch": 12.0,
1197
+ "grad_norm": 3.4031107425689697,
1198
  "learning_rate": 4.000000000000001e-06,
1199
+ "loss": 0.8062,
1200
  "step": 1560
1201
  },
1202
  {
1203
  "epoch": 12.0,
1204
+ "eval_accuracy": 0.8345864661654135,
1205
+ "eval_loss": 0.7569367289543152,
1206
+ "eval_runtime": 0.799,
1207
+ "eval_samples_per_second": 166.458,
1208
+ "eval_steps_per_second": 21.277,
1209
  "step": 1560
1210
  },
1211
  {
1212
  "epoch": 12.076923076923077,
1213
+ "grad_norm": 2.181518793106079,
1214
  "learning_rate": 3.897435897435898e-06,
1215
+ "loss": 0.7903,
1216
  "step": 1570
1217
  },
1218
  {
1219
  "epoch": 12.153846153846153,
1220
+ "grad_norm": 2.4533674716949463,
1221
  "learning_rate": 3.794871794871795e-06,
1222
+ "loss": 0.8228,
1223
  "step": 1580
1224
  },
1225
  {
1226
  "epoch": 12.23076923076923,
1227
+ "grad_norm": 1.6895153522491455,
1228
  "learning_rate": 3.692307692307693e-06,
1229
+ "loss": 0.8242,
1230
  "step": 1590
1231
  },
1232
  {
1233
  "epoch": 12.307692307692308,
1234
+ "grad_norm": 2.1413424015045166,
1235
  "learning_rate": 3.58974358974359e-06,
1236
+ "loss": 0.8378,
1237
  "step": 1600
1238
  },
1239
  {
1240
  "epoch": 12.384615384615385,
1241
+ "grad_norm": 2.416987657546997,
1242
  "learning_rate": 3.487179487179487e-06,
1243
+ "loss": 0.8088,
1244
  "step": 1610
1245
  },
1246
  {
1247
  "epoch": 12.461538461538462,
1248
+ "grad_norm": 1.704318881034851,
1249
  "learning_rate": 3.384615384615385e-06,
1250
+ "loss": 0.7953,
1251
  "step": 1620
1252
  },
1253
  {
1254
  "epoch": 12.538461538461538,
1255
+ "grad_norm": 2.015375852584839,
1256
  "learning_rate": 3.2820512820512823e-06,
1257
+ "loss": 0.799,
1258
  "step": 1630
1259
  },
1260
  {
1261
  "epoch": 12.615384615384615,
1262
+ "grad_norm": 1.851975917816162,
1263
  "learning_rate": 3.1794871794871795e-06,
1264
+ "loss": 0.813,
1265
  "step": 1640
1266
  },
1267
  {
1268
  "epoch": 12.692307692307692,
1269
+ "grad_norm": 3.352241277694702,
1270
  "learning_rate": 3.0769230769230774e-06,
1271
+ "loss": 0.763,
1272
  "step": 1650
1273
  },
1274
  {
1275
  "epoch": 12.76923076923077,
1276
+ "grad_norm": 2.4369592666625977,
1277
  "learning_rate": 2.9743589743589746e-06,
1278
+ "loss": 0.8583,
1279
  "step": 1660
1280
  },
1281
  {
1282
  "epoch": 12.846153846153847,
1283
+ "grad_norm": 1.7659847736358643,
1284
  "learning_rate": 2.8717948717948717e-06,
1285
+ "loss": 0.8354,
1286
  "step": 1670
1287
  },
1288
  {
1289
  "epoch": 12.923076923076923,
1290
+ "grad_norm": 1.8726952075958252,
1291
  "learning_rate": 2.7692307692307697e-06,
1292
+ "loss": 0.7972,
1293
  "step": 1680
1294
  },
1295
  {
1296
  "epoch": 13.0,
1297
+ "grad_norm": 3.766446352005005,
1298
  "learning_rate": 2.666666666666667e-06,
1299
+ "loss": 0.8141,
1300
  "step": 1690
1301
  },
1302
  {
1303
  "epoch": 13.0,
1304
+ "eval_accuracy": 0.849624060150376,
1305
+ "eval_loss": 0.7535560131072998,
1306
+ "eval_runtime": 0.8236,
1307
+ "eval_samples_per_second": 161.479,
1308
+ "eval_steps_per_second": 20.64,
1309
  "step": 1690
1310
  },
1311
  {
1312
  "epoch": 13.076923076923077,
1313
+ "grad_norm": 3.104832649230957,
1314
  "learning_rate": 2.564102564102564e-06,
1315
+ "loss": 0.7937,
1316
  "step": 1700
1317
  },
1318
  {
1319
  "epoch": 13.153846153846153,
1320
+ "grad_norm": 2.46419095993042,
1321
  "learning_rate": 2.461538461538462e-06,
1322
+ "loss": 0.822,
1323
  "step": 1710
1324
  },
1325
  {
1326
  "epoch": 13.23076923076923,
1327
+ "grad_norm": 3.035752296447754,
1328
  "learning_rate": 2.358974358974359e-06,
1329
+ "loss": 0.8127,
1330
  "step": 1720
1331
  },
1332
  {
1333
  "epoch": 13.307692307692308,
1334
+ "grad_norm": 1.9470882415771484,
1335
  "learning_rate": 2.2564102564102566e-06,
1336
+ "loss": 0.8151,
1337
  "step": 1730
1338
  },
1339
  {
1340
  "epoch": 13.384615384615385,
1341
+ "grad_norm": 2.086946964263916,
1342
  "learning_rate": 2.153846153846154e-06,
1343
+ "loss": 0.7773,
1344
  "step": 1740
1345
  },
1346
  {
1347
  "epoch": 13.461538461538462,
1348
+ "grad_norm": 3.7044293880462646,
1349
  "learning_rate": 2.0512820512820513e-06,
1350
+ "loss": 0.8276,
1351
  "step": 1750
1352
  },
1353
  {
1354
  "epoch": 13.538461538461538,
1355
+ "grad_norm": 2.735135555267334,
1356
  "learning_rate": 1.948717948717949e-06,
1357
+ "loss": 0.8462,
1358
  "step": 1760
1359
  },
1360
  {
1361
  "epoch": 13.615384615384615,
1362
+ "grad_norm": 2.065619468688965,
1363
  "learning_rate": 1.8461538461538465e-06,
1364
+ "loss": 0.8024,
1365
  "step": 1770
1366
  },
1367
  {
1368
  "epoch": 13.692307692307692,
1369
+ "grad_norm": 2.519625663757324,
1370
  "learning_rate": 1.7435897435897436e-06,
1371
+ "loss": 0.7813,
1372
  "step": 1780
1373
  },
1374
  {
1375
  "epoch": 13.76923076923077,
1376
+ "grad_norm": 1.8556421995162964,
1377
  "learning_rate": 1.6410256410256412e-06,
1378
+ "loss": 0.8218,
1379
  "step": 1790
1380
  },
1381
  {
1382
  "epoch": 13.846153846153847,
1383
+ "grad_norm": 3.1282896995544434,
1384
  "learning_rate": 1.5384615384615387e-06,
1385
+ "loss": 0.7891,
1386
  "step": 1800
1387
  },
1388
  {
1389
  "epoch": 13.923076923076923,
1390
+ "grad_norm": 2.363374948501587,
1391
  "learning_rate": 1.4358974358974359e-06,
1392
+ "loss": 0.7341,
1393
  "step": 1810
1394
  },
1395
  {
1396
  "epoch": 14.0,
1397
+ "grad_norm": 4.762740612030029,
1398
  "learning_rate": 1.3333333333333334e-06,
1399
+ "loss": 0.8172,
1400
  "step": 1820
1401
  },
1402
  {
1403
  "epoch": 14.0,
1404
+ "eval_accuracy": 0.8270676691729323,
1405
+ "eval_loss": 0.7531744837760925,
1406
+ "eval_runtime": 0.8266,
1407
+ "eval_samples_per_second": 160.91,
1408
+ "eval_steps_per_second": 20.567,
1409
  "step": 1820
1410
  },
1411
  {
1412
  "epoch": 14.076923076923077,
1413
+ "grad_norm": 1.8959217071533203,
1414
  "learning_rate": 1.230769230769231e-06,
1415
+ "loss": 0.8201,
1416
  "step": 1830
1417
  },
1418
  {
1419
  "epoch": 14.153846153846153,
1420
+ "grad_norm": 2.1285548210144043,
1421
  "learning_rate": 1.1282051282051283e-06,
1422
+ "loss": 0.7906,
1423
  "step": 1840
1424
  },
1425
  {
1426
  "epoch": 14.23076923076923,
1427
+ "grad_norm": 2.916303873062134,
1428
  "learning_rate": 1.0256410256410257e-06,
1429
+ "loss": 0.7793,
1430
  "step": 1850
1431
  },
1432
  {
1433
  "epoch": 14.307692307692308,
1434
+ "grad_norm": 2.9460341930389404,
1435
  "learning_rate": 9.230769230769232e-07,
1436
+ "loss": 0.773,
1437
  "step": 1860
1438
  },
1439
  {
1440
  "epoch": 14.384615384615385,
1441
+ "grad_norm": 2.4907443523406982,
1442
  "learning_rate": 8.205128205128206e-07,
1443
+ "loss": 0.7887,
1444
  "step": 1870
1445
  },
1446
  {
1447
  "epoch": 14.461538461538462,
1448
+ "grad_norm": 2.431607961654663,
1449
  "learning_rate": 7.179487179487179e-07,
1450
+ "loss": 0.7886,
1451
  "step": 1880
1452
  },
1453
  {
1454
  "epoch": 14.538461538461538,
1455
+ "grad_norm": 3.005627155303955,
1456
  "learning_rate": 6.153846153846155e-07,
1457
+ "loss": 0.7789,
1458
  "step": 1890
1459
  },
1460
  {
1461
  "epoch": 14.615384615384615,
1462
+ "grad_norm": 2.981405735015869,
1463
  "learning_rate": 5.128205128205128e-07,
1464
+ "loss": 0.8262,
1465
  "step": 1900
1466
  },
1467
  {
1468
  "epoch": 14.692307692307692,
1469
+ "grad_norm": 1.9199621677398682,
1470
  "learning_rate": 4.102564102564103e-07,
1471
+ "loss": 0.8015,
1472
  "step": 1910
1473
  },
1474
  {
1475
  "epoch": 14.76923076923077,
1476
+ "grad_norm": 3.6987714767456055,
1477
  "learning_rate": 3.0769230769230774e-07,
1478
+ "loss": 0.7998,
1479
  "step": 1920
1480
  },
1481
  {
1482
  "epoch": 14.846153846153847,
1483
+ "grad_norm": 3.5848920345306396,
1484
  "learning_rate": 2.0512820512820514e-07,
1485
+ "loss": 0.8552,
1486
  "step": 1930
1487
  },
1488
  {
1489
  "epoch": 14.923076923076923,
1490
+ "grad_norm": 2.4082224369049072,
1491
  "learning_rate": 1.0256410256410257e-07,
1492
+ "loss": 0.759,
1493
  "step": 1940
1494
  },
1495
  {
1496
  "epoch": 15.0,
1497
+ "grad_norm": 4.4640116691589355,
1498
  "learning_rate": 0.0,
1499
+ "loss": 0.7896,
1500
  "step": 1950
1501
  },
1502
  {
1503
  "epoch": 15.0,
1504
+ "eval_accuracy": 0.8045112781954887,
1505
+ "eval_loss": 0.7388833165168762,
1506
+ "eval_runtime": 0.8692,
1507
+ "eval_samples_per_second": 153.016,
1508
+ "eval_steps_per_second": 19.558,
1509
  "step": 1950
1510
  },
1511
  {
1512
  "epoch": 15.0,
1513
  "step": 1950,
1514
  "total_flos": 1.5658365504595968e+17,
1515
+ "train_loss": 0.923483537771763,
1516
+ "train_runtime": 151.8082,
1517
+ "train_samples_per_second": 102.168,
1518
+ "train_steps_per_second": 12.845
1519
  }
1520
  ],
1521
  "logging_steps": 10,