matthieulel commited on
Commit
0e9a7b6
1 Parent(s): c0e2beb

End of training

Browse files
README.md CHANGED
@@ -2,6 +2,8 @@
2
  license: apache-2.0
3
  base_model: facebook/convnextv2-tiny-1k-224
4
  tags:
 
 
5
  - generated_from_trainer
6
  metrics:
7
  - accuracy
@@ -15,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # convnextv2-tiny-1k-224-finetuned-galaxy10-decals
17
 
18
- This model is a fine-tuned version of [facebook/convnextv2-tiny-1k-224](https://huggingface.co/facebook/convnextv2-tiny-1k-224) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.4261
21
  - Accuracy: 0.8703
 
2
  license: apache-2.0
3
  base_model: facebook/convnextv2-tiny-1k-224
4
  tags:
5
+ - image-classification
6
+ - vision
7
  - generated_from_trainer
8
  metrics:
9
  - accuracy
 
17
 
18
  # convnextv2-tiny-1k-224-finetuned-galaxy10-decals
19
 
20
+ This model is a fine-tuned version of [facebook/convnextv2-tiny-1k-224](https://huggingface.co/facebook/convnextv2-tiny-1k-224) on the matthieulel/galaxy10_decals dataset.
21
  It achieves the following results on the evaluation set:
22
  - Loss: 0.4261
23
  - Accuracy: 0.8703
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 9.977728285077951,
3
- "eval_accuracy": 0.8672510958046337,
4
- "eval_loss": 0.41685062646865845,
5
- "eval_runtime": 12.3483,
6
- "eval_samples_per_second": 129.329,
7
- "eval_steps_per_second": 4.049,
8
- "total_flos": 3.6084187126879396e+18,
9
- "train_loss": 0.7042633635657174,
10
- "train_runtime": 2366.3213,
11
- "train_samples_per_second": 60.706,
12
- "train_steps_per_second": 0.473
13
  }
 
1
  {
2
+ "epoch": 19.879759519038075,
3
+ "eval_accuracy": 0.8703494926719278,
4
+ "eval_loss": 0.4261245131492615,
5
+ "eval_runtime": 25.4134,
6
+ "eval_samples_per_second": 69.806,
7
+ "eval_steps_per_second": 2.204,
8
+ "total_flos": 7.988705158075343e+18,
9
+ "train_loss": 0.5653726263392356,
10
+ "train_runtime": 5886.8979,
11
+ "train_samples_per_second": 54.229,
12
+ "train_steps_per_second": 0.421
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.977728285077951,
3
- "eval_accuracy": 0.8672510958046337,
4
- "eval_loss": 0.41685062646865845,
5
- "eval_runtime": 12.3483,
6
- "eval_samples_per_second": 129.329,
7
- "eval_steps_per_second": 4.049
8
  }
 
1
  {
2
+ "epoch": 19.879759519038075,
3
+ "eval_accuracy": 0.8703494926719278,
4
+ "eval_loss": 0.4261245131492615,
5
+ "eval_runtime": 25.4134,
6
+ "eval_samples_per_second": 69.806,
7
+ "eval_steps_per_second": 2.204
8
  }
runs/May06_10-03-49_nmjti7f45r/events.out.tfevents.1714995748.nmjti7f45r.226.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d906b679414f6f7b01893920792ceeed5c78b6db17d357bece5b695a5dc7498
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.977728285077951,
3
- "total_flos": 3.6084187126879396e+18,
4
- "train_loss": 0.7042633635657174,
5
- "train_runtime": 2366.3213,
6
- "train_samples_per_second": 60.706,
7
- "train_steps_per_second": 0.473
8
  }
 
1
  {
2
+ "epoch": 19.879759519038075,
3
+ "total_flos": 7.988705158075343e+18,
4
+ "train_loss": 0.5653726263392356,
5
+ "train_runtime": 5886.8979,
6
+ "train_samples_per_second": 54.229,
7
+ "train_steps_per_second": 0.421
8
  }
trainer_state.json CHANGED
@@ -1,903 +1,1945 @@
1
  {
2
- "best_metric": 0.8672510958046337,
3
- "best_model_checkpoint": "convnextv2-tiny-1k-224-finetuned-galaxy10-decals/checkpoint-785",
4
- "epoch": 9.977728285077951,
5
  "eval_steps": 500,
6
- "global_step": 1120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.08908685968819599,
13
- "grad_norm": 2.9069173336029053,
14
- "learning_rate": 4.464285714285715e-06,
15
- "loss": 2.3159,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.17817371937639198,
20
- "grad_norm": 4.669096946716309,
21
- "learning_rate": 8.92857142857143e-06,
22
- "loss": 2.2657,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.267260579064588,
27
- "grad_norm": 5.892533302307129,
28
- "learning_rate": 1.3392857142857144e-05,
29
- "loss": 2.1852,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.35634743875278396,
34
- "grad_norm": 4.657855033874512,
35
- "learning_rate": 1.785714285714286e-05,
36
- "loss": 2.0843,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.44543429844098,
41
- "grad_norm": 4.414278030395508,
42
- "learning_rate": 2.2321428571428575e-05,
43
- "loss": 1.9644,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.534521158129176,
48
- "grad_norm": 6.150153636932373,
49
- "learning_rate": 2.6785714285714288e-05,
50
- "loss": 1.7921,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.623608017817372,
55
- "grad_norm": 7.54302978515625,
56
- "learning_rate": 3.125e-05,
57
- "loss": 1.5743,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.7126948775055679,
62
- "grad_norm": 11.624669075012207,
63
- "learning_rate": 3.571428571428572e-05,
64
- "loss": 1.42,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.8017817371937639,
69
- "grad_norm": 11.175118446350098,
70
- "learning_rate": 4.017857142857143e-05,
71
- "loss": 1.301,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.89086859688196,
76
- "grad_norm": 13.484392166137695,
77
- "learning_rate": 4.464285714285715e-05,
78
- "loss": 1.2246,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.9799554565701559,
83
- "grad_norm": 22.607799530029297,
84
- "learning_rate": 4.910714285714286e-05,
85
- "loss": 1.0664,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.9977728285077951,
90
- "eval_accuracy": 0.6725109580463369,
91
- "eval_loss": 0.9818494915962219,
92
- "eval_runtime": 12.2808,
93
- "eval_samples_per_second": 130.041,
94
- "eval_steps_per_second": 4.071,
95
- "step": 112
96
  },
97
  {
98
- "epoch": 1.069042316258352,
99
- "grad_norm": 11.895453453063965,
100
- "learning_rate": 4.960317460317461e-05,
101
- "loss": 1.0415,
102
- "step": 120
 
 
103
  },
104
  {
105
- "epoch": 1.158129175946548,
106
- "grad_norm": 15.956673622131348,
107
- "learning_rate": 4.910714285714286e-05,
108
- "loss": 1.0021,
109
  "step": 130
110
  },
111
  {
112
- "epoch": 1.247216035634744,
113
- "grad_norm": 12.564276695251465,
114
- "learning_rate": 4.8611111111111115e-05,
115
- "loss": 0.9518,
116
  "step": 140
117
  },
118
  {
119
- "epoch": 1.3363028953229399,
120
- "grad_norm": 15.347941398620605,
121
- "learning_rate": 4.811507936507937e-05,
122
- "loss": 0.9316,
123
  "step": 150
124
  },
125
  {
126
- "epoch": 1.4253897550111359,
127
- "grad_norm": 13.298521041870117,
128
- "learning_rate": 4.761904761904762e-05,
129
- "loss": 0.8316,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 1.5144766146993318,
134
- "grad_norm": 12.444368362426758,
135
- "learning_rate": 4.7123015873015876e-05,
136
- "loss": 0.8863,
137
  "step": 170
138
  },
139
  {
140
- "epoch": 1.6035634743875278,
141
- "grad_norm": 10.18455982208252,
142
- "learning_rate": 4.662698412698413e-05,
143
- "loss": 0.8304,
144
  "step": 180
145
  },
146
  {
147
- "epoch": 1.692650334075724,
148
- "grad_norm": 13.635807991027832,
149
- "learning_rate": 4.613095238095239e-05,
150
- "loss": 0.7748,
151
  "step": 190
152
  },
153
  {
154
- "epoch": 1.7817371937639197,
155
- "grad_norm": 9.778557777404785,
156
- "learning_rate": 4.563492063492064e-05,
157
- "loss": 0.7561,
158
  "step": 200
159
  },
160
  {
161
- "epoch": 1.8708240534521159,
162
- "grad_norm": 16.08139419555664,
163
- "learning_rate": 4.5138888888888894e-05,
164
- "loss": 0.8051,
165
  "step": 210
166
  },
167
  {
168
- "epoch": 1.9599109131403119,
169
- "grad_norm": 12.761980056762695,
170
- "learning_rate": 4.464285714285715e-05,
171
- "loss": 0.8019,
172
  "step": 220
173
  },
174
  {
175
- "epoch": 1.9955456570155903,
176
- "eval_accuracy": 0.7989981214777708,
177
- "eval_loss": 0.6332681775093079,
178
- "eval_runtime": 12.3035,
179
- "eval_samples_per_second": 129.8,
180
- "eval_steps_per_second": 4.064,
181
- "step": 224
182
- },
183
- {
184
- "epoch": 2.048997772828508,
185
- "grad_norm": 10.136615753173828,
186
- "learning_rate": 4.41468253968254e-05,
187
- "loss": 0.7221,
188
  "step": 230
189
  },
190
  {
191
- "epoch": 2.138084632516704,
192
- "grad_norm": 13.145553588867188,
193
- "learning_rate": 4.3650793650793655e-05,
194
- "loss": 0.6877,
195
  "step": 240
196
  },
197
  {
198
- "epoch": 2.2271714922048997,
199
- "grad_norm": 14.941648483276367,
200
- "learning_rate": 4.315476190476191e-05,
201
- "loss": 0.7462,
 
 
 
 
 
 
 
 
 
202
  "step": 250
203
  },
204
  {
205
- "epoch": 2.316258351893096,
206
- "grad_norm": 9.842031478881836,
207
- "learning_rate": 4.265873015873016e-05,
208
- "loss": 0.6886,
209
  "step": 260
210
  },
211
  {
212
- "epoch": 2.4053452115812917,
213
- "grad_norm": 11.978471755981445,
214
- "learning_rate": 4.2162698412698416e-05,
215
- "loss": 0.6843,
216
  "step": 270
217
  },
218
  {
219
- "epoch": 2.494432071269488,
220
- "grad_norm": 12.24606990814209,
221
- "learning_rate": 4.166666666666667e-05,
222
- "loss": 0.7038,
223
  "step": 280
224
  },
225
  {
226
- "epoch": 2.5835189309576836,
227
- "grad_norm": 12.357331275939941,
228
- "learning_rate": 4.117063492063492e-05,
229
- "loss": 0.696,
230
  "step": 290
231
  },
232
  {
233
- "epoch": 2.6726057906458798,
234
- "grad_norm": 16.765913009643555,
235
- "learning_rate": 4.067460317460318e-05,
236
- "loss": 0.7174,
237
  "step": 300
238
  },
239
  {
240
- "epoch": 2.7616926503340755,
241
- "grad_norm": 13.57442569732666,
242
- "learning_rate": 4.017857142857143e-05,
243
- "loss": 0.7305,
244
  "step": 310
245
  },
246
  {
247
- "epoch": 2.8507795100222717,
248
- "grad_norm": 13.48181438446045,
249
- "learning_rate": 3.968253968253968e-05,
250
- "loss": 0.6957,
251
  "step": 320
252
  },
253
  {
254
- "epoch": 2.939866369710468,
255
- "grad_norm": 9.809232711791992,
256
- "learning_rate": 3.918650793650794e-05,
257
- "loss": 0.6524,
258
  "step": 330
259
  },
260
  {
261
- "epoch": 2.9933184855233854,
262
- "eval_accuracy": 0.8340638697557922,
263
- "eval_loss": 0.5247990489006042,
264
- "eval_runtime": 12.2883,
265
- "eval_samples_per_second": 129.961,
266
- "eval_steps_per_second": 4.069,
267
- "step": 336
268
- },
269
- {
270
- "epoch": 3.0289532293986636,
271
- "grad_norm": 16.003482818603516,
272
- "learning_rate": 3.8690476190476195e-05,
273
- "loss": 0.6928,
274
  "step": 340
275
  },
276
  {
277
- "epoch": 3.11804008908686,
278
- "grad_norm": 23.448598861694336,
279
- "learning_rate": 3.8194444444444444e-05,
280
- "loss": 0.6696,
281
  "step": 350
282
  },
283
  {
284
- "epoch": 3.2071269487750556,
285
- "grad_norm": 12.516254425048828,
286
- "learning_rate": 3.76984126984127e-05,
287
- "loss": 0.6665,
288
  "step": 360
289
  },
290
  {
291
- "epoch": 3.2962138084632517,
292
- "grad_norm": 13.503238677978516,
293
- "learning_rate": 3.7202380952380956e-05,
294
- "loss": 0.6189,
295
  "step": 370
296
  },
297
  {
298
- "epoch": 3.3853006681514475,
299
- "grad_norm": 14.721902847290039,
300
- "learning_rate": 3.6706349206349205e-05,
301
- "loss": 0.6405,
 
 
 
 
 
 
 
 
 
302
  "step": 380
303
  },
304
  {
305
- "epoch": 3.4743875278396437,
306
- "grad_norm": 11.428637504577637,
307
- "learning_rate": 3.621031746031746e-05,
308
- "loss": 0.6502,
309
  "step": 390
310
  },
311
  {
312
- "epoch": 3.5634743875278394,
313
- "grad_norm": 8.628026008605957,
314
- "learning_rate": 3.571428571428572e-05,
315
- "loss": 0.6335,
316
  "step": 400
317
  },
318
  {
319
- "epoch": 3.6525612472160356,
320
- "grad_norm": 12.637211799621582,
321
- "learning_rate": 3.521825396825397e-05,
322
- "loss": 0.6457,
323
  "step": 410
324
  },
325
  {
326
- "epoch": 3.7416481069042318,
327
- "grad_norm": 13.72917652130127,
328
- "learning_rate": 3.472222222222222e-05,
329
- "loss": 0.6338,
330
  "step": 420
331
  },
332
  {
333
- "epoch": 3.8307349665924275,
334
- "grad_norm": 14.159635543823242,
335
- "learning_rate": 3.422619047619048e-05,
336
- "loss": 0.6318,
337
  "step": 430
338
  },
339
  {
340
- "epoch": 3.9198218262806237,
341
- "grad_norm": 12.676724433898926,
342
- "learning_rate": 3.3730158730158734e-05,
343
- "loss": 0.6339,
344
  "step": 440
345
  },
346
  {
347
- "epoch": 4.0,
348
- "eval_accuracy": 0.8447088290544772,
349
- "eval_loss": 0.4730662703514099,
350
- "eval_runtime": 12.4926,
351
- "eval_samples_per_second": 127.835,
352
- "eval_steps_per_second": 4.002,
353
- "step": 449
354
- },
355
- {
356
- "epoch": 4.008908685968819,
357
- "grad_norm": 11.317361831665039,
358
- "learning_rate": 3.3234126984126983e-05,
359
- "loss": 0.613,
360
  "step": 450
361
  },
362
  {
363
- "epoch": 4.097995545657016,
364
- "grad_norm": 14.402922630310059,
365
- "learning_rate": 3.273809523809524e-05,
366
- "loss": 0.6124,
367
  "step": 460
368
  },
369
  {
370
- "epoch": 4.187082405345212,
371
- "grad_norm": 9.939033508300781,
372
- "learning_rate": 3.2242063492063495e-05,
373
- "loss": 0.5868,
374
  "step": 470
375
  },
376
  {
377
- "epoch": 4.276169265033408,
378
- "grad_norm": 10.611005783081055,
379
- "learning_rate": 3.1746031746031745e-05,
380
- "loss": 0.5786,
381
  "step": 480
382
  },
383
  {
384
- "epoch": 4.365256124721603,
385
- "grad_norm": 11.104296684265137,
386
- "learning_rate": 3.125e-05,
387
- "loss": 0.544,
388
  "step": 490
389
  },
390
  {
391
- "epoch": 4.4543429844097995,
392
- "grad_norm": 14.008048057556152,
393
- "learning_rate": 3.075396825396826e-05,
394
- "loss": 0.6175,
 
 
 
 
 
 
 
 
 
395
  "step": 500
396
  },
397
  {
398
- "epoch": 4.543429844097996,
399
- "grad_norm": 9.320144653320312,
400
- "learning_rate": 3.0257936507936506e-05,
401
- "loss": 0.5999,
402
  "step": 510
403
  },
404
  {
405
- "epoch": 4.632516703786192,
406
- "grad_norm": 9.274946212768555,
407
- "learning_rate": 2.9761904761904762e-05,
408
- "loss": 0.5709,
409
  "step": 520
410
  },
411
  {
412
- "epoch": 4.721603563474387,
413
- "grad_norm": 12.640064239501953,
414
- "learning_rate": 2.9265873015873018e-05,
415
- "loss": 0.6231,
416
  "step": 530
417
  },
418
  {
419
- "epoch": 4.810690423162583,
420
- "grad_norm": 11.968724250793457,
421
- "learning_rate": 2.876984126984127e-05,
422
- "loss": 0.6206,
423
  "step": 540
424
  },
425
  {
426
- "epoch": 4.8997772828507795,
427
- "grad_norm": 11.681157112121582,
428
- "learning_rate": 2.8273809523809523e-05,
429
- "loss": 0.6031,
430
  "step": 550
431
  },
432
  {
433
- "epoch": 4.988864142538976,
434
- "grad_norm": 13.320256233215332,
435
- "learning_rate": 2.777777777777778e-05,
436
- "loss": 0.5178,
437
  "step": 560
438
  },
439
  {
440
- "epoch": 4.997772828507795,
441
- "eval_accuracy": 0.8503443957420163,
442
- "eval_loss": 0.4537416100502014,
443
- "eval_runtime": 12.2913,
444
- "eval_samples_per_second": 129.93,
445
- "eval_steps_per_second": 4.068,
446
- "step": 561
447
- },
448
- {
449
- "epoch": 5.077951002227172,
450
- "grad_norm": 15.332996368408203,
451
- "learning_rate": 2.7281746031746032e-05,
452
- "loss": 0.5617,
453
  "step": 570
454
  },
455
  {
456
- "epoch": 5.167037861915367,
457
- "grad_norm": 14.994087219238281,
458
- "learning_rate": 2.6785714285714288e-05,
459
- "loss": 0.5797,
460
  "step": 580
461
  },
462
  {
463
- "epoch": 5.256124721603563,
464
- "grad_norm": 13.461969375610352,
465
- "learning_rate": 2.628968253968254e-05,
466
- "loss": 0.5524,
467
  "step": 590
468
  },
469
  {
470
- "epoch": 5.3452115812917596,
471
- "grad_norm": 12.29080581665039,
472
- "learning_rate": 2.5793650793650796e-05,
473
- "loss": 0.5824,
474
  "step": 600
475
  },
476
  {
477
- "epoch": 5.434298440979956,
478
- "grad_norm": 11.07197380065918,
479
- "learning_rate": 2.529761904761905e-05,
480
- "loss": 0.554,
481
  "step": 610
482
  },
483
  {
484
- "epoch": 5.523385300668151,
485
- "grad_norm": 9.797560691833496,
486
- "learning_rate": 2.4801587301587305e-05,
487
- "loss": 0.5108,
488
  "step": 620
489
  },
490
  {
491
- "epoch": 5.612472160356347,
492
- "grad_norm": 10.469209671020508,
493
- "learning_rate": 2.4305555555555558e-05,
494
- "loss": 0.5586,
 
 
 
 
 
 
 
 
 
495
  "step": 630
496
  },
497
  {
498
- "epoch": 5.701559020044543,
499
- "grad_norm": 13.22735595703125,
500
- "learning_rate": 2.380952380952381e-05,
501
- "loss": 0.5358,
502
  "step": 640
503
  },
504
  {
505
- "epoch": 5.79064587973274,
506
- "grad_norm": 8.305063247680664,
507
- "learning_rate": 2.3313492063492066e-05,
508
- "loss": 0.5295,
509
  "step": 650
510
  },
511
  {
512
- "epoch": 5.879732739420936,
513
- "grad_norm": 18.399051666259766,
514
- "learning_rate": 2.281746031746032e-05,
515
- "loss": 0.5442,
516
  "step": 660
517
  },
518
  {
519
- "epoch": 5.968819599109131,
520
- "grad_norm": 8.103595733642578,
521
- "learning_rate": 2.2321428571428575e-05,
522
- "loss": 0.5907,
523
  "step": 670
524
  },
525
  {
526
- "epoch": 5.99554565701559,
527
- "eval_accuracy": 0.8472135253600501,
528
- "eval_loss": 0.4555535316467285,
529
- "eval_runtime": 12.2927,
530
- "eval_samples_per_second": 129.914,
531
- "eval_steps_per_second": 4.067,
532
- "step": 673
533
- },
534
- {
535
- "epoch": 6.057906458797327,
536
- "grad_norm": 10.681763648986816,
537
- "learning_rate": 2.1825396825396827e-05,
538
- "loss": 0.5332,
539
  "step": 680
540
  },
541
  {
542
- "epoch": 6.146993318485523,
543
- "grad_norm": 10.129424095153809,
544
- "learning_rate": 2.132936507936508e-05,
545
- "loss": 0.4747,
546
  "step": 690
547
  },
548
  {
549
- "epoch": 6.23608017817372,
550
- "grad_norm": 16.834814071655273,
551
- "learning_rate": 2.0833333333333336e-05,
552
- "loss": 0.5576,
553
  "step": 700
554
  },
555
  {
556
- "epoch": 6.325167037861915,
557
- "grad_norm": 11.258397102355957,
558
- "learning_rate": 2.033730158730159e-05,
559
- "loss": 0.5063,
560
  "step": 710
561
  },
562
  {
563
- "epoch": 6.414253897550111,
564
- "grad_norm": 15.159914016723633,
565
- "learning_rate": 1.984126984126984e-05,
566
- "loss": 0.5385,
567
  "step": 720
568
  },
569
  {
570
- "epoch": 6.503340757238307,
571
- "grad_norm": 10.242027282714844,
572
- "learning_rate": 1.9345238095238097e-05,
573
- "loss": 0.5046,
574
  "step": 730
575
  },
576
  {
577
- "epoch": 6.5924276169265035,
578
- "grad_norm": 10.377813339233398,
579
- "learning_rate": 1.884920634920635e-05,
580
- "loss": 0.5247,
581
  "step": 740
582
  },
583
  {
584
- "epoch": 6.6815144766147,
585
- "grad_norm": 12.55459213256836,
586
- "learning_rate": 1.8353174603174602e-05,
587
- "loss": 0.529,
 
 
 
 
 
 
 
 
 
588
  "step": 750
589
  },
590
  {
591
- "epoch": 6.770601336302895,
592
- "grad_norm": 16.02656364440918,
593
- "learning_rate": 1.785714285714286e-05,
594
- "loss": 0.5073,
595
  "step": 760
596
  },
597
  {
598
- "epoch": 6.859688195991091,
599
- "grad_norm": 16.140487670898438,
600
- "learning_rate": 1.736111111111111e-05,
601
- "loss": 0.5414,
602
  "step": 770
603
  },
604
  {
605
- "epoch": 6.948775055679287,
606
- "grad_norm": 17.321931838989258,
607
- "learning_rate": 1.6865079365079367e-05,
608
- "loss": 0.5292,
609
  "step": 780
610
  },
611
  {
612
- "epoch": 6.993318485523385,
613
- "eval_accuracy": 0.8672510958046337,
614
- "eval_loss": 0.41685062646865845,
615
- "eval_runtime": 12.3673,
616
- "eval_samples_per_second": 129.131,
617
- "eval_steps_per_second": 4.043,
618
- "step": 785
619
- },
620
- {
621
- "epoch": 7.0378619153674835,
622
- "grad_norm": 9.479693412780762,
623
- "learning_rate": 1.636904761904762e-05,
624
- "loss": 0.4586,
625
  "step": 790
626
  },
627
  {
628
- "epoch": 7.12694877505568,
629
- "grad_norm": 11.711000442504883,
630
- "learning_rate": 1.5873015873015872e-05,
631
- "loss": 0.5188,
632
  "step": 800
633
  },
634
  {
635
- "epoch": 7.216035634743875,
636
- "grad_norm": 11.616864204406738,
637
- "learning_rate": 1.537698412698413e-05,
638
- "loss": 0.5024,
639
  "step": 810
640
  },
641
  {
642
- "epoch": 7.305122494432071,
643
- "grad_norm": 10.370725631713867,
644
- "learning_rate": 1.4880952380952381e-05,
645
- "loss": 0.4902,
646
  "step": 820
647
  },
648
  {
649
- "epoch": 7.394209354120267,
650
- "grad_norm": 14.04218864440918,
651
- "learning_rate": 1.4384920634920635e-05,
652
- "loss": 0.5149,
653
  "step": 830
654
  },
655
  {
656
- "epoch": 7.4832962138084635,
657
- "grad_norm": 13.194646835327148,
658
- "learning_rate": 1.388888888888889e-05,
659
- "loss": 0.5562,
660
  "step": 840
661
  },
662
  {
663
- "epoch": 7.57238307349666,
664
- "grad_norm": 9.960190773010254,
665
- "learning_rate": 1.3392857142857144e-05,
666
- "loss": 0.4921,
667
  "step": 850
668
  },
669
  {
670
- "epoch": 7.661469933184855,
671
- "grad_norm": 15.14493465423584,
672
- "learning_rate": 1.2896825396825398e-05,
673
- "loss": 0.471,
674
  "step": 860
675
  },
676
  {
677
- "epoch": 7.750556792873051,
678
- "grad_norm": 11.185235977172852,
679
- "learning_rate": 1.2400793650793652e-05,
680
- "loss": 0.4963,
681
  "step": 870
682
  },
683
  {
684
- "epoch": 7.839643652561247,
685
- "grad_norm": 12.782095909118652,
686
- "learning_rate": 1.1904761904761905e-05,
687
- "loss": 0.4915,
 
 
 
 
 
 
 
 
 
688
  "step": 880
689
  },
690
  {
691
- "epoch": 7.928730512249444,
692
- "grad_norm": 11.89919376373291,
693
- "learning_rate": 1.140873015873016e-05,
694
- "loss": 0.5017,
695
  "step": 890
696
  },
697
  {
698
- "epoch": 8.0,
699
- "eval_accuracy": 0.8597370068879149,
700
- "eval_loss": 0.4106651544570923,
701
- "eval_runtime": 12.3902,
702
- "eval_samples_per_second": 128.892,
703
- "eval_steps_per_second": 4.035,
704
- "step": 898
705
- },
706
- {
707
- "epoch": 8.017817371937639,
708
- "grad_norm": 12.601805686950684,
709
- "learning_rate": 1.0912698412698414e-05,
710
- "loss": 0.5064,
711
  "step": 900
712
  },
713
  {
714
- "epoch": 8.106904231625835,
715
- "grad_norm": 8.723831176757812,
716
- "learning_rate": 1.0416666666666668e-05,
717
- "loss": 0.4181,
718
  "step": 910
719
  },
720
  {
721
- "epoch": 8.195991091314031,
722
- "grad_norm": 12.781538963317871,
723
- "learning_rate": 9.92063492063492e-06,
724
- "loss": 0.4427,
725
  "step": 920
726
  },
727
  {
728
- "epoch": 8.285077951002227,
729
- "grad_norm": 12.263012886047363,
730
- "learning_rate": 9.424603174603175e-06,
731
- "loss": 0.5087,
732
  "step": 930
733
  },
734
  {
735
- "epoch": 8.374164810690424,
736
- "grad_norm": 17.41984748840332,
737
- "learning_rate": 8.92857142857143e-06,
738
- "loss": 0.5301,
739
  "step": 940
740
  },
741
  {
742
- "epoch": 8.46325167037862,
743
- "grad_norm": 10.731024742126465,
744
- "learning_rate": 8.432539682539684e-06,
745
- "loss": 0.4987,
746
  "step": 950
747
  },
748
  {
749
- "epoch": 8.552338530066816,
750
- "grad_norm": 15.722013473510742,
751
- "learning_rate": 7.936507936507936e-06,
752
- "loss": 0.4613,
753
  "step": 960
754
  },
755
  {
756
- "epoch": 8.64142538975501,
757
- "grad_norm": 11.301126480102539,
758
- "learning_rate": 7.4404761904761905e-06,
759
- "loss": 0.5136,
760
  "step": 970
761
  },
762
  {
763
- "epoch": 8.730512249443207,
764
- "grad_norm": 21.23493194580078,
765
- "learning_rate": 6.944444444444445e-06,
766
- "loss": 0.494,
767
  "step": 980
768
  },
769
  {
770
- "epoch": 8.819599109131403,
771
- "grad_norm": 10.211363792419434,
772
- "learning_rate": 6.448412698412699e-06,
773
- "loss": 0.4619,
774
  "step": 990
775
  },
776
  {
777
- "epoch": 8.908685968819599,
778
- "grad_norm": 12.277856826782227,
779
- "learning_rate": 5.9523809523809525e-06,
780
- "loss": 0.4361,
 
 
 
 
 
 
 
 
 
781
  "step": 1000
782
  },
783
  {
784
- "epoch": 8.997772828507795,
785
- "grad_norm": 12.28085708618164,
786
- "learning_rate": 5.456349206349207e-06,
787
- "loss": 0.4605,
788
- "step": 1010
789
- },
790
- {
791
- "epoch": 8.997772828507795,
792
- "eval_accuracy": 0.8634940513462742,
793
- "eval_loss": 0.40621063113212585,
794
- "eval_runtime": 12.3051,
795
- "eval_samples_per_second": 129.783,
796
- "eval_steps_per_second": 4.063,
797
  "step": 1010
798
  },
799
  {
800
- "epoch": 9.086859688195991,
801
- "grad_norm": 8.358485221862793,
802
- "learning_rate": 4.96031746031746e-06,
803
- "loss": 0.4403,
804
  "step": 1020
805
  },
806
  {
807
- "epoch": 9.175946547884188,
808
- "grad_norm": 8.35409164428711,
809
- "learning_rate": 4.464285714285715e-06,
810
- "loss": 0.4514,
811
  "step": 1030
812
  },
813
  {
814
- "epoch": 9.265033407572384,
815
- "grad_norm": 10.057600021362305,
816
- "learning_rate": 3.968253968253968e-06,
817
- "loss": 0.427,
818
  "step": 1040
819
  },
820
  {
821
- "epoch": 9.35412026726058,
822
- "grad_norm": 7.57137393951416,
823
- "learning_rate": 3.4722222222222224e-06,
824
- "loss": 0.4619,
825
  "step": 1050
826
  },
827
  {
828
- "epoch": 9.443207126948774,
829
- "grad_norm": 9.249728202819824,
830
- "learning_rate": 2.9761904761904763e-06,
831
- "loss": 0.4606,
832
  "step": 1060
833
  },
834
  {
835
- "epoch": 9.53229398663697,
836
- "grad_norm": 10.303194046020508,
837
- "learning_rate": 2.48015873015873e-06,
838
- "loss": 0.508,
839
  "step": 1070
840
  },
841
  {
842
- "epoch": 9.621380846325167,
843
- "grad_norm": 11.307740211486816,
844
- "learning_rate": 1.984126984126984e-06,
845
- "loss": 0.4667,
846
  "step": 1080
847
  },
848
  {
849
- "epoch": 9.710467706013363,
850
- "grad_norm": 11.34073543548584,
851
- "learning_rate": 1.4880952380952381e-06,
852
- "loss": 0.4803,
853
  "step": 1090
854
  },
855
  {
856
- "epoch": 9.799554565701559,
857
- "grad_norm": 12.684574127197266,
858
- "learning_rate": 9.92063492063492e-07,
859
- "loss": 0.4613,
860
  "step": 1100
861
  },
862
  {
863
- "epoch": 9.888641425389755,
864
- "grad_norm": 12.430156707763672,
865
- "learning_rate": 4.96031746031746e-07,
866
- "loss": 0.434,
867
  "step": 1110
868
  },
869
  {
870
- "epoch": 9.977728285077951,
871
- "grad_norm": 19.223121643066406,
872
- "learning_rate": 0.0,
873
- "loss": 0.4765,
874
  "step": 1120
875
  },
876
  {
877
- "epoch": 9.977728285077951,
878
- "eval_accuracy": 0.8647463994990607,
879
- "eval_loss": 0.3980247676372528,
880
- "eval_runtime": 12.312,
881
- "eval_samples_per_second": 129.711,
882
- "eval_steps_per_second": 4.061,
883
- "step": 1120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  },
885
  {
886
- "epoch": 9.977728285077951,
887
- "step": 1120,
888
- "total_flos": 3.6084187126879396e+18,
889
- "train_loss": 0.7042633635657174,
890
- "train_runtime": 2366.3213,
891
- "train_samples_per_second": 60.706,
892
- "train_steps_per_second": 0.473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893
  }
894
  ],
895
  "logging_steps": 10,
896
- "max_steps": 1120,
897
  "num_input_tokens_seen": 0,
898
- "num_train_epochs": 10,
899
  "save_steps": 500,
900
- "total_flos": 3.6084187126879396e+18,
901
  "train_batch_size": 32,
902
  "trial_name": null,
903
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.8703494926719278,
3
+ "best_model_checkpoint": "convnextv2-tiny-1k-224-finetuned-galaxy10-decals/checkpoint-2480",
4
+ "epoch": 19.879759519038075,
5
  "eval_steps": 500,
6
+ "global_step": 2480,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08016032064128256,
13
+ "grad_norm": 5.443685531616211,
14
+ "learning_rate": 2.0161290322580646e-06,
15
+ "loss": 2.3476,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.16032064128256512,
20
+ "grad_norm": 5.888722896575928,
21
+ "learning_rate": 4.032258064516129e-06,
22
+ "loss": 2.3134,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.24048096192384769,
27
+ "grad_norm": 3.9038140773773193,
28
+ "learning_rate": 6.048387096774194e-06,
29
+ "loss": 2.2707,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.32064128256513025,
34
+ "grad_norm": 2.5614984035491943,
35
+ "learning_rate": 8.064516129032258e-06,
36
+ "loss": 2.2183,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.40080160320641284,
41
+ "grad_norm": 5.785397052764893,
42
+ "learning_rate": 1.0080645161290323e-05,
43
+ "loss": 2.1604,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.48096192384769537,
48
+ "grad_norm": 5.09072208404541,
49
+ "learning_rate": 1.2096774193548388e-05,
50
+ "loss": 2.0969,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.561122244488978,
55
+ "grad_norm": 2.9109528064727783,
56
+ "learning_rate": 1.4112903225806454e-05,
57
+ "loss": 2.0283,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.6412825651302605,
62
+ "grad_norm": 7.426329135894775,
63
+ "learning_rate": 1.6129032258064517e-05,
64
+ "loss": 1.9125,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.7214428857715431,
69
+ "grad_norm": 8.441859245300293,
70
+ "learning_rate": 1.8145161290322583e-05,
71
+ "loss": 1.7735,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.8016032064128257,
76
+ "grad_norm": 5.191440582275391,
77
+ "learning_rate": 2.0161290322580645e-05,
78
+ "loss": 1.6402,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.8817635270541082,
83
+ "grad_norm": 6.325778007507324,
84
+ "learning_rate": 2.217741935483871e-05,
85
+ "loss": 1.5225,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.9619238476953907,
90
+ "grad_norm": 7.641424655914307,
91
+ "learning_rate": 2.4193548387096777e-05,
92
+ "loss": 1.4287,
93
+ "step": 120
 
 
94
  },
95
  {
96
+ "epoch": 0.9939879759519038,
97
+ "eval_accuracy": 0.5851183765501691,
98
+ "eval_loss": 1.2978211641311646,
99
+ "eval_runtime": 23.4495,
100
+ "eval_samples_per_second": 75.652,
101
+ "eval_steps_per_second": 2.388,
102
+ "step": 124
103
  },
104
  {
105
+ "epoch": 1.0420841683366733,
106
+ "grad_norm": 9.333320617675781,
107
+ "learning_rate": 2.620967741935484e-05,
108
+ "loss": 1.3617,
109
  "step": 130
110
  },
111
  {
112
+ "epoch": 1.122244488977956,
113
+ "grad_norm": 10.036263465881348,
114
+ "learning_rate": 2.822580645161291e-05,
115
+ "loss": 1.3084,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 1.2024048096192386,
120
+ "grad_norm": 11.795063018798828,
121
+ "learning_rate": 3.024193548387097e-05,
122
+ "loss": 1.2472,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 1.282565130260521,
127
+ "grad_norm": 11.583420753479004,
128
+ "learning_rate": 3.2258064516129034e-05,
129
+ "loss": 1.1335,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 1.3627254509018036,
134
+ "grad_norm": 14.882524490356445,
135
+ "learning_rate": 3.427419354838709e-05,
136
+ "loss": 1.102,
137
  "step": 170
138
  },
139
  {
140
+ "epoch": 1.4428857715430863,
141
+ "grad_norm": 14.157336235046387,
142
+ "learning_rate": 3.6290322580645165e-05,
143
+ "loss": 1.049,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 1.5230460921843687,
148
+ "grad_norm": 10.484189987182617,
149
+ "learning_rate": 3.8306451612903224e-05,
150
+ "loss": 1.0445,
151
  "step": 190
152
  },
153
  {
154
+ "epoch": 1.6032064128256514,
155
+ "grad_norm": 14.128747940063477,
156
+ "learning_rate": 4.032258064516129e-05,
157
+ "loss": 0.995,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 1.6833667334669338,
162
+ "grad_norm": 9.768001556396484,
163
+ "learning_rate": 4.2338709677419356e-05,
164
+ "loss": 0.9573,
165
  "step": 210
166
  },
167
  {
168
+ "epoch": 1.7635270541082164,
169
+ "grad_norm": 13.823319435119629,
170
+ "learning_rate": 4.435483870967742e-05,
171
+ "loss": 0.9072,
172
  "step": 220
173
  },
174
  {
175
+ "epoch": 1.843687374749499,
176
+ "grad_norm": 9.132129669189453,
177
+ "learning_rate": 4.637096774193548e-05,
178
+ "loss": 0.8156,
 
 
 
 
 
 
 
 
 
179
  "step": 230
180
  },
181
  {
182
+ "epoch": 1.9238476953907817,
183
+ "grad_norm": 18.744123458862305,
184
+ "learning_rate": 4.8387096774193554e-05,
185
+ "loss": 0.8329,
186
  "step": 240
187
  },
188
  {
189
+ "epoch": 1.9959919839679359,
190
+ "eval_accuracy": 0.7728297632468997,
191
+ "eval_loss": 0.6986980438232422,
192
+ "eval_runtime": 11.7756,
193
+ "eval_samples_per_second": 150.65,
194
+ "eval_steps_per_second": 4.756,
195
+ "step": 249
196
+ },
197
+ {
198
+ "epoch": 2.004008016032064,
199
+ "grad_norm": 8.23229694366455,
200
+ "learning_rate": 4.995519713261649e-05,
201
+ "loss": 0.8108,
202
  "step": 250
203
  },
204
  {
205
+ "epoch": 2.0841683366733466,
206
+ "grad_norm": 13.277573585510254,
207
+ "learning_rate": 4.973118279569893e-05,
208
+ "loss": 0.7733,
209
  "step": 260
210
  },
211
  {
212
+ "epoch": 2.164328657314629,
213
+ "grad_norm": 13.641548156738281,
214
+ "learning_rate": 4.950716845878137e-05,
215
+ "loss": 0.7904,
216
  "step": 270
217
  },
218
  {
219
+ "epoch": 2.244488977955912,
220
+ "grad_norm": 18.69782829284668,
221
+ "learning_rate": 4.92831541218638e-05,
222
+ "loss": 0.7419,
223
  "step": 280
224
  },
225
  {
226
+ "epoch": 2.3246492985971945,
227
+ "grad_norm": 15.437073707580566,
228
+ "learning_rate": 4.905913978494624e-05,
229
+ "loss": 0.7647,
230
  "step": 290
231
  },
232
  {
233
+ "epoch": 2.404809619238477,
234
+ "grad_norm": 21.065357208251953,
235
+ "learning_rate": 4.8835125448028677e-05,
236
+ "loss": 0.7303,
237
  "step": 300
238
  },
239
  {
240
+ "epoch": 2.4849699398797593,
241
+ "grad_norm": 16.6332950592041,
242
+ "learning_rate": 4.8611111111111115e-05,
243
+ "loss": 0.764,
244
  "step": 310
245
  },
246
  {
247
+ "epoch": 2.565130260521042,
248
+ "grad_norm": 13.331892967224121,
249
+ "learning_rate": 4.8387096774193554e-05,
250
+ "loss": 0.7586,
251
  "step": 320
252
  },
253
  {
254
+ "epoch": 2.6452905811623246,
255
+ "grad_norm": 18.221023559570312,
256
+ "learning_rate": 4.8163082437275986e-05,
257
+ "loss": 0.7078,
258
  "step": 330
259
  },
260
  {
261
+ "epoch": 2.7254509018036073,
262
+ "grad_norm": 16.339580535888672,
263
+ "learning_rate": 4.7939068100358424e-05,
264
+ "loss": 0.7008,
 
 
 
 
 
 
 
 
 
265
  "step": 340
266
  },
267
  {
268
+ "epoch": 2.80561122244489,
269
+ "grad_norm": 12.270729064941406,
270
+ "learning_rate": 4.771505376344086e-05,
271
+ "loss": 0.7941,
272
  "step": 350
273
  },
274
  {
275
+ "epoch": 2.8857715430861726,
276
+ "grad_norm": 10.448567390441895,
277
+ "learning_rate": 4.74910394265233e-05,
278
+ "loss": 0.6575,
279
  "step": 360
280
  },
281
  {
282
+ "epoch": 2.9659318637274548,
283
+ "grad_norm": 12.076117515563965,
284
+ "learning_rate": 4.726702508960574e-05,
285
+ "loss": 0.7348,
286
  "step": 370
287
  },
288
  {
289
+ "epoch": 2.997995991983968,
290
+ "eval_accuracy": 0.8179255918827508,
291
+ "eval_loss": 0.5658715963363647,
292
+ "eval_runtime": 12.6995,
293
+ "eval_samples_per_second": 139.691,
294
+ "eval_steps_per_second": 4.41,
295
+ "step": 374
296
+ },
297
+ {
298
+ "epoch": 3.0460921843687374,
299
+ "grad_norm": 10.130627632141113,
300
+ "learning_rate": 4.704301075268818e-05,
301
+ "loss": 0.6752,
302
  "step": 380
303
  },
304
  {
305
+ "epoch": 3.12625250501002,
306
+ "grad_norm": 15.763223648071289,
307
+ "learning_rate": 4.681899641577061e-05,
308
+ "loss": 0.6367,
309
  "step": 390
310
  },
311
  {
312
+ "epoch": 3.2064128256513027,
313
+ "grad_norm": 13.580248832702637,
314
+ "learning_rate": 4.659498207885305e-05,
315
+ "loss": 0.6239,
316
  "step": 400
317
  },
318
  {
319
+ "epoch": 3.2865731462925853,
320
+ "grad_norm": 11.084284782409668,
321
+ "learning_rate": 4.637096774193548e-05,
322
+ "loss": 0.6857,
323
  "step": 410
324
  },
325
  {
326
+ "epoch": 3.3667334669338675,
327
+ "grad_norm": 12.55604362487793,
328
+ "learning_rate": 4.614695340501792e-05,
329
+ "loss": 0.6603,
330
  "step": 420
331
  },
332
  {
333
+ "epoch": 3.44689378757515,
334
+ "grad_norm": 9.201930046081543,
335
+ "learning_rate": 4.5922939068100365e-05,
336
+ "loss": 0.6514,
337
  "step": 430
338
  },
339
  {
340
+ "epoch": 3.527054108216433,
341
+ "grad_norm": 12.60409927368164,
342
+ "learning_rate": 4.56989247311828e-05,
343
+ "loss": 0.6237,
344
  "step": 440
345
  },
346
  {
347
+ "epoch": 3.6072144288577155,
348
+ "grad_norm": 21.13867950439453,
349
+ "learning_rate": 4.5474910394265236e-05,
350
+ "loss": 0.6698,
 
 
 
 
 
 
 
 
 
351
  "step": 450
352
  },
353
  {
354
+ "epoch": 3.687374749498998,
355
+ "grad_norm": 15.415425300598145,
356
+ "learning_rate": 4.5250896057347674e-05,
357
+ "loss": 0.6016,
358
  "step": 460
359
  },
360
  {
361
+ "epoch": 3.7675350701402808,
362
+ "grad_norm": 10.865119934082031,
363
+ "learning_rate": 4.5026881720430106e-05,
364
+ "loss": 0.633,
365
  "step": 470
366
  },
367
  {
368
+ "epoch": 3.847695390781563,
369
+ "grad_norm": 12.04747200012207,
370
+ "learning_rate": 4.4802867383512545e-05,
371
+ "loss": 0.686,
372
  "step": 480
373
  },
374
  {
375
+ "epoch": 3.9278557114228456,
376
+ "grad_norm": 15.560898780822754,
377
+ "learning_rate": 4.4578853046594983e-05,
378
+ "loss": 0.611,
379
  "step": 490
380
  },
381
  {
382
+ "epoch": 4.0,
383
+ "eval_accuracy": 0.8297632468996617,
384
+ "eval_loss": 0.5378695726394653,
385
+ "eval_runtime": 12.4744,
386
+ "eval_samples_per_second": 142.212,
387
+ "eval_steps_per_second": 4.489,
388
+ "step": 499
389
+ },
390
+ {
391
+ "epoch": 4.008016032064128,
392
+ "grad_norm": 12.36653995513916,
393
+ "learning_rate": 4.435483870967742e-05,
394
+ "loss": 0.6783,
395
  "step": 500
396
  },
397
  {
398
+ "epoch": 4.0881763527054105,
399
+ "grad_norm": 12.612045288085938,
400
+ "learning_rate": 4.413082437275986e-05,
401
+ "loss": 0.6591,
402
  "step": 510
403
  },
404
  {
405
+ "epoch": 4.168336673346693,
406
+ "grad_norm": 9.516498565673828,
407
+ "learning_rate": 4.390681003584229e-05,
408
+ "loss": 0.6409,
409
  "step": 520
410
  },
411
  {
412
+ "epoch": 4.248496993987976,
413
+ "grad_norm": 10.227922439575195,
414
+ "learning_rate": 4.368279569892473e-05,
415
+ "loss": 0.6113,
416
  "step": 530
417
  },
418
  {
419
+ "epoch": 4.328657314629258,
420
+ "grad_norm": 9.941215515136719,
421
+ "learning_rate": 4.345878136200717e-05,
422
+ "loss": 0.5545,
423
  "step": 540
424
  },
425
  {
426
+ "epoch": 4.408817635270541,
427
+ "grad_norm": 11.132833480834961,
428
+ "learning_rate": 4.323476702508961e-05,
429
+ "loss": 0.5876,
430
  "step": 550
431
  },
432
  {
433
+ "epoch": 4.488977955911824,
434
+ "grad_norm": 17.30998992919922,
435
+ "learning_rate": 4.301075268817205e-05,
436
+ "loss": 0.6059,
437
  "step": 560
438
  },
439
  {
440
+ "epoch": 4.569138276553106,
441
+ "grad_norm": 12.370113372802734,
442
+ "learning_rate": 4.2786738351254486e-05,
443
+ "loss": 0.572,
 
 
 
 
 
 
 
 
 
444
  "step": 570
445
  },
446
  {
447
+ "epoch": 4.649298597194389,
448
+ "grad_norm": 8.4649019241333,
449
+ "learning_rate": 4.256272401433692e-05,
450
+ "loss": 0.5474,
451
  "step": 580
452
  },
453
  {
454
+ "epoch": 4.729458917835672,
455
+ "grad_norm": 13.911017417907715,
456
+ "learning_rate": 4.2338709677419356e-05,
457
+ "loss": 0.5745,
458
  "step": 590
459
  },
460
  {
461
+ "epoch": 4.809619238476954,
462
+ "grad_norm": 13.061511993408203,
463
+ "learning_rate": 4.2114695340501795e-05,
464
+ "loss": 0.6151,
465
  "step": 600
466
  },
467
  {
468
+ "epoch": 4.889779559118237,
469
+ "grad_norm": 17.543981552124023,
470
+ "learning_rate": 4.1890681003584233e-05,
471
+ "loss": 0.5702,
472
  "step": 610
473
  },
474
  {
475
+ "epoch": 4.969939879759519,
476
+ "grad_norm": 14.352049827575684,
477
+ "learning_rate": 4.166666666666667e-05,
478
+ "loss": 0.5929,
479
  "step": 620
480
  },
481
  {
482
+ "epoch": 4.993987975951904,
483
+ "eval_accuracy": 0.8376550169109357,
484
+ "eval_loss": 0.4972485899925232,
485
+ "eval_runtime": 30.5452,
486
+ "eval_samples_per_second": 58.078,
487
+ "eval_steps_per_second": 1.833,
488
+ "step": 623
489
+ },
490
+ {
491
+ "epoch": 5.050100200400801,
492
+ "grad_norm": 12.319650650024414,
493
+ "learning_rate": 4.1442652329749104e-05,
494
+ "loss": 0.6066,
495
  "step": 630
496
  },
497
  {
498
+ "epoch": 5.130260521042084,
499
+ "grad_norm": 13.590569496154785,
500
+ "learning_rate": 4.121863799283154e-05,
501
+ "loss": 0.561,
502
  "step": 640
503
  },
504
  {
505
+ "epoch": 5.210420841683367,
506
+ "grad_norm": 6.548098087310791,
507
+ "learning_rate": 4.099462365591398e-05,
508
+ "loss": 0.5747,
509
  "step": 650
510
  },
511
  {
512
+ "epoch": 5.290581162324649,
513
+ "grad_norm": 11.081100463867188,
514
+ "learning_rate": 4.077060931899642e-05,
515
+ "loss": 0.5469,
516
  "step": 660
517
  },
518
  {
519
+ "epoch": 5.370741482965932,
520
+ "grad_norm": 11.028233528137207,
521
+ "learning_rate": 4.054659498207886e-05,
522
+ "loss": 0.5913,
523
  "step": 670
524
  },
525
  {
526
+ "epoch": 5.4509018036072145,
527
+ "grad_norm": 16.77172088623047,
528
+ "learning_rate": 4.032258064516129e-05,
529
+ "loss": 0.5454,
 
 
 
 
 
 
 
 
 
530
  "step": 680
531
  },
532
  {
533
+ "epoch": 5.531062124248497,
534
+ "grad_norm": 12.664057731628418,
535
+ "learning_rate": 4.009856630824373e-05,
536
+ "loss": 0.5376,
537
  "step": 690
538
  },
539
  {
540
+ "epoch": 5.61122244488978,
541
+ "grad_norm": 16.970354080200195,
542
+ "learning_rate": 3.987455197132617e-05,
543
+ "loss": 0.6195,
544
  "step": 700
545
  },
546
  {
547
+ "epoch": 5.6913827655310625,
548
+ "grad_norm": 20.813093185424805,
549
+ "learning_rate": 3.96505376344086e-05,
550
+ "loss": 0.5759,
551
  "step": 710
552
  },
553
  {
554
+ "epoch": 5.771543086172345,
555
+ "grad_norm": 12.778873443603516,
556
+ "learning_rate": 3.9426523297491045e-05,
557
+ "loss": 0.5636,
558
  "step": 720
559
  },
560
  {
561
+ "epoch": 5.851703406813627,
562
+ "grad_norm": 9.49085521697998,
563
+ "learning_rate": 3.9202508960573483e-05,
564
+ "loss": 0.5213,
565
  "step": 730
566
  },
567
  {
568
+ "epoch": 5.9318637274549095,
569
+ "grad_norm": 16.53606605529785,
570
+ "learning_rate": 3.8978494623655915e-05,
571
+ "loss": 0.5227,
572
  "step": 740
573
  },
574
  {
575
+ "epoch": 5.995991983967936,
576
+ "eval_accuracy": 0.8478015783540023,
577
+ "eval_loss": 0.4714604616165161,
578
+ "eval_runtime": 11.6058,
579
+ "eval_samples_per_second": 152.855,
580
+ "eval_steps_per_second": 4.825,
581
+ "step": 748
582
+ },
583
+ {
584
+ "epoch": 6.012024048096192,
585
+ "grad_norm": 15.601022720336914,
586
+ "learning_rate": 3.8754480286738354e-05,
587
+ "loss": 0.5287,
588
  "step": 750
589
  },
590
  {
591
+ "epoch": 6.092184368737475,
592
+ "grad_norm": 10.821127891540527,
593
+ "learning_rate": 3.8530465949820786e-05,
594
+ "loss": 0.5278,
595
  "step": 760
596
  },
597
  {
598
+ "epoch": 6.1723446893787575,
599
+ "grad_norm": 15.051136016845703,
600
+ "learning_rate": 3.8306451612903224e-05,
601
+ "loss": 0.5491,
602
  "step": 770
603
  },
604
  {
605
+ "epoch": 6.25250501002004,
606
+ "grad_norm": 13.785345077514648,
607
+ "learning_rate": 3.808243727598566e-05,
608
+ "loss": 0.4941,
609
  "step": 780
610
  },
611
  {
612
+ "epoch": 6.332665330661323,
613
+ "grad_norm": 12.571328163146973,
614
+ "learning_rate": 3.78584229390681e-05,
615
+ "loss": 0.4668,
 
 
 
 
 
 
 
 
 
616
  "step": 790
617
  },
618
  {
619
+ "epoch": 6.412825651302605,
620
+ "grad_norm": 14.443199157714844,
621
+ "learning_rate": 3.763440860215054e-05,
622
+ "loss": 0.5502,
623
  "step": 800
624
  },
625
  {
626
+ "epoch": 6.492985971943888,
627
+ "grad_norm": 12.781950950622559,
628
+ "learning_rate": 3.741039426523298e-05,
629
+ "loss": 0.5049,
630
  "step": 810
631
  },
632
  {
633
+ "epoch": 6.573146292585171,
634
+ "grad_norm": 8.832810401916504,
635
+ "learning_rate": 3.718637992831541e-05,
636
+ "loss": 0.5654,
637
  "step": 820
638
  },
639
  {
640
+ "epoch": 6.653306613226453,
641
+ "grad_norm": 13.026018142700195,
642
+ "learning_rate": 3.696236559139785e-05,
643
+ "loss": 0.5289,
644
  "step": 830
645
  },
646
  {
647
+ "epoch": 6.733466933867735,
648
+ "grad_norm": 11.173068046569824,
649
+ "learning_rate": 3.673835125448029e-05,
650
+ "loss": 0.5262,
651
  "step": 840
652
  },
653
  {
654
+ "epoch": 6.813627254509018,
655
+ "grad_norm": 15.73713207244873,
656
+ "learning_rate": 3.651433691756273e-05,
657
+ "loss": 0.5239,
658
  "step": 850
659
  },
660
  {
661
+ "epoch": 6.8937875751503,
662
+ "grad_norm": 11.182281494140625,
663
+ "learning_rate": 3.6290322580645165e-05,
664
+ "loss": 0.5269,
665
  "step": 860
666
  },
667
  {
668
+ "epoch": 6.973947895791583,
669
+ "grad_norm": 11.750397682189941,
670
+ "learning_rate": 3.60663082437276e-05,
671
+ "loss": 0.5166,
672
  "step": 870
673
  },
674
  {
675
+ "epoch": 6.997995991983968,
676
+ "eval_accuracy": 0.8494926719278467,
677
+ "eval_loss": 0.47609812021255493,
678
+ "eval_runtime": 22.6677,
679
+ "eval_samples_per_second": 78.261,
680
+ "eval_steps_per_second": 2.47,
681
+ "step": 873
682
+ },
683
+ {
684
+ "epoch": 7.054108216432866,
685
+ "grad_norm": 13.769631385803223,
686
+ "learning_rate": 3.5842293906810036e-05,
687
+ "loss": 0.5207,
688
  "step": 880
689
  },
690
  {
691
+ "epoch": 7.134268537074148,
692
+ "grad_norm": 13.10180377960205,
693
+ "learning_rate": 3.5618279569892474e-05,
694
+ "loss": 0.5231,
695
  "step": 890
696
  },
697
  {
698
+ "epoch": 7.214428857715431,
699
+ "grad_norm": 9.971457481384277,
700
+ "learning_rate": 3.539426523297491e-05,
701
+ "loss": 0.518,
 
 
 
 
 
 
 
 
 
702
  "step": 900
703
  },
704
  {
705
+ "epoch": 7.294589178356714,
706
+ "grad_norm": 12.092657089233398,
707
+ "learning_rate": 3.517025089605735e-05,
708
+ "loss": 0.5034,
709
  "step": 910
710
  },
711
  {
712
+ "epoch": 7.374749498997996,
713
+ "grad_norm": 19.348663330078125,
714
+ "learning_rate": 3.494623655913979e-05,
715
+ "loss": 0.4927,
716
  "step": 920
717
  },
718
  {
719
+ "epoch": 7.454909819639279,
720
+ "grad_norm": 10.206799507141113,
721
+ "learning_rate": 3.472222222222222e-05,
722
+ "loss": 0.5062,
723
  "step": 930
724
  },
725
  {
726
+ "epoch": 7.5350701402805615,
727
+ "grad_norm": 9.899465560913086,
728
+ "learning_rate": 3.449820788530466e-05,
729
+ "loss": 0.5037,
730
  "step": 940
731
  },
732
  {
733
+ "epoch": 7.615230460921843,
734
+ "grad_norm": 13.186443328857422,
735
+ "learning_rate": 3.427419354838709e-05,
736
+ "loss": 0.5159,
737
  "step": 950
738
  },
739
  {
740
+ "epoch": 7.695390781563126,
741
+ "grad_norm": 9.82767391204834,
742
+ "learning_rate": 3.405017921146954e-05,
743
+ "loss": 0.479,
744
  "step": 960
745
  },
746
  {
747
+ "epoch": 7.775551102204409,
748
+ "grad_norm": 9.09422492980957,
749
+ "learning_rate": 3.382616487455198e-05,
750
+ "loss": 0.5146,
751
  "step": 970
752
  },
753
  {
754
+ "epoch": 7.855711422845691,
755
+ "grad_norm": 11.1051025390625,
756
+ "learning_rate": 3.360215053763441e-05,
757
+ "loss": 0.4908,
758
  "step": 980
759
  },
760
  {
761
+ "epoch": 7.935871743486974,
762
+ "grad_norm": 9.16980266571045,
763
+ "learning_rate": 3.337813620071685e-05,
764
+ "loss": 0.4992,
765
  "step": 990
766
  },
767
  {
768
+ "epoch": 8.0,
769
+ "eval_accuracy": 0.8562570462232244,
770
+ "eval_loss": 0.432047575712204,
771
+ "eval_runtime": 17.8346,
772
+ "eval_samples_per_second": 99.469,
773
+ "eval_steps_per_second": 3.14,
774
+ "step": 998
775
+ },
776
+ {
777
+ "epoch": 8.016032064128256,
778
+ "grad_norm": 8.514852523803711,
779
+ "learning_rate": 3.3154121863799286e-05,
780
+ "loss": 0.4702,
781
  "step": 1000
782
  },
783
  {
784
+ "epoch": 8.09619238476954,
785
+ "grad_norm": 10.90427017211914,
786
+ "learning_rate": 3.293010752688172e-05,
787
+ "loss": 0.4809,
 
 
 
 
 
 
 
 
 
788
  "step": 1010
789
  },
790
  {
791
+ "epoch": 8.176352705410821,
792
+ "grad_norm": 13.75596809387207,
793
+ "learning_rate": 3.270609318996416e-05,
794
+ "loss": 0.4769,
795
  "step": 1020
796
  },
797
  {
798
+ "epoch": 8.256513026052104,
799
+ "grad_norm": 14.506204605102539,
800
+ "learning_rate": 3.24820788530466e-05,
801
+ "loss": 0.5157,
802
  "step": 1030
803
  },
804
  {
805
+ "epoch": 8.336673346693386,
806
+ "grad_norm": 10.074383735656738,
807
+ "learning_rate": 3.2258064516129034e-05,
808
+ "loss": 0.4916,
809
  "step": 1040
810
  },
811
  {
812
+ "epoch": 8.41683366733467,
813
+ "grad_norm": 13.06877326965332,
814
+ "learning_rate": 3.203405017921147e-05,
815
+ "loss": 0.4953,
816
  "step": 1050
817
  },
818
  {
819
+ "epoch": 8.496993987975952,
820
+ "grad_norm": 7.9596171379089355,
821
+ "learning_rate": 3.1810035842293904e-05,
822
+ "loss": 0.4879,
823
  "step": 1060
824
  },
825
  {
826
+ "epoch": 8.577154308617235,
827
+ "grad_norm": 11.05156135559082,
828
+ "learning_rate": 3.158602150537634e-05,
829
+ "loss": 0.4397,
830
  "step": 1070
831
  },
832
  {
833
+ "epoch": 8.657314629258517,
834
+ "grad_norm": 9.935453414916992,
835
+ "learning_rate": 3.136200716845878e-05,
836
+ "loss": 0.5141,
837
  "step": 1080
838
  },
839
  {
840
+ "epoch": 8.7374749498998,
841
+ "grad_norm": 9.928804397583008,
842
+ "learning_rate": 3.113799283154122e-05,
843
+ "loss": 0.4781,
844
  "step": 1090
845
  },
846
  {
847
+ "epoch": 8.817635270541082,
848
+ "grad_norm": 7.301691055297852,
849
+ "learning_rate": 3.091397849462366e-05,
850
+ "loss": 0.4484,
851
  "step": 1100
852
  },
853
  {
854
+ "epoch": 8.897795591182366,
855
+ "grad_norm": 13.609901428222656,
856
+ "learning_rate": 3.06899641577061e-05,
857
+ "loss": 0.4656,
858
  "step": 1110
859
  },
860
  {
861
+ "epoch": 8.977955911823647,
862
+ "grad_norm": 10.269015312194824,
863
+ "learning_rate": 3.046594982078853e-05,
864
+ "loss": 0.4528,
865
  "step": 1120
866
  },
867
  {
868
+ "epoch": 8.993987975951903,
869
+ "eval_accuracy": 0.8641488162344984,
870
+ "eval_loss": 0.4410019814968109,
871
+ "eval_runtime": 27.5679,
872
+ "eval_samples_per_second": 64.35,
873
+ "eval_steps_per_second": 2.031,
874
+ "step": 1122
875
+ },
876
+ {
877
+ "epoch": 9.05811623246493,
878
+ "grad_norm": 8.7766695022583,
879
+ "learning_rate": 3.024193548387097e-05,
880
+ "loss": 0.4843,
881
+ "step": 1130
882
+ },
883
+ {
884
+ "epoch": 9.138276553106213,
885
+ "grad_norm": 10.279314994812012,
886
+ "learning_rate": 3.0017921146953403e-05,
887
+ "loss": 0.5017,
888
+ "step": 1140
889
+ },
890
+ {
891
+ "epoch": 9.218436873747494,
892
+ "grad_norm": 13.840995788574219,
893
+ "learning_rate": 2.979390681003584e-05,
894
+ "loss": 0.4443,
895
+ "step": 1150
896
+ },
897
+ {
898
+ "epoch": 9.298597194388778,
899
+ "grad_norm": 14.21786117553711,
900
+ "learning_rate": 2.9569892473118284e-05,
901
+ "loss": 0.4319,
902
+ "step": 1160
903
+ },
904
+ {
905
+ "epoch": 9.37875751503006,
906
+ "grad_norm": 9.682762145996094,
907
+ "learning_rate": 2.9345878136200715e-05,
908
+ "loss": 0.4692,
909
+ "step": 1170
910
+ },
911
+ {
912
+ "epoch": 9.458917835671343,
913
+ "grad_norm": 12.985733985900879,
914
+ "learning_rate": 2.9121863799283154e-05,
915
+ "loss": 0.4221,
916
+ "step": 1180
917
+ },
918
+ {
919
+ "epoch": 9.539078156312625,
920
+ "grad_norm": 12.35405445098877,
921
+ "learning_rate": 2.8897849462365596e-05,
922
+ "loss": 0.4835,
923
+ "step": 1190
924
+ },
925
+ {
926
+ "epoch": 9.619238476953909,
927
+ "grad_norm": 7.067807197570801,
928
+ "learning_rate": 2.8673835125448028e-05,
929
+ "loss": 0.4548,
930
+ "step": 1200
931
+ },
932
+ {
933
+ "epoch": 9.69939879759519,
934
+ "grad_norm": 10.279123306274414,
935
+ "learning_rate": 2.8449820788530467e-05,
936
+ "loss": 0.4791,
937
+ "step": 1210
938
+ },
939
+ {
940
+ "epoch": 9.779559118236474,
941
+ "grad_norm": 12.814294815063477,
942
+ "learning_rate": 2.822580645161291e-05,
943
+ "loss": 0.4551,
944
+ "step": 1220
945
+ },
946
+ {
947
+ "epoch": 9.859719438877756,
948
+ "grad_norm": 15.132489204406738,
949
+ "learning_rate": 2.800179211469534e-05,
950
+ "loss": 0.3957,
951
+ "step": 1230
952
+ },
953
+ {
954
+ "epoch": 9.939879759519037,
955
+ "grad_norm": 11.959942817687988,
956
+ "learning_rate": 2.777777777777778e-05,
957
+ "loss": 0.4566,
958
+ "step": 1240
959
+ },
960
+ {
961
+ "epoch": 9.995991983967937,
962
+ "eval_accuracy": 0.8641488162344984,
963
+ "eval_loss": 0.42970511317253113,
964
+ "eval_runtime": 16.9001,
965
+ "eval_samples_per_second": 104.97,
966
+ "eval_steps_per_second": 3.314,
967
+ "step": 1247
968
+ },
969
+ {
970
+ "epoch": 10.02004008016032,
971
+ "grad_norm": 12.989178657531738,
972
+ "learning_rate": 2.7553763440860214e-05,
973
+ "loss": 0.4402,
974
+ "step": 1250
975
+ },
976
+ {
977
+ "epoch": 10.100200400801603,
978
+ "grad_norm": 11.985700607299805,
979
+ "learning_rate": 2.7329749103942653e-05,
980
+ "loss": 0.4544,
981
+ "step": 1260
982
+ },
983
+ {
984
+ "epoch": 10.180360721442886,
985
+ "grad_norm": 17.515514373779297,
986
+ "learning_rate": 2.710573476702509e-05,
987
+ "loss": 0.3676,
988
+ "step": 1270
989
+ },
990
+ {
991
+ "epoch": 10.260521042084168,
992
+ "grad_norm": 11.017400741577148,
993
+ "learning_rate": 2.6881720430107527e-05,
994
+ "loss": 0.4534,
995
+ "step": 1280
996
+ },
997
+ {
998
+ "epoch": 10.340681362725451,
999
+ "grad_norm": 7.939273357391357,
1000
+ "learning_rate": 2.6657706093189965e-05,
1001
+ "loss": 0.4719,
1002
+ "step": 1290
1003
+ },
1004
+ {
1005
+ "epoch": 10.420841683366733,
1006
+ "grad_norm": 13.53430461883545,
1007
+ "learning_rate": 2.6433691756272404e-05,
1008
+ "loss": 0.4162,
1009
+ "step": 1300
1010
+ },
1011
+ {
1012
+ "epoch": 10.501002004008017,
1013
+ "grad_norm": 9.599760055541992,
1014
+ "learning_rate": 2.620967741935484e-05,
1015
+ "loss": 0.46,
1016
+ "step": 1310
1017
+ },
1018
+ {
1019
+ "epoch": 10.581162324649299,
1020
+ "grad_norm": 7.481749057769775,
1021
+ "learning_rate": 2.5985663082437278e-05,
1022
+ "loss": 0.405,
1023
+ "step": 1320
1024
+ },
1025
+ {
1026
+ "epoch": 10.661322645290582,
1027
+ "grad_norm": 17.151025772094727,
1028
+ "learning_rate": 2.5761648745519713e-05,
1029
+ "loss": 0.4484,
1030
+ "step": 1330
1031
+ },
1032
+ {
1033
+ "epoch": 10.741482965931864,
1034
+ "grad_norm": 11.18791389465332,
1035
+ "learning_rate": 2.5537634408602152e-05,
1036
+ "loss": 0.4461,
1037
+ "step": 1340
1038
+ },
1039
+ {
1040
+ "epoch": 10.821643286573146,
1041
+ "grad_norm": 9.898661613464355,
1042
+ "learning_rate": 2.531362007168459e-05,
1043
+ "loss": 0.3958,
1044
+ "step": 1350
1045
+ },
1046
+ {
1047
+ "epoch": 10.901803607214429,
1048
+ "grad_norm": 9.442924499511719,
1049
+ "learning_rate": 2.5089605734767026e-05,
1050
+ "loss": 0.4279,
1051
+ "step": 1360
1052
+ },
1053
+ {
1054
+ "epoch": 10.98196392785571,
1055
+ "grad_norm": 11.045487403869629,
1056
+ "learning_rate": 2.4865591397849464e-05,
1057
+ "loss": 0.4294,
1058
+ "step": 1370
1059
+ },
1060
+ {
1061
+ "epoch": 10.997995991983968,
1062
+ "eval_accuracy": 0.8607666290868095,
1063
+ "eval_loss": 0.42823219299316406,
1064
+ "eval_runtime": 26.2766,
1065
+ "eval_samples_per_second": 67.512,
1066
+ "eval_steps_per_second": 2.131,
1067
+ "step": 1372
1068
+ },
1069
+ {
1070
+ "epoch": 11.062124248496994,
1071
+ "grad_norm": 9.318482398986816,
1072
+ "learning_rate": 2.46415770609319e-05,
1073
+ "loss": 0.4314,
1074
+ "step": 1380
1075
+ },
1076
+ {
1077
+ "epoch": 11.142284569138276,
1078
+ "grad_norm": 16.068525314331055,
1079
+ "learning_rate": 2.4417562724014338e-05,
1080
+ "loss": 0.3944,
1081
+ "step": 1390
1082
+ },
1083
+ {
1084
+ "epoch": 11.22244488977956,
1085
+ "grad_norm": 6.959997177124023,
1086
+ "learning_rate": 2.4193548387096777e-05,
1087
+ "loss": 0.4251,
1088
+ "step": 1400
1089
+ },
1090
+ {
1091
+ "epoch": 11.302605210420841,
1092
+ "grad_norm": 11.282358169555664,
1093
+ "learning_rate": 2.3969534050179212e-05,
1094
+ "loss": 0.4074,
1095
+ "step": 1410
1096
+ },
1097
+ {
1098
+ "epoch": 11.382765531062125,
1099
+ "grad_norm": 8.684910774230957,
1100
+ "learning_rate": 2.374551971326165e-05,
1101
+ "loss": 0.4426,
1102
+ "step": 1420
1103
+ },
1104
+ {
1105
+ "epoch": 11.462925851703407,
1106
+ "grad_norm": 11.480581283569336,
1107
+ "learning_rate": 2.352150537634409e-05,
1108
+ "loss": 0.413,
1109
+ "step": 1430
1110
+ },
1111
+ {
1112
+ "epoch": 11.54308617234469,
1113
+ "grad_norm": 10.927531242370605,
1114
+ "learning_rate": 2.3297491039426525e-05,
1115
+ "loss": 0.4338,
1116
+ "step": 1440
1117
+ },
1118
+ {
1119
+ "epoch": 11.623246492985972,
1120
+ "grad_norm": 10.118310928344727,
1121
+ "learning_rate": 2.307347670250896e-05,
1122
+ "loss": 0.4536,
1123
+ "step": 1450
1124
+ },
1125
+ {
1126
+ "epoch": 11.703406813627254,
1127
+ "grad_norm": 10.131954193115234,
1128
+ "learning_rate": 2.28494623655914e-05,
1129
+ "loss": 0.4164,
1130
+ "step": 1460
1131
+ },
1132
+ {
1133
+ "epoch": 11.783567134268537,
1134
+ "grad_norm": 14.58598804473877,
1135
+ "learning_rate": 2.2625448028673837e-05,
1136
+ "loss": 0.4342,
1137
+ "step": 1470
1138
+ },
1139
+ {
1140
+ "epoch": 11.863727454909819,
1141
+ "grad_norm": 12.672148704528809,
1142
+ "learning_rate": 2.2401433691756272e-05,
1143
+ "loss": 0.4393,
1144
+ "step": 1480
1145
+ },
1146
+ {
1147
+ "epoch": 11.943887775551103,
1148
+ "grad_norm": 11.871652603149414,
1149
+ "learning_rate": 2.217741935483871e-05,
1150
+ "loss": 0.3771,
1151
+ "step": 1490
1152
+ },
1153
+ {
1154
+ "epoch": 12.0,
1155
+ "eval_accuracy": 0.85456595264938,
1156
+ "eval_loss": 0.4546312391757965,
1157
+ "eval_runtime": 21.2281,
1158
+ "eval_samples_per_second": 83.569,
1159
+ "eval_steps_per_second": 2.638,
1160
+ "step": 1497
1161
+ },
1162
+ {
1163
+ "epoch": 12.024048096192384,
1164
+ "grad_norm": 12.87192440032959,
1165
+ "learning_rate": 2.1953405017921146e-05,
1166
+ "loss": 0.404,
1167
+ "step": 1500
1168
+ },
1169
+ {
1170
+ "epoch": 12.104208416833668,
1171
+ "grad_norm": 11.67623519897461,
1172
+ "learning_rate": 2.1729390681003585e-05,
1173
+ "loss": 0.3968,
1174
+ "step": 1510
1175
+ },
1176
+ {
1177
+ "epoch": 12.18436873747495,
1178
+ "grad_norm": 11.608409881591797,
1179
+ "learning_rate": 2.1505376344086024e-05,
1180
+ "loss": 0.3809,
1181
+ "step": 1520
1182
+ },
1183
+ {
1184
+ "epoch": 12.264529058116233,
1185
+ "grad_norm": 9.568375587463379,
1186
+ "learning_rate": 2.128136200716846e-05,
1187
+ "loss": 0.4135,
1188
+ "step": 1530
1189
+ },
1190
+ {
1191
+ "epoch": 12.344689378757515,
1192
+ "grad_norm": 10.64120864868164,
1193
+ "learning_rate": 2.1057347670250897e-05,
1194
+ "loss": 0.411,
1195
+ "step": 1540
1196
+ },
1197
+ {
1198
+ "epoch": 12.424849699398798,
1199
+ "grad_norm": 9.730778694152832,
1200
+ "learning_rate": 2.0833333333333336e-05,
1201
+ "loss": 0.3948,
1202
+ "step": 1550
1203
+ },
1204
+ {
1205
+ "epoch": 12.50501002004008,
1206
+ "grad_norm": 11.325265884399414,
1207
+ "learning_rate": 2.060931899641577e-05,
1208
+ "loss": 0.403,
1209
+ "step": 1560
1210
+ },
1211
+ {
1212
+ "epoch": 12.585170340681362,
1213
+ "grad_norm": 13.892471313476562,
1214
+ "learning_rate": 2.038530465949821e-05,
1215
+ "loss": 0.4393,
1216
+ "step": 1570
1217
+ },
1218
+ {
1219
+ "epoch": 12.665330661322646,
1220
+ "grad_norm": 14.78463363647461,
1221
+ "learning_rate": 2.0161290322580645e-05,
1222
+ "loss": 0.3784,
1223
+ "step": 1580
1224
+ },
1225
+ {
1226
+ "epoch": 12.745490981963927,
1227
+ "grad_norm": 14.130833625793457,
1228
+ "learning_rate": 1.9937275985663084e-05,
1229
+ "loss": 0.425,
1230
+ "step": 1590
1231
+ },
1232
+ {
1233
+ "epoch": 12.82565130260521,
1234
+ "grad_norm": 8.856616973876953,
1235
+ "learning_rate": 1.9713261648745522e-05,
1236
+ "loss": 0.3804,
1237
+ "step": 1600
1238
+ },
1239
+ {
1240
+ "epoch": 12.905811623246493,
1241
+ "grad_norm": 10.982331275939941,
1242
+ "learning_rate": 1.9489247311827958e-05,
1243
+ "loss": 0.4252,
1244
+ "step": 1610
1245
+ },
1246
+ {
1247
+ "epoch": 12.985971943887776,
1248
+ "grad_norm": 9.618106842041016,
1249
+ "learning_rate": 1.9265232974910393e-05,
1250
+ "loss": 0.4224,
1251
+ "step": 1620
1252
+ },
1253
+ {
1254
+ "epoch": 12.993987975951903,
1255
+ "eval_accuracy": 0.8624577226606539,
1256
+ "eval_loss": 0.448898583650589,
1257
+ "eval_runtime": 17.755,
1258
+ "eval_samples_per_second": 99.916,
1259
+ "eval_steps_per_second": 3.154,
1260
+ "step": 1621
1261
+ },
1262
+ {
1263
+ "epoch": 13.066132264529058,
1264
+ "grad_norm": 18.403636932373047,
1265
+ "learning_rate": 1.904121863799283e-05,
1266
+ "loss": 0.4142,
1267
+ "step": 1630
1268
+ },
1269
+ {
1270
+ "epoch": 13.146292585170341,
1271
+ "grad_norm": 12.012832641601562,
1272
+ "learning_rate": 1.881720430107527e-05,
1273
+ "loss": 0.3976,
1274
+ "step": 1640
1275
+ },
1276
+ {
1277
+ "epoch": 13.226452905811623,
1278
+ "grad_norm": 10.503453254699707,
1279
+ "learning_rate": 1.8593189964157705e-05,
1280
+ "loss": 0.3707,
1281
+ "step": 1650
1282
+ },
1283
+ {
1284
+ "epoch": 13.306613226452907,
1285
+ "grad_norm": 12.286235809326172,
1286
+ "learning_rate": 1.8369175627240144e-05,
1287
+ "loss": 0.4056,
1288
+ "step": 1660
1289
+ },
1290
+ {
1291
+ "epoch": 13.386773547094188,
1292
+ "grad_norm": 9.312376976013184,
1293
+ "learning_rate": 1.8145161290322583e-05,
1294
+ "loss": 0.4226,
1295
+ "step": 1670
1296
+ },
1297
+ {
1298
+ "epoch": 13.46693386773547,
1299
+ "grad_norm": 9.602310180664062,
1300
+ "learning_rate": 1.7921146953405018e-05,
1301
+ "loss": 0.3638,
1302
+ "step": 1680
1303
+ },
1304
+ {
1305
+ "epoch": 13.547094188376754,
1306
+ "grad_norm": 10.584216117858887,
1307
+ "learning_rate": 1.7697132616487457e-05,
1308
+ "loss": 0.3955,
1309
+ "step": 1690
1310
+ },
1311
+ {
1312
+ "epoch": 13.627254509018035,
1313
+ "grad_norm": 9.666451454162598,
1314
+ "learning_rate": 1.7473118279569895e-05,
1315
+ "loss": 0.3895,
1316
+ "step": 1700
1317
  },
1318
  {
1319
+ "epoch": 13.707414829659319,
1320
+ "grad_norm": 14.383480072021484,
1321
+ "learning_rate": 1.724910394265233e-05,
1322
+ "loss": 0.4076,
1323
+ "step": 1710
1324
+ },
1325
+ {
1326
+ "epoch": 13.7875751503006,
1327
+ "grad_norm": 9.302132606506348,
1328
+ "learning_rate": 1.702508960573477e-05,
1329
+ "loss": 0.4342,
1330
+ "step": 1720
1331
+ },
1332
+ {
1333
+ "epoch": 13.867735470941884,
1334
+ "grad_norm": 24.193918228149414,
1335
+ "learning_rate": 1.6801075268817204e-05,
1336
+ "loss": 0.4005,
1337
+ "step": 1730
1338
+ },
1339
+ {
1340
+ "epoch": 13.947895791583166,
1341
+ "grad_norm": 14.272506713867188,
1342
+ "learning_rate": 1.6577060931899643e-05,
1343
+ "loss": 0.4099,
1344
+ "step": 1740
1345
+ },
1346
+ {
1347
+ "epoch": 13.995991983967937,
1348
+ "eval_accuracy": 0.8624577226606539,
1349
+ "eval_loss": 0.4411380887031555,
1350
+ "eval_runtime": 11.1197,
1351
+ "eval_samples_per_second": 159.537,
1352
+ "eval_steps_per_second": 5.036,
1353
+ "step": 1746
1354
+ },
1355
+ {
1356
+ "epoch": 14.02805611222445,
1357
+ "grad_norm": 10.529751777648926,
1358
+ "learning_rate": 1.635304659498208e-05,
1359
+ "loss": 0.3849,
1360
+ "step": 1750
1361
+ },
1362
+ {
1363
+ "epoch": 14.108216432865731,
1364
+ "grad_norm": 9.820696830749512,
1365
+ "learning_rate": 1.6129032258064517e-05,
1366
+ "loss": 0.4455,
1367
+ "step": 1760
1368
+ },
1369
+ {
1370
+ "epoch": 14.188376753507015,
1371
+ "grad_norm": 8.576085090637207,
1372
+ "learning_rate": 1.5905017921146952e-05,
1373
+ "loss": 0.3852,
1374
+ "step": 1770
1375
+ },
1376
+ {
1377
+ "epoch": 14.268537074148297,
1378
+ "grad_norm": 11.380485534667969,
1379
+ "learning_rate": 1.568100358422939e-05,
1380
+ "loss": 0.367,
1381
+ "step": 1780
1382
+ },
1383
+ {
1384
+ "epoch": 14.348697394789578,
1385
+ "grad_norm": 12.20594310760498,
1386
+ "learning_rate": 1.545698924731183e-05,
1387
+ "loss": 0.3974,
1388
+ "step": 1790
1389
+ },
1390
+ {
1391
+ "epoch": 14.428857715430862,
1392
+ "grad_norm": 11.483406066894531,
1393
+ "learning_rate": 1.5232974910394265e-05,
1394
+ "loss": 0.3949,
1395
+ "step": 1800
1396
+ },
1397
+ {
1398
+ "epoch": 14.509018036072144,
1399
+ "grad_norm": 9.642448425292969,
1400
+ "learning_rate": 1.5008960573476701e-05,
1401
+ "loss": 0.4097,
1402
+ "step": 1810
1403
+ },
1404
+ {
1405
+ "epoch": 14.589178356713427,
1406
+ "grad_norm": 10.316274642944336,
1407
+ "learning_rate": 1.4784946236559142e-05,
1408
+ "loss": 0.3393,
1409
+ "step": 1820
1410
+ },
1411
+ {
1412
+ "epoch": 14.669338677354709,
1413
+ "grad_norm": 10.923069953918457,
1414
+ "learning_rate": 1.4560931899641577e-05,
1415
+ "loss": 0.3562,
1416
+ "step": 1830
1417
+ },
1418
+ {
1419
+ "epoch": 14.749498997995993,
1420
+ "grad_norm": 9.884988784790039,
1421
+ "learning_rate": 1.4336917562724014e-05,
1422
+ "loss": 0.3581,
1423
+ "step": 1840
1424
+ },
1425
+ {
1426
+ "epoch": 14.829659318637274,
1427
+ "grad_norm": 8.461724281311035,
1428
+ "learning_rate": 1.4112903225806454e-05,
1429
+ "loss": 0.3775,
1430
+ "step": 1850
1431
+ },
1432
+ {
1433
+ "epoch": 14.909819639278558,
1434
+ "grad_norm": 8.028816223144531,
1435
+ "learning_rate": 1.388888888888889e-05,
1436
+ "loss": 0.3968,
1437
+ "step": 1860
1438
+ },
1439
+ {
1440
+ "epoch": 14.98997995991984,
1441
+ "grad_norm": 13.816071510314941,
1442
+ "learning_rate": 1.3664874551971326e-05,
1443
+ "loss": 0.3759,
1444
+ "step": 1870
1445
+ },
1446
+ {
1447
+ "epoch": 14.997995991983968,
1448
+ "eval_accuracy": 0.8652762119503946,
1449
+ "eval_loss": 0.43173447251319885,
1450
+ "eval_runtime": 13.0155,
1451
+ "eval_samples_per_second": 136.299,
1452
+ "eval_steps_per_second": 4.303,
1453
+ "step": 1871
1454
+ },
1455
+ {
1456
+ "epoch": 15.070140280561123,
1457
+ "grad_norm": 10.104082107543945,
1458
+ "learning_rate": 1.3440860215053763e-05,
1459
+ "loss": 0.3643,
1460
+ "step": 1880
1461
+ },
1462
+ {
1463
+ "epoch": 15.150300601202405,
1464
+ "grad_norm": 9.357542991638184,
1465
+ "learning_rate": 1.3216845878136202e-05,
1466
+ "loss": 0.3912,
1467
+ "step": 1890
1468
+ },
1469
+ {
1470
+ "epoch": 15.230460921843687,
1471
+ "grad_norm": 9.616016387939453,
1472
+ "learning_rate": 1.2992831541218639e-05,
1473
+ "loss": 0.3554,
1474
+ "step": 1900
1475
+ },
1476
+ {
1477
+ "epoch": 15.31062124248497,
1478
+ "grad_norm": 11.787483215332031,
1479
+ "learning_rate": 1.2768817204301076e-05,
1480
+ "loss": 0.3695,
1481
+ "step": 1910
1482
+ },
1483
+ {
1484
+ "epoch": 15.390781563126252,
1485
+ "grad_norm": 12.30813980102539,
1486
+ "learning_rate": 1.2544802867383513e-05,
1487
+ "loss": 0.4177,
1488
+ "step": 1920
1489
+ },
1490
+ {
1491
+ "epoch": 15.470941883767535,
1492
+ "grad_norm": 12.10972785949707,
1493
+ "learning_rate": 1.232078853046595e-05,
1494
+ "loss": 0.3606,
1495
+ "step": 1930
1496
+ },
1497
+ {
1498
+ "epoch": 15.551102204408817,
1499
+ "grad_norm": 9.871501922607422,
1500
+ "learning_rate": 1.2096774193548388e-05,
1501
+ "loss": 0.3627,
1502
+ "step": 1940
1503
+ },
1504
+ {
1505
+ "epoch": 15.6312625250501,
1506
+ "grad_norm": 9.91009521484375,
1507
+ "learning_rate": 1.1872759856630825e-05,
1508
+ "loss": 0.363,
1509
+ "step": 1950
1510
+ },
1511
+ {
1512
+ "epoch": 15.711422845691382,
1513
+ "grad_norm": 9.586908340454102,
1514
+ "learning_rate": 1.1648745519713262e-05,
1515
+ "loss": 0.3761,
1516
+ "step": 1960
1517
+ },
1518
+ {
1519
+ "epoch": 15.791583166332666,
1520
+ "grad_norm": 13.530887603759766,
1521
+ "learning_rate": 1.14247311827957e-05,
1522
+ "loss": 0.3437,
1523
+ "step": 1970
1524
+ },
1525
+ {
1526
+ "epoch": 15.871743486973948,
1527
+ "grad_norm": 10.21368408203125,
1528
+ "learning_rate": 1.1200716845878136e-05,
1529
+ "loss": 0.3575,
1530
+ "step": 1980
1531
+ },
1532
+ {
1533
+ "epoch": 15.951903807615231,
1534
+ "grad_norm": 10.05925178527832,
1535
+ "learning_rate": 1.0976702508960573e-05,
1536
+ "loss": 0.3692,
1537
+ "step": 1990
1538
+ },
1539
+ {
1540
+ "epoch": 16.0,
1541
+ "eval_accuracy": 0.863021420518602,
1542
+ "eval_loss": 0.43041756749153137,
1543
+ "eval_runtime": 13.2621,
1544
+ "eval_samples_per_second": 133.765,
1545
+ "eval_steps_per_second": 4.223,
1546
+ "step": 1996
1547
+ },
1548
+ {
1549
+ "epoch": 16.03206412825651,
1550
+ "grad_norm": 8.98168659210205,
1551
+ "learning_rate": 1.0752688172043012e-05,
1552
+ "loss": 0.3292,
1553
+ "step": 2000
1554
+ },
1555
+ {
1556
+ "epoch": 16.112224448897795,
1557
+ "grad_norm": 13.636373519897461,
1558
+ "learning_rate": 1.0528673835125449e-05,
1559
+ "loss": 0.3546,
1560
+ "step": 2010
1561
+ },
1562
+ {
1563
+ "epoch": 16.19238476953908,
1564
+ "grad_norm": 8.504558563232422,
1565
+ "learning_rate": 1.0304659498207886e-05,
1566
+ "loss": 0.3346,
1567
+ "step": 2020
1568
+ },
1569
+ {
1570
+ "epoch": 16.272545090180362,
1571
+ "grad_norm": 12.934548377990723,
1572
+ "learning_rate": 1.0080645161290323e-05,
1573
+ "loss": 0.3453,
1574
+ "step": 2030
1575
+ },
1576
+ {
1577
+ "epoch": 16.352705410821642,
1578
+ "grad_norm": 10.473663330078125,
1579
+ "learning_rate": 9.856630824372761e-06,
1580
+ "loss": 0.3545,
1581
+ "step": 2040
1582
+ },
1583
+ {
1584
+ "epoch": 16.432865731462925,
1585
+ "grad_norm": 12.263561248779297,
1586
+ "learning_rate": 9.632616487455196e-06,
1587
+ "loss": 0.3487,
1588
+ "step": 2050
1589
+ },
1590
+ {
1591
+ "epoch": 16.51302605210421,
1592
+ "grad_norm": 8.574410438537598,
1593
+ "learning_rate": 9.408602150537635e-06,
1594
+ "loss": 0.4023,
1595
+ "step": 2060
1596
+ },
1597
+ {
1598
+ "epoch": 16.593186372745492,
1599
+ "grad_norm": 20.604251861572266,
1600
+ "learning_rate": 9.184587813620072e-06,
1601
+ "loss": 0.3941,
1602
+ "step": 2070
1603
+ },
1604
+ {
1605
+ "epoch": 16.673346693386772,
1606
+ "grad_norm": 15.173372268676758,
1607
+ "learning_rate": 8.960573476702509e-06,
1608
+ "loss": 0.3716,
1609
+ "step": 2080
1610
+ },
1611
+ {
1612
+ "epoch": 16.753507014028056,
1613
+ "grad_norm": 10.56511116027832,
1614
+ "learning_rate": 8.736559139784948e-06,
1615
+ "loss": 0.3873,
1616
+ "step": 2090
1617
+ },
1618
+ {
1619
+ "epoch": 16.83366733466934,
1620
+ "grad_norm": 12.560215950012207,
1621
+ "learning_rate": 8.512544802867385e-06,
1622
+ "loss": 0.332,
1623
+ "step": 2100
1624
+ },
1625
+ {
1626
+ "epoch": 16.91382765531062,
1627
+ "grad_norm": 12.049774169921875,
1628
+ "learning_rate": 8.288530465949821e-06,
1629
+ "loss": 0.3394,
1630
+ "step": 2110
1631
+ },
1632
+ {
1633
+ "epoch": 16.993987975951903,
1634
+ "grad_norm": 12.533961296081543,
1635
+ "learning_rate": 8.064516129032258e-06,
1636
+ "loss": 0.364,
1637
+ "step": 2120
1638
+ },
1639
+ {
1640
+ "epoch": 16.993987975951903,
1641
+ "eval_accuracy": 0.8664036076662909,
1642
+ "eval_loss": 0.4329654276371002,
1643
+ "eval_runtime": 11.1171,
1644
+ "eval_samples_per_second": 159.574,
1645
+ "eval_steps_per_second": 5.037,
1646
+ "step": 2120
1647
+ },
1648
+ {
1649
+ "epoch": 17.074148296593187,
1650
+ "grad_norm": 11.729970932006836,
1651
+ "learning_rate": 7.840501792114695e-06,
1652
+ "loss": 0.3719,
1653
+ "step": 2130
1654
+ },
1655
+ {
1656
+ "epoch": 17.15430861723447,
1657
+ "grad_norm": 8.87394905090332,
1658
+ "learning_rate": 7.616487455197132e-06,
1659
+ "loss": 0.3165,
1660
+ "step": 2140
1661
+ },
1662
+ {
1663
+ "epoch": 17.23446893787575,
1664
+ "grad_norm": 11.172150611877441,
1665
+ "learning_rate": 7.392473118279571e-06,
1666
+ "loss": 0.371,
1667
+ "step": 2150
1668
+ },
1669
+ {
1670
+ "epoch": 17.314629258517034,
1671
+ "grad_norm": 17.362049102783203,
1672
+ "learning_rate": 7.168458781362007e-06,
1673
+ "loss": 0.3607,
1674
+ "step": 2160
1675
+ },
1676
+ {
1677
+ "epoch": 17.394789579158317,
1678
+ "grad_norm": 9.348087310791016,
1679
+ "learning_rate": 6.944444444444445e-06,
1680
+ "loss": 0.3837,
1681
+ "step": 2170
1682
+ },
1683
+ {
1684
+ "epoch": 17.4749498997996,
1685
+ "grad_norm": 11.758851051330566,
1686
+ "learning_rate": 6.720430107526882e-06,
1687
+ "loss": 0.3847,
1688
+ "step": 2180
1689
+ },
1690
+ {
1691
+ "epoch": 17.55511022044088,
1692
+ "grad_norm": 12.436318397521973,
1693
+ "learning_rate": 6.4964157706093195e-06,
1694
+ "loss": 0.3541,
1695
+ "step": 2190
1696
+ },
1697
+ {
1698
+ "epoch": 17.635270541082164,
1699
+ "grad_norm": 8.334653854370117,
1700
+ "learning_rate": 6.2724014336917564e-06,
1701
+ "loss": 0.3638,
1702
+ "step": 2200
1703
+ },
1704
+ {
1705
+ "epoch": 17.715430861723448,
1706
+ "grad_norm": 13.402957916259766,
1707
+ "learning_rate": 6.048387096774194e-06,
1708
+ "loss": 0.3178,
1709
+ "step": 2210
1710
+ },
1711
+ {
1712
+ "epoch": 17.79559118236473,
1713
+ "grad_norm": 7.792669296264648,
1714
+ "learning_rate": 5.824372759856631e-06,
1715
+ "loss": 0.3553,
1716
+ "step": 2220
1717
+ },
1718
+ {
1719
+ "epoch": 17.87575150300601,
1720
+ "grad_norm": 16.891786575317383,
1721
+ "learning_rate": 5.600358422939068e-06,
1722
+ "loss": 0.357,
1723
+ "step": 2230
1724
+ },
1725
+ {
1726
+ "epoch": 17.955911823647295,
1727
+ "grad_norm": 8.470565795898438,
1728
+ "learning_rate": 5.376344086021506e-06,
1729
+ "loss": 0.3636,
1730
+ "step": 2240
1731
+ },
1732
+ {
1733
+ "epoch": 17.995991983967937,
1734
+ "eval_accuracy": 0.8680947012401353,
1735
+ "eval_loss": 0.4249646067619324,
1736
+ "eval_runtime": 23.3665,
1737
+ "eval_samples_per_second": 75.921,
1738
+ "eval_steps_per_second": 2.397,
1739
+ "step": 2245
1740
+ },
1741
+ {
1742
+ "epoch": 18.03607214428858,
1743
+ "grad_norm": 8.499645233154297,
1744
+ "learning_rate": 5.152329749103943e-06,
1745
+ "loss": 0.3596,
1746
+ "step": 2250
1747
+ },
1748
+ {
1749
+ "epoch": 18.11623246492986,
1750
+ "grad_norm": 12.444930076599121,
1751
+ "learning_rate": 4.928315412186381e-06,
1752
+ "loss": 0.2935,
1753
+ "step": 2260
1754
+ },
1755
+ {
1756
+ "epoch": 18.196392785571142,
1757
+ "grad_norm": 11.180887222290039,
1758
+ "learning_rate": 4.7043010752688175e-06,
1759
+ "loss": 0.3253,
1760
+ "step": 2270
1761
+ },
1762
+ {
1763
+ "epoch": 18.276553106212425,
1764
+ "grad_norm": 8.559268951416016,
1765
+ "learning_rate": 4.4802867383512545e-06,
1766
+ "loss": 0.3427,
1767
+ "step": 2280
1768
+ },
1769
+ {
1770
+ "epoch": 18.35671342685371,
1771
+ "grad_norm": 15.954803466796875,
1772
+ "learning_rate": 4.256272401433692e-06,
1773
+ "loss": 0.3657,
1774
+ "step": 2290
1775
+ },
1776
+ {
1777
+ "epoch": 18.43687374749499,
1778
+ "grad_norm": 12.122089385986328,
1779
+ "learning_rate": 4.032258064516129e-06,
1780
+ "loss": 0.3101,
1781
+ "step": 2300
1782
+ },
1783
+ {
1784
+ "epoch": 18.517034068136272,
1785
+ "grad_norm": 7.729813575744629,
1786
+ "learning_rate": 3.808243727598566e-06,
1787
+ "loss": 0.3266,
1788
+ "step": 2310
1789
+ },
1790
+ {
1791
+ "epoch": 18.597194388777556,
1792
+ "grad_norm": 8.754435539245605,
1793
+ "learning_rate": 3.5842293906810035e-06,
1794
+ "loss": 0.3337,
1795
+ "step": 2320
1796
+ },
1797
+ {
1798
+ "epoch": 18.677354709418836,
1799
+ "grad_norm": 12.22318172454834,
1800
+ "learning_rate": 3.360215053763441e-06,
1801
+ "loss": 0.3262,
1802
+ "step": 2330
1803
+ },
1804
+ {
1805
+ "epoch": 18.75751503006012,
1806
+ "grad_norm": 11.190741539001465,
1807
+ "learning_rate": 3.1362007168458782e-06,
1808
+ "loss": 0.3537,
1809
+ "step": 2340
1810
+ },
1811
+ {
1812
+ "epoch": 18.837675350701403,
1813
+ "grad_norm": 12.088068008422852,
1814
+ "learning_rate": 2.9121863799283156e-06,
1815
+ "loss": 0.3034,
1816
+ "step": 2350
1817
+ },
1818
+ {
1819
+ "epoch": 18.917835671342687,
1820
+ "grad_norm": 8.4877347946167,
1821
+ "learning_rate": 2.688172043010753e-06,
1822
+ "loss": 0.3537,
1823
+ "step": 2360
1824
+ },
1825
+ {
1826
+ "epoch": 18.997995991983966,
1827
+ "grad_norm": 12.145883560180664,
1828
+ "learning_rate": 2.4641577060931903e-06,
1829
+ "loss": 0.3396,
1830
+ "step": 2370
1831
+ },
1832
+ {
1833
+ "epoch": 18.997995991983966,
1834
+ "eval_accuracy": 0.8675310033821871,
1835
+ "eval_loss": 0.4275255799293518,
1836
+ "eval_runtime": 20.9813,
1837
+ "eval_samples_per_second": 84.551,
1838
+ "eval_steps_per_second": 2.669,
1839
+ "step": 2370
1840
+ },
1841
+ {
1842
+ "epoch": 19.07815631262525,
1843
+ "grad_norm": 12.138285636901855,
1844
+ "learning_rate": 2.2401433691756272e-06,
1845
+ "loss": 0.3464,
1846
+ "step": 2380
1847
+ },
1848
+ {
1849
+ "epoch": 19.158316633266534,
1850
+ "grad_norm": 15.26742172241211,
1851
+ "learning_rate": 2.0161290322580646e-06,
1852
+ "loss": 0.342,
1853
+ "step": 2390
1854
+ },
1855
+ {
1856
+ "epoch": 19.238476953907817,
1857
+ "grad_norm": 11.72075080871582,
1858
+ "learning_rate": 1.7921146953405017e-06,
1859
+ "loss": 0.3159,
1860
+ "step": 2400
1861
+ },
1862
+ {
1863
+ "epoch": 19.318637274549097,
1864
+ "grad_norm": 8.406167984008789,
1865
+ "learning_rate": 1.5681003584229391e-06,
1866
+ "loss": 0.2976,
1867
+ "step": 2410
1868
+ },
1869
+ {
1870
+ "epoch": 19.39879759519038,
1871
+ "grad_norm": 9.558813095092773,
1872
+ "learning_rate": 1.3440860215053765e-06,
1873
+ "loss": 0.3713,
1874
+ "step": 2420
1875
+ },
1876
+ {
1877
+ "epoch": 19.478957915831664,
1878
+ "grad_norm": 12.249772071838379,
1879
+ "learning_rate": 1.1200716845878136e-06,
1880
+ "loss": 0.3353,
1881
+ "step": 2430
1882
+ },
1883
+ {
1884
+ "epoch": 19.559118236472948,
1885
+ "grad_norm": 14.776098251342773,
1886
+ "learning_rate": 8.960573476702509e-07,
1887
+ "loss": 0.3507,
1888
+ "step": 2440
1889
+ },
1890
+ {
1891
+ "epoch": 19.639278557114228,
1892
+ "grad_norm": 12.802459716796875,
1893
+ "learning_rate": 6.720430107526882e-07,
1894
+ "loss": 0.3144,
1895
+ "step": 2450
1896
+ },
1897
+ {
1898
+ "epoch": 19.71943887775551,
1899
+ "grad_norm": 11.719756126403809,
1900
+ "learning_rate": 4.4802867383512544e-07,
1901
+ "loss": 0.3283,
1902
+ "step": 2460
1903
+ },
1904
+ {
1905
+ "epoch": 19.799599198396795,
1906
+ "grad_norm": 11.714187622070312,
1907
+ "learning_rate": 2.2401433691756272e-07,
1908
+ "loss": 0.3198,
1909
+ "step": 2470
1910
+ },
1911
+ {
1912
+ "epoch": 19.879759519038075,
1913
+ "grad_norm": 12.341629028320312,
1914
+ "learning_rate": 0.0,
1915
+ "loss": 0.3057,
1916
+ "step": 2480
1917
+ },
1918
+ {
1919
+ "epoch": 19.879759519038075,
1920
+ "eval_accuracy": 0.8703494926719278,
1921
+ "eval_loss": 0.4261245131492615,
1922
+ "eval_runtime": 20.7326,
1923
+ "eval_samples_per_second": 85.566,
1924
+ "eval_steps_per_second": 2.701,
1925
+ "step": 2480
1926
+ },
1927
+ {
1928
+ "epoch": 19.879759519038075,
1929
+ "step": 2480,
1930
+ "total_flos": 7.988705158075343e+18,
1931
+ "train_loss": 0.5653726263392356,
1932
+ "train_runtime": 5886.8979,
1933
+ "train_samples_per_second": 54.229,
1934
+ "train_steps_per_second": 0.421
1935
  }
1936
  ],
1937
  "logging_steps": 10,
1938
+ "max_steps": 2480,
1939
  "num_input_tokens_seen": 0,
1940
+ "num_train_epochs": 20,
1941
  "save_steps": 500,
1942
+ "total_flos": 7.988705158075343e+18,
1943
  "train_batch_size": 32,
1944
  "trial_name": null,
1945
  "trial_params": null