siddharth963 commited on
Commit
6d01da9
1 Parent(s): e7b3a7d

End of training

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8761682242990654,
4
- "eval_loss": 0.38753223419189453,
5
- "eval_runtime": 92.4939,
6
- "eval_samples_per_second": 46.273,
7
- "eval_steps_per_second": 5.784,
8
- "total_flos": 1.3264660513609667e+19,
9
- "train_loss": 0.36025063641717503,
10
- "train_runtime": 7147.2488,
11
- "train_samples_per_second": 23.949,
12
- "train_steps_per_second": 0.749
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.8705607476635514,
4
+ "eval_loss": 0.3742366135120392,
5
+ "eval_runtime": 43.3876,
6
+ "eval_samples_per_second": 49.323,
7
+ "eval_steps_per_second": 1.544,
8
+ "total_flos": 1.4918616518411741e+19,
9
+ "train_loss": 0.4060697093009949,
10
+ "train_runtime": 6764.8563,
11
+ "train_samples_per_second": 28.466,
12
+ "train_steps_per_second": 0.222
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8761682242990654,
4
- "eval_loss": 0.38753223419189453,
5
- "eval_runtime": 92.4939,
6
- "eval_samples_per_second": 46.273,
7
- "eval_steps_per_second": 5.784
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.8705607476635514,
4
+ "eval_loss": 0.3742366135120392,
5
+ "eval_runtime": 43.3876,
6
+ "eval_samples_per_second": 49.323,
7
+ "eval_steps_per_second": 1.544
8
  }
runs/Oct29_06-12-06_ddfe0230610b/events.out.tfevents.1667031626.ddfe0230610b.17.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:769c6fe8669e6bb736e79135e8df32945b0e27eec7d942ad4e06d3f0449edcf5
3
+ size 363
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "total_flos": 1.3264660513609667e+19,
4
- "train_loss": 0.36025063641717503,
5
- "train_runtime": 7147.2488,
6
- "train_samples_per_second": 23.949,
7
- "train_steps_per_second": 0.749
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 1.4918616518411741e+19,
4
+ "train_loss": 0.4060697093009949,
5
+ "train_runtime": 6764.8563,
6
+ "train_samples_per_second": 28.466,
7
+ "train_steps_per_second": 0.222
8
  }
trainer_state.json CHANGED
@@ -1,3334 +1,1015 @@
1
  {
2
- "best_metric": 0.8761682242990654,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-cassava/checkpoint-4280",
4
- "epoch": 10.0,
5
- "global_step": 5350,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.02,
12
- "learning_rate": 9.345794392523364e-07,
13
- "loss": 1.6171,
14
  "step": 10
15
  },
16
  {
17
- "epoch": 0.04,
18
- "learning_rate": 1.8691588785046728e-06,
19
- "loss": 1.6057,
20
  "step": 20
21
  },
22
  {
23
- "epoch": 0.06,
24
- "learning_rate": 2.8037383177570094e-06,
25
- "loss": 1.5864,
26
  "step": 30
27
  },
28
  {
29
- "epoch": 0.07,
30
- "learning_rate": 3.7383177570093455e-06,
31
- "loss": 1.5484,
32
  "step": 40
33
  },
34
  {
35
- "epoch": 0.09,
36
- "learning_rate": 4.6728971962616825e-06,
37
- "loss": 1.4915,
38
  "step": 50
39
  },
40
  {
41
- "epoch": 0.11,
42
- "learning_rate": 5.607476635514019e-06,
43
- "loss": 1.4295,
44
  "step": 60
45
  },
46
  {
47
- "epoch": 0.13,
48
- "learning_rate": 6.542056074766355e-06,
49
- "loss": 1.306,
50
  "step": 70
51
  },
52
  {
53
- "epoch": 0.15,
54
- "learning_rate": 7.476635514018691e-06,
55
- "loss": 1.2262,
56
  "step": 80
57
  },
58
  {
59
- "epoch": 0.17,
60
- "learning_rate": 8.411214953271028e-06,
61
- "loss": 1.1393,
62
  "step": 90
63
  },
64
  {
65
- "epoch": 0.19,
66
- "learning_rate": 9.345794392523365e-06,
67
- "loss": 1.0983,
68
  "step": 100
69
  },
70
  {
71
- "epoch": 0.21,
72
- "learning_rate": 1.02803738317757e-05,
73
- "loss": 1.1016,
74
  "step": 110
75
  },
76
  {
77
- "epoch": 0.22,
78
- "learning_rate": 1.1214953271028037e-05,
79
- "loss": 1.0129,
80
  "step": 120
81
  },
82
  {
83
- "epoch": 0.24,
84
- "learning_rate": 1.2149532710280374e-05,
85
- "loss": 1.0331,
86
  "step": 130
87
  },
88
  {
89
- "epoch": 0.26,
90
- "learning_rate": 1.308411214953271e-05,
91
- "loss": 0.9059,
92
  "step": 140
93
  },
94
  {
95
- "epoch": 0.28,
96
- "learning_rate": 1.4018691588785047e-05,
97
- "loss": 0.9411,
 
 
 
 
 
 
 
 
 
98
  "step": 150
99
  },
100
  {
101
- "epoch": 0.3,
102
- "learning_rate": 1.4953271028037382e-05,
103
- "loss": 0.8249,
104
  "step": 160
105
  },
106
  {
107
- "epoch": 0.32,
108
- "learning_rate": 1.588785046728972e-05,
109
- "loss": 0.8729,
110
  "step": 170
111
  },
112
  {
113
- "epoch": 0.34,
114
- "learning_rate": 1.6822429906542056e-05,
115
- "loss": 0.8896,
116
  "step": 180
117
  },
118
  {
119
- "epoch": 0.36,
120
- "learning_rate": 1.775700934579439e-05,
121
- "loss": 0.8313,
122
  "step": 190
123
  },
124
  {
125
- "epoch": 0.37,
126
- "learning_rate": 1.869158878504673e-05,
127
- "loss": 0.7836,
128
  "step": 200
129
  },
130
  {
131
- "epoch": 0.39,
132
- "learning_rate": 1.9626168224299065e-05,
133
- "loss": 0.7587,
134
  "step": 210
135
  },
136
  {
137
- "epoch": 0.41,
138
- "learning_rate": 2.05607476635514e-05,
139
- "loss": 0.6824,
140
  "step": 220
141
  },
142
  {
143
- "epoch": 0.43,
144
- "learning_rate": 2.149532710280374e-05,
145
- "loss": 0.7291,
146
  "step": 230
147
  },
148
  {
149
- "epoch": 0.45,
150
- "learning_rate": 2.2429906542056075e-05,
151
- "loss": 0.7577,
152
  "step": 240
153
  },
154
  {
155
- "epoch": 0.47,
156
- "learning_rate": 2.3364485981308414e-05,
157
- "loss": 0.7538,
158
  "step": 250
159
  },
160
  {
161
- "epoch": 0.49,
162
- "learning_rate": 2.429906542056075e-05,
163
- "loss": 0.6234,
164
  "step": 260
165
  },
166
  {
167
- "epoch": 0.5,
168
- "learning_rate": 2.5233644859813084e-05,
169
- "loss": 0.6444,
170
  "step": 270
171
  },
172
  {
173
- "epoch": 0.52,
174
- "learning_rate": 2.616822429906542e-05,
175
- "loss": 0.6348,
176
  "step": 280
177
  },
178
  {
179
- "epoch": 0.54,
180
- "learning_rate": 2.7102803738317755e-05,
181
- "loss": 0.6086,
182
  "step": 290
183
  },
184
  {
185
- "epoch": 0.56,
186
- "learning_rate": 2.8037383177570094e-05,
187
- "loss": 0.6258,
 
 
 
 
 
 
 
 
 
188
  "step": 300
189
  },
190
  {
191
- "epoch": 0.58,
192
- "learning_rate": 2.897196261682243e-05,
193
- "loss": 0.6055,
194
  "step": 310
195
  },
196
  {
197
- "epoch": 0.6,
198
- "learning_rate": 2.9906542056074764e-05,
199
- "loss": 0.5749,
200
  "step": 320
201
  },
202
  {
203
- "epoch": 0.62,
204
- "learning_rate": 3.08411214953271e-05,
205
- "loss": 0.595,
206
  "step": 330
207
  },
208
  {
209
- "epoch": 0.64,
210
- "learning_rate": 3.177570093457944e-05,
211
- "loss": 0.541,
212
  "step": 340
213
  },
214
  {
215
- "epoch": 0.65,
216
- "learning_rate": 3.2710280373831774e-05,
217
- "loss": 0.597,
218
  "step": 350
219
  },
220
  {
221
- "epoch": 0.67,
222
- "learning_rate": 3.364485981308411e-05,
223
- "loss": 0.465,
224
  "step": 360
225
  },
226
  {
227
- "epoch": 0.69,
228
- "learning_rate": 3.457943925233645e-05,
229
- "loss": 0.5728,
230
  "step": 370
231
  },
232
  {
233
- "epoch": 0.71,
234
- "learning_rate": 3.551401869158878e-05,
235
- "loss": 0.5068,
236
  "step": 380
237
  },
238
  {
239
- "epoch": 0.73,
240
- "learning_rate": 3.644859813084112e-05,
241
- "loss": 0.5928,
242
  "step": 390
243
  },
244
  {
245
- "epoch": 0.75,
246
- "learning_rate": 3.738317757009346e-05,
247
- "loss": 0.5488,
248
  "step": 400
249
  },
250
  {
251
- "epoch": 0.77,
252
- "learning_rate": 3.831775700934579e-05,
253
- "loss": 0.5565,
254
  "step": 410
255
  },
256
  {
257
- "epoch": 0.79,
258
- "learning_rate": 3.925233644859813e-05,
259
- "loss": 0.5147,
260
  "step": 420
261
  },
262
  {
263
- "epoch": 0.8,
264
- "learning_rate": 4.018691588785047e-05,
265
- "loss": 0.5709,
266
  "step": 430
267
  },
268
  {
269
- "epoch": 0.82,
270
- "learning_rate": 4.11214953271028e-05,
271
- "loss": 0.5086,
272
  "step": 440
273
  },
274
  {
275
- "epoch": 0.84,
276
- "learning_rate": 4.205607476635514e-05,
277
- "loss": 0.4941,
278
  "step": 450
279
  },
280
  {
281
- "epoch": 0.86,
282
- "learning_rate": 4.299065420560748e-05,
283
- "loss": 0.492,
 
 
 
 
 
 
 
 
 
284
  "step": 460
285
  },
286
  {
287
- "epoch": 0.88,
288
- "learning_rate": 4.392523364485982e-05,
289
- "loss": 0.522,
290
  "step": 470
291
  },
292
  {
293
- "epoch": 0.9,
294
- "learning_rate": 4.485981308411215e-05,
295
- "loss": 0.5502,
296
  "step": 480
297
  },
298
  {
299
- "epoch": 0.92,
300
- "learning_rate": 4.579439252336449e-05,
301
- "loss": 0.5002,
302
  "step": 490
303
  },
304
  {
305
- "epoch": 0.93,
306
- "learning_rate": 4.672897196261683e-05,
307
- "loss": 0.5522,
308
  "step": 500
309
  },
310
  {
311
- "epoch": 0.95,
312
- "learning_rate": 4.766355140186916e-05,
313
- "loss": 0.5166,
314
  "step": 510
315
  },
316
  {
317
- "epoch": 0.97,
318
- "learning_rate": 4.85981308411215e-05,
319
- "loss": 0.5825,
320
  "step": 520
321
  },
322
  {
323
- "epoch": 0.99,
324
- "learning_rate": 4.9532710280373836e-05,
325
- "loss": 0.5531,
326
  "step": 530
327
  },
328
  {
329
- "epoch": 1.0,
330
- "eval_accuracy": 0.8336448598130841,
331
- "eval_loss": 0.4937918782234192,
332
- "eval_runtime": 96.0141,
333
- "eval_samples_per_second": 44.577,
334
- "eval_steps_per_second": 5.572,
335
- "step": 535
336
- },
337
- {
338
- "epoch": 1.01,
339
- "learning_rate": 4.994807892004154e-05,
340
- "loss": 0.491,
341
  "step": 540
342
  },
343
  {
344
- "epoch": 1.03,
345
- "learning_rate": 4.9844236760124614e-05,
346
- "loss": 0.4679,
347
  "step": 550
348
  },
349
  {
350
- "epoch": 1.05,
351
- "learning_rate": 4.974039460020769e-05,
352
- "loss": 0.4584,
353
  "step": 560
354
  },
355
  {
356
- "epoch": 1.07,
357
- "learning_rate": 4.963655244029076e-05,
358
- "loss": 0.5556,
359
  "step": 570
360
  },
361
  {
362
- "epoch": 1.08,
363
- "learning_rate": 4.9532710280373836e-05,
364
- "loss": 0.4374,
365
  "step": 580
366
  },
367
  {
368
- "epoch": 1.1,
369
- "learning_rate": 4.9428868120456904e-05,
370
- "loss": 0.4499,
371
  "step": 590
372
  },
373
  {
374
- "epoch": 1.12,
375
- "learning_rate": 4.9325025960539985e-05,
376
- "loss": 0.5201,
 
 
 
 
 
 
 
 
 
377
  "step": 600
378
  },
379
  {
380
- "epoch": 1.14,
381
- "learning_rate": 4.922118380062305e-05,
382
- "loss": 0.6573,
383
  "step": 610
384
  },
385
  {
386
- "epoch": 1.16,
387
- "learning_rate": 4.9117341640706127e-05,
388
- "loss": 0.5029,
389
  "step": 620
390
  },
391
  {
392
- "epoch": 1.18,
393
- "learning_rate": 4.901349948078921e-05,
394
- "loss": 0.5457,
395
  "step": 630
396
  },
397
  {
398
- "epoch": 1.2,
399
- "learning_rate": 4.8909657320872275e-05,
400
- "loss": 0.492,
401
  "step": 640
402
  },
403
  {
404
- "epoch": 1.21,
405
- "learning_rate": 4.880581516095535e-05,
406
- "loss": 0.3407,
407
  "step": 650
408
  },
409
  {
410
- "epoch": 1.23,
411
- "learning_rate": 4.8701973001038423e-05,
412
- "loss": 0.5129,
413
  "step": 660
414
  },
415
  {
416
- "epoch": 1.25,
417
- "learning_rate": 4.85981308411215e-05,
418
- "loss": 0.4626,
419
  "step": 670
420
  },
421
  {
422
- "epoch": 1.27,
423
- "learning_rate": 4.849428868120457e-05,
424
- "loss": 0.5528,
425
  "step": 680
426
  },
427
  {
428
- "epoch": 1.29,
429
- "learning_rate": 4.8390446521287646e-05,
430
- "loss": 0.4977,
431
  "step": 690
432
  },
433
  {
434
- "epoch": 1.31,
435
- "learning_rate": 4.828660436137072e-05,
436
- "loss": 0.387,
437
  "step": 700
438
  },
439
  {
440
- "epoch": 1.33,
441
- "learning_rate": 4.818276220145379e-05,
442
- "loss": 0.5349,
443
  "step": 710
444
  },
445
  {
446
- "epoch": 1.35,
447
- "learning_rate": 4.807892004153687e-05,
448
- "loss": 0.4265,
449
  "step": 720
450
  },
451
  {
452
- "epoch": 1.36,
453
- "learning_rate": 4.797507788161994e-05,
454
- "loss": 0.4719,
455
  "step": 730
456
  },
457
  {
458
- "epoch": 1.38,
459
- "learning_rate": 4.787123572170301e-05,
460
- "loss": 0.5005,
461
  "step": 740
462
  },
463
  {
464
- "epoch": 1.4,
465
- "learning_rate": 4.776739356178609e-05,
466
- "loss": 0.5112,
 
 
 
 
 
 
 
 
 
467
  "step": 750
468
  },
469
  {
470
- "epoch": 1.42,
471
- "learning_rate": 4.766355140186916e-05,
472
- "loss": 0.5112,
473
  "step": 760
474
  },
475
  {
476
- "epoch": 1.44,
477
- "learning_rate": 4.755970924195223e-05,
478
- "loss": 0.3827,
479
  "step": 770
480
  },
481
  {
482
- "epoch": 1.46,
483
- "learning_rate": 4.745586708203531e-05,
484
- "loss": 0.5576,
485
  "step": 780
486
  },
487
  {
488
- "epoch": 1.48,
489
- "learning_rate": 4.735202492211838e-05,
490
- "loss": 0.3974,
491
  "step": 790
492
  },
493
  {
494
- "epoch": 1.5,
495
- "learning_rate": 4.7248182762201456e-05,
496
- "loss": 0.5209,
497
  "step": 800
498
  },
499
  {
500
- "epoch": 1.51,
501
- "learning_rate": 4.714434060228453e-05,
502
- "loss": 0.4721,
503
  "step": 810
504
  },
505
  {
506
- "epoch": 1.53,
507
- "learning_rate": 4.7040498442367604e-05,
508
- "loss": 0.385,
509
  "step": 820
510
  },
511
  {
512
- "epoch": 1.55,
513
- "learning_rate": 4.693665628245067e-05,
514
- "loss": 0.4874,
515
  "step": 830
516
  },
517
  {
518
- "epoch": 1.57,
519
- "learning_rate": 4.683281412253375e-05,
520
- "loss": 0.4409,
521
  "step": 840
522
  },
523
  {
524
- "epoch": 1.59,
525
- "learning_rate": 4.672897196261683e-05,
526
- "loss": 0.506,
527
  "step": 850
528
  },
529
  {
530
- "epoch": 1.61,
531
- "learning_rate": 4.6625129802699895e-05,
532
- "loss": 0.4446,
533
  "step": 860
534
  },
535
  {
536
- "epoch": 1.63,
537
- "learning_rate": 4.6521287642782976e-05,
538
- "loss": 0.4086,
539
  "step": 870
540
  },
541
  {
542
- "epoch": 1.64,
543
- "learning_rate": 4.641744548286604e-05,
544
- "loss": 0.3869,
545
  "step": 880
546
  },
547
  {
548
- "epoch": 1.66,
549
- "learning_rate": 4.631360332294912e-05,
550
- "loss": 0.395,
551
  "step": 890
552
  },
553
  {
554
- "epoch": 1.68,
555
- "learning_rate": 4.620976116303219e-05,
556
- "loss": 0.464,
 
 
 
 
 
 
 
 
 
557
  "step": 900
558
  },
559
  {
560
- "epoch": 1.7,
561
- "learning_rate": 4.6105919003115266e-05,
562
- "loss": 0.3581,
563
  "step": 910
564
  },
565
  {
566
- "epoch": 1.72,
567
- "learning_rate": 4.600207684319834e-05,
568
- "loss": 0.387,
569
  "step": 920
570
  },
571
  {
572
- "epoch": 1.74,
573
- "learning_rate": 4.5898234683281414e-05,
574
- "loss": 0.4066,
575
  "step": 930
576
  },
577
  {
578
- "epoch": 1.76,
579
- "learning_rate": 4.579439252336449e-05,
580
- "loss": 0.3997,
581
  "step": 940
582
  },
583
  {
584
- "epoch": 1.78,
585
- "learning_rate": 4.569055036344756e-05,
586
- "loss": 0.4705,
587
  "step": 950
588
  },
589
  {
590
- "epoch": 1.79,
591
- "learning_rate": 4.558670820353064e-05,
592
- "loss": 0.4063,
593
  "step": 960
594
  },
595
  {
596
- "epoch": 1.81,
597
- "learning_rate": 4.548286604361371e-05,
598
- "loss": 0.4227,
599
  "step": 970
600
  },
601
  {
602
- "epoch": 1.83,
603
- "learning_rate": 4.537902388369678e-05,
604
- "loss": 0.4844,
605
  "step": 980
606
  },
607
  {
608
- "epoch": 1.85,
609
- "learning_rate": 4.527518172377986e-05,
610
- "loss": 0.4631,
611
  "step": 990
612
  },
613
  {
614
- "epoch": 1.87,
615
- "learning_rate": 4.517133956386293e-05,
616
- "loss": 0.4847,
617
  "step": 1000
618
  },
619
  {
620
- "epoch": 1.89,
621
- "learning_rate": 4.5067497403946e-05,
622
- "loss": 0.4112,
623
  "step": 1010
624
  },
625
  {
626
- "epoch": 1.91,
627
- "learning_rate": 4.496365524402908e-05,
628
- "loss": 0.3964,
629
  "step": 1020
630
  },
631
  {
632
- "epoch": 1.93,
633
- "learning_rate": 4.485981308411215e-05,
634
- "loss": 0.4341,
635
  "step": 1030
636
  },
637
  {
638
- "epoch": 1.94,
639
- "learning_rate": 4.4755970924195224e-05,
640
- "loss": 0.5106,
641
  "step": 1040
642
  },
643
  {
644
- "epoch": 1.96,
645
- "learning_rate": 4.46521287642783e-05,
646
- "loss": 0.4269,
647
  "step": 1050
648
  },
649
  {
650
- "epoch": 1.98,
651
- "learning_rate": 4.454828660436137e-05,
652
- "loss": 0.413,
653
- "step": 1060
 
 
 
654
  },
655
  {
656
- "epoch": 2.0,
657
- "learning_rate": 4.4444444444444447e-05,
658
- "loss": 0.4139,
659
- "step": 1070
660
  },
661
  {
662
- "epoch": 2.0,
663
- "eval_accuracy": 0.861214953271028,
664
- "eval_loss": 0.40707165002822876,
665
- "eval_runtime": 99.8719,
666
- "eval_samples_per_second": 42.855,
667
- "eval_steps_per_second": 5.357,
668
  "step": 1070
669
  },
670
  {
671
- "epoch": 2.02,
672
- "learning_rate": 4.434060228452752e-05,
673
- "loss": 0.395,
674
  "step": 1080
675
  },
676
  {
677
- "epoch": 2.04,
678
- "learning_rate": 4.4236760124610595e-05,
679
- "loss": 0.3672,
680
  "step": 1090
681
  },
682
  {
683
- "epoch": 2.06,
684
- "learning_rate": 4.413291796469366e-05,
685
- "loss": 0.3994,
686
  "step": 1100
687
  },
688
  {
689
- "epoch": 2.07,
690
- "learning_rate": 4.4029075804776743e-05,
691
- "loss": 0.3818,
692
  "step": 1110
693
  },
694
  {
695
- "epoch": 2.09,
696
- "learning_rate": 4.392523364485982e-05,
697
- "loss": 0.4013,
698
  "step": 1120
699
  },
700
  {
701
- "epoch": 2.11,
702
- "learning_rate": 4.3821391484942885e-05,
703
- "loss": 0.3409,
704
  "step": 1130
705
  },
706
  {
707
- "epoch": 2.13,
708
- "learning_rate": 4.3717549325025966e-05,
709
- "loss": 0.4753,
710
  "step": 1140
711
  },
712
  {
713
- "epoch": 2.15,
714
- "learning_rate": 4.3613707165109034e-05,
715
- "loss": 0.3671,
716
  "step": 1150
717
  },
718
  {
719
- "epoch": 2.17,
720
- "learning_rate": 4.350986500519211e-05,
721
- "loss": 0.462,
722
  "step": 1160
723
  },
724
  {
725
- "epoch": 2.19,
726
- "learning_rate": 4.340602284527518e-05,
727
- "loss": 0.3729,
728
  "step": 1170
729
  },
730
  {
731
- "epoch": 2.21,
732
- "learning_rate": 4.3302180685358256e-05,
733
- "loss": 0.3876,
734
  "step": 1180
735
  },
736
  {
737
- "epoch": 2.22,
738
- "learning_rate": 4.319833852544133e-05,
739
- "loss": 0.3794,
740
  "step": 1190
741
  },
742
  {
743
- "epoch": 2.24,
744
- "learning_rate": 4.3094496365524405e-05,
745
- "loss": 0.3609,
 
 
 
 
 
 
 
 
 
746
  "step": 1200
747
  },
748
  {
749
- "epoch": 2.26,
750
- "learning_rate": 4.299065420560748e-05,
751
- "loss": 0.4161,
752
  "step": 1210
753
  },
754
  {
755
- "epoch": 2.28,
756
- "learning_rate": 4.2886812045690546e-05,
757
- "loss": 0.5202,
758
  "step": 1220
759
  },
760
  {
761
- "epoch": 2.3,
762
- "learning_rate": 4.278296988577363e-05,
763
- "loss": 0.4579,
764
  "step": 1230
765
  },
766
  {
767
- "epoch": 2.32,
768
- "learning_rate": 4.26791277258567e-05,
769
- "loss": 0.408,
770
  "step": 1240
771
  },
772
  {
773
- "epoch": 2.34,
774
- "learning_rate": 4.257528556593977e-05,
775
- "loss": 0.36,
776
  "step": 1250
777
  },
778
  {
779
- "epoch": 2.36,
780
- "learning_rate": 4.247144340602285e-05,
781
- "loss": 0.3109,
782
  "step": 1260
783
  },
784
  {
785
- "epoch": 2.37,
786
- "learning_rate": 4.236760124610592e-05,
787
- "loss": 0.3713,
788
  "step": 1270
789
  },
790
  {
791
- "epoch": 2.39,
792
- "learning_rate": 4.226375908618899e-05,
793
- "loss": 0.4329,
794
  "step": 1280
795
  },
796
  {
797
- "epoch": 2.41,
798
- "learning_rate": 4.2159916926272066e-05,
799
- "loss": 0.3612,
800
  "step": 1290
801
  },
802
  {
803
- "epoch": 2.43,
804
- "learning_rate": 4.205607476635514e-05,
805
- "loss": 0.387,
806
  "step": 1300
807
  },
808
  {
809
- "epoch": 2.45,
810
- "learning_rate": 4.1952232606438215e-05,
811
- "loss": 0.4461,
812
  "step": 1310
813
  },
814
  {
815
- "epoch": 2.47,
816
- "learning_rate": 4.184839044652129e-05,
817
- "loss": 0.2903,
818
  "step": 1320
819
  },
820
  {
821
- "epoch": 2.49,
822
- "learning_rate": 4.174454828660436e-05,
823
- "loss": 0.328,
824
  "step": 1330
825
  },
826
  {
827
- "epoch": 2.5,
828
- "learning_rate": 4.164070612668744e-05,
829
- "loss": 0.4151,
830
  "step": 1340
831
  },
832
  {
833
- "epoch": 2.52,
834
- "learning_rate": 4.153686396677051e-05,
835
- "loss": 0.3888,
836
  "step": 1350
837
  },
838
  {
839
- "epoch": 2.54,
840
- "learning_rate": 4.1433021806853586e-05,
841
- "loss": 0.3871,
 
 
 
 
 
 
 
 
 
842
  "step": 1360
843
  },
844
  {
845
- "epoch": 2.56,
846
- "learning_rate": 4.132917964693666e-05,
847
- "loss": 0.4304,
848
  "step": 1370
849
  },
850
  {
851
- "epoch": 2.58,
852
- "learning_rate": 4.1225337487019734e-05,
853
- "loss": 0.387,
854
  "step": 1380
855
  },
856
  {
857
- "epoch": 2.6,
858
- "learning_rate": 4.11214953271028e-05,
859
- "loss": 0.3658,
860
  "step": 1390
861
  },
862
  {
863
- "epoch": 2.62,
864
- "learning_rate": 4.101765316718588e-05,
865
- "loss": 0.4494,
866
  "step": 1400
867
  },
868
  {
869
- "epoch": 2.64,
870
- "learning_rate": 4.091381100726896e-05,
871
- "loss": 0.3355,
872
  "step": 1410
873
  },
874
  {
875
- "epoch": 2.65,
876
- "learning_rate": 4.0809968847352024e-05,
877
- "loss": 0.3368,
878
  "step": 1420
879
  },
880
  {
881
- "epoch": 2.67,
882
- "learning_rate": 4.0706126687435105e-05,
883
- "loss": 0.4087,
884
  "step": 1430
885
  },
886
  {
887
- "epoch": 2.69,
888
- "learning_rate": 4.060228452751817e-05,
889
- "loss": 0.4279,
890
  "step": 1440
891
  },
892
  {
893
- "epoch": 2.71,
894
- "learning_rate": 4.049844236760125e-05,
895
- "loss": 0.3548,
896
  "step": 1450
897
  },
898
  {
899
- "epoch": 2.73,
900
- "learning_rate": 4.039460020768432e-05,
901
- "loss": 0.4466,
902
  "step": 1460
903
  },
904
  {
905
- "epoch": 2.75,
906
- "learning_rate": 4.0290758047767395e-05,
907
- "loss": 0.4133,
908
  "step": 1470
909
  },
910
  {
911
- "epoch": 2.77,
912
- "learning_rate": 4.018691588785047e-05,
913
- "loss": 0.3845,
914
  "step": 1480
915
  },
916
  {
917
- "epoch": 2.79,
918
- "learning_rate": 4.0083073727933544e-05,
919
- "loss": 0.4471,
920
  "step": 1490
921
  },
922
  {
923
- "epoch": 2.8,
924
- "learning_rate": 3.997923156801662e-05,
925
- "loss": 0.4421,
926
  "step": 1500
927
  },
928
- {
929
- "epoch": 2.82,
930
- "learning_rate": 3.987538940809969e-05,
931
- "loss": 0.4383,
932
- "step": 1510
933
- },
934
- {
935
- "epoch": 2.84,
936
- "learning_rate": 3.9771547248182767e-05,
937
- "loss": 0.3326,
938
- "step": 1520
939
- },
940
- {
941
- "epoch": 2.86,
942
- "learning_rate": 3.966770508826584e-05,
943
- "loss": 0.4058,
944
- "step": 1530
945
- },
946
- {
947
- "epoch": 2.88,
948
- "learning_rate": 3.956386292834891e-05,
949
- "loss": 0.3653,
950
- "step": 1540
951
- },
952
- {
953
- "epoch": 2.9,
954
- "learning_rate": 3.946002076843199e-05,
955
- "loss": 0.4157,
956
- "step": 1550
957
- },
958
- {
959
- "epoch": 2.92,
960
- "learning_rate": 3.935617860851506e-05,
961
- "loss": 0.3944,
962
- "step": 1560
963
- },
964
- {
965
- "epoch": 2.93,
966
- "learning_rate": 3.925233644859813e-05,
967
- "loss": 0.4103,
968
- "step": 1570
969
- },
970
- {
971
- "epoch": 2.95,
972
- "learning_rate": 3.914849428868121e-05,
973
- "loss": 0.4601,
974
- "step": 1580
975
- },
976
- {
977
- "epoch": 2.97,
978
- "learning_rate": 3.904465212876428e-05,
979
- "loss": 0.3561,
980
- "step": 1590
981
- },
982
- {
983
- "epoch": 2.99,
984
- "learning_rate": 3.8940809968847354e-05,
985
- "loss": 0.287,
986
- "step": 1600
987
- },
988
- {
989
- "epoch": 3.0,
990
- "eval_accuracy": 0.8642523364485981,
991
- "eval_loss": 0.39544418454170227,
992
- "eval_runtime": 95.8501,
993
- "eval_samples_per_second": 44.653,
994
- "eval_steps_per_second": 5.582,
995
- "step": 1605
996
- },
997
- {
998
- "epoch": 3.01,
999
- "learning_rate": 3.883696780893043e-05,
1000
- "loss": 0.3781,
1001
- "step": 1610
1002
- },
1003
- {
1004
- "epoch": 3.03,
1005
- "learning_rate": 3.87331256490135e-05,
1006
- "loss": 0.3999,
1007
- "step": 1620
1008
- },
1009
- {
1010
- "epoch": 3.05,
1011
- "learning_rate": 3.8629283489096576e-05,
1012
- "loss": 0.3507,
1013
- "step": 1630
1014
- },
1015
- {
1016
- "epoch": 3.07,
1017
- "learning_rate": 3.852544132917965e-05,
1018
- "loss": 0.354,
1019
- "step": 1640
1020
- },
1021
- {
1022
- "epoch": 3.08,
1023
- "learning_rate": 3.8421599169262725e-05,
1024
- "loss": 0.4233,
1025
- "step": 1650
1026
- },
1027
- {
1028
- "epoch": 3.1,
1029
- "learning_rate": 3.831775700934579e-05,
1030
- "loss": 0.3766,
1031
- "step": 1660
1032
- },
1033
- {
1034
- "epoch": 3.12,
1035
- "learning_rate": 3.821391484942887e-05,
1036
- "loss": 0.4494,
1037
- "step": 1670
1038
- },
1039
- {
1040
- "epoch": 3.14,
1041
- "learning_rate": 3.811007268951195e-05,
1042
- "loss": 0.3772,
1043
- "step": 1680
1044
- },
1045
- {
1046
- "epoch": 3.16,
1047
- "learning_rate": 3.8006230529595015e-05,
1048
- "loss": 0.4079,
1049
- "step": 1690
1050
- },
1051
- {
1052
- "epoch": 3.18,
1053
- "learning_rate": 3.7902388369678096e-05,
1054
- "loss": 0.3796,
1055
- "step": 1700
1056
- },
1057
- {
1058
- "epoch": 3.2,
1059
- "learning_rate": 3.779854620976116e-05,
1060
- "loss": 0.2899,
1061
- "step": 1710
1062
- },
1063
- {
1064
- "epoch": 3.21,
1065
- "learning_rate": 3.769470404984424e-05,
1066
- "loss": 0.357,
1067
- "step": 1720
1068
- },
1069
- {
1070
- "epoch": 3.23,
1071
- "learning_rate": 3.759086188992731e-05,
1072
- "loss": 0.3922,
1073
- "step": 1730
1074
- },
1075
- {
1076
- "epoch": 3.25,
1077
- "learning_rate": 3.7487019730010386e-05,
1078
- "loss": 0.3743,
1079
- "step": 1740
1080
- },
1081
- {
1082
- "epoch": 3.27,
1083
- "learning_rate": 3.738317757009346e-05,
1084
- "loss": 0.3275,
1085
- "step": 1750
1086
- },
1087
- {
1088
- "epoch": 3.29,
1089
- "learning_rate": 3.7279335410176535e-05,
1090
- "loss": 0.3128,
1091
- "step": 1760
1092
- },
1093
- {
1094
- "epoch": 3.31,
1095
- "learning_rate": 3.717549325025961e-05,
1096
- "loss": 0.3351,
1097
- "step": 1770
1098
- },
1099
- {
1100
- "epoch": 3.33,
1101
- "learning_rate": 3.7071651090342676e-05,
1102
- "loss": 0.354,
1103
- "step": 1780
1104
- },
1105
- {
1106
- "epoch": 3.35,
1107
- "learning_rate": 3.696780893042576e-05,
1108
- "loss": 0.3604,
1109
- "step": 1790
1110
- },
1111
- {
1112
- "epoch": 3.36,
1113
- "learning_rate": 3.686396677050883e-05,
1114
- "loss": 0.389,
1115
- "step": 1800
1116
- },
1117
- {
1118
- "epoch": 3.38,
1119
- "learning_rate": 3.67601246105919e-05,
1120
- "loss": 0.3892,
1121
- "step": 1810
1122
- },
1123
- {
1124
- "epoch": 3.4,
1125
- "learning_rate": 3.665628245067498e-05,
1126
- "loss": 0.3525,
1127
- "step": 1820
1128
- },
1129
- {
1130
- "epoch": 3.42,
1131
- "learning_rate": 3.655244029075805e-05,
1132
- "loss": 0.3725,
1133
- "step": 1830
1134
- },
1135
- {
1136
- "epoch": 3.44,
1137
- "learning_rate": 3.644859813084112e-05,
1138
- "loss": 0.3909,
1139
- "step": 1840
1140
- },
1141
- {
1142
- "epoch": 3.46,
1143
- "learning_rate": 3.6344755970924196e-05,
1144
- "loss": 0.2884,
1145
- "step": 1850
1146
- },
1147
- {
1148
- "epoch": 3.48,
1149
- "learning_rate": 3.624091381100727e-05,
1150
- "loss": 0.3384,
1151
- "step": 1860
1152
- },
1153
- {
1154
- "epoch": 3.5,
1155
- "learning_rate": 3.6137071651090344e-05,
1156
- "loss": 0.3645,
1157
- "step": 1870
1158
- },
1159
- {
1160
- "epoch": 3.51,
1161
- "learning_rate": 3.603322949117342e-05,
1162
- "loss": 0.3407,
1163
- "step": 1880
1164
- },
1165
- {
1166
- "epoch": 3.53,
1167
- "learning_rate": 3.592938733125649e-05,
1168
- "loss": 0.3847,
1169
- "step": 1890
1170
- },
1171
- {
1172
- "epoch": 3.55,
1173
- "learning_rate": 3.582554517133957e-05,
1174
- "loss": 0.3862,
1175
- "step": 1900
1176
- },
1177
- {
1178
- "epoch": 3.57,
1179
- "learning_rate": 3.572170301142264e-05,
1180
- "loss": 0.3831,
1181
- "step": 1910
1182
- },
1183
- {
1184
- "epoch": 3.59,
1185
- "learning_rate": 3.5617860851505715e-05,
1186
- "loss": 0.3147,
1187
- "step": 1920
1188
- },
1189
- {
1190
- "epoch": 3.61,
1191
- "learning_rate": 3.551401869158878e-05,
1192
- "loss": 0.3686,
1193
- "step": 1930
1194
- },
1195
- {
1196
- "epoch": 3.63,
1197
- "learning_rate": 3.5410176531671864e-05,
1198
- "loss": 0.3913,
1199
- "step": 1940
1200
- },
1201
- {
1202
- "epoch": 3.64,
1203
- "learning_rate": 3.530633437175493e-05,
1204
- "loss": 0.3717,
1205
- "step": 1950
1206
- },
1207
- {
1208
- "epoch": 3.66,
1209
- "learning_rate": 3.5202492211838006e-05,
1210
- "loss": 0.4135,
1211
- "step": 1960
1212
- },
1213
- {
1214
- "epoch": 3.68,
1215
- "learning_rate": 3.5098650051921087e-05,
1216
- "loss": 0.372,
1217
- "step": 1970
1218
- },
1219
- {
1220
- "epoch": 3.7,
1221
- "learning_rate": 3.4994807892004154e-05,
1222
- "loss": 0.3738,
1223
- "step": 1980
1224
- },
1225
- {
1226
- "epoch": 3.72,
1227
- "learning_rate": 3.489096573208723e-05,
1228
- "loss": 0.3847,
1229
- "step": 1990
1230
- },
1231
- {
1232
- "epoch": 3.74,
1233
- "learning_rate": 3.47871235721703e-05,
1234
- "loss": 0.3893,
1235
- "step": 2000
1236
- },
1237
- {
1238
- "epoch": 3.76,
1239
- "learning_rate": 3.468328141225338e-05,
1240
- "loss": 0.3045,
1241
- "step": 2010
1242
- },
1243
- {
1244
- "epoch": 3.78,
1245
- "learning_rate": 3.457943925233645e-05,
1246
- "loss": 0.3242,
1247
- "step": 2020
1248
- },
1249
- {
1250
- "epoch": 3.79,
1251
- "learning_rate": 3.4475597092419525e-05,
1252
- "loss": 0.3417,
1253
- "step": 2030
1254
- },
1255
- {
1256
- "epoch": 3.81,
1257
- "learning_rate": 3.43717549325026e-05,
1258
- "loss": 0.3568,
1259
- "step": 2040
1260
- },
1261
- {
1262
- "epoch": 3.83,
1263
- "learning_rate": 3.426791277258567e-05,
1264
- "loss": 0.3225,
1265
- "step": 2050
1266
- },
1267
- {
1268
- "epoch": 3.85,
1269
- "learning_rate": 3.416407061266875e-05,
1270
- "loss": 0.3341,
1271
- "step": 2060
1272
- },
1273
- {
1274
- "epoch": 3.87,
1275
- "learning_rate": 3.406022845275182e-05,
1276
- "loss": 0.326,
1277
- "step": 2070
1278
- },
1279
- {
1280
- "epoch": 3.89,
1281
- "learning_rate": 3.395638629283489e-05,
1282
- "loss": 0.3508,
1283
- "step": 2080
1284
- },
1285
- {
1286
- "epoch": 3.91,
1287
- "learning_rate": 3.385254413291797e-05,
1288
- "loss": 0.333,
1289
- "step": 2090
1290
- },
1291
- {
1292
- "epoch": 3.93,
1293
- "learning_rate": 3.374870197300104e-05,
1294
- "loss": 0.4373,
1295
- "step": 2100
1296
- },
1297
- {
1298
- "epoch": 3.94,
1299
- "learning_rate": 3.364485981308411e-05,
1300
- "loss": 0.3769,
1301
- "step": 2110
1302
- },
1303
- {
1304
- "epoch": 3.96,
1305
- "learning_rate": 3.3541017653167186e-05,
1306
- "loss": 0.3697,
1307
- "step": 2120
1308
- },
1309
- {
1310
- "epoch": 3.98,
1311
- "learning_rate": 3.343717549325026e-05,
1312
- "loss": 0.3187,
1313
- "step": 2130
1314
- },
1315
- {
1316
- "epoch": 4.0,
1317
- "learning_rate": 3.3333333333333335e-05,
1318
- "loss": 0.4211,
1319
- "step": 2140
1320
- },
1321
- {
1322
- "epoch": 4.0,
1323
- "eval_accuracy": 0.8700934579439252,
1324
- "eval_loss": 0.39060959219932556,
1325
- "eval_runtime": 93.1195,
1326
- "eval_samples_per_second": 45.962,
1327
- "eval_steps_per_second": 5.745,
1328
- "step": 2140
1329
- },
1330
- {
1331
- "epoch": 4.02,
1332
- "learning_rate": 3.322949117341641e-05,
1333
- "loss": 0.2857,
1334
- "step": 2150
1335
- },
1336
- {
1337
- "epoch": 4.04,
1338
- "learning_rate": 3.3125649013499483e-05,
1339
- "loss": 0.3271,
1340
- "step": 2160
1341
- },
1342
- {
1343
- "epoch": 4.06,
1344
- "learning_rate": 3.302180685358255e-05,
1345
- "loss": 0.3109,
1346
- "step": 2170
1347
- },
1348
- {
1349
- "epoch": 4.07,
1350
- "learning_rate": 3.291796469366563e-05,
1351
- "loss": 0.2679,
1352
- "step": 2180
1353
- },
1354
- {
1355
- "epoch": 4.09,
1356
- "learning_rate": 3.2814122533748706e-05,
1357
- "loss": 0.3242,
1358
- "step": 2190
1359
- },
1360
- {
1361
- "epoch": 4.11,
1362
- "learning_rate": 3.2710280373831774e-05,
1363
- "loss": 0.3083,
1364
- "step": 2200
1365
- },
1366
- {
1367
- "epoch": 4.13,
1368
- "learning_rate": 3.2606438213914855e-05,
1369
- "loss": 0.3325,
1370
- "step": 2210
1371
- },
1372
- {
1373
- "epoch": 4.15,
1374
- "learning_rate": 3.250259605399792e-05,
1375
- "loss": 0.3989,
1376
- "step": 2220
1377
- },
1378
- {
1379
- "epoch": 4.17,
1380
- "learning_rate": 3.2398753894080996e-05,
1381
- "loss": 0.3044,
1382
- "step": 2230
1383
- },
1384
- {
1385
- "epoch": 4.19,
1386
- "learning_rate": 3.229491173416408e-05,
1387
- "loss": 0.3389,
1388
- "step": 2240
1389
- },
1390
- {
1391
- "epoch": 4.21,
1392
- "learning_rate": 3.2191069574247145e-05,
1393
- "loss": 0.3284,
1394
- "step": 2250
1395
- },
1396
- {
1397
- "epoch": 4.22,
1398
- "learning_rate": 3.208722741433022e-05,
1399
- "loss": 0.2777,
1400
- "step": 2260
1401
- },
1402
- {
1403
- "epoch": 4.24,
1404
- "learning_rate": 3.198338525441329e-05,
1405
- "loss": 0.3531,
1406
- "step": 2270
1407
- },
1408
- {
1409
- "epoch": 4.26,
1410
- "learning_rate": 3.187954309449637e-05,
1411
- "loss": 0.3578,
1412
- "step": 2280
1413
- },
1414
- {
1415
- "epoch": 4.28,
1416
- "learning_rate": 3.177570093457944e-05,
1417
- "loss": 0.375,
1418
- "step": 2290
1419
- },
1420
- {
1421
- "epoch": 4.3,
1422
- "learning_rate": 3.1671858774662516e-05,
1423
- "loss": 0.3912,
1424
- "step": 2300
1425
- },
1426
- {
1427
- "epoch": 4.32,
1428
- "learning_rate": 3.156801661474559e-05,
1429
- "loss": 0.3632,
1430
- "step": 2310
1431
- },
1432
- {
1433
- "epoch": 4.34,
1434
- "learning_rate": 3.146417445482866e-05,
1435
- "loss": 0.3453,
1436
- "step": 2320
1437
- },
1438
- {
1439
- "epoch": 4.36,
1440
- "learning_rate": 3.136033229491174e-05,
1441
- "loss": 0.3412,
1442
- "step": 2330
1443
- },
1444
- {
1445
- "epoch": 4.37,
1446
- "learning_rate": 3.1256490134994806e-05,
1447
- "loss": 0.3506,
1448
- "step": 2340
1449
- },
1450
- {
1451
- "epoch": 4.39,
1452
- "learning_rate": 3.115264797507788e-05,
1453
- "loss": 0.3178,
1454
- "step": 2350
1455
- },
1456
- {
1457
- "epoch": 4.41,
1458
- "learning_rate": 3.104880581516096e-05,
1459
- "loss": 0.3787,
1460
- "step": 2360
1461
- },
1462
- {
1463
- "epoch": 4.43,
1464
- "learning_rate": 3.094496365524403e-05,
1465
- "loss": 0.2522,
1466
- "step": 2370
1467
- },
1468
- {
1469
- "epoch": 4.45,
1470
- "learning_rate": 3.08411214953271e-05,
1471
- "loss": 0.2986,
1472
- "step": 2380
1473
- },
1474
- {
1475
- "epoch": 4.47,
1476
- "learning_rate": 3.073727933541018e-05,
1477
- "loss": 0.3647,
1478
- "step": 2390
1479
- },
1480
- {
1481
- "epoch": 4.49,
1482
- "learning_rate": 3.063343717549325e-05,
1483
- "loss": 0.2927,
1484
- "step": 2400
1485
- },
1486
- {
1487
- "epoch": 4.5,
1488
- "learning_rate": 3.0529595015576326e-05,
1489
- "loss": 0.3868,
1490
- "step": 2410
1491
- },
1492
- {
1493
- "epoch": 4.52,
1494
- "learning_rate": 3.04257528556594e-05,
1495
- "loss": 0.2927,
1496
- "step": 2420
1497
- },
1498
- {
1499
- "epoch": 4.54,
1500
- "learning_rate": 3.0321910695742474e-05,
1501
- "loss": 0.2755,
1502
- "step": 2430
1503
- },
1504
- {
1505
- "epoch": 4.56,
1506
- "learning_rate": 3.0218068535825545e-05,
1507
- "loss": 0.2953,
1508
- "step": 2440
1509
- },
1510
- {
1511
- "epoch": 4.58,
1512
- "learning_rate": 3.0114226375908622e-05,
1513
- "loss": 0.2946,
1514
- "step": 2450
1515
- },
1516
- {
1517
- "epoch": 4.6,
1518
- "learning_rate": 3.0010384215991693e-05,
1519
- "loss": 0.3302,
1520
- "step": 2460
1521
- },
1522
- {
1523
- "epoch": 4.62,
1524
- "learning_rate": 2.9906542056074764e-05,
1525
- "loss": 0.406,
1526
- "step": 2470
1527
- },
1528
- {
1529
- "epoch": 4.64,
1530
- "learning_rate": 2.9802699896157842e-05,
1531
- "loss": 0.323,
1532
- "step": 2480
1533
- },
1534
- {
1535
- "epoch": 4.65,
1536
- "learning_rate": 2.9698857736240916e-05,
1537
- "loss": 0.3808,
1538
- "step": 2490
1539
- },
1540
- {
1541
- "epoch": 4.67,
1542
- "learning_rate": 2.9595015576323987e-05,
1543
- "loss": 0.3269,
1544
- "step": 2500
1545
- },
1546
- {
1547
- "epoch": 4.69,
1548
- "learning_rate": 2.9491173416407064e-05,
1549
- "loss": 0.3703,
1550
- "step": 2510
1551
- },
1552
- {
1553
- "epoch": 4.71,
1554
- "learning_rate": 2.9387331256490135e-05,
1555
- "loss": 0.3173,
1556
- "step": 2520
1557
- },
1558
- {
1559
- "epoch": 4.73,
1560
- "learning_rate": 2.9283489096573206e-05,
1561
- "loss": 0.2749,
1562
- "step": 2530
1563
- },
1564
- {
1565
- "epoch": 4.75,
1566
- "learning_rate": 2.9179646936656284e-05,
1567
- "loss": 0.3128,
1568
- "step": 2540
1569
- },
1570
- {
1571
- "epoch": 4.77,
1572
- "learning_rate": 2.9075804776739358e-05,
1573
- "loss": 0.3725,
1574
- "step": 2550
1575
- },
1576
- {
1577
- "epoch": 4.79,
1578
- "learning_rate": 2.897196261682243e-05,
1579
- "loss": 0.3052,
1580
- "step": 2560
1581
- },
1582
- {
1583
- "epoch": 4.8,
1584
- "learning_rate": 2.8868120456905506e-05,
1585
- "loss": 0.3159,
1586
- "step": 2570
1587
- },
1588
- {
1589
- "epoch": 4.82,
1590
- "learning_rate": 2.8764278296988577e-05,
1591
- "loss": 0.27,
1592
- "step": 2580
1593
- },
1594
- {
1595
- "epoch": 4.84,
1596
- "learning_rate": 2.866043613707165e-05,
1597
- "loss": 0.3566,
1598
- "step": 2590
1599
- },
1600
- {
1601
- "epoch": 4.86,
1602
- "learning_rate": 2.855659397715473e-05,
1603
- "loss": 0.3622,
1604
- "step": 2600
1605
- },
1606
- {
1607
- "epoch": 4.88,
1608
- "learning_rate": 2.84527518172378e-05,
1609
- "loss": 0.3318,
1610
- "step": 2610
1611
- },
1612
- {
1613
- "epoch": 4.9,
1614
- "learning_rate": 2.834890965732087e-05,
1615
- "loss": 0.334,
1616
- "step": 2620
1617
- },
1618
- {
1619
- "epoch": 4.92,
1620
- "learning_rate": 2.824506749740395e-05,
1621
- "loss": 0.3084,
1622
- "step": 2630
1623
- },
1624
- {
1625
- "epoch": 4.93,
1626
- "learning_rate": 2.814122533748702e-05,
1627
- "loss": 0.3524,
1628
- "step": 2640
1629
- },
1630
- {
1631
- "epoch": 4.95,
1632
- "learning_rate": 2.8037383177570094e-05,
1633
- "loss": 0.3473,
1634
- "step": 2650
1635
- },
1636
- {
1637
- "epoch": 4.97,
1638
- "learning_rate": 2.793354101765317e-05,
1639
- "loss": 0.3593,
1640
- "step": 2660
1641
- },
1642
- {
1643
- "epoch": 4.99,
1644
- "learning_rate": 2.7829698857736242e-05,
1645
- "loss": 0.316,
1646
- "step": 2670
1647
- },
1648
- {
1649
- "epoch": 5.0,
1650
- "eval_accuracy": 0.8754672897196262,
1651
- "eval_loss": 0.37160804867744446,
1652
- "eval_runtime": 95.1891,
1653
- "eval_samples_per_second": 44.963,
1654
- "eval_steps_per_second": 5.62,
1655
- "step": 2675
1656
- },
1657
- {
1658
- "epoch": 5.01,
1659
- "learning_rate": 2.7725856697819313e-05,
1660
- "loss": 0.2738,
1661
- "step": 2680
1662
- },
1663
- {
1664
- "epoch": 5.03,
1665
- "learning_rate": 2.762201453790239e-05,
1666
- "loss": 0.342,
1667
- "step": 2690
1668
- },
1669
- {
1670
- "epoch": 5.05,
1671
- "learning_rate": 2.751817237798546e-05,
1672
- "loss": 0.3448,
1673
- "step": 2700
1674
- },
1675
- {
1676
- "epoch": 5.07,
1677
- "learning_rate": 2.7414330218068536e-05,
1678
- "loss": 0.3099,
1679
- "step": 2710
1680
- },
1681
- {
1682
- "epoch": 5.08,
1683
- "learning_rate": 2.7310488058151613e-05,
1684
- "loss": 0.3368,
1685
- "step": 2720
1686
- },
1687
- {
1688
- "epoch": 5.1,
1689
- "learning_rate": 2.7206645898234684e-05,
1690
- "loss": 0.2743,
1691
- "step": 2730
1692
- },
1693
- {
1694
- "epoch": 5.12,
1695
- "learning_rate": 2.7102803738317755e-05,
1696
- "loss": 0.3064,
1697
- "step": 2740
1698
- },
1699
- {
1700
- "epoch": 5.14,
1701
- "learning_rate": 2.6998961578400832e-05,
1702
- "loss": 0.2109,
1703
- "step": 2750
1704
- },
1705
- {
1706
- "epoch": 5.16,
1707
- "learning_rate": 2.6895119418483907e-05,
1708
- "loss": 0.3611,
1709
- "step": 2760
1710
- },
1711
- {
1712
- "epoch": 5.18,
1713
- "learning_rate": 2.6791277258566978e-05,
1714
- "loss": 0.3456,
1715
- "step": 2770
1716
- },
1717
- {
1718
- "epoch": 5.2,
1719
- "learning_rate": 2.6687435098650055e-05,
1720
- "loss": 0.2599,
1721
- "step": 2780
1722
- },
1723
- {
1724
- "epoch": 5.21,
1725
- "learning_rate": 2.6583592938733126e-05,
1726
- "loss": 0.2951,
1727
- "step": 2790
1728
- },
1729
- {
1730
- "epoch": 5.23,
1731
- "learning_rate": 2.6479750778816197e-05,
1732
- "loss": 0.1794,
1733
- "step": 2800
1734
- },
1735
- {
1736
- "epoch": 5.25,
1737
- "learning_rate": 2.6375908618899274e-05,
1738
- "loss": 0.2954,
1739
- "step": 2810
1740
- },
1741
- {
1742
- "epoch": 5.27,
1743
- "learning_rate": 2.627206645898235e-05,
1744
- "loss": 0.298,
1745
- "step": 2820
1746
- },
1747
- {
1748
- "epoch": 5.29,
1749
- "learning_rate": 2.616822429906542e-05,
1750
- "loss": 0.3187,
1751
- "step": 2830
1752
- },
1753
- {
1754
- "epoch": 5.31,
1755
- "learning_rate": 2.6064382139148497e-05,
1756
- "loss": 0.3297,
1757
- "step": 2840
1758
- },
1759
- {
1760
- "epoch": 5.33,
1761
- "learning_rate": 2.5960539979231568e-05,
1762
- "loss": 0.2431,
1763
- "step": 2850
1764
- },
1765
- {
1766
- "epoch": 5.35,
1767
- "learning_rate": 2.585669781931464e-05,
1768
- "loss": 0.3967,
1769
- "step": 2860
1770
- },
1771
- {
1772
- "epoch": 5.36,
1773
- "learning_rate": 2.5752855659397716e-05,
1774
- "loss": 0.3722,
1775
- "step": 2870
1776
- },
1777
- {
1778
- "epoch": 5.38,
1779
- "learning_rate": 2.564901349948079e-05,
1780
- "loss": 0.2752,
1781
- "step": 2880
1782
- },
1783
- {
1784
- "epoch": 5.4,
1785
- "learning_rate": 2.554517133956386e-05,
1786
- "loss": 0.2971,
1787
- "step": 2890
1788
- },
1789
- {
1790
- "epoch": 5.42,
1791
- "learning_rate": 2.544132917964694e-05,
1792
- "loss": 0.2804,
1793
- "step": 2900
1794
- },
1795
- {
1796
- "epoch": 5.44,
1797
- "learning_rate": 2.533748701973001e-05,
1798
- "loss": 0.3715,
1799
- "step": 2910
1800
- },
1801
- {
1802
- "epoch": 5.46,
1803
- "learning_rate": 2.5233644859813084e-05,
1804
- "loss": 0.2833,
1805
- "step": 2920
1806
- },
1807
- {
1808
- "epoch": 5.48,
1809
- "learning_rate": 2.512980269989616e-05,
1810
- "loss": 0.2891,
1811
- "step": 2930
1812
- },
1813
- {
1814
- "epoch": 5.5,
1815
- "learning_rate": 2.5025960539979233e-05,
1816
- "loss": 0.3122,
1817
- "step": 2940
1818
- },
1819
- {
1820
- "epoch": 5.51,
1821
- "learning_rate": 2.4922118380062307e-05,
1822
- "loss": 0.2891,
1823
- "step": 2950
1824
- },
1825
- {
1826
- "epoch": 5.53,
1827
- "learning_rate": 2.481827622014538e-05,
1828
- "loss": 0.3167,
1829
- "step": 2960
1830
- },
1831
- {
1832
- "epoch": 5.55,
1833
- "learning_rate": 2.4714434060228452e-05,
1834
- "loss": 0.3505,
1835
- "step": 2970
1836
- },
1837
- {
1838
- "epoch": 5.57,
1839
- "learning_rate": 2.4610591900311526e-05,
1840
- "loss": 0.3255,
1841
- "step": 2980
1842
- },
1843
- {
1844
- "epoch": 5.59,
1845
- "learning_rate": 2.4506749740394604e-05,
1846
- "loss": 0.2708,
1847
- "step": 2990
1848
- },
1849
- {
1850
- "epoch": 5.61,
1851
- "learning_rate": 2.4402907580477675e-05,
1852
- "loss": 0.3709,
1853
- "step": 3000
1854
- },
1855
- {
1856
- "epoch": 5.63,
1857
- "learning_rate": 2.429906542056075e-05,
1858
- "loss": 0.3623,
1859
- "step": 3010
1860
- },
1861
- {
1862
- "epoch": 5.64,
1863
- "learning_rate": 2.4195223260643823e-05,
1864
- "loss": 0.2488,
1865
- "step": 3020
1866
- },
1867
- {
1868
- "epoch": 5.66,
1869
- "learning_rate": 2.4091381100726894e-05,
1870
- "loss": 0.3012,
1871
- "step": 3030
1872
- },
1873
- {
1874
- "epoch": 5.68,
1875
- "learning_rate": 2.398753894080997e-05,
1876
- "loss": 0.318,
1877
- "step": 3040
1878
- },
1879
- {
1880
- "epoch": 5.7,
1881
- "learning_rate": 2.3883696780893046e-05,
1882
- "loss": 0.3596,
1883
- "step": 3050
1884
- },
1885
- {
1886
- "epoch": 5.72,
1887
- "learning_rate": 2.3779854620976117e-05,
1888
- "loss": 0.2923,
1889
- "step": 3060
1890
- },
1891
- {
1892
- "epoch": 5.74,
1893
- "learning_rate": 2.367601246105919e-05,
1894
- "loss": 0.3677,
1895
- "step": 3070
1896
- },
1897
- {
1898
- "epoch": 5.76,
1899
- "learning_rate": 2.3572170301142265e-05,
1900
- "loss": 0.2508,
1901
- "step": 3080
1902
- },
1903
- {
1904
- "epoch": 5.78,
1905
- "learning_rate": 2.3468328141225336e-05,
1906
- "loss": 0.2464,
1907
- "step": 3090
1908
- },
1909
- {
1910
- "epoch": 5.79,
1911
- "learning_rate": 2.3364485981308414e-05,
1912
- "loss": 0.3364,
1913
- "step": 3100
1914
- },
1915
- {
1916
- "epoch": 5.81,
1917
- "learning_rate": 2.3260643821391488e-05,
1918
- "loss": 0.2931,
1919
- "step": 3110
1920
- },
1921
- {
1922
- "epoch": 5.83,
1923
- "learning_rate": 2.315680166147456e-05,
1924
- "loss": 0.2962,
1925
- "step": 3120
1926
- },
1927
- {
1928
- "epoch": 5.85,
1929
- "learning_rate": 2.3052959501557633e-05,
1930
- "loss": 0.282,
1931
- "step": 3130
1932
- },
1933
- {
1934
- "epoch": 5.87,
1935
- "learning_rate": 2.2949117341640707e-05,
1936
- "loss": 0.2978,
1937
- "step": 3140
1938
- },
1939
- {
1940
- "epoch": 5.89,
1941
- "learning_rate": 2.284527518172378e-05,
1942
- "loss": 0.2524,
1943
- "step": 3150
1944
- },
1945
- {
1946
- "epoch": 5.91,
1947
- "learning_rate": 2.2741433021806856e-05,
1948
- "loss": 0.3384,
1949
- "step": 3160
1950
- },
1951
- {
1952
- "epoch": 5.93,
1953
- "learning_rate": 2.263759086188993e-05,
1954
- "loss": 0.2936,
1955
- "step": 3170
1956
- },
1957
- {
1958
- "epoch": 5.94,
1959
- "learning_rate": 2.2533748701973e-05,
1960
- "loss": 0.2521,
1961
- "step": 3180
1962
- },
1963
- {
1964
- "epoch": 5.96,
1965
- "learning_rate": 2.2429906542056075e-05,
1966
- "loss": 0.3685,
1967
- "step": 3190
1968
- },
1969
- {
1970
- "epoch": 5.98,
1971
- "learning_rate": 2.232606438213915e-05,
1972
- "loss": 0.2391,
1973
- "step": 3200
1974
- },
1975
- {
1976
- "epoch": 6.0,
1977
- "learning_rate": 2.2222222222222223e-05,
1978
- "loss": 0.2709,
1979
- "step": 3210
1980
- },
1981
- {
1982
- "epoch": 6.0,
1983
- "eval_accuracy": 0.8735981308411215,
1984
- "eval_loss": 0.37839841842651367,
1985
- "eval_runtime": 92.127,
1986
- "eval_samples_per_second": 46.458,
1987
- "eval_steps_per_second": 5.807,
1988
- "step": 3210
1989
- },
1990
- {
1991
- "epoch": 6.02,
1992
- "learning_rate": 2.2118380062305298e-05,
1993
- "loss": 0.2305,
1994
- "step": 3220
1995
- },
1996
- {
1997
- "epoch": 6.04,
1998
- "learning_rate": 2.2014537902388372e-05,
1999
- "loss": 0.2758,
2000
- "step": 3230
2001
- },
2002
- {
2003
- "epoch": 6.06,
2004
- "learning_rate": 2.1910695742471443e-05,
2005
- "loss": 0.2602,
2006
- "step": 3240
2007
- },
2008
- {
2009
- "epoch": 6.07,
2010
- "learning_rate": 2.1806853582554517e-05,
2011
- "loss": 0.311,
2012
- "step": 3250
2013
- },
2014
- {
2015
- "epoch": 6.09,
2016
- "learning_rate": 2.170301142263759e-05,
2017
- "loss": 0.3315,
2018
- "step": 3260
2019
- },
2020
- {
2021
- "epoch": 6.11,
2022
- "learning_rate": 2.1599169262720665e-05,
2023
- "loss": 0.3078,
2024
- "step": 3270
2025
- },
2026
- {
2027
- "epoch": 6.13,
2028
- "learning_rate": 2.149532710280374e-05,
2029
- "loss": 0.319,
2030
- "step": 3280
2031
- },
2032
- {
2033
- "epoch": 6.15,
2034
- "learning_rate": 2.1391484942886814e-05,
2035
- "loss": 0.3326,
2036
- "step": 3290
2037
- },
2038
- {
2039
- "epoch": 6.17,
2040
- "learning_rate": 2.1287642782969885e-05,
2041
- "loss": 0.2892,
2042
- "step": 3300
2043
- },
2044
- {
2045
- "epoch": 6.19,
2046
- "learning_rate": 2.118380062305296e-05,
2047
- "loss": 0.2669,
2048
- "step": 3310
2049
- },
2050
- {
2051
- "epoch": 6.21,
2052
- "learning_rate": 2.1079958463136033e-05,
2053
- "loss": 0.2829,
2054
- "step": 3320
2055
- },
2056
- {
2057
- "epoch": 6.22,
2058
- "learning_rate": 2.0976116303219107e-05,
2059
- "loss": 0.2434,
2060
- "step": 3330
2061
- },
2062
- {
2063
- "epoch": 6.24,
2064
- "learning_rate": 2.087227414330218e-05,
2065
- "loss": 0.2422,
2066
- "step": 3340
2067
- },
2068
- {
2069
- "epoch": 6.26,
2070
- "learning_rate": 2.0768431983385256e-05,
2071
- "loss": 0.3655,
2072
- "step": 3350
2073
- },
2074
- {
2075
- "epoch": 6.28,
2076
- "learning_rate": 2.066458982346833e-05,
2077
- "loss": 0.2772,
2078
- "step": 3360
2079
- },
2080
- {
2081
- "epoch": 6.3,
2082
- "learning_rate": 2.05607476635514e-05,
2083
- "loss": 0.2542,
2084
- "step": 3370
2085
- },
2086
- {
2087
- "epoch": 6.32,
2088
- "learning_rate": 2.045690550363448e-05,
2089
- "loss": 0.2978,
2090
- "step": 3380
2091
- },
2092
- {
2093
- "epoch": 6.34,
2094
- "learning_rate": 2.0353063343717553e-05,
2095
- "loss": 0.2518,
2096
- "step": 3390
2097
- },
2098
- {
2099
- "epoch": 6.36,
2100
- "learning_rate": 2.0249221183800623e-05,
2101
- "loss": 0.2697,
2102
- "step": 3400
2103
- },
2104
- {
2105
- "epoch": 6.37,
2106
- "learning_rate": 2.0145379023883698e-05,
2107
- "loss": 0.2326,
2108
- "step": 3410
2109
- },
2110
- {
2111
- "epoch": 6.39,
2112
- "learning_rate": 2.0041536863966772e-05,
2113
- "loss": 0.29,
2114
- "step": 3420
2115
- },
2116
- {
2117
- "epoch": 6.41,
2118
- "learning_rate": 1.9937694704049846e-05,
2119
- "loss": 0.2858,
2120
- "step": 3430
2121
- },
2122
- {
2123
- "epoch": 6.43,
2124
- "learning_rate": 1.983385254413292e-05,
2125
- "loss": 0.2665,
2126
- "step": 3440
2127
- },
2128
- {
2129
- "epoch": 6.45,
2130
- "learning_rate": 1.9730010384215995e-05,
2131
- "loss": 0.2583,
2132
- "step": 3450
2133
- },
2134
- {
2135
- "epoch": 6.47,
2136
- "learning_rate": 1.9626168224299065e-05,
2137
- "loss": 0.3213,
2138
- "step": 3460
2139
- },
2140
- {
2141
- "epoch": 6.49,
2142
- "learning_rate": 1.952232606438214e-05,
2143
- "loss": 0.3385,
2144
- "step": 3470
2145
- },
2146
- {
2147
- "epoch": 6.5,
2148
- "learning_rate": 1.9418483904465214e-05,
2149
- "loss": 0.2492,
2150
- "step": 3480
2151
- },
2152
- {
2153
- "epoch": 6.52,
2154
- "learning_rate": 1.9314641744548288e-05,
2155
- "loss": 0.2829,
2156
- "step": 3490
2157
- },
2158
- {
2159
- "epoch": 6.54,
2160
- "learning_rate": 1.9210799584631362e-05,
2161
- "loss": 0.2651,
2162
- "step": 3500
2163
- },
2164
- {
2165
- "epoch": 6.56,
2166
- "learning_rate": 1.9106957424714437e-05,
2167
- "loss": 0.2671,
2168
- "step": 3510
2169
- },
2170
- {
2171
- "epoch": 6.58,
2172
- "learning_rate": 1.9003115264797507e-05,
2173
- "loss": 0.3116,
2174
- "step": 3520
2175
- },
2176
- {
2177
- "epoch": 6.6,
2178
- "learning_rate": 1.889927310488058e-05,
2179
- "loss": 0.3116,
2180
- "step": 3530
2181
- },
2182
- {
2183
- "epoch": 6.62,
2184
- "learning_rate": 1.8795430944963656e-05,
2185
- "loss": 0.2305,
2186
- "step": 3540
2187
- },
2188
- {
2189
- "epoch": 6.64,
2190
- "learning_rate": 1.869158878504673e-05,
2191
- "loss": 0.3035,
2192
- "step": 3550
2193
- },
2194
- {
2195
- "epoch": 6.65,
2196
- "learning_rate": 1.8587746625129804e-05,
2197
- "loss": 0.2388,
2198
- "step": 3560
2199
- },
2200
- {
2201
- "epoch": 6.67,
2202
- "learning_rate": 1.848390446521288e-05,
2203
- "loss": 0.2424,
2204
- "step": 3570
2205
- },
2206
- {
2207
- "epoch": 6.69,
2208
- "learning_rate": 1.838006230529595e-05,
2209
- "loss": 0.219,
2210
- "step": 3580
2211
- },
2212
- {
2213
- "epoch": 6.71,
2214
- "learning_rate": 1.8276220145379024e-05,
2215
- "loss": 0.212,
2216
- "step": 3590
2217
- },
2218
- {
2219
- "epoch": 6.73,
2220
- "learning_rate": 1.8172377985462098e-05,
2221
- "loss": 0.3682,
2222
- "step": 3600
2223
- },
2224
- {
2225
- "epoch": 6.75,
2226
- "learning_rate": 1.8068535825545172e-05,
2227
- "loss": 0.2882,
2228
- "step": 3610
2229
- },
2230
- {
2231
- "epoch": 6.77,
2232
- "learning_rate": 1.7964693665628246e-05,
2233
- "loss": 0.2846,
2234
- "step": 3620
2235
- },
2236
- {
2237
- "epoch": 6.79,
2238
- "learning_rate": 1.786085150571132e-05,
2239
- "loss": 0.276,
2240
- "step": 3630
2241
- },
2242
- {
2243
- "epoch": 6.8,
2244
- "learning_rate": 1.775700934579439e-05,
2245
- "loss": 0.214,
2246
- "step": 3640
2247
- },
2248
- {
2249
- "epoch": 6.82,
2250
- "learning_rate": 1.7653167185877466e-05,
2251
- "loss": 0.2351,
2252
- "step": 3650
2253
- },
2254
- {
2255
- "epoch": 6.84,
2256
- "learning_rate": 1.7549325025960543e-05,
2257
- "loss": 0.2428,
2258
- "step": 3660
2259
- },
2260
- {
2261
- "epoch": 6.86,
2262
- "learning_rate": 1.7445482866043614e-05,
2263
- "loss": 0.2289,
2264
- "step": 3670
2265
- },
2266
- {
2267
- "epoch": 6.88,
2268
- "learning_rate": 1.734164070612669e-05,
2269
- "loss": 0.2087,
2270
- "step": 3680
2271
- },
2272
- {
2273
- "epoch": 6.9,
2274
- "learning_rate": 1.7237798546209763e-05,
2275
- "loss": 0.2492,
2276
- "step": 3690
2277
- },
2278
- {
2279
- "epoch": 6.92,
2280
- "learning_rate": 1.7133956386292833e-05,
2281
- "loss": 0.2855,
2282
- "step": 3700
2283
- },
2284
- {
2285
- "epoch": 6.93,
2286
- "learning_rate": 1.703011422637591e-05,
2287
- "loss": 0.2701,
2288
- "step": 3710
2289
- },
2290
- {
2291
- "epoch": 6.95,
2292
- "learning_rate": 1.6926272066458985e-05,
2293
- "loss": 0.2635,
2294
- "step": 3720
2295
- },
2296
- {
2297
- "epoch": 6.97,
2298
- "learning_rate": 1.6822429906542056e-05,
2299
- "loss": 0.267,
2300
- "step": 3730
2301
- },
2302
- {
2303
- "epoch": 6.99,
2304
- "learning_rate": 1.671858774662513e-05,
2305
- "loss": 0.177,
2306
- "step": 3740
2307
- },
2308
- {
2309
- "epoch": 7.0,
2310
- "eval_accuracy": 0.8745327102803738,
2311
- "eval_loss": 0.37715595960617065,
2312
- "eval_runtime": 95.1073,
2313
- "eval_samples_per_second": 45.002,
2314
- "eval_steps_per_second": 5.625,
2315
- "step": 3745
2316
- },
2317
- {
2318
- "epoch": 7.01,
2319
- "learning_rate": 1.6614745586708205e-05,
2320
- "loss": 0.307,
2321
- "step": 3750
2322
- },
2323
- {
2324
- "epoch": 7.03,
2325
- "learning_rate": 1.6510903426791275e-05,
2326
- "loss": 0.258,
2327
- "step": 3760
2328
- },
2329
- {
2330
- "epoch": 7.05,
2331
- "learning_rate": 1.6407061266874353e-05,
2332
- "loss": 0.2322,
2333
- "step": 3770
2334
- },
2335
- {
2336
- "epoch": 7.07,
2337
- "learning_rate": 1.6303219106957427e-05,
2338
- "loss": 0.187,
2339
- "step": 3780
2340
- },
2341
- {
2342
- "epoch": 7.08,
2343
- "learning_rate": 1.6199376947040498e-05,
2344
- "loss": 0.2838,
2345
- "step": 3790
2346
- },
2347
- {
2348
- "epoch": 7.1,
2349
- "learning_rate": 1.6095534787123572e-05,
2350
- "loss": 0.2353,
2351
- "step": 3800
2352
- },
2353
- {
2354
- "epoch": 7.12,
2355
- "learning_rate": 1.5991692627206647e-05,
2356
- "loss": 0.2661,
2357
- "step": 3810
2358
- },
2359
- {
2360
- "epoch": 7.14,
2361
- "learning_rate": 1.588785046728972e-05,
2362
- "loss": 0.1917,
2363
- "step": 3820
2364
- },
2365
- {
2366
- "epoch": 7.16,
2367
- "learning_rate": 1.5784008307372795e-05,
2368
- "loss": 0.1978,
2369
- "step": 3830
2370
- },
2371
- {
2372
- "epoch": 7.18,
2373
- "learning_rate": 1.568016614745587e-05,
2374
- "loss": 0.2688,
2375
- "step": 3840
2376
- },
2377
- {
2378
- "epoch": 7.2,
2379
- "learning_rate": 1.557632398753894e-05,
2380
- "loss": 0.2383,
2381
- "step": 3850
2382
- },
2383
- {
2384
- "epoch": 7.21,
2385
- "learning_rate": 1.5472481827622014e-05,
2386
- "loss": 0.2874,
2387
- "step": 3860
2388
- },
2389
- {
2390
- "epoch": 7.23,
2391
- "learning_rate": 1.536863966770509e-05,
2392
- "loss": 0.265,
2393
- "step": 3870
2394
- },
2395
- {
2396
- "epoch": 7.25,
2397
- "learning_rate": 1.5264797507788163e-05,
2398
- "loss": 0.1821,
2399
- "step": 3880
2400
- },
2401
- {
2402
- "epoch": 7.27,
2403
- "learning_rate": 1.5160955347871237e-05,
2404
- "loss": 0.193,
2405
- "step": 3890
2406
- },
2407
- {
2408
- "epoch": 7.29,
2409
- "learning_rate": 1.5057113187954311e-05,
2410
- "loss": 0.2681,
2411
- "step": 3900
2412
- },
2413
- {
2414
- "epoch": 7.31,
2415
- "learning_rate": 1.4953271028037382e-05,
2416
- "loss": 0.2406,
2417
- "step": 3910
2418
- },
2419
- {
2420
- "epoch": 7.33,
2421
- "learning_rate": 1.4849428868120458e-05,
2422
- "loss": 0.3214,
2423
- "step": 3920
2424
- },
2425
- {
2426
- "epoch": 7.35,
2427
- "learning_rate": 1.4745586708203532e-05,
2428
- "loss": 0.2521,
2429
- "step": 3930
2430
- },
2431
- {
2432
- "epoch": 7.36,
2433
- "learning_rate": 1.4641744548286603e-05,
2434
- "loss": 0.3427,
2435
- "step": 3940
2436
- },
2437
- {
2438
- "epoch": 7.38,
2439
- "learning_rate": 1.4537902388369679e-05,
2440
- "loss": 0.3272,
2441
- "step": 3950
2442
- },
2443
- {
2444
- "epoch": 7.4,
2445
- "learning_rate": 1.4434060228452753e-05,
2446
- "loss": 0.2336,
2447
- "step": 3960
2448
- },
2449
- {
2450
- "epoch": 7.42,
2451
- "learning_rate": 1.4330218068535826e-05,
2452
- "loss": 0.2209,
2453
- "step": 3970
2454
- },
2455
- {
2456
- "epoch": 7.44,
2457
- "learning_rate": 1.42263759086189e-05,
2458
- "loss": 0.222,
2459
- "step": 3980
2460
- },
2461
- {
2462
- "epoch": 7.46,
2463
- "learning_rate": 1.4122533748701974e-05,
2464
- "loss": 0.236,
2465
- "step": 3990
2466
- },
2467
- {
2468
- "epoch": 7.48,
2469
- "learning_rate": 1.4018691588785047e-05,
2470
- "loss": 0.2875,
2471
- "step": 4000
2472
- },
2473
- {
2474
- "epoch": 7.5,
2475
- "learning_rate": 1.3914849428868121e-05,
2476
- "loss": 0.2098,
2477
- "step": 4010
2478
- },
2479
- {
2480
- "epoch": 7.51,
2481
- "learning_rate": 1.3811007268951195e-05,
2482
- "loss": 0.324,
2483
- "step": 4020
2484
- },
2485
- {
2486
- "epoch": 7.53,
2487
- "learning_rate": 1.3707165109034268e-05,
2488
- "loss": 0.2038,
2489
- "step": 4030
2490
- },
2491
- {
2492
- "epoch": 7.55,
2493
- "learning_rate": 1.3603322949117342e-05,
2494
- "loss": 0.271,
2495
- "step": 4040
2496
- },
2497
- {
2498
- "epoch": 7.57,
2499
- "learning_rate": 1.3499480789200416e-05,
2500
- "loss": 0.296,
2501
- "step": 4050
2502
- },
2503
- {
2504
- "epoch": 7.59,
2505
- "learning_rate": 1.3395638629283489e-05,
2506
- "loss": 0.2557,
2507
- "step": 4060
2508
- },
2509
- {
2510
- "epoch": 7.61,
2511
- "learning_rate": 1.3291796469366563e-05,
2512
- "loss": 0.2788,
2513
- "step": 4070
2514
- },
2515
- {
2516
- "epoch": 7.63,
2517
- "learning_rate": 1.3187954309449637e-05,
2518
- "loss": 0.2684,
2519
- "step": 4080
2520
- },
2521
- {
2522
- "epoch": 7.64,
2523
- "learning_rate": 1.308411214953271e-05,
2524
- "loss": 0.2447,
2525
- "step": 4090
2526
- },
2527
- {
2528
- "epoch": 7.66,
2529
- "learning_rate": 1.2980269989615784e-05,
2530
- "loss": 0.287,
2531
- "step": 4100
2532
- },
2533
- {
2534
- "epoch": 7.68,
2535
- "learning_rate": 1.2876427829698858e-05,
2536
- "loss": 0.2186,
2537
- "step": 4110
2538
- },
2539
- {
2540
- "epoch": 7.7,
2541
- "learning_rate": 1.277258566978193e-05,
2542
- "loss": 0.1978,
2543
- "step": 4120
2544
- },
2545
- {
2546
- "epoch": 7.72,
2547
- "learning_rate": 1.2668743509865005e-05,
2548
- "loss": 0.2698,
2549
- "step": 4130
2550
- },
2551
- {
2552
- "epoch": 7.74,
2553
- "learning_rate": 1.256490134994808e-05,
2554
- "loss": 0.1624,
2555
- "step": 4140
2556
- },
2557
- {
2558
- "epoch": 7.76,
2559
- "learning_rate": 1.2461059190031153e-05,
2560
- "loss": 0.2591,
2561
- "step": 4150
2562
- },
2563
- {
2564
- "epoch": 7.78,
2565
- "learning_rate": 1.2357217030114226e-05,
2566
- "loss": 0.2035,
2567
- "step": 4160
2568
- },
2569
- {
2570
- "epoch": 7.79,
2571
- "learning_rate": 1.2253374870197302e-05,
2572
- "loss": 0.2285,
2573
- "step": 4170
2574
- },
2575
- {
2576
- "epoch": 7.81,
2577
- "learning_rate": 1.2149532710280374e-05,
2578
- "loss": 0.2257,
2579
- "step": 4180
2580
- },
2581
- {
2582
- "epoch": 7.83,
2583
- "learning_rate": 1.2045690550363447e-05,
2584
- "loss": 0.2555,
2585
- "step": 4190
2586
- },
2587
- {
2588
- "epoch": 7.85,
2589
- "learning_rate": 1.1941848390446523e-05,
2590
- "loss": 0.2192,
2591
- "step": 4200
2592
- },
2593
- {
2594
- "epoch": 7.87,
2595
- "learning_rate": 1.1838006230529595e-05,
2596
- "loss": 0.2169,
2597
- "step": 4210
2598
- },
2599
- {
2600
- "epoch": 7.89,
2601
- "learning_rate": 1.1734164070612668e-05,
2602
- "loss": 0.2914,
2603
- "step": 4220
2604
- },
2605
- {
2606
- "epoch": 7.91,
2607
- "learning_rate": 1.1630321910695744e-05,
2608
- "loss": 0.253,
2609
- "step": 4230
2610
- },
2611
- {
2612
- "epoch": 7.93,
2613
- "learning_rate": 1.1526479750778816e-05,
2614
- "loss": 0.2067,
2615
- "step": 4240
2616
- },
2617
- {
2618
- "epoch": 7.94,
2619
- "learning_rate": 1.142263759086189e-05,
2620
- "loss": 0.1995,
2621
- "step": 4250
2622
- },
2623
- {
2624
- "epoch": 7.96,
2625
- "learning_rate": 1.1318795430944965e-05,
2626
- "loss": 0.2902,
2627
- "step": 4260
2628
- },
2629
- {
2630
- "epoch": 7.98,
2631
- "learning_rate": 1.1214953271028037e-05,
2632
- "loss": 0.2051,
2633
- "step": 4270
2634
- },
2635
- {
2636
- "epoch": 8.0,
2637
- "learning_rate": 1.1111111111111112e-05,
2638
- "loss": 0.2409,
2639
- "step": 4280
2640
- },
2641
- {
2642
- "epoch": 8.0,
2643
- "eval_accuracy": 0.8761682242990654,
2644
- "eval_loss": 0.38753223419189453,
2645
- "eval_runtime": 91.8924,
2646
- "eval_samples_per_second": 46.576,
2647
- "eval_steps_per_second": 5.822,
2648
- "step": 4280
2649
- },
2650
- {
2651
- "epoch": 8.02,
2652
- "learning_rate": 1.1007268951194186e-05,
2653
- "loss": 0.2436,
2654
- "step": 4290
2655
- },
2656
- {
2657
- "epoch": 8.04,
2658
- "learning_rate": 1.0903426791277258e-05,
2659
- "loss": 0.2468,
2660
- "step": 4300
2661
- },
2662
- {
2663
- "epoch": 8.06,
2664
- "learning_rate": 1.0799584631360333e-05,
2665
- "loss": 0.168,
2666
- "step": 4310
2667
- },
2668
- {
2669
- "epoch": 8.07,
2670
- "learning_rate": 1.0695742471443407e-05,
2671
- "loss": 0.2093,
2672
- "step": 4320
2673
- },
2674
- {
2675
- "epoch": 8.09,
2676
- "learning_rate": 1.059190031152648e-05,
2677
- "loss": 0.1616,
2678
- "step": 4330
2679
- },
2680
- {
2681
- "epoch": 8.11,
2682
- "learning_rate": 1.0488058151609554e-05,
2683
- "loss": 0.1912,
2684
- "step": 4340
2685
- },
2686
- {
2687
- "epoch": 8.13,
2688
- "learning_rate": 1.0384215991692628e-05,
2689
- "loss": 0.3157,
2690
- "step": 4350
2691
- },
2692
- {
2693
- "epoch": 8.15,
2694
- "learning_rate": 1.02803738317757e-05,
2695
- "loss": 0.2213,
2696
- "step": 4360
2697
- },
2698
- {
2699
- "epoch": 8.17,
2700
- "learning_rate": 1.0176531671858776e-05,
2701
- "loss": 0.2473,
2702
- "step": 4370
2703
- },
2704
- {
2705
- "epoch": 8.19,
2706
- "learning_rate": 1.0072689511941849e-05,
2707
- "loss": 0.2155,
2708
- "step": 4380
2709
- },
2710
- {
2711
- "epoch": 8.21,
2712
- "learning_rate": 9.968847352024923e-06,
2713
- "loss": 0.209,
2714
- "step": 4390
2715
- },
2716
- {
2717
- "epoch": 8.22,
2718
- "learning_rate": 9.865005192107997e-06,
2719
- "loss": 0.2552,
2720
- "step": 4400
2721
- },
2722
- {
2723
- "epoch": 8.24,
2724
- "learning_rate": 9.76116303219107e-06,
2725
- "loss": 0.2469,
2726
- "step": 4410
2727
- },
2728
- {
2729
- "epoch": 8.26,
2730
- "learning_rate": 9.657320872274144e-06,
2731
- "loss": 0.2195,
2732
- "step": 4420
2733
- },
2734
- {
2735
- "epoch": 8.28,
2736
- "learning_rate": 9.553478712357218e-06,
2737
- "loss": 0.2094,
2738
- "step": 4430
2739
- },
2740
- {
2741
- "epoch": 8.3,
2742
- "learning_rate": 9.44963655244029e-06,
2743
- "loss": 0.2593,
2744
- "step": 4440
2745
- },
2746
- {
2747
- "epoch": 8.32,
2748
- "learning_rate": 9.345794392523365e-06,
2749
- "loss": 0.254,
2750
- "step": 4450
2751
- },
2752
- {
2753
- "epoch": 8.34,
2754
- "learning_rate": 9.24195223260644e-06,
2755
- "loss": 0.1353,
2756
- "step": 4460
2757
- },
2758
- {
2759
- "epoch": 8.36,
2760
- "learning_rate": 9.138110072689512e-06,
2761
- "loss": 0.2152,
2762
- "step": 4470
2763
- },
2764
- {
2765
- "epoch": 8.37,
2766
- "learning_rate": 9.034267912772586e-06,
2767
- "loss": 0.2258,
2768
- "step": 4480
2769
- },
2770
- {
2771
- "epoch": 8.39,
2772
- "learning_rate": 8.93042575285566e-06,
2773
- "loss": 0.1593,
2774
- "step": 4490
2775
- },
2776
- {
2777
- "epoch": 8.41,
2778
- "learning_rate": 8.826583592938733e-06,
2779
- "loss": 0.1939,
2780
- "step": 4500
2781
- },
2782
- {
2783
- "epoch": 8.43,
2784
- "learning_rate": 8.722741433021807e-06,
2785
- "loss": 0.2717,
2786
- "step": 4510
2787
- },
2788
- {
2789
- "epoch": 8.45,
2790
- "learning_rate": 8.618899273104881e-06,
2791
- "loss": 0.2263,
2792
- "step": 4520
2793
- },
2794
- {
2795
- "epoch": 8.47,
2796
- "learning_rate": 8.515057113187956e-06,
2797
- "loss": 0.281,
2798
- "step": 4530
2799
- },
2800
- {
2801
- "epoch": 8.49,
2802
- "learning_rate": 8.411214953271028e-06,
2803
- "loss": 0.2438,
2804
- "step": 4540
2805
- },
2806
- {
2807
- "epoch": 8.5,
2808
- "learning_rate": 8.307372793354102e-06,
2809
- "loss": 0.1487,
2810
- "step": 4550
2811
- },
2812
- {
2813
- "epoch": 8.52,
2814
- "learning_rate": 8.203530633437177e-06,
2815
- "loss": 0.1811,
2816
- "step": 4560
2817
- },
2818
- {
2819
- "epoch": 8.54,
2820
- "learning_rate": 8.099688473520249e-06,
2821
- "loss": 0.2803,
2822
- "step": 4570
2823
- },
2824
- {
2825
- "epoch": 8.56,
2826
- "learning_rate": 7.995846313603323e-06,
2827
- "loss": 0.2339,
2828
- "step": 4580
2829
- },
2830
- {
2831
- "epoch": 8.58,
2832
- "learning_rate": 7.892004153686398e-06,
2833
- "loss": 0.2617,
2834
- "step": 4590
2835
- },
2836
- {
2837
- "epoch": 8.6,
2838
- "learning_rate": 7.78816199376947e-06,
2839
- "loss": 0.3019,
2840
- "step": 4600
2841
- },
2842
- {
2843
- "epoch": 8.62,
2844
- "learning_rate": 7.684319833852544e-06,
2845
- "loss": 0.2083,
2846
- "step": 4610
2847
- },
2848
- {
2849
- "epoch": 8.64,
2850
- "learning_rate": 7.5804776739356185e-06,
2851
- "loss": 0.24,
2852
- "step": 4620
2853
- },
2854
- {
2855
- "epoch": 8.65,
2856
- "learning_rate": 7.476635514018691e-06,
2857
- "loss": 0.2378,
2858
- "step": 4630
2859
- },
2860
- {
2861
- "epoch": 8.67,
2862
- "learning_rate": 7.372793354101766e-06,
2863
- "loss": 0.2475,
2864
- "step": 4640
2865
- },
2866
- {
2867
- "epoch": 8.69,
2868
- "learning_rate": 7.2689511941848395e-06,
2869
- "loss": 0.197,
2870
- "step": 4650
2871
- },
2872
- {
2873
- "epoch": 8.71,
2874
- "learning_rate": 7.165109034267913e-06,
2875
- "loss": 0.2302,
2876
- "step": 4660
2877
- },
2878
- {
2879
- "epoch": 8.73,
2880
- "learning_rate": 7.061266874350987e-06,
2881
- "loss": 0.2186,
2882
- "step": 4670
2883
- },
2884
- {
2885
- "epoch": 8.75,
2886
- "learning_rate": 6.9574247144340605e-06,
2887
- "loss": 0.2172,
2888
- "step": 4680
2889
- },
2890
- {
2891
- "epoch": 8.77,
2892
- "learning_rate": 6.853582554517134e-06,
2893
- "loss": 0.2189,
2894
- "step": 4690
2895
- },
2896
- {
2897
- "epoch": 8.79,
2898
- "learning_rate": 6.749740394600208e-06,
2899
- "loss": 0.2137,
2900
- "step": 4700
2901
- },
2902
- {
2903
- "epoch": 8.8,
2904
- "learning_rate": 6.6458982346832815e-06,
2905
- "loss": 0.2259,
2906
- "step": 4710
2907
- },
2908
- {
2909
- "epoch": 8.82,
2910
- "learning_rate": 6.542056074766355e-06,
2911
- "loss": 0.15,
2912
- "step": 4720
2913
- },
2914
- {
2915
- "epoch": 8.84,
2916
- "learning_rate": 6.438213914849429e-06,
2917
- "loss": 0.2622,
2918
- "step": 4730
2919
- },
2920
- {
2921
- "epoch": 8.86,
2922
- "learning_rate": 6.3343717549325025e-06,
2923
- "loss": 0.2009,
2924
- "step": 4740
2925
- },
2926
- {
2927
- "epoch": 8.88,
2928
- "learning_rate": 6.230529595015577e-06,
2929
- "loss": 0.2095,
2930
- "step": 4750
2931
- },
2932
- {
2933
- "epoch": 8.9,
2934
- "learning_rate": 6.126687435098651e-06,
2935
- "loss": 0.1766,
2936
- "step": 4760
2937
- },
2938
- {
2939
- "epoch": 8.92,
2940
- "learning_rate": 6.0228452751817235e-06,
2941
- "loss": 0.1982,
2942
- "step": 4770
2943
- },
2944
- {
2945
- "epoch": 8.93,
2946
- "learning_rate": 5.919003115264798e-06,
2947
- "loss": 0.2287,
2948
- "step": 4780
2949
- },
2950
- {
2951
- "epoch": 8.95,
2952
- "learning_rate": 5.815160955347872e-06,
2953
- "loss": 0.266,
2954
- "step": 4790
2955
- },
2956
- {
2957
- "epoch": 8.97,
2958
- "learning_rate": 5.711318795430945e-06,
2959
- "loss": 0.2196,
2960
- "step": 4800
2961
- },
2962
- {
2963
- "epoch": 8.99,
2964
- "learning_rate": 5.607476635514019e-06,
2965
- "loss": 0.1929,
2966
- "step": 4810
2967
- },
2968
- {
2969
- "epoch": 9.0,
2970
- "eval_accuracy": 0.8707943925233644,
2971
- "eval_loss": 0.3915020227432251,
2972
- "eval_runtime": 93.367,
2973
- "eval_samples_per_second": 45.841,
2974
- "eval_steps_per_second": 5.73,
2975
- "step": 4815
2976
- },
2977
- {
2978
- "epoch": 9.01,
2979
- "learning_rate": 5.503634475597093e-06,
2980
- "loss": 0.2137,
2981
- "step": 4820
2982
- },
2983
- {
2984
- "epoch": 9.03,
2985
- "learning_rate": 5.399792315680166e-06,
2986
- "loss": 0.1994,
2987
- "step": 4830
2988
- },
2989
- {
2990
- "epoch": 9.05,
2991
- "learning_rate": 5.29595015576324e-06,
2992
- "loss": 0.2243,
2993
- "step": 4840
2994
- },
2995
- {
2996
- "epoch": 9.07,
2997
- "learning_rate": 5.192107995846314e-06,
2998
- "loss": 0.282,
2999
- "step": 4850
3000
- },
3001
- {
3002
- "epoch": 9.08,
3003
- "learning_rate": 5.088265835929388e-06,
3004
- "loss": 0.1425,
3005
- "step": 4860
3006
- },
3007
- {
3008
- "epoch": 9.1,
3009
- "learning_rate": 4.9844236760124615e-06,
3010
- "loss": 0.2213,
3011
- "step": 4870
3012
- },
3013
- {
3014
- "epoch": 9.12,
3015
- "learning_rate": 4.880581516095535e-06,
3016
- "loss": 0.1939,
3017
- "step": 4880
3018
- },
3019
- {
3020
- "epoch": 9.14,
3021
- "learning_rate": 4.776739356178609e-06,
3022
- "loss": 0.1753,
3023
- "step": 4890
3024
- },
3025
- {
3026
- "epoch": 9.16,
3027
- "learning_rate": 4.6728971962616825e-06,
3028
- "loss": 0.1758,
3029
- "step": 4900
3030
- },
3031
- {
3032
- "epoch": 9.18,
3033
- "learning_rate": 4.569055036344756e-06,
3034
- "loss": 0.1597,
3035
- "step": 4910
3036
- },
3037
- {
3038
- "epoch": 9.2,
3039
- "learning_rate": 4.46521287642783e-06,
3040
- "loss": 0.2269,
3041
- "step": 4920
3042
- },
3043
- {
3044
- "epoch": 9.21,
3045
- "learning_rate": 4.3613707165109035e-06,
3046
- "loss": 0.2723,
3047
- "step": 4930
3048
- },
3049
- {
3050
- "epoch": 9.23,
3051
- "learning_rate": 4.257528556593978e-06,
3052
- "loss": 0.2012,
3053
- "step": 4940
3054
- },
3055
- {
3056
- "epoch": 9.25,
3057
- "learning_rate": 4.153686396677051e-06,
3058
- "loss": 0.1863,
3059
- "step": 4950
3060
- },
3061
- {
3062
- "epoch": 9.27,
3063
- "learning_rate": 4.0498442367601245e-06,
3064
- "loss": 0.2029,
3065
- "step": 4960
3066
- },
3067
- {
3068
- "epoch": 9.29,
3069
- "learning_rate": 3.946002076843199e-06,
3070
- "loss": 0.2067,
3071
- "step": 4970
3072
- },
3073
- {
3074
- "epoch": 9.31,
3075
- "learning_rate": 3.842159916926272e-06,
3076
- "loss": 0.2359,
3077
- "step": 4980
3078
- },
3079
- {
3080
- "epoch": 9.33,
3081
- "learning_rate": 3.7383177570093455e-06,
3082
- "loss": 0.2398,
3083
- "step": 4990
3084
- },
3085
- {
3086
- "epoch": 9.35,
3087
- "learning_rate": 3.6344755970924198e-06,
3088
- "loss": 0.2233,
3089
- "step": 5000
3090
- },
3091
- {
3092
- "epoch": 9.36,
3093
- "learning_rate": 3.5306334371754936e-06,
3094
- "loss": 0.1934,
3095
- "step": 5010
3096
- },
3097
- {
3098
- "epoch": 9.38,
3099
- "learning_rate": 3.426791277258567e-06,
3100
- "loss": 0.2625,
3101
- "step": 5020
3102
- },
3103
- {
3104
- "epoch": 9.4,
3105
- "learning_rate": 3.3229491173416407e-06,
3106
- "loss": 0.2548,
3107
- "step": 5030
3108
- },
3109
- {
3110
- "epoch": 9.42,
3111
- "learning_rate": 3.2191069574247146e-06,
3112
- "loss": 0.1718,
3113
- "step": 5040
3114
- },
3115
- {
3116
- "epoch": 9.44,
3117
- "learning_rate": 3.1152647975077884e-06,
3118
- "loss": 0.1668,
3119
- "step": 5050
3120
- },
3121
- {
3122
- "epoch": 9.46,
3123
- "learning_rate": 3.0114226375908617e-06,
3124
- "loss": 0.2202,
3125
- "step": 5060
3126
- },
3127
- {
3128
- "epoch": 9.48,
3129
- "learning_rate": 2.907580477673936e-06,
3130
- "loss": 0.2091,
3131
- "step": 5070
3132
- },
3133
- {
3134
- "epoch": 9.5,
3135
- "learning_rate": 2.8037383177570094e-06,
3136
- "loss": 0.2213,
3137
- "step": 5080
3138
- },
3139
- {
3140
- "epoch": 9.51,
3141
- "learning_rate": 2.699896157840083e-06,
3142
- "loss": 0.2857,
3143
- "step": 5090
3144
- },
3145
- {
3146
- "epoch": 9.53,
3147
- "learning_rate": 2.596053997923157e-06,
3148
- "loss": 0.2209,
3149
- "step": 5100
3150
- },
3151
- {
3152
- "epoch": 9.55,
3153
- "learning_rate": 2.4922118380062308e-06,
3154
- "loss": 0.2341,
3155
- "step": 5110
3156
- },
3157
- {
3158
- "epoch": 9.57,
3159
- "learning_rate": 2.3883696780893046e-06,
3160
- "loss": 0.2111,
3161
- "step": 5120
3162
- },
3163
- {
3164
- "epoch": 9.59,
3165
- "learning_rate": 2.284527518172378e-06,
3166
- "loss": 0.1869,
3167
- "step": 5130
3168
- },
3169
- {
3170
- "epoch": 9.61,
3171
- "learning_rate": 2.1806853582554518e-06,
3172
- "loss": 0.2229,
3173
- "step": 5140
3174
- },
3175
- {
3176
- "epoch": 9.63,
3177
- "learning_rate": 2.0768431983385256e-06,
3178
- "loss": 0.2251,
3179
- "step": 5150
3180
- },
3181
- {
3182
- "epoch": 9.64,
3183
- "learning_rate": 1.9730010384215994e-06,
3184
- "loss": 0.183,
3185
- "step": 5160
3186
- },
3187
- {
3188
- "epoch": 9.66,
3189
- "learning_rate": 1.8691588785046728e-06,
3190
- "loss": 0.1606,
3191
- "step": 5170
3192
- },
3193
- {
3194
- "epoch": 9.68,
3195
- "learning_rate": 1.7653167185877468e-06,
3196
- "loss": 0.2067,
3197
- "step": 5180
3198
- },
3199
- {
3200
- "epoch": 9.7,
3201
- "learning_rate": 1.6614745586708204e-06,
3202
- "loss": 0.2346,
3203
- "step": 5190
3204
- },
3205
- {
3206
- "epoch": 9.72,
3207
- "learning_rate": 1.5576323987538942e-06,
3208
- "loss": 0.2161,
3209
- "step": 5200
3210
- },
3211
- {
3212
- "epoch": 9.74,
3213
- "learning_rate": 1.453790238836968e-06,
3214
- "loss": 0.2893,
3215
- "step": 5210
3216
- },
3217
- {
3218
- "epoch": 9.76,
3219
- "learning_rate": 1.3499480789200416e-06,
3220
- "loss": 0.2196,
3221
- "step": 5220
3222
- },
3223
- {
3224
- "epoch": 9.78,
3225
- "learning_rate": 1.2461059190031154e-06,
3226
- "loss": 0.2035,
3227
- "step": 5230
3228
- },
3229
- {
3230
- "epoch": 9.79,
3231
- "learning_rate": 1.142263759086189e-06,
3232
- "loss": 0.1541,
3233
- "step": 5240
3234
- },
3235
- {
3236
- "epoch": 9.81,
3237
- "learning_rate": 1.0384215991692628e-06,
3238
- "loss": 0.2105,
3239
- "step": 5250
3240
- },
3241
- {
3242
- "epoch": 9.83,
3243
- "learning_rate": 9.345794392523364e-07,
3244
- "loss": 0.2065,
3245
- "step": 5260
3246
- },
3247
- {
3248
- "epoch": 9.85,
3249
- "learning_rate": 8.307372793354102e-07,
3250
- "loss": 0.161,
3251
- "step": 5270
3252
- },
3253
- {
3254
- "epoch": 9.87,
3255
- "learning_rate": 7.26895119418484e-07,
3256
- "loss": 0.2285,
3257
- "step": 5280
3258
- },
3259
- {
3260
- "epoch": 9.89,
3261
- "learning_rate": 6.230529595015577e-07,
3262
- "loss": 0.2051,
3263
- "step": 5290
3264
- },
3265
- {
3266
- "epoch": 9.91,
3267
- "learning_rate": 5.192107995846314e-07,
3268
- "loss": 0.2291,
3269
- "step": 5300
3270
- },
3271
- {
3272
- "epoch": 9.93,
3273
- "learning_rate": 4.153686396677051e-07,
3274
- "loss": 0.2071,
3275
- "step": 5310
3276
- },
3277
- {
3278
- "epoch": 9.94,
3279
- "learning_rate": 3.1152647975077885e-07,
3280
- "loss": 0.2055,
3281
- "step": 5320
3282
- },
3283
- {
3284
- "epoch": 9.96,
3285
- "learning_rate": 2.0768431983385255e-07,
3286
- "loss": 0.1433,
3287
- "step": 5330
3288
- },
3289
- {
3290
- "epoch": 9.98,
3291
- "learning_rate": 1.0384215991692627e-07,
3292
- "loss": 0.2493,
3293
- "step": 5340
3294
- },
3295
  {
3296
  "epoch": 10.0,
3297
- "learning_rate": 0.0,
3298
- "loss": 0.1877,
3299
- "step": 5350
3300
- },
3301
- {
3302
- "epoch": 10.0,
3303
- "eval_accuracy": 0.8714953271028038,
3304
- "eval_loss": 0.38807550072669983,
3305
- "eval_runtime": 92.2965,
3306
- "eval_samples_per_second": 46.372,
3307
- "eval_steps_per_second": 5.797,
3308
- "step": 5350
3309
- },
3310
- {
3311
- "epoch": 10.0,
3312
- "step": 5350,
3313
- "total_flos": 1.3264660513609667e+19,
3314
- "train_loss": 0.36025063641717503,
3315
- "train_runtime": 7147.2488,
3316
- "train_samples_per_second": 23.949,
3317
- "train_steps_per_second": 0.749
3318
  },
3319
  {
3320
  "epoch": 10.0,
3321
- "eval_accuracy": 0.8761682242990654,
3322
- "eval_loss": 0.38753223419189453,
3323
- "eval_runtime": 92.7247,
3324
- "eval_samples_per_second": 46.158,
3325
- "eval_steps_per_second": 5.77,
3326
- "step": 5350
3327
  }
3328
  ],
3329
- "max_steps": 5350,
3330
  "num_train_epochs": 10,
3331
- "total_flos": 1.3264660513609667e+19,
3332
  "trial_name": null,
3333
  "trial_params": null
3334
  }
 
1
  {
2
+ "best_metric": 0.8705607476635514,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned-cassava/checkpoint-1200",
4
+ "epoch": 9.996677740863788,
5
+ "global_step": 1500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.07,
12
+ "learning_rate": 3.3333333333333333e-06,
13
+ "loss": 1.5923,
14
  "step": 10
15
  },
16
  {
17
+ "epoch": 0.13,
18
+ "learning_rate": 6.666666666666667e-06,
19
+ "loss": 1.546,
20
  "step": 20
21
  },
22
  {
23
+ "epoch": 0.2,
24
+ "learning_rate": 1e-05,
25
+ "loss": 1.425,
26
  "step": 30
27
  },
28
  {
29
+ "epoch": 0.27,
30
+ "learning_rate": 1.3333333333333333e-05,
31
+ "loss": 1.2868,
32
  "step": 40
33
  },
34
  {
35
+ "epoch": 0.33,
36
+ "learning_rate": 1.6666666666666667e-05,
37
+ "loss": 1.0968,
38
  "step": 50
39
  },
40
  {
41
+ "epoch": 0.4,
42
+ "learning_rate": 2e-05,
43
+ "loss": 1.0634,
44
  "step": 60
45
  },
46
  {
47
+ "epoch": 0.47,
48
+ "learning_rate": 2.3333333333333336e-05,
49
+ "loss": 0.9365,
50
  "step": 70
51
  },
52
  {
53
+ "epoch": 0.53,
54
+ "learning_rate": 2.6666666666666667e-05,
55
+ "loss": 0.854,
56
  "step": 80
57
  },
58
  {
59
+ "epoch": 0.6,
60
+ "learning_rate": 3e-05,
61
+ "loss": 0.7884,
62
  "step": 90
63
  },
64
  {
65
+ "epoch": 0.66,
66
+ "learning_rate": 3.3333333333333335e-05,
67
+ "loss": 0.7399,
68
  "step": 100
69
  },
70
  {
71
+ "epoch": 0.73,
72
+ "learning_rate": 3.6666666666666666e-05,
73
+ "loss": 0.6745,
74
  "step": 110
75
  },
76
  {
77
+ "epoch": 0.8,
78
+ "learning_rate": 4e-05,
79
+ "loss": 0.6399,
80
  "step": 120
81
  },
82
  {
83
+ "epoch": 0.86,
84
+ "learning_rate": 4.3333333333333334e-05,
85
+ "loss": 0.6273,
86
  "step": 130
87
  },
88
  {
89
+ "epoch": 0.93,
90
+ "learning_rate": 4.666666666666667e-05,
91
+ "loss": 0.5644,
92
  "step": 140
93
  },
94
  {
95
+ "epoch": 1.0,
96
+ "learning_rate": 5e-05,
97
+ "loss": 0.5628,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 1.0,
102
+ "eval_accuracy": 0.8308411214953271,
103
+ "eval_loss": 0.5357394218444824,
104
+ "eval_runtime": 44.4006,
105
+ "eval_samples_per_second": 48.198,
106
+ "eval_steps_per_second": 1.509,
107
  "step": 150
108
  },
109
  {
110
+ "epoch": 1.07,
111
+ "learning_rate": 4.962962962962963e-05,
112
+ "loss": 0.5665,
113
  "step": 160
114
  },
115
  {
116
+ "epoch": 1.13,
117
+ "learning_rate": 4.925925925925926e-05,
118
+ "loss": 0.5119,
119
  "step": 170
120
  },
121
  {
122
+ "epoch": 1.2,
123
+ "learning_rate": 4.888888888888889e-05,
124
+ "loss": 0.4606,
125
  "step": 180
126
  },
127
  {
128
+ "epoch": 1.27,
129
+ "learning_rate": 4.851851851851852e-05,
130
+ "loss": 0.5033,
131
  "step": 190
132
  },
133
  {
134
+ "epoch": 1.33,
135
+ "learning_rate": 4.814814814814815e-05,
136
+ "loss": 0.4933,
137
  "step": 200
138
  },
139
  {
140
+ "epoch": 1.4,
141
+ "learning_rate": 4.7777777777777784e-05,
142
+ "loss": 0.4688,
143
  "step": 210
144
  },
145
  {
146
+ "epoch": 1.47,
147
+ "learning_rate": 4.740740740740741e-05,
148
+ "loss": 0.514,
149
  "step": 220
150
  },
151
  {
152
+ "epoch": 1.53,
153
+ "learning_rate": 4.703703703703704e-05,
154
+ "loss": 0.4439,
155
  "step": 230
156
  },
157
  {
158
+ "epoch": 1.6,
159
+ "learning_rate": 4.666666666666667e-05,
160
+ "loss": 0.5142,
161
  "step": 240
162
  },
163
  {
164
+ "epoch": 1.66,
165
+ "learning_rate": 4.62962962962963e-05,
166
+ "loss": 0.4206,
167
  "step": 250
168
  },
169
  {
170
+ "epoch": 1.73,
171
+ "learning_rate": 4.592592592592593e-05,
172
+ "loss": 0.4254,
173
  "step": 260
174
  },
175
  {
176
+ "epoch": 1.8,
177
+ "learning_rate": 4.555555555555556e-05,
178
+ "loss": 0.4284,
179
  "step": 270
180
  },
181
  {
182
+ "epoch": 1.86,
183
+ "learning_rate": 4.518518518518519e-05,
184
+ "loss": 0.4446,
185
  "step": 280
186
  },
187
  {
188
+ "epoch": 1.93,
189
+ "learning_rate": 4.481481481481482e-05,
190
+ "loss": 0.422,
191
  "step": 290
192
  },
193
  {
194
+ "epoch": 2.0,
195
+ "learning_rate": 4.4444444444444447e-05,
196
+ "loss": 0.4398,
197
+ "step": 300
198
+ },
199
+ {
200
+ "epoch": 2.0,
201
+ "eval_accuracy": 0.8598130841121495,
202
+ "eval_loss": 0.43114611506462097,
203
+ "eval_runtime": 46.3574,
204
+ "eval_samples_per_second": 46.163,
205
+ "eval_steps_per_second": 1.445,
206
  "step": 300
207
  },
208
  {
209
+ "epoch": 2.07,
210
+ "learning_rate": 4.4074074074074076e-05,
211
+ "loss": 0.4463,
212
  "step": 310
213
  },
214
  {
215
+ "epoch": 2.13,
216
+ "learning_rate": 4.3703703703703705e-05,
217
+ "loss": 0.3933,
218
  "step": 320
219
  },
220
  {
221
+ "epoch": 2.2,
222
+ "learning_rate": 4.3333333333333334e-05,
223
+ "loss": 0.4323,
224
  "step": 330
225
  },
226
  {
227
+ "epoch": 2.27,
228
+ "learning_rate": 4.296296296296296e-05,
229
+ "loss": 0.4332,
230
  "step": 340
231
  },
232
  {
233
+ "epoch": 2.33,
234
+ "learning_rate": 4.259259259259259e-05,
235
+ "loss": 0.4394,
236
  "step": 350
237
  },
238
  {
239
+ "epoch": 2.4,
240
+ "learning_rate": 4.222222222222222e-05,
241
+ "loss": 0.4084,
242
  "step": 360
243
  },
244
  {
245
+ "epoch": 2.47,
246
+ "learning_rate": 4.185185185185185e-05,
247
+ "loss": 0.3772,
248
  "step": 370
249
  },
250
  {
251
+ "epoch": 2.53,
252
+ "learning_rate": 4.148148148148148e-05,
253
+ "loss": 0.4324,
254
  "step": 380
255
  },
256
  {
257
+ "epoch": 2.6,
258
+ "learning_rate": 4.111111111111111e-05,
259
+ "loss": 0.3999,
260
  "step": 390
261
  },
262
  {
263
+ "epoch": 2.66,
264
+ "learning_rate": 4.074074074074074e-05,
265
+ "loss": 0.425,
266
  "step": 400
267
  },
268
  {
269
+ "epoch": 2.73,
270
+ "learning_rate": 4.0370370370370374e-05,
271
+ "loss": 0.42,
272
  "step": 410
273
  },
274
  {
275
+ "epoch": 2.8,
276
+ "learning_rate": 4e-05,
277
+ "loss": 0.431,
278
  "step": 420
279
  },
280
  {
281
+ "epoch": 2.86,
282
+ "learning_rate": 3.962962962962963e-05,
283
+ "loss": 0.3784,
284
  "step": 430
285
  },
286
  {
287
+ "epoch": 2.93,
288
+ "learning_rate": 3.925925925925926e-05,
289
+ "loss": 0.3684,
290
  "step": 440
291
  },
292
  {
293
+ "epoch": 3.0,
294
+ "learning_rate": 3.888888888888889e-05,
295
+ "loss": 0.4022,
296
  "step": 450
297
  },
298
  {
299
+ "epoch": 3.0,
300
+ "eval_accuracy": 0.866822429906542,
301
+ "eval_loss": 0.39579352736473083,
302
+ "eval_runtime": 44.3798,
303
+ "eval_samples_per_second": 48.22,
304
+ "eval_steps_per_second": 1.51,
305
+ "step": 450
306
+ },
307
+ {
308
+ "epoch": 3.07,
309
+ "learning_rate": 3.851851851851852e-05,
310
+ "loss": 0.4017,
311
  "step": 460
312
  },
313
  {
314
+ "epoch": 3.13,
315
+ "learning_rate": 3.814814814814815e-05,
316
+ "loss": 0.4008,
317
  "step": 470
318
  },
319
  {
320
+ "epoch": 3.2,
321
+ "learning_rate": 3.777777777777778e-05,
322
+ "loss": 0.3794,
323
  "step": 480
324
  },
325
  {
326
+ "epoch": 3.27,
327
+ "learning_rate": 3.740740740740741e-05,
328
+ "loss": 0.3789,
329
  "step": 490
330
  },
331
  {
332
+ "epoch": 3.33,
333
+ "learning_rate": 3.7037037037037037e-05,
334
+ "loss": 0.3759,
335
  "step": 500
336
  },
337
  {
338
+ "epoch": 3.4,
339
+ "learning_rate": 3.6666666666666666e-05,
340
+ "loss": 0.3652,
341
  "step": 510
342
  },
343
  {
344
+ "epoch": 3.47,
345
+ "learning_rate": 3.62962962962963e-05,
346
+ "loss": 0.3496,
347
  "step": 520
348
  },
349
  {
350
+ "epoch": 3.53,
351
+ "learning_rate": 3.592592592592593e-05,
352
+ "loss": 0.3565,
353
  "step": 530
354
  },
355
  {
356
+ "epoch": 3.6,
357
+ "learning_rate": 3.555555555555556e-05,
358
+ "loss": 0.3943,
 
 
 
 
 
 
 
 
 
359
  "step": 540
360
  },
361
  {
362
+ "epoch": 3.66,
363
+ "learning_rate": 3.518518518518519e-05,
364
+ "loss": 0.3959,
365
  "step": 550
366
  },
367
  {
368
+ "epoch": 3.73,
369
+ "learning_rate": 3.481481481481482e-05,
370
+ "loss": 0.3871,
371
  "step": 560
372
  },
373
  {
374
+ "epoch": 3.8,
375
+ "learning_rate": 3.444444444444445e-05,
376
+ "loss": 0.3636,
377
  "step": 570
378
  },
379
  {
380
+ "epoch": 3.86,
381
+ "learning_rate": 3.4074074074074077e-05,
382
+ "loss": 0.3487,
383
  "step": 580
384
  },
385
  {
386
+ "epoch": 3.93,
387
+ "learning_rate": 3.3703703703703706e-05,
388
+ "loss": 0.4183,
389
  "step": 590
390
  },
391
  {
392
+ "epoch": 4.0,
393
+ "learning_rate": 3.3333333333333335e-05,
394
+ "loss": 0.3855,
395
+ "step": 600
396
+ },
397
+ {
398
+ "epoch": 4.0,
399
+ "eval_accuracy": 0.8598130841121495,
400
+ "eval_loss": 0.4029967784881592,
401
+ "eval_runtime": 42.4817,
402
+ "eval_samples_per_second": 50.375,
403
+ "eval_steps_per_second": 1.577,
404
  "step": 600
405
  },
406
  {
407
+ "epoch": 4.07,
408
+ "learning_rate": 3.2962962962962964e-05,
409
+ "loss": 0.3929,
410
  "step": 610
411
  },
412
  {
413
+ "epoch": 4.13,
414
+ "learning_rate": 3.25925925925926e-05,
415
+ "loss": 0.3455,
416
  "step": 620
417
  },
418
  {
419
+ "epoch": 4.2,
420
+ "learning_rate": 3.222222222222223e-05,
421
+ "loss": 0.351,
422
  "step": 630
423
  },
424
  {
425
+ "epoch": 4.27,
426
+ "learning_rate": 3.185185185185185e-05,
427
+ "loss": 0.3597,
428
  "step": 640
429
  },
430
  {
431
+ "epoch": 4.33,
432
+ "learning_rate": 3.148148148148148e-05,
433
+ "loss": 0.3524,
434
  "step": 650
435
  },
436
  {
437
+ "epoch": 4.4,
438
+ "learning_rate": 3.111111111111111e-05,
439
+ "loss": 0.3514,
440
  "step": 660
441
  },
442
  {
443
+ "epoch": 4.47,
444
+ "learning_rate": 3.074074074074074e-05,
445
+ "loss": 0.3362,
446
  "step": 670
447
  },
448
  {
449
+ "epoch": 4.53,
450
+ "learning_rate": 3.037037037037037e-05,
451
+ "loss": 0.3549,
452
  "step": 680
453
  },
454
  {
455
+ "epoch": 4.6,
456
+ "learning_rate": 3e-05,
457
+ "loss": 0.3175,
458
  "step": 690
459
  },
460
  {
461
+ "epoch": 4.66,
462
+ "learning_rate": 2.962962962962963e-05,
463
+ "loss": 0.3253,
464
  "step": 700
465
  },
466
  {
467
+ "epoch": 4.73,
468
+ "learning_rate": 2.925925925925926e-05,
469
+ "loss": 0.3367,
470
  "step": 710
471
  },
472
  {
473
+ "epoch": 4.8,
474
+ "learning_rate": 2.8888888888888888e-05,
475
+ "loss": 0.3571,
476
  "step": 720
477
  },
478
  {
479
+ "epoch": 4.86,
480
+ "learning_rate": 2.851851851851852e-05,
481
+ "loss": 0.3256,
482
  "step": 730
483
  },
484
  {
485
+ "epoch": 4.93,
486
+ "learning_rate": 2.814814814814815e-05,
487
+ "loss": 0.3747,
488
  "step": 740
489
  },
490
  {
491
+ "epoch": 5.0,
492
+ "learning_rate": 2.777777777777778e-05,
493
+ "loss": 0.3659,
494
+ "step": 750
495
+ },
496
+ {
497
+ "epoch": 5.0,
498
+ "eval_accuracy": 0.8616822429906542,
499
+ "eval_loss": 0.4124689996242523,
500
+ "eval_runtime": 43.3097,
501
+ "eval_samples_per_second": 49.412,
502
+ "eval_steps_per_second": 1.547,
503
  "step": 750
504
  },
505
  {
506
+ "epoch": 5.07,
507
+ "learning_rate": 2.7407407407407408e-05,
508
+ "loss": 0.3325,
509
  "step": 760
510
  },
511
  {
512
+ "epoch": 5.13,
513
+ "learning_rate": 2.7037037037037037e-05,
514
+ "loss": 0.3088,
515
  "step": 770
516
  },
517
  {
518
+ "epoch": 5.2,
519
+ "learning_rate": 2.6666666666666667e-05,
520
+ "loss": 0.3233,
521
  "step": 780
522
  },
523
  {
524
+ "epoch": 5.27,
525
+ "learning_rate": 2.6296296296296296e-05,
526
+ "loss": 0.2802,
527
  "step": 790
528
  },
529
  {
530
+ "epoch": 5.33,
531
+ "learning_rate": 2.5925925925925925e-05,
532
+ "loss": 0.3253,
533
  "step": 800
534
  },
535
  {
536
+ "epoch": 5.4,
537
+ "learning_rate": 2.5555555555555554e-05,
538
+ "loss": 0.3135,
539
  "step": 810
540
  },
541
  {
542
+ "epoch": 5.47,
543
+ "learning_rate": 2.5185185185185183e-05,
544
+ "loss": 0.3415,
545
  "step": 820
546
  },
547
  {
548
+ "epoch": 5.53,
549
+ "learning_rate": 2.4814814814814816e-05,
550
+ "loss": 0.2925,
551
  "step": 830
552
  },
553
  {
554
+ "epoch": 5.6,
555
+ "learning_rate": 2.4444444444444445e-05,
556
+ "loss": 0.3586,
557
  "step": 840
558
  },
559
  {
560
+ "epoch": 5.66,
561
+ "learning_rate": 2.4074074074074074e-05,
562
+ "loss": 0.3392,
563
  "step": 850
564
  },
565
  {
566
+ "epoch": 5.73,
567
+ "learning_rate": 2.3703703703703707e-05,
568
+ "loss": 0.3423,
569
  "step": 860
570
  },
571
  {
572
+ "epoch": 5.8,
573
+ "learning_rate": 2.3333333333333336e-05,
574
+ "loss": 0.3371,
575
  "step": 870
576
  },
577
  {
578
+ "epoch": 5.86,
579
+ "learning_rate": 2.2962962962962965e-05,
580
+ "loss": 0.3295,
581
  "step": 880
582
  },
583
  {
584
+ "epoch": 5.93,
585
+ "learning_rate": 2.2592592592592594e-05,
586
+ "loss": 0.345,
587
  "step": 890
588
  },
589
  {
590
+ "epoch": 6.0,
591
+ "learning_rate": 2.2222222222222223e-05,
592
+ "loss": 0.3393,
593
+ "step": 900
594
+ },
595
+ {
596
+ "epoch": 6.0,
597
+ "eval_accuracy": 0.8672897196261682,
598
+ "eval_loss": 0.3839728534221649,
599
+ "eval_runtime": 43.7964,
600
+ "eval_samples_per_second": 48.862,
601
+ "eval_steps_per_second": 1.53,
602
  "step": 900
603
  },
604
  {
605
+ "epoch": 6.07,
606
+ "learning_rate": 2.1851851851851852e-05,
607
+ "loss": 0.3399,
608
  "step": 910
609
  },
610
  {
611
+ "epoch": 6.13,
612
+ "learning_rate": 2.148148148148148e-05,
613
+ "loss": 0.3209,
614
  "step": 920
615
  },
616
  {
617
+ "epoch": 6.2,
618
+ "learning_rate": 2.111111111111111e-05,
619
+ "loss": 0.3055,
620
  "step": 930
621
  },
622
  {
623
+ "epoch": 6.27,
624
+ "learning_rate": 2.074074074074074e-05,
625
+ "loss": 0.2946,
626
  "step": 940
627
  },
628
  {
629
+ "epoch": 6.33,
630
+ "learning_rate": 2.037037037037037e-05,
631
+ "loss": 0.3265,
632
  "step": 950
633
  },
634
  {
635
+ "epoch": 6.4,
636
+ "learning_rate": 2e-05,
637
+ "loss": 0.3245,
638
  "step": 960
639
  },
640
  {
641
+ "epoch": 6.47,
642
+ "learning_rate": 1.962962962962963e-05,
643
+ "loss": 0.2877,
644
  "step": 970
645
  },
646
  {
647
+ "epoch": 6.53,
648
+ "learning_rate": 1.925925925925926e-05,
649
+ "loss": 0.3212,
650
  "step": 980
651
  },
652
  {
653
+ "epoch": 6.6,
654
+ "learning_rate": 1.888888888888889e-05,
655
+ "loss": 0.2927,
656
  "step": 990
657
  },
658
  {
659
+ "epoch": 6.66,
660
+ "learning_rate": 1.8518518518518518e-05,
661
+ "loss": 0.3031,
662
  "step": 1000
663
  },
664
  {
665
+ "epoch": 6.73,
666
+ "learning_rate": 1.814814814814815e-05,
667
+ "loss": 0.2875,
668
  "step": 1010
669
  },
670
  {
671
+ "epoch": 6.8,
672
+ "learning_rate": 1.777777777777778e-05,
673
+ "loss": 0.3567,
674
  "step": 1020
675
  },
676
  {
677
+ "epoch": 6.86,
678
+ "learning_rate": 1.740740740740741e-05,
679
+ "loss": 0.3145,
680
  "step": 1030
681
  },
682
  {
683
+ "epoch": 6.93,
684
+ "learning_rate": 1.7037037037037038e-05,
685
+ "loss": 0.3167,
686
  "step": 1040
687
  },
688
  {
689
+ "epoch": 7.0,
690
+ "learning_rate": 1.6666666666666667e-05,
691
+ "loss": 0.3022,
692
  "step": 1050
693
  },
694
  {
695
+ "epoch": 7.0,
696
+ "eval_accuracy": 0.8672897196261682,
697
+ "eval_loss": 0.37750476598739624,
698
+ "eval_runtime": 42.1228,
699
+ "eval_samples_per_second": 50.804,
700
+ "eval_steps_per_second": 1.591,
701
+ "step": 1050
702
  },
703
  {
704
+ "epoch": 7.07,
705
+ "learning_rate": 1.62962962962963e-05,
706
+ "loss": 0.286,
707
+ "step": 1060
708
  },
709
  {
710
+ "epoch": 7.13,
711
+ "learning_rate": 1.5925925925925926e-05,
712
+ "loss": 0.292,
 
 
 
713
  "step": 1070
714
  },
715
  {
716
+ "epoch": 7.2,
717
+ "learning_rate": 1.5555555555555555e-05,
718
+ "loss": 0.2974,
719
  "step": 1080
720
  },
721
  {
722
+ "epoch": 7.27,
723
+ "learning_rate": 1.5185185185185186e-05,
724
+ "loss": 0.3276,
725
  "step": 1090
726
  },
727
  {
728
+ "epoch": 7.33,
729
+ "learning_rate": 1.4814814814814815e-05,
730
+ "loss": 0.3145,
731
  "step": 1100
732
  },
733
  {
734
+ "epoch": 7.4,
735
+ "learning_rate": 1.4444444444444444e-05,
736
+ "loss": 0.2889,
737
  "step": 1110
738
  },
739
  {
740
+ "epoch": 7.47,
741
+ "learning_rate": 1.4074074074074075e-05,
742
+ "loss": 0.2937,
743
  "step": 1120
744
  },
745
  {
746
+ "epoch": 7.53,
747
+ "learning_rate": 1.3703703703703704e-05,
748
+ "loss": 0.2652,
749
  "step": 1130
750
  },
751
  {
752
+ "epoch": 7.6,
753
+ "learning_rate": 1.3333333333333333e-05,
754
+ "loss": 0.2759,
755
  "step": 1140
756
  },
757
  {
758
+ "epoch": 7.66,
759
+ "learning_rate": 1.2962962962962962e-05,
760
+ "loss": 0.3055,
761
  "step": 1150
762
  },
763
  {
764
+ "epoch": 7.73,
765
+ "learning_rate": 1.2592592592592592e-05,
766
+ "loss": 0.3161,
767
  "step": 1160
768
  },
769
  {
770
+ "epoch": 7.8,
771
+ "learning_rate": 1.2222222222222222e-05,
772
+ "loss": 0.3121,
773
  "step": 1170
774
  },
775
  {
776
+ "epoch": 7.86,
777
+ "learning_rate": 1.1851851851851853e-05,
778
+ "loss": 0.2599,
779
  "step": 1180
780
  },
781
  {
782
+ "epoch": 7.93,
783
+ "learning_rate": 1.1481481481481482e-05,
784
+ "loss": 0.2658,
785
  "step": 1190
786
  },
787
  {
788
+ "epoch": 8.0,
789
+ "learning_rate": 1.1111111111111112e-05,
790
+ "loss": 0.2941,
791
+ "step": 1200
792
+ },
793
+ {
794
+ "epoch": 8.0,
795
+ "eval_accuracy": 0.8705607476635514,
796
+ "eval_loss": 0.3742366135120392,
797
+ "eval_runtime": 42.2299,
798
+ "eval_samples_per_second": 50.675,
799
+ "eval_steps_per_second": 1.587,
800
  "step": 1200
801
  },
802
  {
803
+ "epoch": 8.07,
804
+ "learning_rate": 1.074074074074074e-05,
805
+ "loss": 0.2998,
806
  "step": 1210
807
  },
808
  {
809
+ "epoch": 8.13,
810
+ "learning_rate": 1.037037037037037e-05,
811
+ "loss": 0.3095,
812
  "step": 1220
813
  },
814
  {
815
+ "epoch": 8.2,
816
+ "learning_rate": 1e-05,
817
+ "loss": 0.2833,
818
  "step": 1230
819
  },
820
  {
821
+ "epoch": 8.27,
822
+ "learning_rate": 9.62962962962963e-06,
823
+ "loss": 0.2792,
824
  "step": 1240
825
  },
826
  {
827
+ "epoch": 8.33,
828
+ "learning_rate": 9.259259259259259e-06,
829
+ "loss": 0.2776,
830
  "step": 1250
831
  },
832
  {
833
+ "epoch": 8.4,
834
+ "learning_rate": 8.88888888888889e-06,
835
+ "loss": 0.3238,
836
  "step": 1260
837
  },
838
  {
839
+ "epoch": 8.47,
840
+ "learning_rate": 8.518518518518519e-06,
841
+ "loss": 0.2764,
842
  "step": 1270
843
  },
844
  {
845
+ "epoch": 8.53,
846
+ "learning_rate": 8.14814814814815e-06,
847
+ "loss": 0.2875,
848
  "step": 1280
849
  },
850
  {
851
+ "epoch": 8.6,
852
+ "learning_rate": 7.777777777777777e-06,
853
+ "loss": 0.2545,
854
  "step": 1290
855
  },
856
  {
857
+ "epoch": 8.66,
858
+ "learning_rate": 7.4074074074074075e-06,
859
+ "loss": 0.2551,
860
  "step": 1300
861
  },
862
  {
863
+ "epoch": 8.73,
864
+ "learning_rate": 7.0370370370370375e-06,
865
+ "loss": 0.2611,
866
  "step": 1310
867
  },
868
  {
869
+ "epoch": 8.8,
870
+ "learning_rate": 6.666666666666667e-06,
871
+ "loss": 0.251,
872
  "step": 1320
873
  },
874
  {
875
+ "epoch": 8.86,
876
+ "learning_rate": 6.296296296296296e-06,
877
+ "loss": 0.2864,
878
  "step": 1330
879
  },
880
  {
881
+ "epoch": 8.93,
882
+ "learning_rate": 5.925925925925927e-06,
883
+ "loss": 0.2921,
884
  "step": 1340
885
  },
886
  {
887
+ "epoch": 9.0,
888
+ "learning_rate": 5.555555555555556e-06,
889
+ "loss": 0.2903,
890
  "step": 1350
891
  },
892
  {
893
+ "epoch": 9.0,
894
+ "eval_accuracy": 0.8696261682242991,
895
+ "eval_loss": 0.3808652460575104,
896
+ "eval_runtime": 42.311,
897
+ "eval_samples_per_second": 50.578,
898
+ "eval_steps_per_second": 1.584,
899
+ "step": 1350
900
+ },
901
+ {
902
+ "epoch": 9.07,
903
+ "learning_rate": 5.185185185185185e-06,
904
+ "loss": 0.3301,
905
  "step": 1360
906
  },
907
  {
908
+ "epoch": 9.13,
909
+ "learning_rate": 4.814814814814815e-06,
910
+ "loss": 0.3025,
911
  "step": 1370
912
  },
913
  {
914
+ "epoch": 9.2,
915
+ "learning_rate": 4.444444444444445e-06,
916
+ "loss": 0.2471,
917
  "step": 1380
918
  },
919
  {
920
+ "epoch": 9.27,
921
+ "learning_rate": 4.074074074074075e-06,
922
+ "loss": 0.2805,
923
  "step": 1390
924
  },
925
  {
926
+ "epoch": 9.33,
927
+ "learning_rate": 3.7037037037037037e-06,
928
+ "loss": 0.2679,
929
  "step": 1400
930
  },
931
  {
932
+ "epoch": 9.4,
933
+ "learning_rate": 3.3333333333333333e-06,
934
+ "loss": 0.2784,
935
  "step": 1410
936
  },
937
  {
938
+ "epoch": 9.47,
939
+ "learning_rate": 2.9629629629629633e-06,
940
+ "loss": 0.2785,
941
  "step": 1420
942
  },
943
  {
944
+ "epoch": 9.53,
945
+ "learning_rate": 2.5925925925925925e-06,
946
+ "loss": 0.2662,
947
  "step": 1430
948
  },
949
  {
950
+ "epoch": 9.6,
951
+ "learning_rate": 2.2222222222222225e-06,
952
+ "loss": 0.2679,
953
  "step": 1440
954
  },
955
  {
956
+ "epoch": 9.66,
957
+ "learning_rate": 1.8518518518518519e-06,
958
+ "loss": 0.2702,
959
  "step": 1450
960
  },
961
  {
962
+ "epoch": 9.73,
963
+ "learning_rate": 1.4814814814814817e-06,
964
+ "loss": 0.2737,
965
  "step": 1460
966
  },
967
  {
968
+ "epoch": 9.8,
969
+ "learning_rate": 1.1111111111111112e-06,
970
+ "loss": 0.2747,
971
  "step": 1470
972
  },
973
  {
974
+ "epoch": 9.86,
975
+ "learning_rate": 7.407407407407408e-07,
976
+ "loss": 0.2418,
977
  "step": 1480
978
  },
979
  {
980
+ "epoch": 9.93,
981
+ "learning_rate": 3.703703703703704e-07,
982
+ "loss": 0.2568,
983
  "step": 1490
984
  },
985
  {
986
+ "epoch": 10.0,
987
+ "learning_rate": 0.0,
988
+ "loss": 0.2584,
989
  "step": 1500
990
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
991
  {
992
  "epoch": 10.0,
993
+ "eval_accuracy": 0.8696261682242991,
994
+ "eval_loss": 0.37556955218315125,
995
+ "eval_runtime": 42.3603,
996
+ "eval_samples_per_second": 50.519,
997
+ "eval_steps_per_second": 1.582,
998
+ "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
  },
1000
  {
1001
  "epoch": 10.0,
1002
+ "step": 1500,
1003
+ "total_flos": 1.4918616518411741e+19,
1004
+ "train_loss": 0.4060697093009949,
1005
+ "train_runtime": 6764.8563,
1006
+ "train_samples_per_second": 28.466,
1007
+ "train_steps_per_second": 0.222
1008
  }
1009
  ],
1010
+ "max_steps": 1500,
1011
  "num_train_epochs": 10,
1012
+ "total_flos": 1.4918616518411741e+19,
1013
  "trial_name": null,
1014
  "trial_params": null
1015
  }