End of training

Browse files

Files changed (6) hide show

README.md +3 -1
all_results.json +11 -11
eval_results.json +6 -6
runs/May06_10-03-49_nmjti7f45r/events.out.tfevents.1714995748.nmjti7f45r.226.1 +3 -0
train_results.json +6 -6
trainer_state.json +1591 -549

README.md CHANGED Viewed

@@ -2,6 +2,8 @@
 license: apache-2.0
 base_model: facebook/convnextv2-tiny-1k-224
 tags:
 - generated_from_trainer
 metrics:
 - accuracy
@@ -15,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
 # convnextv2-tiny-1k-224-finetuned-galaxy10-decals
-This model is a fine-tuned version of [facebook/convnextv2-tiny-1k-224](https://huggingface.co/facebook/convnextv2-tiny-1k-224) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.4261
 - Accuracy: 0.8703

 license: apache-2.0
 base_model: facebook/convnextv2-tiny-1k-224
 tags:
+- image-classification
+- vision
 - generated_from_trainer
 metrics:
 - accuracy
 # convnextv2-tiny-1k-224-finetuned-galaxy10-decals
+This model is a fine-tuned version of [facebook/convnextv2-tiny-1k-224](https://huggingface.co/facebook/convnextv2-tiny-1k-224) on the matthieulel/galaxy10_decals dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.4261
 - Accuracy: 0.8703

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
-    "epoch": 9.977728285077951,
-    "eval_accuracy": 0.8672510958046337,
-    "eval_loss": 0.41685062646865845,
-    "eval_runtime": 12.3483,
-    "eval_samples_per_second": 129.329,
-    "eval_steps_per_second": 4.049,
-    "total_flos": 3.6084187126879396e+18,
-    "train_loss": 0.7042633635657174,
-    "train_runtime": 2366.3213,
-    "train_samples_per_second": 60.706,
-    "train_steps_per_second": 0.473
 }

 {
+    "epoch": 19.879759519038075,
+    "eval_accuracy": 0.8703494926719278,
+    "eval_loss": 0.4261245131492615,
+    "eval_runtime": 25.4134,
+    "eval_samples_per_second": 69.806,
+    "eval_steps_per_second": 2.204,
+    "total_flos": 7.988705158075343e+18,
+    "train_loss": 0.5653726263392356,
+    "train_runtime": 5886.8979,
+    "train_samples_per_second": 54.229,
+    "train_steps_per_second": 0.421
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 9.977728285077951,
-    "eval_accuracy": 0.8672510958046337,
-    "eval_loss": 0.41685062646865845,
-    "eval_runtime": 12.3483,
-    "eval_samples_per_second": 129.329,
-    "eval_steps_per_second": 4.049
 }

 {
+    "epoch": 19.879759519038075,
+    "eval_accuracy": 0.8703494926719278,
+    "eval_loss": 0.4261245131492615,
+    "eval_runtime": 25.4134,
+    "eval_samples_per_second": 69.806,
+    "eval_steps_per_second": 2.204
 }

runs/May06_10-03-49_nmjti7f45r/events.out.tfevents.1714995748.nmjti7f45r.226.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d906b679414f6f7b01893920792ceeed5c78b6db17d357bece5b695a5dc7498
+size 411

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 9.977728285077951,
-    "total_flos": 3.6084187126879396e+18,
-    "train_loss": 0.7042633635657174,
-    "train_runtime": 2366.3213,
-    "train_samples_per_second": 60.706,
-    "train_steps_per_second": 0.473
 }

 {
+    "epoch": 19.879759519038075,
+    "total_flos": 7.988705158075343e+18,
+    "train_loss": 0.5653726263392356,
+    "train_runtime": 5886.8979,
+    "train_samples_per_second": 54.229,
+    "train_steps_per_second": 0.421
 }

trainer_state.json CHANGED Viewed

@@ -1,903 +1,1945 @@
 {
-  "best_metric": 0.8672510958046337,
-  "best_model_checkpoint": "convnextv2-tiny-1k-224-finetuned-galaxy10-decals/checkpoint-785",
-  "epoch": 9.977728285077951,
   "eval_steps": 500,
-  "global_step": 1120,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.08908685968819599,
-      "grad_norm": 2.9069173336029053,
-      "learning_rate": 4.464285714285715e-06,
-      "loss": 2.3159,
       "step": 10
     },
     {
-      "epoch": 0.17817371937639198,
-      "grad_norm": 4.669096946716309,
-      "learning_rate": 8.92857142857143e-06,
-      "loss": 2.2657,
       "step": 20
     },
     {
-      "epoch": 0.267260579064588,
-      "grad_norm": 5.892533302307129,
-      "learning_rate": 1.3392857142857144e-05,
-      "loss": 2.1852,
       "step": 30
     },
     {
-      "epoch": 0.35634743875278396,
-      "grad_norm": 4.657855033874512,
-      "learning_rate": 1.785714285714286e-05,
-      "loss": 2.0843,
       "step": 40
     },
     {
-      "epoch": 0.44543429844098,
-      "grad_norm": 4.414278030395508,
-      "learning_rate": 2.2321428571428575e-05,
-      "loss": 1.9644,
       "step": 50
     },
     {
-      "epoch": 0.534521158129176,
-      "grad_norm": 6.150153636932373,
-      "learning_rate": 2.6785714285714288e-05,
-      "loss": 1.7921,
       "step": 60
     },
     {
-      "epoch": 0.623608017817372,
-      "grad_norm": 7.54302978515625,
-      "learning_rate": 3.125e-05,
-      "loss": 1.5743,
       "step": 70
     },
     {
-      "epoch": 0.7126948775055679,
-      "grad_norm": 11.624669075012207,
-      "learning_rate": 3.571428571428572e-05,
-      "loss": 1.42,
       "step": 80
     },
     {
-      "epoch": 0.8017817371937639,
-      "grad_norm": 11.175118446350098,
-      "learning_rate": 4.017857142857143e-05,
-      "loss": 1.301,
       "step": 90
     },
     {
-      "epoch": 0.89086859688196,
-      "grad_norm": 13.484392166137695,
-      "learning_rate": 4.464285714285715e-05,
-      "loss": 1.2246,
       "step": 100
     },
     {
-      "epoch": 0.9799554565701559,
-      "grad_norm": 22.607799530029297,
-      "learning_rate": 4.910714285714286e-05,
-      "loss": 1.0664,
       "step": 110
     },
     {
-      "epoch": 0.9977728285077951,
-      "eval_accuracy": 0.6725109580463369,
-      "eval_loss": 0.9818494915962219,
-      "eval_runtime": 12.2808,
-      "eval_samples_per_second": 130.041,
-      "eval_steps_per_second": 4.071,
-      "step": 112
     },
     {
-      "epoch": 1.069042316258352,
-      "grad_norm": 11.895453453063965,
-      "learning_rate": 4.960317460317461e-05,
-      "loss": 1.0415,
-      "step": 120
     },
     {
-      "epoch": 1.158129175946548,
-      "grad_norm": 15.956673622131348,
-      "learning_rate": 4.910714285714286e-05,
-      "loss": 1.0021,
       "step": 130
     },
     {
-      "epoch": 1.247216035634744,
-      "grad_norm": 12.564276695251465,
-      "learning_rate": 4.8611111111111115e-05,
-      "loss": 0.9518,
       "step": 140
     },
     {
-      "epoch": 1.3363028953229399,
-      "grad_norm": 15.347941398620605,
-      "learning_rate": 4.811507936507937e-05,
-      "loss": 0.9316,
       "step": 150
     },
     {
-      "epoch": 1.4253897550111359,
-      "grad_norm": 13.298521041870117,
-      "learning_rate": 4.761904761904762e-05,
-      "loss": 0.8316,
       "step": 160
     },
     {
-      "epoch": 1.5144766146993318,
-      "grad_norm": 12.444368362426758,
-      "learning_rate": 4.7123015873015876e-05,
-      "loss": 0.8863,
       "step": 170
     },
     {
-      "epoch": 1.6035634743875278,
-      "grad_norm": 10.18455982208252,
-      "learning_rate": 4.662698412698413e-05,
-      "loss": 0.8304,
       "step": 180
     },
     {
-      "epoch": 1.692650334075724,
-      "grad_norm": 13.635807991027832,
-      "learning_rate": 4.613095238095239e-05,
-      "loss": 0.7748,
       "step": 190
     },
     {
-      "epoch": 1.7817371937639197,
-      "grad_norm": 9.778557777404785,
-      "learning_rate": 4.563492063492064e-05,
-      "loss": 0.7561,
       "step": 200
     },
     {
-      "epoch": 1.8708240534521159,
-      "grad_norm": 16.08139419555664,
-      "learning_rate": 4.5138888888888894e-05,
-      "loss": 0.8051,
       "step": 210
     },
     {
-      "epoch": 1.9599109131403119,
-      "grad_norm": 12.761980056762695,
-      "learning_rate": 4.464285714285715e-05,
-      "loss": 0.8019,
       "step": 220
     },
     {
-      "epoch": 1.9955456570155903,
-      "eval_accuracy": 0.7989981214777708,
-      "eval_loss": 0.6332681775093079,
-      "eval_runtime": 12.3035,
-      "eval_samples_per_second": 129.8,
-      "eval_steps_per_second": 4.064,
-      "step": 224
-    },
-    {
-      "epoch": 2.048997772828508,
-      "grad_norm": 10.136615753173828,
-      "learning_rate": 4.41468253968254e-05,
-      "loss": 0.7221,
       "step": 230
     },
     {
-      "epoch": 2.138084632516704,
-      "grad_norm": 13.145553588867188,
-      "learning_rate": 4.3650793650793655e-05,
-      "loss": 0.6877,
       "step": 240
     },
     {
-      "epoch": 2.2271714922048997,
-      "grad_norm": 14.941648483276367,
-      "learning_rate": 4.315476190476191e-05,
-      "loss": 0.7462,
       "step": 250
     },
     {
-      "epoch": 2.316258351893096,
-      "grad_norm": 9.842031478881836,
-      "learning_rate": 4.265873015873016e-05,
-      "loss": 0.6886,
       "step": 260
     },
     {
-      "epoch": 2.4053452115812917,
-      "grad_norm": 11.978471755981445,
-      "learning_rate": 4.2162698412698416e-05,
-      "loss": 0.6843,
       "step": 270
     },
     {
-      "epoch": 2.494432071269488,
-      "grad_norm": 12.24606990814209,
-      "learning_rate": 4.166666666666667e-05,
-      "loss": 0.7038,
       "step": 280
     },
     {
-      "epoch": 2.5835189309576836,
-      "grad_norm": 12.357331275939941,
-      "learning_rate": 4.117063492063492e-05,
-      "loss": 0.696,
       "step": 290
     },
     {
-      "epoch": 2.6726057906458798,
-      "grad_norm": 16.765913009643555,
-      "learning_rate": 4.067460317460318e-05,
-      "loss": 0.7174,
       "step": 300
     },
     {
-      "epoch": 2.7616926503340755,
-      "grad_norm": 13.57442569732666,
-      "learning_rate": 4.017857142857143e-05,
-      "loss": 0.7305,
       "step": 310
     },
     {
-      "epoch": 2.8507795100222717,
-      "grad_norm": 13.48181438446045,
-      "learning_rate": 3.968253968253968e-05,
-      "loss": 0.6957,
       "step": 320
     },
     {
-      "epoch": 2.939866369710468,
-      "grad_norm": 9.809232711791992,
-      "learning_rate": 3.918650793650794e-05,
-      "loss": 0.6524,
       "step": 330
     },
     {
-      "epoch": 2.9933184855233854,
-      "eval_accuracy": 0.8340638697557922,
-      "eval_loss": 0.5247990489006042,
-      "eval_runtime": 12.2883,
-      "eval_samples_per_second": 129.961,
-      "eval_steps_per_second": 4.069,
-      "step": 336
-    },
-    {
-      "epoch": 3.0289532293986636,
-      "grad_norm": 16.003482818603516,
-      "learning_rate": 3.8690476190476195e-05,
-      "loss": 0.6928,
       "step": 340
     },
     {
-      "epoch": 3.11804008908686,
-      "grad_norm": 23.448598861694336,
-      "learning_rate": 3.8194444444444444e-05,
-      "loss": 0.6696,
       "step": 350
     },
     {
-      "epoch": 3.2071269487750556,
-      "grad_norm": 12.516254425048828,
-      "learning_rate": 3.76984126984127e-05,
-      "loss": 0.6665,
       "step": 360
     },
     {
-      "epoch": 3.2962138084632517,
-      "grad_norm": 13.503238677978516,
-      "learning_rate": 3.7202380952380956e-05,
-      "loss": 0.6189,
       "step": 370
     },
     {
-      "epoch": 3.3853006681514475,
-      "grad_norm": 14.721902847290039,
-      "learning_rate": 3.6706349206349205e-05,
-      "loss": 0.6405,
       "step": 380
     },
     {
-      "epoch": 3.4743875278396437,
-      "grad_norm": 11.428637504577637,
-      "learning_rate": 3.621031746031746e-05,
-      "loss": 0.6502,
       "step": 390
     },
     {
-      "epoch": 3.5634743875278394,
-      "grad_norm": 8.628026008605957,
-      "learning_rate": 3.571428571428572e-05,
-      "loss": 0.6335,
       "step": 400
     },
     {
-      "epoch": 3.6525612472160356,
-      "grad_norm": 12.637211799621582,
-      "learning_rate": 3.521825396825397e-05,
-      "loss": 0.6457,
       "step": 410
     },
     {
-      "epoch": 3.7416481069042318,
-      "grad_norm": 13.72917652130127,
-      "learning_rate": 3.472222222222222e-05,
-      "loss": 0.6338,
       "step": 420
     },
     {
-      "epoch": 3.8307349665924275,
-      "grad_norm": 14.159635543823242,
-      "learning_rate": 3.422619047619048e-05,
-      "loss": 0.6318,
       "step": 430
     },
     {
-      "epoch": 3.9198218262806237,
-      "grad_norm": 12.676724433898926,
-      "learning_rate": 3.3730158730158734e-05,
-      "loss": 0.6339,
       "step": 440
     },
     {
-      "epoch": 4.0,
-      "eval_accuracy": 0.8447088290544772,
-      "eval_loss": 0.4730662703514099,
-      "eval_runtime": 12.4926,
-      "eval_samples_per_second": 127.835,
-      "eval_steps_per_second": 4.002,
-      "step": 449
-    },
-    {
-      "epoch": 4.008908685968819,
-      "grad_norm": 11.317361831665039,
-      "learning_rate": 3.3234126984126983e-05,
-      "loss": 0.613,
       "step": 450
     },
     {
-      "epoch": 4.097995545657016,
-      "grad_norm": 14.402922630310059,
-      "learning_rate": 3.273809523809524e-05,
-      "loss": 0.6124,
       "step": 460
     },
     {
-      "epoch": 4.187082405345212,
-      "grad_norm": 9.939033508300781,
-      "learning_rate": 3.2242063492063495e-05,
-      "loss": 0.5868,
       "step": 470
     },
     {
-      "epoch": 4.276169265033408,
-      "grad_norm": 10.611005783081055,
-      "learning_rate": 3.1746031746031745e-05,
-      "loss": 0.5786,
       "step": 480
     },
     {
-      "epoch": 4.365256124721603,
-      "grad_norm": 11.104296684265137,
-      "learning_rate": 3.125e-05,
-      "loss": 0.544,
       "step": 490
     },
     {
-      "epoch": 4.4543429844097995,
-      "grad_norm": 14.008048057556152,
-      "learning_rate": 3.075396825396826e-05,
-      "loss": 0.6175,
       "step": 500
     },
     {
-      "epoch": 4.543429844097996,
-      "grad_norm": 9.320144653320312,
-      "learning_rate": 3.0257936507936506e-05,
-      "loss": 0.5999,
       "step": 510
     },
     {
-      "epoch": 4.632516703786192,
-      "grad_norm": 9.274946212768555,
-      "learning_rate": 2.9761904761904762e-05,
-      "loss": 0.5709,
       "step": 520
     },
     {
-      "epoch": 4.721603563474387,
-      "grad_norm": 12.640064239501953,
-      "learning_rate": 2.9265873015873018e-05,
-      "loss": 0.6231,
       "step": 530
     },
     {
-      "epoch": 4.810690423162583,
-      "grad_norm": 11.968724250793457,
-      "learning_rate": 2.876984126984127e-05,
-      "loss": 0.6206,
       "step": 540
     },
     {
-      "epoch": 4.8997772828507795,
-      "grad_norm": 11.681157112121582,
-      "learning_rate": 2.8273809523809523e-05,
-      "loss": 0.6031,
       "step": 550
     },
     {
-      "epoch": 4.988864142538976,
-      "grad_norm": 13.320256233215332,
-      "learning_rate": 2.777777777777778e-05,
-      "loss": 0.5178,
       "step": 560
     },
     {
-      "epoch": 4.997772828507795,
-      "eval_accuracy": 0.8503443957420163,
-      "eval_loss": 0.4537416100502014,
-      "eval_runtime": 12.2913,
-      "eval_samples_per_second": 129.93,
-      "eval_steps_per_second": 4.068,
-      "step": 561
-    },
-    {
-      "epoch": 5.077951002227172,
-      "grad_norm": 15.332996368408203,
-      "learning_rate": 2.7281746031746032e-05,
-      "loss": 0.5617,
       "step": 570
     },
     {
-      "epoch": 5.167037861915367,
-      "grad_norm": 14.994087219238281,
-      "learning_rate": 2.6785714285714288e-05,
-      "loss": 0.5797,
       "step": 580
     },
     {
-      "epoch": 5.256124721603563,
-      "grad_norm": 13.461969375610352,
-      "learning_rate": 2.628968253968254e-05,
-      "loss": 0.5524,
       "step": 590
     },
     {
-      "epoch": 5.3452115812917596,
-      "grad_norm": 12.29080581665039,
-      "learning_rate": 2.5793650793650796e-05,
-      "loss": 0.5824,
       "step": 600
     },
     {
-      "epoch": 5.434298440979956,
-      "grad_norm": 11.07197380065918,
-      "learning_rate": 2.529761904761905e-05,
-      "loss": 0.554,
       "step": 610
     },
     {
-      "epoch": 5.523385300668151,
-      "grad_norm": 9.797560691833496,
-      "learning_rate": 2.4801587301587305e-05,
-      "loss": 0.5108,
       "step": 620
     },
     {
-      "epoch": 5.612472160356347,
-      "grad_norm": 10.469209671020508,
-      "learning_rate": 2.4305555555555558e-05,
-      "loss": 0.5586,
       "step": 630
     },
     {
-      "epoch": 5.701559020044543,
-      "grad_norm": 13.22735595703125,
-      "learning_rate": 2.380952380952381e-05,
-      "loss": 0.5358,
       "step": 640
     },
     {
-      "epoch": 5.79064587973274,
-      "grad_norm": 8.305063247680664,
-      "learning_rate": 2.3313492063492066e-05,
-      "loss": 0.5295,
       "step": 650
     },
     {
-      "epoch": 5.879732739420936,
-      "grad_norm": 18.399051666259766,
-      "learning_rate": 2.281746031746032e-05,
-      "loss": 0.5442,
       "step": 660
     },
     {
-      "epoch": 5.968819599109131,
-      "grad_norm": 8.103595733642578,
-      "learning_rate": 2.2321428571428575e-05,
-      "loss": 0.5907,
       "step": 670
     },
     {
-      "epoch": 5.99554565701559,
-      "eval_accuracy": 0.8472135253600501,
-      "eval_loss": 0.4555535316467285,
-      "eval_runtime": 12.2927,
-      "eval_samples_per_second": 129.914,
-      "eval_steps_per_second": 4.067,
-      "step": 673
-    },
-    {
-      "epoch": 6.057906458797327,
-      "grad_norm": 10.681763648986816,
-      "learning_rate": 2.1825396825396827e-05,
-      "loss": 0.5332,
       "step": 680
     },
     {
-      "epoch": 6.146993318485523,
-      "grad_norm": 10.129424095153809,
-      "learning_rate": 2.132936507936508e-05,
-      "loss": 0.4747,
       "step": 690
     },
     {
-      "epoch": 6.23608017817372,
-      "grad_norm": 16.834814071655273,
-      "learning_rate": 2.0833333333333336e-05,
-      "loss": 0.5576,
       "step": 700
     },
     {
-      "epoch": 6.325167037861915,
-      "grad_norm": 11.258397102355957,
-      "learning_rate": 2.033730158730159e-05,
-      "loss": 0.5063,
       "step": 710
     },
     {
-      "epoch": 6.414253897550111,
-      "grad_norm": 15.159914016723633,
-      "learning_rate": 1.984126984126984e-05,
-      "loss": 0.5385,
       "step": 720
     },
     {
-      "epoch": 6.503340757238307,
-      "grad_norm": 10.242027282714844,
-      "learning_rate": 1.9345238095238097e-05,
-      "loss": 0.5046,
       "step": 730
     },
     {
-      "epoch": 6.5924276169265035,
-      "grad_norm": 10.377813339233398,
-      "learning_rate": 1.884920634920635e-05,
-      "loss": 0.5247,
       "step": 740
     },
     {
-      "epoch": 6.6815144766147,
-      "grad_norm": 12.55459213256836,
-      "learning_rate": 1.8353174603174602e-05,
-      "loss": 0.529,
       "step": 750
     },
     {
-      "epoch": 6.770601336302895,
-      "grad_norm": 16.02656364440918,
-      "learning_rate": 1.785714285714286e-05,
-      "loss": 0.5073,
       "step": 760
     },
     {
-      "epoch": 6.859688195991091,
-      "grad_norm": 16.140487670898438,
-      "learning_rate": 1.736111111111111e-05,
-      "loss": 0.5414,
       "step": 770
     },
     {
-      "epoch": 6.948775055679287,
-      "grad_norm": 17.321931838989258,
-      "learning_rate": 1.6865079365079367e-05,
-      "loss": 0.5292,
       "step": 780
     },
     {
-      "epoch": 6.993318485523385,
-      "eval_accuracy": 0.8672510958046337,
-      "eval_loss": 0.41685062646865845,
-      "eval_runtime": 12.3673,
-      "eval_samples_per_second": 129.131,
-      "eval_steps_per_second": 4.043,
-      "step": 785
-    },
-    {
-      "epoch": 7.0378619153674835,
-      "grad_norm": 9.479693412780762,
-      "learning_rate": 1.636904761904762e-05,
-      "loss": 0.4586,
       "step": 790
     },
     {
-      "epoch": 7.12694877505568,
-      "grad_norm": 11.711000442504883,
-      "learning_rate": 1.5873015873015872e-05,
-      "loss": 0.5188,
       "step": 800
     },
     {
-      "epoch": 7.216035634743875,
-      "grad_norm": 11.616864204406738,
-      "learning_rate": 1.537698412698413e-05,
-      "loss": 0.5024,
       "step": 810
     },
     {
-      "epoch": 7.305122494432071,
-      "grad_norm": 10.370725631713867,
-      "learning_rate": 1.4880952380952381e-05,
-      "loss": 0.4902,
       "step": 820
     },
     {
-      "epoch": 7.394209354120267,
-      "grad_norm": 14.04218864440918,
-      "learning_rate": 1.4384920634920635e-05,
-      "loss": 0.5149,
       "step": 830
     },
     {
-      "epoch": 7.4832962138084635,
-      "grad_norm": 13.194646835327148,
-      "learning_rate": 1.388888888888889e-05,
-      "loss": 0.5562,
       "step": 840
     },
     {
-      "epoch": 7.57238307349666,
-      "grad_norm": 9.960190773010254,
-      "learning_rate": 1.3392857142857144e-05,
-      "loss": 0.4921,
       "step": 850
     },
     {
-      "epoch": 7.661469933184855,
-      "grad_norm": 15.14493465423584,
-      "learning_rate": 1.2896825396825398e-05,
-      "loss": 0.471,
       "step": 860
     },
     {
-      "epoch": 7.750556792873051,
-      "grad_norm": 11.185235977172852,
-      "learning_rate": 1.2400793650793652e-05,
-      "loss": 0.4963,
       "step": 870
     },
     {
-      "epoch": 7.839643652561247,
-      "grad_norm": 12.782095909118652,
-      "learning_rate": 1.1904761904761905e-05,
-      "loss": 0.4915,
       "step": 880
     },
     {
-      "epoch": 7.928730512249444,
-      "grad_norm": 11.89919376373291,
-      "learning_rate": 1.140873015873016e-05,
-      "loss": 0.5017,
       "step": 890
     },
     {
-      "epoch": 8.0,
-      "eval_accuracy": 0.8597370068879149,
-      "eval_loss": 0.4106651544570923,
-      "eval_runtime": 12.3902,
-      "eval_samples_per_second": 128.892,
-      "eval_steps_per_second": 4.035,
-      "step": 898
-    },
-    {
-      "epoch": 8.017817371937639,
-      "grad_norm": 12.601805686950684,
-      "learning_rate": 1.0912698412698414e-05,
-      "loss": 0.5064,
       "step": 900
     },
     {
-      "epoch": 8.106904231625835,
-      "grad_norm": 8.723831176757812,
-      "learning_rate": 1.0416666666666668e-05,
-      "loss": 0.4181,
       "step": 910
     },
     {
-      "epoch": 8.195991091314031,
-      "grad_norm": 12.781538963317871,
-      "learning_rate": 9.92063492063492e-06,
-      "loss": 0.4427,
       "step": 920
     },
     {
-      "epoch": 8.285077951002227,
-      "grad_norm": 12.263012886047363,
-      "learning_rate": 9.424603174603175e-06,
-      "loss": 0.5087,
       "step": 930
     },
     {
-      "epoch": 8.374164810690424,
-      "grad_norm": 17.41984748840332,
-      "learning_rate": 8.92857142857143e-06,
-      "loss": 0.5301,
       "step": 940
     },
     {
-      "epoch": 8.46325167037862,
-      "grad_norm": 10.731024742126465,
-      "learning_rate": 8.432539682539684e-06,
-      "loss": 0.4987,
       "step": 950
     },
     {
-      "epoch": 8.552338530066816,
-      "grad_norm": 15.722013473510742,
-      "learning_rate": 7.936507936507936e-06,
-      "loss": 0.4613,
       "step": 960
     },
     {
-      "epoch": 8.64142538975501,
-      "grad_norm": 11.301126480102539,
-      "learning_rate": 7.4404761904761905e-06,
-      "loss": 0.5136,
       "step": 970
     },
     {
-      "epoch": 8.730512249443207,
-      "grad_norm": 21.23493194580078,
-      "learning_rate": 6.944444444444445e-06,
-      "loss": 0.494,
       "step": 980
     },
     {
-      "epoch": 8.819599109131403,
-      "grad_norm": 10.211363792419434,
-      "learning_rate": 6.448412698412699e-06,
-      "loss": 0.4619,
       "step": 990
     },
     {
-      "epoch": 8.908685968819599,
-      "grad_norm": 12.277856826782227,
-      "learning_rate": 5.9523809523809525e-06,
-      "loss": 0.4361,
       "step": 1000
     },
     {
-      "epoch": 8.997772828507795,
-      "grad_norm": 12.28085708618164,
-      "learning_rate": 5.456349206349207e-06,
-      "loss": 0.4605,
-      "step": 1010
-    },
-    {
-      "epoch": 8.997772828507795,
-      "eval_accuracy": 0.8634940513462742,
-      "eval_loss": 0.40621063113212585,
-      "eval_runtime": 12.3051,
-      "eval_samples_per_second": 129.783,
-      "eval_steps_per_second": 4.063,
       "step": 1010
     },
     {
-      "epoch": 9.086859688195991,
-      "grad_norm": 8.358485221862793,
-      "learning_rate": 4.96031746031746e-06,
-      "loss": 0.4403,
       "step": 1020
     },
     {
-      "epoch": 9.175946547884188,
-      "grad_norm": 8.35409164428711,
-      "learning_rate": 4.464285714285715e-06,
-      "loss": 0.4514,
       "step": 1030
     },
     {
-      "epoch": 9.265033407572384,
-      "grad_norm": 10.057600021362305,
-      "learning_rate": 3.968253968253968e-06,
-      "loss": 0.427,
       "step": 1040
     },
     {
-      "epoch": 9.35412026726058,
-      "grad_norm": 7.57137393951416,
-      "learning_rate": 3.4722222222222224e-06,
-      "loss": 0.4619,
       "step": 1050
     },
     {
-      "epoch": 9.443207126948774,
-      "grad_norm": 9.249728202819824,
-      "learning_rate": 2.9761904761904763e-06,
-      "loss": 0.4606,
       "step": 1060
     },
     {
-      "epoch": 9.53229398663697,
-      "grad_norm": 10.303194046020508,
-      "learning_rate": 2.48015873015873e-06,
-      "loss": 0.508,
       "step": 1070
     },
     {
-      "epoch": 9.621380846325167,
-      "grad_norm": 11.307740211486816,
-      "learning_rate": 1.984126984126984e-06,
-      "loss": 0.4667,
       "step": 1080
     },
     {
-      "epoch": 9.710467706013363,
-      "grad_norm": 11.34073543548584,
-      "learning_rate": 1.4880952380952381e-06,
-      "loss": 0.4803,
       "step": 1090
     },
     {
-      "epoch": 9.799554565701559,
-      "grad_norm": 12.684574127197266,
-      "learning_rate": 9.92063492063492e-07,
-      "loss": 0.4613,
       "step": 1100
     },
     {
-      "epoch": 9.888641425389755,
-      "grad_norm": 12.430156707763672,
-      "learning_rate": 4.96031746031746e-07,
-      "loss": 0.434,
       "step": 1110
     },
     {
-      "epoch": 9.977728285077951,
-      "grad_norm": 19.223121643066406,
-      "learning_rate": 0.0,
-      "loss": 0.4765,
       "step": 1120
     },
     {
-      "epoch": 9.977728285077951,
-      "eval_accuracy": 0.8647463994990607,
-      "eval_loss": 0.3980247676372528,
-      "eval_runtime": 12.312,
-      "eval_samples_per_second": 129.711,
-      "eval_steps_per_second": 4.061,
-      "step": 1120
     },
     {
-      "epoch": 9.977728285077951,
-      "step": 1120,
-      "total_flos": 3.6084187126879396e+18,
-      "train_loss": 0.7042633635657174,
-      "train_runtime": 2366.3213,
-      "train_samples_per_second": 60.706,
-      "train_steps_per_second": 0.473
     }
   ],
   "logging_steps": 10,
-  "max_steps": 1120,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 10,
   "save_steps": 500,
-  "total_flos": 3.6084187126879396e+18,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.8703494926719278,
+  "best_model_checkpoint": "convnextv2-tiny-1k-224-finetuned-galaxy10-decals/checkpoint-2480",
+  "epoch": 19.879759519038075,
   "eval_steps": 500,
+  "global_step": 2480,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.08016032064128256,
+      "grad_norm": 5.443685531616211,
+      "learning_rate": 2.0161290322580646e-06,
+      "loss": 2.3476,
       "step": 10
     },
     {
+      "epoch": 0.16032064128256512,
+      "grad_norm": 5.888722896575928,
+      "learning_rate": 4.032258064516129e-06,
+      "loss": 2.3134,
       "step": 20
     },
     {
+      "epoch": 0.24048096192384769,
+      "grad_norm": 3.9038140773773193,
+      "learning_rate": 6.048387096774194e-06,
+      "loss": 2.2707,
       "step": 30
     },
     {
+      "epoch": 0.32064128256513025,
+      "grad_norm": 2.5614984035491943,
+      "learning_rate": 8.064516129032258e-06,
+      "loss": 2.2183,
       "step": 40
     },
     {
+      "epoch": 0.40080160320641284,
+      "grad_norm": 5.785397052764893,
+      "learning_rate": 1.0080645161290323e-05,
+      "loss": 2.1604,
       "step": 50
     },
     {
+      "epoch": 0.48096192384769537,
+      "grad_norm": 5.09072208404541,
+      "learning_rate": 1.2096774193548388e-05,
+      "loss": 2.0969,
       "step": 60
     },
     {
+      "epoch": 0.561122244488978,
+      "grad_norm": 2.9109528064727783,
+      "learning_rate": 1.4112903225806454e-05,
+      "loss": 2.0283,
       "step": 70
     },
     {
+      "epoch": 0.6412825651302605,
+      "grad_norm": 7.426329135894775,
+      "learning_rate": 1.6129032258064517e-05,
+      "loss": 1.9125,
       "step": 80
     },
     {
+      "epoch": 0.7214428857715431,
+      "grad_norm": 8.441859245300293,
+      "learning_rate": 1.8145161290322583e-05,
+      "loss": 1.7735,
       "step": 90
     },
     {
+      "epoch": 0.8016032064128257,
+      "grad_norm": 5.191440582275391,
+      "learning_rate": 2.0161290322580645e-05,
+      "loss": 1.6402,
       "step": 100
     },
     {
+      "epoch": 0.8817635270541082,
+      "grad_norm": 6.325778007507324,
+      "learning_rate": 2.217741935483871e-05,
+      "loss": 1.5225,
       "step": 110
     },
     {
+      "epoch": 0.9619238476953907,
+      "grad_norm": 7.641424655914307,
+      "learning_rate": 2.4193548387096777e-05,
+      "loss": 1.4287,
+      "step": 120
     },
     {
+      "epoch": 0.9939879759519038,
+      "eval_accuracy": 0.5851183765501691,
+      "eval_loss": 1.2978211641311646,
+      "eval_runtime": 23.4495,
+      "eval_samples_per_second": 75.652,
+      "eval_steps_per_second": 2.388,
+      "step": 124
     },
     {
+      "epoch": 1.0420841683366733,
+      "grad_norm": 9.333320617675781,
+      "learning_rate": 2.620967741935484e-05,
+      "loss": 1.3617,
       "step": 130
     },
     {
+      "epoch": 1.122244488977956,
+      "grad_norm": 10.036263465881348,
+      "learning_rate": 2.822580645161291e-05,
+      "loss": 1.3084,
       "step": 140
     },
     {
+      "epoch": 1.2024048096192386,
+      "grad_norm": 11.795063018798828,
+      "learning_rate": 3.024193548387097e-05,
+      "loss": 1.2472,
       "step": 150
     },
     {
+      "epoch": 1.282565130260521,
+      "grad_norm": 11.583420753479004,
+      "learning_rate": 3.2258064516129034e-05,
+      "loss": 1.1335,
       "step": 160
     },
     {
+      "epoch": 1.3627254509018036,
+      "grad_norm": 14.882524490356445,
+      "learning_rate": 3.427419354838709e-05,
+      "loss": 1.102,
       "step": 170
     },
     {
+      "epoch": 1.4428857715430863,
+      "grad_norm": 14.157336235046387,
+      "learning_rate": 3.6290322580645165e-05,
+      "loss": 1.049,
       "step": 180
     },
     {
+      "epoch": 1.5230460921843687,
+      "grad_norm": 10.484189987182617,
+      "learning_rate": 3.8306451612903224e-05,
+      "loss": 1.0445,
       "step": 190
     },
     {
+      "epoch": 1.6032064128256514,
+      "grad_norm": 14.128747940063477,
+      "learning_rate": 4.032258064516129e-05,
+      "loss": 0.995,
       "step": 200
     },
     {
+      "epoch": 1.6833667334669338,
+      "grad_norm": 9.768001556396484,
+      "learning_rate": 4.2338709677419356e-05,
+      "loss": 0.9573,
       "step": 210
     },
     {
+      "epoch": 1.7635270541082164,
+      "grad_norm": 13.823319435119629,
+      "learning_rate": 4.435483870967742e-05,
+      "loss": 0.9072,
       "step": 220
     },
     {
+      "epoch": 1.843687374749499,
+      "grad_norm": 9.132129669189453,
+      "learning_rate": 4.637096774193548e-05,
+      "loss": 0.8156,
       "step": 230
     },
     {
+      "epoch": 1.9238476953907817,
+      "grad_norm": 18.744123458862305,
+      "learning_rate": 4.8387096774193554e-05,
+      "loss": 0.8329,
       "step": 240
     },
     {
+      "epoch": 1.9959919839679359,
+      "eval_accuracy": 0.7728297632468997,
+      "eval_loss": 0.6986980438232422,
+      "eval_runtime": 11.7756,
+      "eval_samples_per_second": 150.65,
+      "eval_steps_per_second": 4.756,
+      "step": 249
+    },
+    {
+      "epoch": 2.004008016032064,
+      "grad_norm": 8.23229694366455,
+      "learning_rate": 4.995519713261649e-05,
+      "loss": 0.8108,
       "step": 250
     },
     {
+      "epoch": 2.0841683366733466,
+      "grad_norm": 13.277573585510254,
+      "learning_rate": 4.973118279569893e-05,
+      "loss": 0.7733,
       "step": 260
     },
     {
+      "epoch": 2.164328657314629,
+      "grad_norm": 13.641548156738281,
+      "learning_rate": 4.950716845878137e-05,
+      "loss": 0.7904,
       "step": 270
     },
     {
+      "epoch": 2.244488977955912,
+      "grad_norm": 18.69782829284668,
+      "learning_rate": 4.92831541218638e-05,
+      "loss": 0.7419,
       "step": 280
     },
     {
+      "epoch": 2.3246492985971945,
+      "grad_norm": 15.437073707580566,
+      "learning_rate": 4.905913978494624e-05,
+      "loss": 0.7647,
       "step": 290
     },
     {
+      "epoch": 2.404809619238477,
+      "grad_norm": 21.065357208251953,
+      "learning_rate": 4.8835125448028677e-05,
+      "loss": 0.7303,
       "step": 300
     },
     {
+      "epoch": 2.4849699398797593,
+      "grad_norm": 16.6332950592041,
+      "learning_rate": 4.8611111111111115e-05,
+      "loss": 0.764,
       "step": 310
     },
     {
+      "epoch": 2.565130260521042,
+      "grad_norm": 13.331892967224121,
+      "learning_rate": 4.8387096774193554e-05,
+      "loss": 0.7586,
       "step": 320
     },
     {
+      "epoch": 2.6452905811623246,
+      "grad_norm": 18.221023559570312,
+      "learning_rate": 4.8163082437275986e-05,
+      "loss": 0.7078,
       "step": 330
     },
     {
+      "epoch": 2.7254509018036073,
+      "grad_norm": 16.339580535888672,
+      "learning_rate": 4.7939068100358424e-05,
+      "loss": 0.7008,
       "step": 340
     },
     {
+      "epoch": 2.80561122244489,
+      "grad_norm": 12.270729064941406,
+      "learning_rate": 4.771505376344086e-05,
+      "loss": 0.7941,
       "step": 350
     },
     {
+      "epoch": 2.8857715430861726,
+      "grad_norm": 10.448567390441895,
+      "learning_rate": 4.74910394265233e-05,
+      "loss": 0.6575,
       "step": 360
     },
     {
+      "epoch": 2.9659318637274548,
+      "grad_norm": 12.076117515563965,
+      "learning_rate": 4.726702508960574e-05,
+      "loss": 0.7348,
       "step": 370
     },
     {
+      "epoch": 2.997995991983968,
+      "eval_accuracy": 0.8179255918827508,
+      "eval_loss": 0.5658715963363647,
+      "eval_runtime": 12.6995,
+      "eval_samples_per_second": 139.691,
+      "eval_steps_per_second": 4.41,
+      "step": 374
+    },
+    {
+      "epoch": 3.0460921843687374,
+      "grad_norm": 10.130627632141113,
+      "learning_rate": 4.704301075268818e-05,
+      "loss": 0.6752,
       "step": 380
     },
     {
+      "epoch": 3.12625250501002,
+      "grad_norm": 15.763223648071289,
+      "learning_rate": 4.681899641577061e-05,
+      "loss": 0.6367,
       "step": 390
     },
     {
+      "epoch": 3.2064128256513027,
+      "grad_norm": 13.580248832702637,
+      "learning_rate": 4.659498207885305e-05,
+      "loss": 0.6239,
       "step": 400
     },
     {
+      "epoch": 3.2865731462925853,
+      "grad_norm": 11.084284782409668,
+      "learning_rate": 4.637096774193548e-05,
+      "loss": 0.6857,
       "step": 410
     },
     {
+      "epoch": 3.3667334669338675,
+      "grad_norm": 12.55604362487793,
+      "learning_rate": 4.614695340501792e-05,
+      "loss": 0.6603,
       "step": 420
     },
     {
+      "epoch": 3.44689378757515,
+      "grad_norm": 9.201930046081543,
+      "learning_rate": 4.5922939068100365e-05,
+      "loss": 0.6514,
       "step": 430
     },
     {
+      "epoch": 3.527054108216433,
+      "grad_norm": 12.60409927368164,
+      "learning_rate": 4.56989247311828e-05,
+      "loss": 0.6237,
       "step": 440
     },
     {
+      "epoch": 3.6072144288577155,
+      "grad_norm": 21.13867950439453,
+      "learning_rate": 4.5474910394265236e-05,
+      "loss": 0.6698,
       "step": 450
     },
     {
+      "epoch": 3.687374749498998,
+      "grad_norm": 15.415425300598145,
+      "learning_rate": 4.5250896057347674e-05,
+      "loss": 0.6016,
       "step": 460
     },
     {
+      "epoch": 3.7675350701402808,
+      "grad_norm": 10.865119934082031,
+      "learning_rate": 4.5026881720430106e-05,
+      "loss": 0.633,
       "step": 470
     },
     {
+      "epoch": 3.847695390781563,
+      "grad_norm": 12.04747200012207,
+      "learning_rate": 4.4802867383512545e-05,
+      "loss": 0.686,
       "step": 480
     },
     {
+      "epoch": 3.9278557114228456,
+      "grad_norm": 15.560898780822754,
+      "learning_rate": 4.4578853046594983e-05,
+      "loss": 0.611,
       "step": 490
     },
     {
+      "epoch": 4.0,
+      "eval_accuracy": 0.8297632468996617,
+      "eval_loss": 0.5378695726394653,
+      "eval_runtime": 12.4744,
+      "eval_samples_per_second": 142.212,
+      "eval_steps_per_second": 4.489,
+      "step": 499
+    },
+    {
+      "epoch": 4.008016032064128,
+      "grad_norm": 12.36653995513916,
+      "learning_rate": 4.435483870967742e-05,
+      "loss": 0.6783,
       "step": 500
     },
     {
+      "epoch": 4.0881763527054105,
+      "grad_norm": 12.612045288085938,
+      "learning_rate": 4.413082437275986e-05,
+      "loss": 0.6591,
       "step": 510
     },
     {
+      "epoch": 4.168336673346693,
+      "grad_norm": 9.516498565673828,
+      "learning_rate": 4.390681003584229e-05,
+      "loss": 0.6409,
       "step": 520
     },
     {
+      "epoch": 4.248496993987976,
+      "grad_norm": 10.227922439575195,
+      "learning_rate": 4.368279569892473e-05,
+      "loss": 0.6113,
       "step": 530
     },
     {
+      "epoch": 4.328657314629258,
+      "grad_norm": 9.941215515136719,
+      "learning_rate": 4.345878136200717e-05,
+      "loss": 0.5545,
       "step": 540
     },
     {
+      "epoch": 4.408817635270541,
+      "grad_norm": 11.132833480834961,
+      "learning_rate": 4.323476702508961e-05,
+      "loss": 0.5876,
       "step": 550
     },
     {
+      "epoch": 4.488977955911824,
+      "grad_norm": 17.30998992919922,
+      "learning_rate": 4.301075268817205e-05,
+      "loss": 0.6059,
       "step": 560
     },
     {
+      "epoch": 4.569138276553106,
+      "grad_norm": 12.370113372802734,
+      "learning_rate": 4.2786738351254486e-05,
+      "loss": 0.572,
       "step": 570
     },
     {
+      "epoch": 4.649298597194389,
+      "grad_norm": 8.4649019241333,
+      "learning_rate": 4.256272401433692e-05,
+      "loss": 0.5474,
       "step": 580
     },
     {
+      "epoch": 4.729458917835672,
+      "grad_norm": 13.911017417907715,
+      "learning_rate": 4.2338709677419356e-05,
+      "loss": 0.5745,
       "step": 590
     },
     {
+      "epoch": 4.809619238476954,
+      "grad_norm": 13.061511993408203,
+      "learning_rate": 4.2114695340501795e-05,
+      "loss": 0.6151,
       "step": 600
     },
     {
+      "epoch": 4.889779559118237,
+      "grad_norm": 17.543981552124023,
+      "learning_rate": 4.1890681003584233e-05,
+      "loss": 0.5702,
       "step": 610
     },
     {
+      "epoch": 4.969939879759519,
+      "grad_norm": 14.352049827575684,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 0.5929,
       "step": 620
     },
     {
+      "epoch": 4.993987975951904,
+      "eval_accuracy": 0.8376550169109357,
+      "eval_loss": 0.4972485899925232,
+      "eval_runtime": 30.5452,
+      "eval_samples_per_second": 58.078,
+      "eval_steps_per_second": 1.833,
+      "step": 623
+    },
+    {
+      "epoch": 5.050100200400801,
+      "grad_norm": 12.319650650024414,
+      "learning_rate": 4.1442652329749104e-05,
+      "loss": 0.6066,
       "step": 630
     },
     {
+      "epoch": 5.130260521042084,
+      "grad_norm": 13.590569496154785,
+      "learning_rate": 4.121863799283154e-05,
+      "loss": 0.561,
       "step": 640
     },
     {
+      "epoch": 5.210420841683367,
+      "grad_norm": 6.548098087310791,
+      "learning_rate": 4.099462365591398e-05,
+      "loss": 0.5747,
       "step": 650
     },
     {
+      "epoch": 5.290581162324649,
+      "grad_norm": 11.081100463867188,
+      "learning_rate": 4.077060931899642e-05,
+      "loss": 0.5469,
       "step": 660
     },
     {
+      "epoch": 5.370741482965932,
+      "grad_norm": 11.028233528137207,
+      "learning_rate": 4.054659498207886e-05,
+      "loss": 0.5913,
       "step": 670
     },
     {
+      "epoch": 5.4509018036072145,
+      "grad_norm": 16.77172088623047,
+      "learning_rate": 4.032258064516129e-05,
+      "loss": 0.5454,
       "step": 680
     },
     {
+      "epoch": 5.531062124248497,
+      "grad_norm": 12.664057731628418,
+      "learning_rate": 4.009856630824373e-05,
+      "loss": 0.5376,
       "step": 690
     },
     {
+      "epoch": 5.61122244488978,
+      "grad_norm": 16.970354080200195,
+      "learning_rate": 3.987455197132617e-05,
+      "loss": 0.6195,
       "step": 700
     },
     {
+      "epoch": 5.6913827655310625,
+      "grad_norm": 20.813093185424805,
+      "learning_rate": 3.96505376344086e-05,
+      "loss": 0.5759,
       "step": 710
     },
     {
+      "epoch": 5.771543086172345,
+      "grad_norm": 12.778873443603516,
+      "learning_rate": 3.9426523297491045e-05,
+      "loss": 0.5636,
       "step": 720
     },
     {
+      "epoch": 5.851703406813627,
+      "grad_norm": 9.49085521697998,
+      "learning_rate": 3.9202508960573483e-05,
+      "loss": 0.5213,
       "step": 730
     },
     {
+      "epoch": 5.9318637274549095,
+      "grad_norm": 16.53606605529785,
+      "learning_rate": 3.8978494623655915e-05,
+      "loss": 0.5227,
       "step": 740
     },
     {
+      "epoch": 5.995991983967936,
+      "eval_accuracy": 0.8478015783540023,
+      "eval_loss": 0.4714604616165161,
+      "eval_runtime": 11.6058,
+      "eval_samples_per_second": 152.855,
+      "eval_steps_per_second": 4.825,
+      "step": 748
+    },
+    {
+      "epoch": 6.012024048096192,
+      "grad_norm": 15.601022720336914,
+      "learning_rate": 3.8754480286738354e-05,
+      "loss": 0.5287,
       "step": 750
     },
     {
+      "epoch": 6.092184368737475,
+      "grad_norm": 10.821127891540527,
+      "learning_rate": 3.8530465949820786e-05,
+      "loss": 0.5278,
       "step": 760
     },
     {
+      "epoch": 6.1723446893787575,
+      "grad_norm": 15.051136016845703,
+      "learning_rate": 3.8306451612903224e-05,
+      "loss": 0.5491,
       "step": 770
     },
     {
+      "epoch": 6.25250501002004,
+      "grad_norm": 13.785345077514648,
+      "learning_rate": 3.808243727598566e-05,
+      "loss": 0.4941,
       "step": 780
     },
     {
+      "epoch": 6.332665330661323,
+      "grad_norm": 12.571328163146973,
+      "learning_rate": 3.78584229390681e-05,
+      "loss": 0.4668,
       "step": 790
     },
     {
+      "epoch": 6.412825651302605,
+      "grad_norm": 14.443199157714844,
+      "learning_rate": 3.763440860215054e-05,
+      "loss": 0.5502,
       "step": 800
     },
     {
+      "epoch": 6.492985971943888,
+      "grad_norm": 12.781950950622559,
+      "learning_rate": 3.741039426523298e-05,
+      "loss": 0.5049,
       "step": 810
     },
     {
+      "epoch": 6.573146292585171,
+      "grad_norm": 8.832810401916504,
+      "learning_rate": 3.718637992831541e-05,
+      "loss": 0.5654,
       "step": 820
     },
     {
+      "epoch": 6.653306613226453,
+      "grad_norm": 13.026018142700195,
+      "learning_rate": 3.696236559139785e-05,
+      "loss": 0.5289,
       "step": 830
     },
     {
+      "epoch": 6.733466933867735,
+      "grad_norm": 11.173068046569824,
+      "learning_rate": 3.673835125448029e-05,
+      "loss": 0.5262,
       "step": 840
     },
     {
+      "epoch": 6.813627254509018,
+      "grad_norm": 15.73713207244873,
+      "learning_rate": 3.651433691756273e-05,
+      "loss": 0.5239,
       "step": 850
     },
     {
+      "epoch": 6.8937875751503,
+      "grad_norm": 11.182281494140625,
+      "learning_rate": 3.6290322580645165e-05,
+      "loss": 0.5269,
       "step": 860
     },
     {
+      "epoch": 6.973947895791583,
+      "grad_norm": 11.750397682189941,
+      "learning_rate": 3.60663082437276e-05,
+      "loss": 0.5166,
       "step": 870
     },
     {
+      "epoch": 6.997995991983968,
+      "eval_accuracy": 0.8494926719278467,
+      "eval_loss": 0.47609812021255493,
+      "eval_runtime": 22.6677,
+      "eval_samples_per_second": 78.261,
+      "eval_steps_per_second": 2.47,
+      "step": 873
+    },
+    {
+      "epoch": 7.054108216432866,
+      "grad_norm": 13.769631385803223,
+      "learning_rate": 3.5842293906810036e-05,
+      "loss": 0.5207,
       "step": 880
     },
     {
+      "epoch": 7.134268537074148,
+      "grad_norm": 13.10180377960205,
+      "learning_rate": 3.5618279569892474e-05,
+      "loss": 0.5231,
       "step": 890
     },
     {
+      "epoch": 7.214428857715431,
+      "grad_norm": 9.971457481384277,
+      "learning_rate": 3.539426523297491e-05,
+      "loss": 0.518,
       "step": 900
     },
     {
+      "epoch": 7.294589178356714,
+      "grad_norm": 12.092657089233398,
+      "learning_rate": 3.517025089605735e-05,
+      "loss": 0.5034,
       "step": 910
     },
     {
+      "epoch": 7.374749498997996,
+      "grad_norm": 19.348663330078125,
+      "learning_rate": 3.494623655913979e-05,
+      "loss": 0.4927,
       "step": 920
     },
     {
+      "epoch": 7.454909819639279,
+      "grad_norm": 10.206799507141113,
+      "learning_rate": 3.472222222222222e-05,
+      "loss": 0.5062,
       "step": 930
     },
     {
+      "epoch": 7.5350701402805615,
+      "grad_norm": 9.899465560913086,
+      "learning_rate": 3.449820788530466e-05,
+      "loss": 0.5037,
       "step": 940
     },
     {
+      "epoch": 7.615230460921843,
+      "grad_norm": 13.186443328857422,
+      "learning_rate": 3.427419354838709e-05,
+      "loss": 0.5159,
       "step": 950
     },
     {
+      "epoch": 7.695390781563126,
+      "grad_norm": 9.82767391204834,
+      "learning_rate": 3.405017921146954e-05,
+      "loss": 0.479,
       "step": 960
     },
     {
+      "epoch": 7.775551102204409,
+      "grad_norm": 9.09422492980957,
+      "learning_rate": 3.382616487455198e-05,
+      "loss": 0.5146,
       "step": 970
     },
     {
+      "epoch": 7.855711422845691,
+      "grad_norm": 11.1051025390625,
+      "learning_rate": 3.360215053763441e-05,
+      "loss": 0.4908,
       "step": 980
     },
     {
+      "epoch": 7.935871743486974,
+      "grad_norm": 9.16980266571045,
+      "learning_rate": 3.337813620071685e-05,
+      "loss": 0.4992,
       "step": 990
     },
     {
+      "epoch": 8.0,
+      "eval_accuracy": 0.8562570462232244,
+      "eval_loss": 0.432047575712204,
+      "eval_runtime": 17.8346,
+      "eval_samples_per_second": 99.469,
+      "eval_steps_per_second": 3.14,
+      "step": 998
+    },
+    {
+      "epoch": 8.016032064128256,
+      "grad_norm": 8.514852523803711,
+      "learning_rate": 3.3154121863799286e-05,
+      "loss": 0.4702,
       "step": 1000
     },
     {
+      "epoch": 8.09619238476954,
+      "grad_norm": 10.90427017211914,
+      "learning_rate": 3.293010752688172e-05,
+      "loss": 0.4809,
       "step": 1010
     },
     {
+      "epoch": 8.176352705410821,
+      "grad_norm": 13.75596809387207,
+      "learning_rate": 3.270609318996416e-05,
+      "loss": 0.4769,
       "step": 1020
     },
     {
+      "epoch": 8.256513026052104,
+      "grad_norm": 14.506204605102539,
+      "learning_rate": 3.24820788530466e-05,
+      "loss": 0.5157,
       "step": 1030
     },
     {
+      "epoch": 8.336673346693386,
+      "grad_norm": 10.074383735656738,
+      "learning_rate": 3.2258064516129034e-05,
+      "loss": 0.4916,
       "step": 1040
     },
     {
+      "epoch": 8.41683366733467,
+      "grad_norm": 13.06877326965332,
+      "learning_rate": 3.203405017921147e-05,
+      "loss": 0.4953,
       "step": 1050
     },
     {
+      "epoch": 8.496993987975952,
+      "grad_norm": 7.9596171379089355,
+      "learning_rate": 3.1810035842293904e-05,
+      "loss": 0.4879,
       "step": 1060
     },
     {
+      "epoch": 8.577154308617235,
+      "grad_norm": 11.05156135559082,
+      "learning_rate": 3.158602150537634e-05,
+      "loss": 0.4397,
       "step": 1070
     },
     {
+      "epoch": 8.657314629258517,
+      "grad_norm": 9.935453414916992,
+      "learning_rate": 3.136200716845878e-05,
+      "loss": 0.5141,
       "step": 1080
     },
     {
+      "epoch": 8.7374749498998,
+      "grad_norm": 9.928804397583008,
+      "learning_rate": 3.113799283154122e-05,
+      "loss": 0.4781,
       "step": 1090
     },
     {
+      "epoch": 8.817635270541082,
+      "grad_norm": 7.301691055297852,
+      "learning_rate": 3.091397849462366e-05,
+      "loss": 0.4484,
       "step": 1100
     },
     {
+      "epoch": 8.897795591182366,
+      "grad_norm": 13.609901428222656,
+      "learning_rate": 3.06899641577061e-05,
+      "loss": 0.4656,
       "step": 1110
     },
     {
+      "epoch": 8.977955911823647,
+      "grad_norm": 10.269015312194824,
+      "learning_rate": 3.046594982078853e-05,
+      "loss": 0.4528,
       "step": 1120
     },
     {
+      "epoch": 8.993987975951903,
+      "eval_accuracy": 0.8641488162344984,
+      "eval_loss": 0.4410019814968109,
+      "eval_runtime": 27.5679,
+      "eval_samples_per_second": 64.35,
+      "eval_steps_per_second": 2.031,
+      "step": 1122
+    },
+    {
+      "epoch": 9.05811623246493,
+      "grad_norm": 8.7766695022583,
+      "learning_rate": 3.024193548387097e-05,
+      "loss": 0.4843,
+      "step": 1130
+    },
+    {
+      "epoch": 9.138276553106213,
+      "grad_norm": 10.279314994812012,
+      "learning_rate": 3.0017921146953403e-05,
+      "loss": 0.5017,
+      "step": 1140
+    },
+    {
+      "epoch": 9.218436873747494,
+      "grad_norm": 13.840995788574219,
+      "learning_rate": 2.979390681003584e-05,
+      "loss": 0.4443,
+      "step": 1150
+    },
+    {
+      "epoch": 9.298597194388778,
+      "grad_norm": 14.21786117553711,
+      "learning_rate": 2.9569892473118284e-05,
+      "loss": 0.4319,
+      "step": 1160
+    },
+    {
+      "epoch": 9.37875751503006,
+      "grad_norm": 9.682762145996094,
+      "learning_rate": 2.9345878136200715e-05,
+      "loss": 0.4692,
+      "step": 1170
+    },
+    {
+      "epoch": 9.458917835671343,
+      "grad_norm": 12.985733985900879,
+      "learning_rate": 2.9121863799283154e-05,
+      "loss": 0.4221,
+      "step": 1180
+    },
+    {
+      "epoch": 9.539078156312625,
+      "grad_norm": 12.35405445098877,
+      "learning_rate": 2.8897849462365596e-05,
+      "loss": 0.4835,
+      "step": 1190
+    },
+    {
+      "epoch": 9.619238476953909,
+      "grad_norm": 7.067807197570801,
+      "learning_rate": 2.8673835125448028e-05,
+      "loss": 0.4548,
+      "step": 1200
+    },
+    {
+      "epoch": 9.69939879759519,
+      "grad_norm": 10.279123306274414,
+      "learning_rate": 2.8449820788530467e-05,
+      "loss": 0.4791,
+      "step": 1210
+    },
+    {
+      "epoch": 9.779559118236474,
+      "grad_norm": 12.814294815063477,
+      "learning_rate": 2.822580645161291e-05,
+      "loss": 0.4551,
+      "step": 1220
+    },
+    {
+      "epoch": 9.859719438877756,
+      "grad_norm": 15.132489204406738,
+      "learning_rate": 2.800179211469534e-05,
+      "loss": 0.3957,
+      "step": 1230
+    },
+    {
+      "epoch": 9.939879759519037,
+      "grad_norm": 11.959942817687988,
+      "learning_rate": 2.777777777777778e-05,
+      "loss": 0.4566,
+      "step": 1240
+    },
+    {
+      "epoch": 9.995991983967937,
+      "eval_accuracy": 0.8641488162344984,
+      "eval_loss": 0.42970511317253113,
+      "eval_runtime": 16.9001,
+      "eval_samples_per_second": 104.97,
+      "eval_steps_per_second": 3.314,
+      "step": 1247
+    },
+    {
+      "epoch": 10.02004008016032,
+      "grad_norm": 12.989178657531738,
+      "learning_rate": 2.7553763440860214e-05,
+      "loss": 0.4402,
+      "step": 1250
+    },
+    {
+      "epoch": 10.100200400801603,
+      "grad_norm": 11.985700607299805,
+      "learning_rate": 2.7329749103942653e-05,
+      "loss": 0.4544,
+      "step": 1260
+    },
+    {
+      "epoch": 10.180360721442886,
+      "grad_norm": 17.515514373779297,
+      "learning_rate": 2.710573476702509e-05,
+      "loss": 0.3676,
+      "step": 1270
+    },
+    {
+      "epoch": 10.260521042084168,
+      "grad_norm": 11.017400741577148,
+      "learning_rate": 2.6881720430107527e-05,
+      "loss": 0.4534,
+      "step": 1280
+    },
+    {
+      "epoch": 10.340681362725451,
+      "grad_norm": 7.939273357391357,
+      "learning_rate": 2.6657706093189965e-05,
+      "loss": 0.4719,
+      "step": 1290
+    },
+    {
+      "epoch": 10.420841683366733,
+      "grad_norm": 13.53430461883545,
+      "learning_rate": 2.6433691756272404e-05,
+      "loss": 0.4162,
+      "step": 1300
+    },
+    {
+      "epoch": 10.501002004008017,
+      "grad_norm": 9.599760055541992,
+      "learning_rate": 2.620967741935484e-05,
+      "loss": 0.46,
+      "step": 1310
+    },
+    {
+      "epoch": 10.581162324649299,
+      "grad_norm": 7.481749057769775,
+      "learning_rate": 2.5985663082437278e-05,
+      "loss": 0.405,
+      "step": 1320
+    },
+    {
+      "epoch": 10.661322645290582,
+      "grad_norm": 17.151025772094727,
+      "learning_rate": 2.5761648745519713e-05,
+      "loss": 0.4484,
+      "step": 1330
+    },
+    {
+      "epoch": 10.741482965931864,
+      "grad_norm": 11.18791389465332,
+      "learning_rate": 2.5537634408602152e-05,
+      "loss": 0.4461,
+      "step": 1340
+    },
+    {
+      "epoch": 10.821643286573146,
+      "grad_norm": 9.898661613464355,
+      "learning_rate": 2.531362007168459e-05,
+      "loss": 0.3958,
+      "step": 1350
+    },
+    {
+      "epoch": 10.901803607214429,
+      "grad_norm": 9.442924499511719,
+      "learning_rate": 2.5089605734767026e-05,
+      "loss": 0.4279,
+      "step": 1360
+    },
+    {
+      "epoch": 10.98196392785571,
+      "grad_norm": 11.045487403869629,
+      "learning_rate": 2.4865591397849464e-05,
+      "loss": 0.4294,
+      "step": 1370
+    },
+    {
+      "epoch": 10.997995991983968,
+      "eval_accuracy": 0.8607666290868095,
+      "eval_loss": 0.42823219299316406,
+      "eval_runtime": 26.2766,
+      "eval_samples_per_second": 67.512,
+      "eval_steps_per_second": 2.131,
+      "step": 1372
+    },
+    {
+      "epoch": 11.062124248496994,
+      "grad_norm": 9.318482398986816,
+      "learning_rate": 2.46415770609319e-05,
+      "loss": 0.4314,
+      "step": 1380
+    },
+    {
+      "epoch": 11.142284569138276,
+      "grad_norm": 16.068525314331055,
+      "learning_rate": 2.4417562724014338e-05,
+      "loss": 0.3944,
+      "step": 1390
+    },
+    {
+      "epoch": 11.22244488977956,
+      "grad_norm": 6.959997177124023,
+      "learning_rate": 2.4193548387096777e-05,
+      "loss": 0.4251,
+      "step": 1400
+    },
+    {
+      "epoch": 11.302605210420841,
+      "grad_norm": 11.282358169555664,
+      "learning_rate": 2.3969534050179212e-05,
+      "loss": 0.4074,
+      "step": 1410
+    },
+    {
+      "epoch": 11.382765531062125,
+      "grad_norm": 8.684910774230957,
+      "learning_rate": 2.374551971326165e-05,
+      "loss": 0.4426,
+      "step": 1420
+    },
+    {
+      "epoch": 11.462925851703407,
+      "grad_norm": 11.480581283569336,
+      "learning_rate": 2.352150537634409e-05,
+      "loss": 0.413,
+      "step": 1430
+    },
+    {
+      "epoch": 11.54308617234469,
+      "grad_norm": 10.927531242370605,
+      "learning_rate": 2.3297491039426525e-05,
+      "loss": 0.4338,
+      "step": 1440
+    },
+    {
+      "epoch": 11.623246492985972,
+      "grad_norm": 10.118310928344727,
+      "learning_rate": 2.307347670250896e-05,
+      "loss": 0.4536,
+      "step": 1450
+    },
+    {
+      "epoch": 11.703406813627254,
+      "grad_norm": 10.131954193115234,
+      "learning_rate": 2.28494623655914e-05,
+      "loss": 0.4164,
+      "step": 1460
+    },
+    {
+      "epoch": 11.783567134268537,
+      "grad_norm": 14.58598804473877,
+      "learning_rate": 2.2625448028673837e-05,
+      "loss": 0.4342,
+      "step": 1470
+    },
+    {
+      "epoch": 11.863727454909819,
+      "grad_norm": 12.672148704528809,
+      "learning_rate": 2.2401433691756272e-05,
+      "loss": 0.4393,
+      "step": 1480
+    },
+    {
+      "epoch": 11.943887775551103,
+      "grad_norm": 11.871652603149414,
+      "learning_rate": 2.217741935483871e-05,
+      "loss": 0.3771,
+      "step": 1490
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.85456595264938,
+      "eval_loss": 0.4546312391757965,
+      "eval_runtime": 21.2281,
+      "eval_samples_per_second": 83.569,
+      "eval_steps_per_second": 2.638,
+      "step": 1497
+    },
+    {
+      "epoch": 12.024048096192384,
+      "grad_norm": 12.87192440032959,
+      "learning_rate": 2.1953405017921146e-05,
+      "loss": 0.404,
+      "step": 1500
+    },
+    {
+      "epoch": 12.104208416833668,
+      "grad_norm": 11.67623519897461,
+      "learning_rate": 2.1729390681003585e-05,
+      "loss": 0.3968,
+      "step": 1510
+    },
+    {
+      "epoch": 12.18436873747495,
+      "grad_norm": 11.608409881591797,
+      "learning_rate": 2.1505376344086024e-05,
+      "loss": 0.3809,
+      "step": 1520
+    },
+    {
+      "epoch": 12.264529058116233,
+      "grad_norm": 9.568375587463379,
+      "learning_rate": 2.128136200716846e-05,
+      "loss": 0.4135,
+      "step": 1530
+    },
+    {
+      "epoch": 12.344689378757515,
+      "grad_norm": 10.64120864868164,
+      "learning_rate": 2.1057347670250897e-05,
+      "loss": 0.411,
+      "step": 1540
+    },
+    {
+      "epoch": 12.424849699398798,
+      "grad_norm": 9.730778694152832,
+      "learning_rate": 2.0833333333333336e-05,
+      "loss": 0.3948,
+      "step": 1550
+    },
+    {
+      "epoch": 12.50501002004008,
+      "grad_norm": 11.325265884399414,
+      "learning_rate": 2.060931899641577e-05,
+      "loss": 0.403,
+      "step": 1560
+    },
+    {
+      "epoch": 12.585170340681362,
+      "grad_norm": 13.892471313476562,
+      "learning_rate": 2.038530465949821e-05,
+      "loss": 0.4393,
+      "step": 1570
+    },
+    {
+      "epoch": 12.665330661322646,
+      "grad_norm": 14.78463363647461,
+      "learning_rate": 2.0161290322580645e-05,
+      "loss": 0.3784,
+      "step": 1580
+    },
+    {
+      "epoch": 12.745490981963927,
+      "grad_norm": 14.130833625793457,
+      "learning_rate": 1.9937275985663084e-05,
+      "loss": 0.425,
+      "step": 1590
+    },
+    {
+      "epoch": 12.82565130260521,
+      "grad_norm": 8.856616973876953,
+      "learning_rate": 1.9713261648745522e-05,
+      "loss": 0.3804,
+      "step": 1600
+    },
+    {
+      "epoch": 12.905811623246493,
+      "grad_norm": 10.982331275939941,
+      "learning_rate": 1.9489247311827958e-05,
+      "loss": 0.4252,
+      "step": 1610
+    },
+    {
+      "epoch": 12.985971943887776,
+      "grad_norm": 9.618106842041016,
+      "learning_rate": 1.9265232974910393e-05,
+      "loss": 0.4224,
+      "step": 1620
+    },
+    {
+      "epoch": 12.993987975951903,
+      "eval_accuracy": 0.8624577226606539,
+      "eval_loss": 0.448898583650589,
+      "eval_runtime": 17.755,
+      "eval_samples_per_second": 99.916,
+      "eval_steps_per_second": 3.154,
+      "step": 1621
+    },
+    {
+      "epoch": 13.066132264529058,
+      "grad_norm": 18.403636932373047,
+      "learning_rate": 1.904121863799283e-05,
+      "loss": 0.4142,
+      "step": 1630
+    },
+    {
+      "epoch": 13.146292585170341,
+      "grad_norm": 12.012832641601562,
+      "learning_rate": 1.881720430107527e-05,
+      "loss": 0.3976,
+      "step": 1640
+    },
+    {
+      "epoch": 13.226452905811623,
+      "grad_norm": 10.503453254699707,
+      "learning_rate": 1.8593189964157705e-05,
+      "loss": 0.3707,
+      "step": 1650
+    },
+    {
+      "epoch": 13.306613226452907,
+      "grad_norm": 12.286235809326172,
+      "learning_rate": 1.8369175627240144e-05,
+      "loss": 0.4056,
+      "step": 1660
+    },
+    {
+      "epoch": 13.386773547094188,
+      "grad_norm": 9.312376976013184,
+      "learning_rate": 1.8145161290322583e-05,
+      "loss": 0.4226,
+      "step": 1670
+    },
+    {
+      "epoch": 13.46693386773547,
+      "grad_norm": 9.602310180664062,
+      "learning_rate": 1.7921146953405018e-05,
+      "loss": 0.3638,
+      "step": 1680
+    },
+    {
+      "epoch": 13.547094188376754,
+      "grad_norm": 10.584216117858887,
+      "learning_rate": 1.7697132616487457e-05,
+      "loss": 0.3955,
+      "step": 1690
+    },
+    {
+      "epoch": 13.627254509018035,
+      "grad_norm": 9.666451454162598,
+      "learning_rate": 1.7473118279569895e-05,
+      "loss": 0.3895,
+      "step": 1700
     },
     {
+      "epoch": 13.707414829659319,
+      "grad_norm": 14.383480072021484,
+      "learning_rate": 1.724910394265233e-05,
+      "loss": 0.4076,
+      "step": 1710
+    },
+    {
+      "epoch": 13.7875751503006,
+      "grad_norm": 9.302132606506348,
+      "learning_rate": 1.702508960573477e-05,
+      "loss": 0.4342,
+      "step": 1720
+    },
+    {
+      "epoch": 13.867735470941884,
+      "grad_norm": 24.193918228149414,
+      "learning_rate": 1.6801075268817204e-05,
+      "loss": 0.4005,
+      "step": 1730
+    },
+    {
+      "epoch": 13.947895791583166,
+      "grad_norm": 14.272506713867188,
+      "learning_rate": 1.6577060931899643e-05,
+      "loss": 0.4099,
+      "step": 1740
+    },
+    {
+      "epoch": 13.995991983967937,
+      "eval_accuracy": 0.8624577226606539,
+      "eval_loss": 0.4411380887031555,
+      "eval_runtime": 11.1197,
+      "eval_samples_per_second": 159.537,
+      "eval_steps_per_second": 5.036,
+      "step": 1746
+    },
+    {
+      "epoch": 14.02805611222445,
+      "grad_norm": 10.529751777648926,
+      "learning_rate": 1.635304659498208e-05,
+      "loss": 0.3849,
+      "step": 1750
+    },
+    {
+      "epoch": 14.108216432865731,
+      "grad_norm": 9.820696830749512,
+      "learning_rate": 1.6129032258064517e-05,
+      "loss": 0.4455,
+      "step": 1760
+    },
+    {
+      "epoch": 14.188376753507015,
+      "grad_norm": 8.576085090637207,
+      "learning_rate": 1.5905017921146952e-05,
+      "loss": 0.3852,
+      "step": 1770
+    },
+    {
+      "epoch": 14.268537074148297,
+      "grad_norm": 11.380485534667969,
+      "learning_rate": 1.568100358422939e-05,
+      "loss": 0.367,
+      "step": 1780
+    },
+    {
+      "epoch": 14.348697394789578,
+      "grad_norm": 12.20594310760498,
+      "learning_rate": 1.545698924731183e-05,
+      "loss": 0.3974,
+      "step": 1790
+    },
+    {
+      "epoch": 14.428857715430862,
+      "grad_norm": 11.483406066894531,
+      "learning_rate": 1.5232974910394265e-05,
+      "loss": 0.3949,
+      "step": 1800
+    },
+    {
+      "epoch": 14.509018036072144,
+      "grad_norm": 9.642448425292969,
+      "learning_rate": 1.5008960573476701e-05,
+      "loss": 0.4097,
+      "step": 1810
+    },
+    {
+      "epoch": 14.589178356713427,
+      "grad_norm": 10.316274642944336,
+      "learning_rate": 1.4784946236559142e-05,
+      "loss": 0.3393,
+      "step": 1820
+    },
+    {
+      "epoch": 14.669338677354709,
+      "grad_norm": 10.923069953918457,
+      "learning_rate": 1.4560931899641577e-05,
+      "loss": 0.3562,
+      "step": 1830
+    },
+    {
+      "epoch": 14.749498997995993,
+      "grad_norm": 9.884988784790039,
+      "learning_rate": 1.4336917562724014e-05,
+      "loss": 0.3581,
+      "step": 1840
+    },
+    {
+      "epoch": 14.829659318637274,
+      "grad_norm": 8.461724281311035,
+      "learning_rate": 1.4112903225806454e-05,
+      "loss": 0.3775,
+      "step": 1850
+    },
+    {
+      "epoch": 14.909819639278558,
+      "grad_norm": 8.028816223144531,
+      "learning_rate": 1.388888888888889e-05,
+      "loss": 0.3968,
+      "step": 1860
+    },
+    {
+      "epoch": 14.98997995991984,
+      "grad_norm": 13.816071510314941,
+      "learning_rate": 1.3664874551971326e-05,
+      "loss": 0.3759,
+      "step": 1870
+    },
+    {
+      "epoch": 14.997995991983968,
+      "eval_accuracy": 0.8652762119503946,
+      "eval_loss": 0.43173447251319885,
+      "eval_runtime": 13.0155,
+      "eval_samples_per_second": 136.299,
+      "eval_steps_per_second": 4.303,
+      "step": 1871
+    },
+    {
+      "epoch": 15.070140280561123,
+      "grad_norm": 10.104082107543945,
+      "learning_rate": 1.3440860215053763e-05,
+      "loss": 0.3643,
+      "step": 1880
+    },
+    {
+      "epoch": 15.150300601202405,
+      "grad_norm": 9.357542991638184,
+      "learning_rate": 1.3216845878136202e-05,
+      "loss": 0.3912,
+      "step": 1890
+    },
+    {
+      "epoch": 15.230460921843687,
+      "grad_norm": 9.616016387939453,
+      "learning_rate": 1.2992831541218639e-05,
+      "loss": 0.3554,
+      "step": 1900
+    },
+    {
+      "epoch": 15.31062124248497,
+      "grad_norm": 11.787483215332031,
+      "learning_rate": 1.2768817204301076e-05,
+      "loss": 0.3695,
+      "step": 1910
+    },
+    {
+      "epoch": 15.390781563126252,
+      "grad_norm": 12.30813980102539,
+      "learning_rate": 1.2544802867383513e-05,
+      "loss": 0.4177,
+      "step": 1920
+    },
+    {
+      "epoch": 15.470941883767535,
+      "grad_norm": 12.10972785949707,
+      "learning_rate": 1.232078853046595e-05,
+      "loss": 0.3606,
+      "step": 1930
+    },
+    {
+      "epoch": 15.551102204408817,
+      "grad_norm": 9.871501922607422,
+      "learning_rate": 1.2096774193548388e-05,
+      "loss": 0.3627,
+      "step": 1940
+    },
+    {
+      "epoch": 15.6312625250501,
+      "grad_norm": 9.91009521484375,
+      "learning_rate": 1.1872759856630825e-05,
+      "loss": 0.363,
+      "step": 1950
+    },
+    {
+      "epoch": 15.711422845691382,
+      "grad_norm": 9.586908340454102,
+      "learning_rate": 1.1648745519713262e-05,
+      "loss": 0.3761,
+      "step": 1960
+    },
+    {
+      "epoch": 15.791583166332666,
+      "grad_norm": 13.530887603759766,
+      "learning_rate": 1.14247311827957e-05,
+      "loss": 0.3437,
+      "step": 1970
+    },
+    {
+      "epoch": 15.871743486973948,
+      "grad_norm": 10.21368408203125,
+      "learning_rate": 1.1200716845878136e-05,
+      "loss": 0.3575,
+      "step": 1980
+    },
+    {
+      "epoch": 15.951903807615231,
+      "grad_norm": 10.05925178527832,
+      "learning_rate": 1.0976702508960573e-05,
+      "loss": 0.3692,
+      "step": 1990
+    },
+    {
+      "epoch": 16.0,
+      "eval_accuracy": 0.863021420518602,
+      "eval_loss": 0.43041756749153137,
+      "eval_runtime": 13.2621,
+      "eval_samples_per_second": 133.765,
+      "eval_steps_per_second": 4.223,
+      "step": 1996
+    },
+    {
+      "epoch": 16.03206412825651,
+      "grad_norm": 8.98168659210205,
+      "learning_rate": 1.0752688172043012e-05,
+      "loss": 0.3292,
+      "step": 2000
+    },
+    {
+      "epoch": 16.112224448897795,
+      "grad_norm": 13.636373519897461,
+      "learning_rate": 1.0528673835125449e-05,
+      "loss": 0.3546,
+      "step": 2010
+    },
+    {
+      "epoch": 16.19238476953908,
+      "grad_norm": 8.504558563232422,
+      "learning_rate": 1.0304659498207886e-05,
+      "loss": 0.3346,
+      "step": 2020
+    },
+    {
+      "epoch": 16.272545090180362,
+      "grad_norm": 12.934548377990723,
+      "learning_rate": 1.0080645161290323e-05,
+      "loss": 0.3453,
+      "step": 2030
+    },
+    {
+      "epoch": 16.352705410821642,
+      "grad_norm": 10.473663330078125,
+      "learning_rate": 9.856630824372761e-06,
+      "loss": 0.3545,
+      "step": 2040
+    },
+    {
+      "epoch": 16.432865731462925,
+      "grad_norm": 12.263561248779297,
+      "learning_rate": 9.632616487455196e-06,
+      "loss": 0.3487,
+      "step": 2050
+    },
+    {
+      "epoch": 16.51302605210421,
+      "grad_norm": 8.574410438537598,
+      "learning_rate": 9.408602150537635e-06,
+      "loss": 0.4023,
+      "step": 2060
+    },
+    {
+      "epoch": 16.593186372745492,
+      "grad_norm": 20.604251861572266,
+      "learning_rate": 9.184587813620072e-06,
+      "loss": 0.3941,
+      "step": 2070
+    },
+    {
+      "epoch": 16.673346693386772,
+      "grad_norm": 15.173372268676758,
+      "learning_rate": 8.960573476702509e-06,
+      "loss": 0.3716,
+      "step": 2080
+    },
+    {
+      "epoch": 16.753507014028056,
+      "grad_norm": 10.56511116027832,
+      "learning_rate": 8.736559139784948e-06,
+      "loss": 0.3873,
+      "step": 2090
+    },
+    {
+      "epoch": 16.83366733466934,
+      "grad_norm": 12.560215950012207,
+      "learning_rate": 8.512544802867385e-06,
+      "loss": 0.332,
+      "step": 2100
+    },
+    {
+      "epoch": 16.91382765531062,
+      "grad_norm": 12.049774169921875,
+      "learning_rate": 8.288530465949821e-06,
+      "loss": 0.3394,
+      "step": 2110
+    },
+    {
+      "epoch": 16.993987975951903,
+      "grad_norm": 12.533961296081543,
+      "learning_rate": 8.064516129032258e-06,
+      "loss": 0.364,
+      "step": 2120
+    },
+    {
+      "epoch": 16.993987975951903,
+      "eval_accuracy": 0.8664036076662909,
+      "eval_loss": 0.4329654276371002,
+      "eval_runtime": 11.1171,
+      "eval_samples_per_second": 159.574,
+      "eval_steps_per_second": 5.037,
+      "step": 2120
+    },
+    {
+      "epoch": 17.074148296593187,
+      "grad_norm": 11.729970932006836,
+      "learning_rate": 7.840501792114695e-06,
+      "loss": 0.3719,
+      "step": 2130
+    },
+    {
+      "epoch": 17.15430861723447,
+      "grad_norm": 8.87394905090332,
+      "learning_rate": 7.616487455197132e-06,
+      "loss": 0.3165,
+      "step": 2140
+    },
+    {
+      "epoch": 17.23446893787575,
+      "grad_norm": 11.172150611877441,
+      "learning_rate": 7.392473118279571e-06,
+      "loss": 0.371,
+      "step": 2150
+    },
+    {
+      "epoch": 17.314629258517034,
+      "grad_norm": 17.362049102783203,
+      "learning_rate": 7.168458781362007e-06,
+      "loss": 0.3607,
+      "step": 2160
+    },
+    {
+      "epoch": 17.394789579158317,
+      "grad_norm": 9.348087310791016,
+      "learning_rate": 6.944444444444445e-06,
+      "loss": 0.3837,
+      "step": 2170
+    },
+    {
+      "epoch": 17.4749498997996,
+      "grad_norm": 11.758851051330566,
+      "learning_rate": 6.720430107526882e-06,
+      "loss": 0.3847,
+      "step": 2180
+    },
+    {
+      "epoch": 17.55511022044088,
+      "grad_norm": 12.436318397521973,
+      "learning_rate": 6.4964157706093195e-06,
+      "loss": 0.3541,
+      "step": 2190
+    },
+    {
+      "epoch": 17.635270541082164,
+      "grad_norm": 8.334653854370117,
+      "learning_rate": 6.2724014336917564e-06,
+      "loss": 0.3638,
+      "step": 2200
+    },
+    {
+      "epoch": 17.715430861723448,
+      "grad_norm": 13.402957916259766,
+      "learning_rate": 6.048387096774194e-06,
+      "loss": 0.3178,
+      "step": 2210
+    },
+    {
+      "epoch": 17.79559118236473,
+      "grad_norm": 7.792669296264648,
+      "learning_rate": 5.824372759856631e-06,
+      "loss": 0.3553,
+      "step": 2220
+    },
+    {
+      "epoch": 17.87575150300601,
+      "grad_norm": 16.891786575317383,
+      "learning_rate": 5.600358422939068e-06,
+      "loss": 0.357,
+      "step": 2230
+    },
+    {
+      "epoch": 17.955911823647295,
+      "grad_norm": 8.470565795898438,
+      "learning_rate": 5.376344086021506e-06,
+      "loss": 0.3636,
+      "step": 2240
+    },
+    {
+      "epoch": 17.995991983967937,
+      "eval_accuracy": 0.8680947012401353,
+      "eval_loss": 0.4249646067619324,
+      "eval_runtime": 23.3665,
+      "eval_samples_per_second": 75.921,
+      "eval_steps_per_second": 2.397,
+      "step": 2245
+    },
+    {
+      "epoch": 18.03607214428858,
+      "grad_norm": 8.499645233154297,
+      "learning_rate": 5.152329749103943e-06,
+      "loss": 0.3596,
+      "step": 2250
+    },
+    {
+      "epoch": 18.11623246492986,
+      "grad_norm": 12.444930076599121,
+      "learning_rate": 4.928315412186381e-06,
+      "loss": 0.2935,
+      "step": 2260
+    },
+    {
+      "epoch": 18.196392785571142,
+      "grad_norm": 11.180887222290039,
+      "learning_rate": 4.7043010752688175e-06,
+      "loss": 0.3253,
+      "step": 2270
+    },
+    {
+      "epoch": 18.276553106212425,
+      "grad_norm": 8.559268951416016,
+      "learning_rate": 4.4802867383512545e-06,
+      "loss": 0.3427,
+      "step": 2280
+    },
+    {
+      "epoch": 18.35671342685371,
+      "grad_norm": 15.954803466796875,
+      "learning_rate": 4.256272401433692e-06,
+      "loss": 0.3657,
+      "step": 2290
+    },
+    {
+      "epoch": 18.43687374749499,
+      "grad_norm": 12.122089385986328,
+      "learning_rate": 4.032258064516129e-06,
+      "loss": 0.3101,
+      "step": 2300
+    },
+    {
+      "epoch": 18.517034068136272,
+      "grad_norm": 7.729813575744629,
+      "learning_rate": 3.808243727598566e-06,
+      "loss": 0.3266,
+      "step": 2310
+    },
+    {
+      "epoch": 18.597194388777556,
+      "grad_norm": 8.754435539245605,
+      "learning_rate": 3.5842293906810035e-06,
+      "loss": 0.3337,
+      "step": 2320
+    },
+    {
+      "epoch": 18.677354709418836,
+      "grad_norm": 12.22318172454834,
+      "learning_rate": 3.360215053763441e-06,
+      "loss": 0.3262,
+      "step": 2330
+    },
+    {
+      "epoch": 18.75751503006012,
+      "grad_norm": 11.190741539001465,
+      "learning_rate": 3.1362007168458782e-06,
+      "loss": 0.3537,
+      "step": 2340
+    },
+    {
+      "epoch": 18.837675350701403,
+      "grad_norm": 12.088068008422852,
+      "learning_rate": 2.9121863799283156e-06,
+      "loss": 0.3034,
+      "step": 2350
+    },
+    {
+      "epoch": 18.917835671342687,
+      "grad_norm": 8.4877347946167,
+      "learning_rate": 2.688172043010753e-06,
+      "loss": 0.3537,
+      "step": 2360
+    },
+    {
+      "epoch": 18.997995991983966,
+      "grad_norm": 12.145883560180664,
+      "learning_rate": 2.4641577060931903e-06,
+      "loss": 0.3396,
+      "step": 2370
+    },
+    {
+      "epoch": 18.997995991983966,
+      "eval_accuracy": 0.8675310033821871,
+      "eval_loss": 0.4275255799293518,
+      "eval_runtime": 20.9813,
+      "eval_samples_per_second": 84.551,
+      "eval_steps_per_second": 2.669,
+      "step": 2370
+    },
+    {
+      "epoch": 19.07815631262525,
+      "grad_norm": 12.138285636901855,
+      "learning_rate": 2.2401433691756272e-06,
+      "loss": 0.3464,
+      "step": 2380
+    },
+    {
+      "epoch": 19.158316633266534,
+      "grad_norm": 15.26742172241211,
+      "learning_rate": 2.0161290322580646e-06,
+      "loss": 0.342,
+      "step": 2390
+    },
+    {
+      "epoch": 19.238476953907817,
+      "grad_norm": 11.72075080871582,
+      "learning_rate": 1.7921146953405017e-06,
+      "loss": 0.3159,
+      "step": 2400
+    },
+    {
+      "epoch": 19.318637274549097,
+      "grad_norm": 8.406167984008789,
+      "learning_rate": 1.5681003584229391e-06,
+      "loss": 0.2976,
+      "step": 2410
+    },
+    {
+      "epoch": 19.39879759519038,
+      "grad_norm": 9.558813095092773,
+      "learning_rate": 1.3440860215053765e-06,
+      "loss": 0.3713,
+      "step": 2420
+    },
+    {
+      "epoch": 19.478957915831664,
+      "grad_norm": 12.249772071838379,
+      "learning_rate": 1.1200716845878136e-06,
+      "loss": 0.3353,
+      "step": 2430
+    },
+    {
+      "epoch": 19.559118236472948,
+      "grad_norm": 14.776098251342773,
+      "learning_rate": 8.960573476702509e-07,
+      "loss": 0.3507,
+      "step": 2440
+    },
+    {
+      "epoch": 19.639278557114228,
+      "grad_norm": 12.802459716796875,
+      "learning_rate": 6.720430107526882e-07,
+      "loss": 0.3144,
+      "step": 2450
+    },
+    {
+      "epoch": 19.71943887775551,
+      "grad_norm": 11.719756126403809,
+      "learning_rate": 4.4802867383512544e-07,
+      "loss": 0.3283,
+      "step": 2460
+    },
+    {
+      "epoch": 19.799599198396795,
+      "grad_norm": 11.714187622070312,
+      "learning_rate": 2.2401433691756272e-07,
+      "loss": 0.3198,
+      "step": 2470
+    },
+    {
+      "epoch": 19.879759519038075,
+      "grad_norm": 12.341629028320312,
+      "learning_rate": 0.0,
+      "loss": 0.3057,
+      "step": 2480
+    },
+    {
+      "epoch": 19.879759519038075,
+      "eval_accuracy": 0.8703494926719278,
+      "eval_loss": 0.4261245131492615,
+      "eval_runtime": 20.7326,
+      "eval_samples_per_second": 85.566,
+      "eval_steps_per_second": 2.701,
+      "step": 2480
+    },
+    {
+      "epoch": 19.879759519038075,
+      "step": 2480,
+      "total_flos": 7.988705158075343e+18,
+      "train_loss": 0.5653726263392356,
+      "train_runtime": 5886.8979,
+      "train_samples_per_second": 54.229,
+      "train_steps_per_second": 0.421
     }
   ],
   "logging_steps": 10,
+  "max_steps": 2480,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
   "save_steps": 500,
+  "total_flos": 7.988705158075343e+18,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null