Uploaded checkpoint-22500

Browse files

Files changed (7) hide show

adapter_config.json +2 -2
adapter_model.safetensors +1 -1
optimizer.pt +2 -2
rng_state.pth +2 -2
scheduler.pt +1 -1
trainer_state.json +1366 -1040
training_args.bin +2 -2

adapter_config.json CHANGED Viewed

@@ -20,11 +20,11 @@
   "revision": null,
   "target_modules": [
     "down_proj",
-    "up_proj",
     "gate_proj",
     "v_proj",
-    "k_proj",
     "o_proj",
     "q_proj"
   ],
   "task_type": "CAUSAL_LM",

   "revision": null,
   "target_modules": [
     "down_proj",
     "gate_proj",
+    "up_proj",
     "v_proj",
     "o_proj",
+    "k_proj",
     "q_proj"
   ],
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1094cefddb8a4c25c681c6cde66e2e7b24fd394103df2badf5c69d6900ada43b
 size 119975656

 version https://git-lfs.github.com/spec/v1
+oid sha256:242bd1c8837db6c74ea117245bfc1b46592098a098bf72cb4a75b6fa1c50ea96
 size 119975656

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:645d5b94ee5359b4733aca4181803ae6254706a9713eb85a854d8057e3a67182
-size 60477396

 version https://git-lfs.github.com/spec/v1
+oid sha256:39f5976b7e007e60478770ca750c77a010d3dbba2afa30dc3c72a0856e2cd01d
+size 240145026

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e33dda9942df9cbad9cd46793f638f52f82780e545c7592c3d1cbe682087eb0
-size 14180

 version https://git-lfs.github.com/spec/v1
+oid sha256:763d4f397fbd8e2128612f32f5c273b211bf68ec4372f02c7c91ca944e405a2f
+size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:29c7a79b53a589de48d3b7a21df9c0d024be4dea79f68869f72fdc01ae3b212a
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:d71c00e9bfbac2252002b6eca4a38910300bb6c14e6c56273842dfbc024260d9
 size 1064

trainer_state.json CHANGED Viewed

@@ -1,1484 +1,1810 @@
 {
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 0.5,
-  "eval_steps": 2500,
-  "global_step": 20000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0,
-      "grad_norm": 4.094185829162598,
       "learning_rate": 4.000000000000001e-06,
-      "loss": 1.8542,
       "step": 100
     },
     {
       "epoch": 0.01,
-      "grad_norm": 8.345755577087402,
       "learning_rate": 8.000000000000001e-06,
-      "loss": 1.4774,
       "step": 200
     },
     {
       "epoch": 0.01,
-      "grad_norm": 3.6847422122955322,
       "learning_rate": 1.2e-05,
-      "loss": 1.3027,
       "step": 300
     },
     {
       "epoch": 0.01,
-      "grad_norm": 15.149823188781738,
       "learning_rate": 1.6000000000000003e-05,
-      "loss": 1.2168,
       "step": 400
     },
     {
       "epoch": 0.01,
-      "grad_norm": 9.95534896850586,
       "learning_rate": 2e-05,
-      "loss": 1.1544,
       "step": 500
     },
     {
       "epoch": 0.01,
-      "grad_norm": 3.96409273147583,
-      "learning_rate": 1.98974358974359e-05,
-      "loss": 1.134,
-      "step": 600
     },
     {
-      "epoch": 0.02,
-      "grad_norm": 2.587010383605957,
-      "learning_rate": 1.9794871794871798e-05,
-      "loss": 1.1294,
-      "step": 700
     },
     {
-      "epoch": 0.02,
-      "grad_norm": 5.926353454589844,
-      "learning_rate": 1.9692307692307696e-05,
-      "loss": 1.0886,
-      "step": 800
     },
     {
-      "epoch": 0.02,
-      "grad_norm": 4.175276756286621,
-      "learning_rate": 1.958974358974359e-05,
-      "loss": 1.1227,
-      "step": 900
     },
     {
-      "epoch": 0.03,
-      "grad_norm": 2.2265052795410156,
-      "learning_rate": 1.9487179487179488e-05,
-      "loss": 1.0694,
-      "step": 1000
     },
     {
-      "epoch": 0.03,
-      "grad_norm": 6.808347702026367,
-      "learning_rate": 1.9384615384615386e-05,
-      "loss": 1.1084,
-      "step": 1100
     },
     {
-      "epoch": 0.03,
-      "grad_norm": 2.2117719650268555,
-      "learning_rate": 1.9282051282051284e-05,
-      "loss": 1.0758,
-      "step": 1200
     },
     {
-      "epoch": 0.03,
-      "grad_norm": 2.893665075302124,
-      "learning_rate": 1.9179487179487182e-05,
-      "loss": 1.0732,
-      "step": 1300
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 4.583731174468994,
-      "learning_rate": 1.907692307692308e-05,
-      "loss": 1.0345,
-      "step": 1400
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 2.2239737510681152,
-      "learning_rate": 1.8974358974358975e-05,
-      "loss": 1.0151,
-      "step": 1500
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 6.440332412719727,
-      "learning_rate": 1.8871794871794873e-05,
-      "loss": 1.0249,
-      "step": 1600
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 3.9038124084472656,
-      "learning_rate": 1.876923076923077e-05,
-      "loss": 1.0481,
-      "step": 1700
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 4.901433944702148,
-      "learning_rate": 1.866666666666667e-05,
-      "loss": 1.0383,
-      "step": 1800
     },
     {
-      "epoch": 0.05,
-      "grad_norm": 2.6100122928619385,
-      "learning_rate": 1.8564102564102567e-05,
-      "loss": 0.9715,
-      "step": 1900
     },
     {
-      "epoch": 0.05,
-      "grad_norm": 4.283998012542725,
-      "learning_rate": 1.8461538461538465e-05,
-      "loss": 0.9946,
-      "step": 2000
     },
     {
-      "epoch": 0.05,
-      "grad_norm": 5.045602798461914,
-      "learning_rate": 1.835897435897436e-05,
-      "loss": 1.0233,
-      "step": 2100
     },
     {
-      "epoch": 0.06,
-      "grad_norm": 3.054832935333252,
-      "learning_rate": 1.8256410256410257e-05,
-      "loss": 1.0177,
-      "step": 2200
     },
     {
-      "epoch": 0.06,
-      "grad_norm": 4.251312732696533,
-      "learning_rate": 1.8153846153846155e-05,
-      "loss": 0.9562,
-      "step": 2300
     },
     {
-      "epoch": 0.06,
-      "grad_norm": 2.6943576335906982,
-      "learning_rate": 1.8051282051282053e-05,
-      "loss": 1.0076,
-      "step": 2400
     },
     {
-      "epoch": 0.06,
-      "grad_norm": 3.307131290435791,
-      "learning_rate": 1.794871794871795e-05,
-      "loss": 0.9687,
-      "step": 2500
     },
     {
-      "epoch": 0.06,
-      "eval_loss": 0.9706119894981384,
-      "eval_runtime": 104.0832,
-      "eval_samples_per_second": 9.608,
-      "eval_steps_per_second": 9.608,
-      "step": 2500
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 1.8508224487304688,
-      "learning_rate": 1.784615384615385e-05,
-      "loss": 0.9693,
-      "step": 2600
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 2.838670253753662,
-      "learning_rate": 1.7743589743589744e-05,
-      "loss": 0.955,
-      "step": 2700
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 2.9186294078826904,
-      "learning_rate": 1.7641025641025642e-05,
-      "loss": 0.9504,
-      "step": 2800
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 4.183789253234863,
-      "learning_rate": 1.753846153846154e-05,
-      "loss": 0.9579,
-      "step": 2900
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 8.950007438659668,
-      "learning_rate": 1.7435897435897438e-05,
-      "loss": 0.9493,
-      "step": 3000
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 2.4844536781311035,
-      "learning_rate": 1.7333333333333336e-05,
-      "loss": 0.9271,
-      "step": 3100
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 2.786226272583008,
-      "learning_rate": 1.7230769230769234e-05,
-      "loss": 0.955,
-      "step": 3200
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 3.8355979919433594,
-      "learning_rate": 1.7128205128205128e-05,
-      "loss": 0.9351,
-      "step": 3300
     },
     {
-      "epoch": 0.09,
-      "grad_norm": 11.382020950317383,
-      "learning_rate": 1.7025641025641026e-05,
-      "loss": 0.9472,
-      "step": 3400
     },
     {
-      "epoch": 0.09,
-      "grad_norm": 5.132159233093262,
-      "learning_rate": 1.6923076923076924e-05,
-      "loss": 0.9172,
-      "step": 3500
     },
     {
-      "epoch": 0.09,
-      "grad_norm": 7.389036178588867,
-      "learning_rate": 1.6820512820512822e-05,
-      "loss": 0.9273,
-      "step": 3600
     },
     {
-      "epoch": 0.09,
-      "grad_norm": 3.8746137619018555,
-      "learning_rate": 1.671794871794872e-05,
-      "loss": 0.9497,
-      "step": 3700
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 2.999476194381714,
-      "learning_rate": 1.6615384615384618e-05,
-      "loss": 0.9321,
-      "step": 3800
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 5.325080394744873,
-      "learning_rate": 1.6512820512820513e-05,
-      "loss": 0.9375,
-      "step": 3900
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 2.8472201824188232,
-      "learning_rate": 1.641025641025641e-05,
-      "loss": 0.9285,
-      "step": 4000
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 4.391159534454346,
-      "learning_rate": 1.630769230769231e-05,
-      "loss": 0.9285,
-      "step": 4100
     },
     {
-      "epoch": 0.1,
-      "grad_norm": 2.1041903495788574,
-      "learning_rate": 1.6205128205128207e-05,
-      "loss": 0.9262,
-      "step": 4200
     },
     {
-      "epoch": 0.11,
-      "grad_norm": 2.626622200012207,
-      "learning_rate": 1.6102564102564105e-05,
-      "loss": 0.9557,
-      "step": 4300
     },
     {
-      "epoch": 0.11,
-      "grad_norm": 3.374565362930298,
-      "learning_rate": 1.6000000000000003e-05,
-      "loss": 0.9518,
-      "step": 4400
     },
     {
-      "epoch": 0.11,
-      "grad_norm": 3.344024658203125,
-      "learning_rate": 1.5897435897435897e-05,
-      "loss": 0.9141,
-      "step": 4500
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 4.982439994812012,
-      "learning_rate": 1.5794871794871795e-05,
-      "loss": 0.9209,
-      "step": 4600
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 3.430849075317383,
-      "learning_rate": 1.5692307692307693e-05,
-      "loss": 0.8916,
-      "step": 4700
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 3.118523597717285,
-      "learning_rate": 1.558974358974359e-05,
-      "loss": 0.9536,
-      "step": 4800
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 1.9410160779953003,
-      "learning_rate": 1.548717948717949e-05,
-      "loss": 0.9154,
-      "step": 4900
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 3.892230749130249,
-      "learning_rate": 1.5384615384615387e-05,
-      "loss": 0.9289,
-      "step": 5000
     },
     {
-      "epoch": 0.12,
-      "eval_loss": 0.9065942168235779,
-      "eval_runtime": 104.6944,
-      "eval_samples_per_second": 9.552,
-      "eval_steps_per_second": 9.552,
-      "step": 5000
     },
     {
-      "epoch": 0.13,
-      "grad_norm": 3.4030818939208984,
-      "learning_rate": 1.5282051282051282e-05,
-      "loss": 0.8939,
-      "step": 5100
     },
     {
-      "epoch": 0.13,
-      "grad_norm": 5.378746509552002,
-      "learning_rate": 1.517948717948718e-05,
-      "loss": 0.9134,
-      "step": 5200
     },
     {
-      "epoch": 0.13,
-      "grad_norm": 6.0667009353637695,
-      "learning_rate": 1.5076923076923078e-05,
-      "loss": 0.884,
-      "step": 5300
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 10.8038969039917,
-      "learning_rate": 1.4974358974358976e-05,
-      "loss": 0.9061,
-      "step": 5400
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 4.017248630523682,
-      "learning_rate": 1.4871794871794874e-05,
-      "loss": 0.8847,
-      "step": 5500
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 3.3564505577087402,
-      "learning_rate": 1.4769230769230772e-05,
-      "loss": 0.8768,
-      "step": 5600
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 9.728605270385742,
-      "learning_rate": 1.4666666666666666e-05,
-      "loss": 0.8867,
-      "step": 5700
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 2.472195863723755,
-      "learning_rate": 1.4564102564102564e-05,
-      "loss": 0.898,
-      "step": 5800
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 6.605821132659912,
-      "learning_rate": 1.4461538461538462e-05,
-      "loss": 0.9115,
-      "step": 5900
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 4.562441825866699,
-      "learning_rate": 1.435897435897436e-05,
-      "loss": 0.8726,
-      "step": 6000
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 2.6972761154174805,
-      "learning_rate": 1.4256410256410258e-05,
-      "loss": 0.8702,
-      "step": 6100
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 4.478190898895264,
-      "learning_rate": 1.4153846153846156e-05,
-      "loss": 0.8905,
-      "step": 6200
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 4.112303733825684,
-      "learning_rate": 1.405128205128205e-05,
-      "loss": 0.9162,
-      "step": 6300
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 4.536581993103027,
-      "learning_rate": 1.3948717948717949e-05,
-      "loss": 0.8941,
-      "step": 6400
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 4.727623462677002,
-      "learning_rate": 1.3846153846153847e-05,
-      "loss": 0.8582,
-      "step": 6500
     },
     {
-      "epoch": 0.17,
-      "grad_norm": 3.3152599334716797,
-      "learning_rate": 1.3743589743589745e-05,
-      "loss": 0.903,
-      "step": 6600
     },
     {
-      "epoch": 0.17,
-      "grad_norm": 2.5421881675720215,
-      "learning_rate": 1.3641025641025643e-05,
-      "loss": 0.8967,
-      "step": 6700
     },
     {
-      "epoch": 0.17,
-      "grad_norm": 2.4081993103027344,
-      "learning_rate": 1.353846153846154e-05,
-      "loss": 0.8801,
-      "step": 6800
     },
     {
-      "epoch": 0.17,
-      "grad_norm": 12.206317901611328,
-      "learning_rate": 1.3435897435897435e-05,
-      "loss": 0.8594,
-      "step": 6900
     },
     {
-      "epoch": 0.17,
-      "grad_norm": 4.930089473724365,
-      "learning_rate": 1.3333333333333333e-05,
-      "loss": 0.853,
-      "step": 7000
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 3.7278289794921875,
-      "learning_rate": 1.3230769230769231e-05,
-      "loss": 0.8462,
-      "step": 7100
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 2.5738131999969482,
-      "learning_rate": 1.312820512820513e-05,
-      "loss": 0.8466,
-      "step": 7200
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 3.267303466796875,
-      "learning_rate": 1.3025641025641027e-05,
-      "loss": 0.8616,
-      "step": 7300
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 2.1865787506103516,
-      "learning_rate": 1.2923076923076925e-05,
-      "loss": 0.8802,
-      "step": 7400
     },
     {
-      "epoch": 0.19,
-      "grad_norm": 2.0264055728912354,
-      "learning_rate": 1.2820512820512823e-05,
-      "loss": 0.8841,
-      "step": 7500
     },
     {
-      "epoch": 0.19,
-      "eval_loss": 0.8533282279968262,
-      "eval_runtime": 106.0657,
-      "eval_samples_per_second": 9.428,
-      "eval_steps_per_second": 9.428,
-      "step": 7500
     },
     {
-      "epoch": 0.19,
-      "grad_norm": 3.4310085773468018,
-      "learning_rate": 1.2717948717948718e-05,
-      "loss": 0.8767,
-      "step": 7600
     },
     {
-      "epoch": 0.19,
-      "grad_norm": 2.533520460128784,
-      "learning_rate": 1.2615384615384616e-05,
-      "loss": 0.8784,
-      "step": 7700
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 4.673364639282227,
-      "learning_rate": 1.2512820512820514e-05,
-      "loss": 0.8504,
-      "step": 7800
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 2.4026598930358887,
-      "learning_rate": 1.2410256410256412e-05,
-      "loss": 0.8647,
-      "step": 7900
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 6.66796875,
-      "learning_rate": 1.230769230769231e-05,
-      "loss": 0.8634,
-      "step": 8000
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 1.8087568283081055,
-      "learning_rate": 1.2205128205128208e-05,
-      "loss": 0.8277,
-      "step": 8100
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 3.196040630340576,
-      "learning_rate": 1.2102564102564102e-05,
-      "loss": 0.8739,
-      "step": 8200
     },
     {
-      "epoch": 0.21,
-      "grad_norm": 1.6817710399627686,
-      "learning_rate": 1.2e-05,
-      "loss": 0.8367,
-      "step": 8300
     },
     {
-      "epoch": 0.21,
-      "grad_norm": 5.548306941986084,
-      "learning_rate": 1.1897435897435898e-05,
-      "loss": 0.8247,
-      "step": 8400
     },
     {
-      "epoch": 0.21,
-      "grad_norm": 6.069587707519531,
-      "learning_rate": 1.1794871794871796e-05,
-      "loss": 0.8248,
-      "step": 8500
     },
     {
-      "epoch": 0.21,
-      "grad_norm": 3.085785150527954,
-      "learning_rate": 1.1692307692307694e-05,
-      "loss": 0.8618,
-      "step": 8600
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 1.7855651378631592,
-      "learning_rate": 1.1589743589743592e-05,
-      "loss": 0.8601,
-      "step": 8700
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 3.378775119781494,
-      "learning_rate": 1.1487179487179487e-05,
-      "loss": 0.8712,
-      "step": 8800
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 2.7686617374420166,
-      "learning_rate": 1.1384615384615385e-05,
-      "loss": 0.852,
-      "step": 8900
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 5.424912452697754,
-      "learning_rate": 1.1282051282051283e-05,
-      "loss": 0.8796,
-      "step": 9000
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 6.806646347045898,
-      "learning_rate": 1.117948717948718e-05,
-      "loss": 0.8457,
-      "step": 9100
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 4.3275837898254395,
-      "learning_rate": 1.1076923076923079e-05,
-      "loss": 0.8346,
-      "step": 9200
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 5.901556015014648,
-      "learning_rate": 1.0974358974358977e-05,
-      "loss": 0.8489,
-      "step": 9300
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 6.287178993225098,
-      "learning_rate": 1.0871794871794871e-05,
-      "loss": 0.8463,
-      "step": 9400
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 2.2666897773742676,
-      "learning_rate": 1.076923076923077e-05,
-      "loss": 0.8399,
-      "step": 9500
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 2.2565557956695557,
-      "learning_rate": 1.0666666666666667e-05,
-      "loss": 0.8452,
-      "step": 9600
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 3.512251615524292,
-      "learning_rate": 1.0564102564102565e-05,
-      "loss": 0.8665,
-      "step": 9700
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 5.637045860290527,
-      "learning_rate": 1.0461538461538463e-05,
-      "loss": 0.829,
-      "step": 9800
     },
     {
-      "epoch": 0.25,
-      "grad_norm": 2.9041316509246826,
-      "learning_rate": 1.0358974358974361e-05,
-      "loss": 0.8273,
-      "step": 9900
     },
     {
-      "epoch": 0.25,
-      "grad_norm": 2.120234727859497,
-      "learning_rate": 1.0256410256410256e-05,
-      "loss": 0.7933,
-      "step": 10000
     },
     {
-      "epoch": 0.25,
-      "eval_loss": 0.8178455829620361,
-      "eval_runtime": 103.8104,
-      "eval_samples_per_second": 9.633,
-      "eval_steps_per_second": 9.633,
-      "step": 10000
     },
     {
-      "epoch": 0.25,
-      "grad_norm": 5.0967559814453125,
-      "learning_rate": 1.0153846153846154e-05,
-      "loss": 0.833,
-      "step": 10100
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 9.097169876098633,
-      "learning_rate": 1.0051282051282052e-05,
-      "loss": 0.8363,
-      "step": 10200
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 3.159578561782837,
-      "learning_rate": 9.94871794871795e-06,
-      "loss": 0.8154,
-      "step": 10300
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 5.177265644073486,
-      "learning_rate": 9.846153846153848e-06,
-      "loss": 0.8391,
-      "step": 10400
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 4.336682319641113,
-      "learning_rate": 9.743589743589744e-06,
-      "loss": 0.8393,
-      "step": 10500
     },
     {
-      "epoch": 0.27,
-      "grad_norm": 4.355902194976807,
-      "learning_rate": 9.641025641025642e-06,
-      "loss": 0.8491,
-      "step": 10600
     },
     {
-      "epoch": 0.27,
-      "grad_norm": 6.8868279457092285,
-      "learning_rate": 9.53846153846154e-06,
-      "loss": 0.8026,
-      "step": 10700
     },
     {
-      "epoch": 0.27,
-      "grad_norm": 2.8234918117523193,
-      "learning_rate": 9.435897435897436e-06,
-      "loss": 0.8412,
-      "step": 10800
     },
     {
-      "epoch": 0.27,
-      "grad_norm": 4.60006046295166,
-      "learning_rate": 9.333333333333334e-06,
-      "loss": 0.8022,
-      "step": 10900
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 4.048822402954102,
-      "learning_rate": 9.230769230769232e-06,
-      "loss": 0.8117,
-      "step": 11000
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 3.5352272987365723,
-      "learning_rate": 9.128205128205129e-06,
-      "loss": 0.8621,
-      "step": 11100
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 3.284557819366455,
-      "learning_rate": 9.025641025641027e-06,
-      "loss": 0.7945,
-      "step": 11200
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 6.281557559967041,
-      "learning_rate": 8.923076923076925e-06,
-      "loss": 0.8398,
-      "step": 11300
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 4.4348297119140625,
-      "learning_rate": 8.820512820512821e-06,
-      "loss": 0.8151,
-      "step": 11400
     },
     {
-      "epoch": 0.29,
-      "grad_norm": 4.739795684814453,
-      "learning_rate": 8.717948717948719e-06,
-      "loss": 0.8223,
-      "step": 11500
     },
     {
-      "epoch": 0.29,
-      "grad_norm": 5.187675476074219,
-      "learning_rate": 8.615384615384617e-06,
-      "loss": 0.7946,
-      "step": 11600
     },
     {
-      "epoch": 0.29,
-      "grad_norm": 6.4138360023498535,
-      "learning_rate": 8.512820512820513e-06,
-      "loss": 0.8118,
-      "step": 11700
     },
     {
-      "epoch": 0.29,
-      "grad_norm": 3.3624444007873535,
-      "learning_rate": 8.410256410256411e-06,
-      "loss": 0.8234,
-      "step": 11800
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 1.7718826532363892,
-      "learning_rate": 8.307692307692309e-06,
-      "loss": 0.816,
-      "step": 11900
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 5.3870158195495605,
-      "learning_rate": 8.205128205128205e-06,
-      "loss": 0.83,
-      "step": 12000
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 7.233886241912842,
-      "learning_rate": 8.102564102564103e-06,
-      "loss": 0.7626,
-      "step": 12100
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 1.8522437810897827,
-      "learning_rate": 8.000000000000001e-06,
-      "loss": 0.7786,
-      "step": 12200
     },
     {
-      "epoch": 0.31,
-      "grad_norm": 3.0882771015167236,
-      "learning_rate": 7.897435897435898e-06,
-      "loss": 0.8297,
-      "step": 12300
     },
     {
-      "epoch": 0.31,
-      "grad_norm": 5.807680606842041,
-      "learning_rate": 7.794871794871796e-06,
-      "loss": 0.8571,
-      "step": 12400
     },
     {
-      "epoch": 0.31,
-      "grad_norm": 3.875642776489258,
-      "learning_rate": 7.692307692307694e-06,
-      "loss": 0.8101,
-      "step": 12500
     },
     {
-      "epoch": 0.31,
-      "eval_loss": 0.8394359946250916,
-      "eval_runtime": 104.1474,
-      "eval_samples_per_second": 9.602,
-      "eval_steps_per_second": 9.602,
-      "step": 12500
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 10.450545310974121,
-      "learning_rate": 7.58974358974359e-06,
-      "loss": 0.7913,
-      "step": 12600
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 4.064128398895264,
-      "learning_rate": 7.487179487179488e-06,
-      "loss": 0.8523,
-      "step": 12700
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 2.707719564437866,
-      "learning_rate": 7.384615384615386e-06,
-      "loss": 0.8403,
-      "step": 12800
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 4.44093132019043,
-      "learning_rate": 7.282051282051282e-06,
-      "loss": 0.8243,
-      "step": 12900
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 4.285432815551758,
-      "learning_rate": 7.17948717948718e-06,
-      "loss": 0.8011,
-      "step": 13000
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 3.158308744430542,
-      "learning_rate": 7.076923076923078e-06,
-      "loss": 0.8062,
-      "step": 13100
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 5.444665431976318,
-      "learning_rate": 6.974358974358974e-06,
-      "loss": 0.8229,
-      "step": 13200
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 11.230988502502441,
-      "learning_rate": 6.871794871794872e-06,
-      "loss": 0.8169,
-      "step": 13300
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 16.40984344482422,
-      "learning_rate": 6.76923076923077e-06,
-      "loss": 0.8354,
-      "step": 13400
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 5.534363269805908,
-      "learning_rate": 6.666666666666667e-06,
-      "loss": 0.7963,
-      "step": 13500
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 5.745026588439941,
-      "learning_rate": 6.564102564102565e-06,
-      "loss": 0.7899,
-      "step": 13600
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 3.449707508087158,
-      "learning_rate": 6.461538461538463e-06,
-      "loss": 0.7783,
-      "step": 13700
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 2.5562901496887207,
-      "learning_rate": 6.358974358974359e-06,
-      "loss": 0.8222,
-      "step": 13800
     },
     {
-      "epoch": 0.35,
-      "grad_norm": 4.387004375457764,
-      "learning_rate": 6.256410256410257e-06,
-      "loss": 0.8033,
-      "step": 13900
     },
     {
-      "epoch": 0.35,
-      "grad_norm": 3.628570318222046,
-      "learning_rate": 6.153846153846155e-06,
-      "loss": 0.7791,
-      "step": 14000
     },
     {
-      "epoch": 0.35,
-      "grad_norm": 4.86137580871582,
-      "learning_rate": 6.051282051282051e-06,
-      "loss": 0.7974,
-      "step": 14100
     },
     {
-      "epoch": 0.35,
-      "grad_norm": 3.2952165603637695,
-      "learning_rate": 5.948717948717949e-06,
-      "loss": 0.7768,
-      "step": 14200
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 3.655470848083496,
-      "learning_rate": 5.846153846153847e-06,
-      "loss": 0.7969,
-      "step": 14300
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 5.8347086906433105,
-      "learning_rate": 5.743589743589743e-06,
-      "loss": 0.801,
-      "step": 14400
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 4.130991458892822,
-      "learning_rate": 5.641025641025641e-06,
-      "loss": 0.7876,
-      "step": 14500
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 6.501937389373779,
-      "learning_rate": 5.538461538461539e-06,
-      "loss": 0.8172,
-      "step": 14600
     },
     {
-      "epoch": 0.37,
-      "grad_norm": 5.493655204772949,
-      "learning_rate": 5.435897435897436e-06,
-      "loss": 0.8158,
-      "step": 14700
     },
     {
-      "epoch": 0.37,
-      "grad_norm": 5.281980037689209,
-      "learning_rate": 5.333333333333334e-06,
-      "loss": 0.7831,
-      "step": 14800
     },
     {
-      "epoch": 0.37,
-      "grad_norm": 4.665294647216797,
-      "learning_rate": 5.230769230769232e-06,
-      "loss": 0.7772,
-      "step": 14900
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 3.9457015991210938,
-      "learning_rate": 5.128205128205128e-06,
-      "loss": 0.777,
-      "step": 15000
     },
     {
-      "epoch": 0.38,
-      "eval_loss": 0.8082045316696167,
-      "eval_runtime": 104.2332,
-      "eval_samples_per_second": 9.594,
-      "eval_steps_per_second": 9.594,
-      "step": 15000
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 9.883415222167969,
-      "learning_rate": 5.025641025641026e-06,
-      "loss": 0.793,
-      "step": 15100
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 8.593897819519043,
-      "learning_rate": 4.923076923076924e-06,
-      "loss": 0.7879,
-      "step": 15200
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 19.679561614990234,
-      "learning_rate": 4.820512820512821e-06,
-      "loss": 0.836,
-      "step": 15300
     },
     {
-      "epoch": 0.39,
-      "grad_norm": 4.36007833480835,
-      "learning_rate": 4.717948717948718e-06,
-      "loss": 0.81,
-      "step": 15400
     },
     {
-      "epoch": 0.39,
-      "grad_norm": 4.863149166107178,
-      "learning_rate": 4.615384615384616e-06,
-      "loss": 0.8149,
-      "step": 15500
     },
     {
-      "epoch": 0.39,
-      "grad_norm": 9.058311462402344,
-      "learning_rate": 4.512820512820513e-06,
-      "loss": 0.7914,
-      "step": 15600
     },
     {
-      "epoch": 0.39,
-      "grad_norm": 13.729168891906738,
-      "learning_rate": 4.4102564102564104e-06,
-      "loss": 0.7978,
-      "step": 15700
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 2.711949586868286,
-      "learning_rate": 4.307692307692308e-06,
-      "loss": 0.7575,
-      "step": 15800
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 5.580270767211914,
-      "learning_rate": 4.2051282051282055e-06,
-      "loss": 0.7934,
-      "step": 15900
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 21.650022506713867,
-      "learning_rate": 4.102564102564103e-06,
-      "loss": 0.8141,
-      "step": 16000
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 7.138460636138916,
-      "learning_rate": 4.000000000000001e-06,
-      "loss": 0.7433,
-      "step": 16100
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 3.7532575130462646,
-      "learning_rate": 3.897435897435898e-06,
-      "loss": 0.7704,
-      "step": 16200
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 2.153252601623535,
-      "learning_rate": 3.794871794871795e-06,
-      "loss": 0.7647,
-      "step": 16300
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 4.485107898712158,
-      "learning_rate": 3.692307692307693e-06,
-      "loss": 0.8182,
-      "step": 16400
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 5.237086772918701,
-      "learning_rate": 3.58974358974359e-06,
-      "loss": 0.7965,
-      "step": 16500
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 2.242441177368164,
-      "learning_rate": 3.487179487179487e-06,
-      "loss": 0.8619,
-      "step": 16600
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 3.2443642616271973,
-      "learning_rate": 3.384615384615385e-06,
-      "loss": 0.7702,
-      "step": 16700
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 6.27290678024292,
-      "learning_rate": 3.2820512820512823e-06,
-      "loss": 0.7802,
-      "step": 16800
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 5.323145866394043,
-      "learning_rate": 3.1794871794871795e-06,
-      "loss": 0.8103,
-      "step": 16900
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 11.099617004394531,
-      "learning_rate": 3.0769230769230774e-06,
-      "loss": 0.7787,
-      "step": 17000
     },
     {
-      "epoch": 0.43,
-      "grad_norm": 3.3490378856658936,
-      "learning_rate": 2.9743589743589746e-06,
-      "loss": 0.7739,
-      "step": 17100
     },
     {
-      "epoch": 0.43,
-      "grad_norm": 5.076713562011719,
-      "learning_rate": 2.8717948717948717e-06,
-      "loss": 0.7401,
-      "step": 17200
     },
     {
-      "epoch": 0.43,
-      "grad_norm": 4.410634517669678,
-      "learning_rate": 2.7692307692307697e-06,
-      "loss": 0.7738,
-      "step": 17300
     },
     {
-      "epoch": 0.43,
-      "grad_norm": 3.249955177307129,
-      "learning_rate": 2.666666666666667e-06,
-      "loss": 0.7749,
-      "step": 17400
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 4.0387349128723145,
-      "learning_rate": 2.564102564102564e-06,
-      "loss": 0.7704,
-      "step": 17500
     },
     {
-      "epoch": 0.44,
-      "eval_loss": 0.7883001565933228,
-      "eval_runtime": 104.2311,
-      "eval_samples_per_second": 9.594,
-      "eval_steps_per_second": 9.594,
-      "step": 17500
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 7.914300918579102,
-      "learning_rate": 2.461538461538462e-06,
-      "loss": 0.7651,
-      "step": 17600
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 4.809656620025635,
-      "learning_rate": 2.358974358974359e-06,
-      "loss": 0.7631,
-      "step": 17700
     },
     {
-      "epoch": 0.45,
-      "grad_norm": 6.220585823059082,
-      "learning_rate": 2.2564102564102566e-06,
-      "loss": 0.7925,
-      "step": 17800
     },
     {
-      "epoch": 0.45,
-      "grad_norm": 3.666391611099243,
-      "learning_rate": 2.153846153846154e-06,
-      "loss": 0.7857,
-      "step": 17900
     },
     {
-      "epoch": 0.45,
-      "grad_norm": 5.744978427886963,
-      "learning_rate": 2.0512820512820513e-06,
-      "loss": 0.8025,
-      "step": 18000
     },
     {
-      "epoch": 0.45,
-      "grad_norm": 5.490359783172607,
-      "learning_rate": 1.948717948717949e-06,
-      "loss": 0.8005,
-      "step": 18100
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 3.3625869750976562,
-      "learning_rate": 1.8461538461538465e-06,
-      "loss": 0.7753,
-      "step": 18200
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 13.186784744262695,
-      "learning_rate": 1.7435897435897436e-06,
-      "loss": 0.7705,
-      "step": 18300
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 2.9938299655914307,
-      "learning_rate": 1.6410256410256412e-06,
-      "loss": 0.7838,
-      "step": 18400
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 3.876194477081299,
-      "learning_rate": 1.5384615384615387e-06,
-      "loss": 0.7963,
-      "step": 18500
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 8.027066230773926,
-      "learning_rate": 1.4358974358974359e-06,
-      "loss": 0.7841,
-      "step": 18600
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 6.673095226287842,
-      "learning_rate": 1.3333333333333334e-06,
-      "loss": 0.7676,
-      "step": 18700
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 6.047390460968018,
-      "learning_rate": 1.230769230769231e-06,
-      "loss": 0.7792,
-      "step": 18800
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 3.341261625289917,
-      "learning_rate": 1.1282051282051283e-06,
-      "loss": 0.7712,
-      "step": 18900
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 9.690947532653809,
-      "learning_rate": 1.0256410256410257e-06,
-      "loss": 0.768,
-      "step": 19000
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 2.3877036571502686,
-      "learning_rate": 9.230769230769232e-07,
-      "loss": 0.786,
-      "step": 19100
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 5.060111045837402,
-      "learning_rate": 8.205128205128206e-07,
-      "loss": 0.7492,
-      "step": 19200
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 4.0241570472717285,
-      "learning_rate": 7.179487179487179e-07,
-      "loss": 0.7638,
-      "step": 19300
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 6.047507286071777,
-      "learning_rate": 6.153846153846155e-07,
-      "loss": 0.7702,
-      "step": 19400
     },
     {
-      "epoch": 0.49,
-      "grad_norm": 4.642309665679932,
-      "learning_rate": 5.128205128205128e-07,
-      "loss": 0.7541,
-      "step": 19500
     },
     {
-      "epoch": 0.49,
-      "grad_norm": 10.096720695495605,
-      "learning_rate": 4.102564102564103e-07,
-      "loss": 0.7686,
-      "step": 19600
     },
     {
-      "epoch": 0.49,
-      "grad_norm": 11.970602035522461,
-      "learning_rate": 3.0769230769230774e-07,
-      "loss": 0.7619,
-      "step": 19700
     },
     {
-      "epoch": 0.49,
-      "grad_norm": 6.973097801208496,
-      "learning_rate": 2.0512820512820514e-07,
-      "loss": 0.7798,
-      "step": 19800
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 4.512222766876221,
-      "learning_rate": 1.0256410256410257e-07,
-      "loss": 0.7444,
-      "step": 19900
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 3.21940541267395,
-      "learning_rate": 0.0,
-      "loss": 0.7902,
-      "step": 20000
     },
     {
-      "epoch": 0.5,
-      "eval_loss": 0.7663924694061279,
-      "eval_runtime": 104.2963,
-      "eval_samples_per_second": 9.588,
-      "eval_steps_per_second": 9.588,
-      "step": 20000
     }
   ],
-  "logging_steps": 100,
-  "max_steps": 20000,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 2500,
-  "total_flos": 3.2204251987968e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.8163847923278809,
+  "best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-2500",
+  "epoch": 0.0625,
+  "eval_steps": 500,
+  "global_step": 2500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.0,
+      "grad_norm": 3.086414337158203,
+      "learning_rate": 4.0000000000000003e-07,
+      "loss": 0.7892,
+      "step": 10
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 8.478134155273438,
+      "learning_rate": 8.000000000000001e-07,
+      "loss": 0.7746,
+      "step": 20
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.574502468109131,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 0.8222,
+      "step": 30
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 2.6497371196746826,
+      "learning_rate": 1.6000000000000001e-06,
+      "loss": 0.7423,
+      "step": 40
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 3.116753339767456,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.7622,
+      "step": 50
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 3.179832696914673,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": 0.8183,
+      "step": 60
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 3.9869463443756104,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 0.822,
+      "step": 70
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.093494415283203,
+      "learning_rate": 3.2000000000000003e-06,
+      "loss": 0.7966,
+      "step": 80
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.230633735656738,
+      "learning_rate": 3.6000000000000003e-06,
+      "loss": 0.8113,
+      "step": 90
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 9.374403953552246,
       "learning_rate": 4.000000000000001e-06,
+      "loss": 0.7582,
       "step": 100
     },
+    {
+      "epoch": 0.0,
+      "grad_norm": 6.465492248535156,
+      "learning_rate": 4.4e-06,
+      "loss": 0.7662,
+      "step": 110
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 6.279934883117676,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 0.8376,
+      "step": 120
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.799221992492676,
+      "learning_rate": 5.2e-06,
+      "loss": 0.7965,
+      "step": 130
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 3.222240686416626,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 0.8855,
+      "step": 140
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 9.009174346923828,
+      "learning_rate": 6e-06,
+      "loss": 0.8394,
+      "step": 150
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 8.040350914001465,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 0.8426,
+      "step": 160
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 4.131030559539795,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 0.7747,
+      "step": 170
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 3.31986927986145,
+      "learning_rate": 7.2000000000000005e-06,
+      "loss": 0.7125,
+      "step": 180
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 5.7623395919799805,
+      "learning_rate": 7.600000000000001e-06,
+      "loss": 0.7854,
+      "step": 190
+    },
     {
       "epoch": 0.01,
+      "grad_norm": 10.848206520080566,
       "learning_rate": 8.000000000000001e-06,
+      "loss": 0.7756,
       "step": 200
     },
     {
       "epoch": 0.01,
+      "grad_norm": 13.455166816711426,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 0.7894,
+      "step": 210
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.759767532348633,
+      "learning_rate": 8.8e-06,
+      "loss": 0.7454,
+      "step": 220
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.262899875640869,
+      "learning_rate": 9.200000000000002e-06,
+      "loss": 0.8555,
+      "step": 230
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.28985071182251,
+      "learning_rate": 9.600000000000001e-06,
+      "loss": 0.6845,
+      "step": 240
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.174241542816162,
+      "learning_rate": 1e-05,
+      "loss": 0.7983,
+      "step": 250
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.931599617004395,
+      "learning_rate": 1.04e-05,
+      "loss": 0.9041,
+      "step": 260
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.004627227783203,
+      "learning_rate": 1.0800000000000002e-05,
+      "loss": 0.817,
+      "step": 270
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.6102757453918457,
+      "learning_rate": 1.1200000000000001e-05,
+      "loss": 0.7292,
+      "step": 280
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.764902353286743,
+      "learning_rate": 1.16e-05,
+      "loss": 0.9042,
+      "step": 290
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.958317995071411,
       "learning_rate": 1.2e-05,
+      "loss": 0.7539,
       "step": 300
     },
     {
       "epoch": 0.01,
+      "grad_norm": 7.098923683166504,
+      "learning_rate": 1.2400000000000002e-05,
+      "loss": 0.7955,
+      "step": 310
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.129098892211914,
+      "learning_rate": 1.2800000000000001e-05,
+      "loss": 0.849,
+      "step": 320
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.054119825363159,
+      "learning_rate": 1.3200000000000002e-05,
+      "loss": 0.8645,
+      "step": 330
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.205028057098389,
+      "learning_rate": 1.3600000000000002e-05,
+      "loss": 0.8175,
+      "step": 340
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.614790439605713,
+      "learning_rate": 1.4e-05,
+      "loss": 0.8998,
+      "step": 350
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 2.9891204833984375,
+      "learning_rate": 1.4400000000000001e-05,
+      "loss": 0.8108,
+      "step": 360
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.152099609375,
+      "learning_rate": 1.48e-05,
+      "loss": 0.7855,
+      "step": 370
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 9.833850860595703,
+      "learning_rate": 1.5200000000000002e-05,
+      "loss": 0.7736,
+      "step": 380
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.849621295928955,
+      "learning_rate": 1.5600000000000003e-05,
+      "loss": 0.7668,
+      "step": 390
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.4542975425720215,
       "learning_rate": 1.6000000000000003e-05,
+      "loss": 0.7781,
       "step": 400
     },
     {
       "epoch": 0.01,
+      "grad_norm": 6.197661876678467,
+      "learning_rate": 1.64e-05,
+      "loss": 0.8654,
+      "step": 410
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.2606770992279053,
+      "learning_rate": 1.6800000000000002e-05,
+      "loss": 0.7565,
+      "step": 420
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.9680209159851074,
+      "learning_rate": 1.72e-05,
+      "loss": 0.7886,
+      "step": 430
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 18.749984741210938,
+      "learning_rate": 1.76e-05,
+      "loss": 0.7305,
+      "step": 440
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 5.822000503540039,
+      "learning_rate": 1.8e-05,
+      "loss": 0.7833,
+      "step": 450
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.999715805053711,
+      "learning_rate": 1.8400000000000003e-05,
+      "loss": 0.8483,
+      "step": 460
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 7.193736553192139,
+      "learning_rate": 1.88e-05,
+      "loss": 0.84,
+      "step": 470
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 12.573124885559082,
+      "learning_rate": 1.9200000000000003e-05,
+      "loss": 0.8437,
+      "step": 480
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 4.4221601486206055,
+      "learning_rate": 1.9600000000000002e-05,
+      "loss": 0.6836,
+      "step": 490
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 3.0399410724639893,
       "learning_rate": 2e-05,
+      "loss": 0.8264,
       "step": 500
     },
     {
       "epoch": 0.01,
+      "eval_loss": 0.8175864219665527,
+      "eval_runtime": 67.7802,
+      "eval_samples_per_second": 14.754,
+      "eval_steps_per_second": 14.754,
+      "step": 500
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 3.971303701400757,
+      "learning_rate": 1.9978947368421054e-05,
+      "loss": 0.7385,
+      "step": 510
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 3.8043839931488037,
+      "learning_rate": 1.9957894736842107e-05,
+      "loss": 0.7826,
+      "step": 520
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 11.702253341674805,
+      "learning_rate": 1.993684210526316e-05,
+      "loss": 0.7971,
+      "step": 530
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 5.176826000213623,
+      "learning_rate": 1.9915789473684212e-05,
+      "loss": 0.748,
+      "step": 540
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 7.120133876800537,
+      "learning_rate": 1.9894736842105265e-05,
+      "loss": 0.8461,
+      "step": 550
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 12.286151885986328,
+      "learning_rate": 1.9873684210526318e-05,
+      "loss": 0.8335,
+      "step": 560
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 7.857172966003418,
+      "learning_rate": 1.985263157894737e-05,
+      "loss": 0.7231,
+      "step": 570
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 5.327859401702881,
+      "learning_rate": 1.9831578947368423e-05,
+      "loss": 0.877,
+      "step": 580
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 6.9340362548828125,
+      "learning_rate": 1.9810526315789476e-05,
+      "loss": 0.8984,
+      "step": 590
     },
     {
+      "epoch": 0.01,
+      "grad_norm": 2.1034326553344727,
+      "learning_rate": 1.9789473684210528e-05,
+      "loss": 0.7045,
+      "step": 600
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 3.853721857070923,
+      "learning_rate": 1.976842105263158e-05,
+      "loss": 0.761,
+      "step": 610
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 7.6926398277282715,
+      "learning_rate": 1.9747368421052633e-05,
+      "loss": 0.9493,
+      "step": 620
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 6.261799335479736,
+      "learning_rate": 1.9726315789473686e-05,
+      "loss": 0.7719,
+      "step": 630
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 3.864114284515381,
+      "learning_rate": 1.970526315789474e-05,
+      "loss": 0.9406,
+      "step": 640
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 7.093533515930176,
+      "learning_rate": 1.968421052631579e-05,
+      "loss": 0.7951,
+      "step": 650
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 2.3724496364593506,
+      "learning_rate": 1.9663157894736844e-05,
+      "loss": 0.8648,
+      "step": 660
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 10.12341022491455,
+      "learning_rate": 1.9642105263157897e-05,
+      "loss": 0.7823,
+      "step": 670
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 2.80940842628479,
+      "learning_rate": 1.962105263157895e-05,
+      "loss": 0.706,
+      "step": 680
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 8.243487358093262,
+      "learning_rate": 1.9600000000000002e-05,
+      "loss": 0.8244,
+      "step": 690
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 11.420123100280762,
+      "learning_rate": 1.9578947368421055e-05,
+      "loss": 0.6753,
+      "step": 700
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 63.8618278503418,
+      "learning_rate": 1.9557894736842107e-05,
+      "loss": 0.8309,
+      "step": 710
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 4.521258354187012,
+      "learning_rate": 1.953684210526316e-05,
+      "loss": 0.8101,
+      "step": 720
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 2.9532318115234375,
+      "learning_rate": 1.9515789473684213e-05,
+      "loss": 0.8533,
+      "step": 730
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 3.792180061340332,
+      "learning_rate": 1.9494736842105265e-05,
+      "loss": 0.7573,
+      "step": 740
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.155513286590576,
+      "learning_rate": 1.9473684210526318e-05,
+      "loss": 0.8961,
+      "step": 750
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 9.195950508117676,
+      "learning_rate": 1.945263157894737e-05,
+      "loss": 0.8398,
+      "step": 760
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 6.699478626251221,
+      "learning_rate": 1.9431578947368423e-05,
+      "loss": 0.8018,
+      "step": 770
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.254507541656494,
+      "learning_rate": 1.9410526315789476e-05,
+      "loss": 0.8408,
+      "step": 780
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 4.351966857910156,
+      "learning_rate": 1.9389473684210525e-05,
+      "loss": 0.7323,
+      "step": 790
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 2.361276626586914,
+      "learning_rate": 1.936842105263158e-05,
+      "loss": 0.8401,
+      "step": 800
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.449990272521973,
+      "learning_rate": 1.9347368421052634e-05,
+      "loss": 0.726,
+      "step": 810
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.375738143920898,
+      "learning_rate": 1.9326315789473687e-05,
+      "loss": 0.8305,
+      "step": 820
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 2.601025342941284,
+      "learning_rate": 1.930526315789474e-05,
+      "loss": 0.9152,
+      "step": 830
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 12.153268814086914,
+      "learning_rate": 1.9284210526315792e-05,
+      "loss": 0.8423,
+      "step": 840
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 3.785663604736328,
+      "learning_rate": 1.9263157894736845e-05,
+      "loss": 0.7733,
+      "step": 850
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 10.162787437438965,
+      "learning_rate": 1.9242105263157894e-05,
+      "loss": 0.893,
+      "step": 860
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 3.871621608734131,
+      "learning_rate": 1.922105263157895e-05,
+      "loss": 0.798,
+      "step": 870
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 2.9919800758361816,
+      "learning_rate": 1.9200000000000003e-05,
+      "loss": 0.8484,
+      "step": 880
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.40109920501709,
+      "learning_rate": 1.9178947368421055e-05,
+      "loss": 0.9129,
+      "step": 890
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 6.794926643371582,
+      "learning_rate": 1.9157894736842108e-05,
+      "loss": 0.8687,
+      "step": 900
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.942440986633301,
+      "learning_rate": 1.913684210526316e-05,
+      "loss": 0.8564,
+      "step": 910
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.968307018280029,
+      "learning_rate": 1.9115789473684213e-05,
+      "loss": 0.8495,
+      "step": 920
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 8.425616264343262,
+      "learning_rate": 1.9094736842105262e-05,
+      "loss": 0.7242,
+      "step": 930
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 2.819301128387451,
+      "learning_rate": 1.907368421052632e-05,
+      "loss": 0.8381,
+      "step": 940
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 6.81688117980957,
+      "learning_rate": 1.9052631578947368e-05,
+      "loss": 0.8817,
+      "step": 950
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.102423191070557,
+      "learning_rate": 1.9031578947368424e-05,
+      "loss": 0.8274,
+      "step": 960
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 4.12994909286499,
+      "learning_rate": 1.9010526315789476e-05,
+      "loss": 0.7052,
+      "step": 970
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 5.15468692779541,
+      "learning_rate": 1.898947368421053e-05,
+      "loss": 0.772,
+      "step": 980
     },
     {
+      "epoch": 0.02,
+      "grad_norm": 1.62323796749115,
+      "learning_rate": 1.8968421052631582e-05,
+      "loss": 0.7764,
+      "step": 990
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 2.546677589416504,
+      "learning_rate": 1.894736842105263e-05,
+      "loss": 0.8365,
+      "step": 1000
     },
     {
+      "epoch": 0.03,
+      "eval_loss": 0.7952949404716492,
+      "eval_runtime": 67.7544,
+      "eval_samples_per_second": 14.759,
+      "eval_steps_per_second": 14.759,
+      "step": 1000
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 9.28386402130127,
+      "learning_rate": 1.8926315789473687e-05,
+      "loss": 0.8765,
+      "step": 1010
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 7.3430304527282715,
+      "learning_rate": 1.8905263157894736e-05,
+      "loss": 0.8763,
+      "step": 1020
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 4.0531206130981445,
+      "learning_rate": 1.8884210526315792e-05,
+      "loss": 0.7943,
+      "step": 1030
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.028320074081421,
+      "learning_rate": 1.886315789473684e-05,
+      "loss": 0.836,
+      "step": 1040
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.3861188888549805,
+      "learning_rate": 1.8842105263157898e-05,
+      "loss": 0.7336,
+      "step": 1050
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.7832908630371094,
+      "learning_rate": 1.882105263157895e-05,
+      "loss": 0.9283,
+      "step": 1060
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.8170342445373535,
+      "learning_rate": 1.88e-05,
+      "loss": 0.7655,
+      "step": 1070
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 6.15322732925415,
+      "learning_rate": 1.8778947368421056e-05,
+      "loss": 0.9341,
+      "step": 1080
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 7.066686153411865,
+      "learning_rate": 1.8757894736842105e-05,
+      "loss": 0.85,
+      "step": 1090
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 2.986961603164673,
+      "learning_rate": 1.873684210526316e-05,
+      "loss": 0.8943,
+      "step": 1100
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 2.8456902503967285,
+      "learning_rate": 1.871578947368421e-05,
+      "loss": 0.8279,
+      "step": 1110
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.6177377700805664,
+      "learning_rate": 1.8694736842105266e-05,
+      "loss": 0.8192,
+      "step": 1120
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 14.768010139465332,
+      "learning_rate": 1.8673684210526316e-05,
+      "loss": 0.8005,
+      "step": 1130
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 11.347342491149902,
+      "learning_rate": 1.8652631578947368e-05,
+      "loss": 0.8081,
+      "step": 1140
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 4.0560150146484375,
+      "learning_rate": 1.8631578947368424e-05,
+      "loss": 0.9389,
+      "step": 1150
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.3164710998535156,
+      "learning_rate": 1.8610526315789473e-05,
+      "loss": 0.8501,
+      "step": 1160
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 11.112225532531738,
+      "learning_rate": 1.858947368421053e-05,
+      "loss": 0.7162,
+      "step": 1170
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 6.200588703155518,
+      "learning_rate": 1.856842105263158e-05,
+      "loss": 0.7448,
+      "step": 1180
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 6.573482513427734,
+      "learning_rate": 1.8547368421052635e-05,
+      "loss": 0.8071,
+      "step": 1190
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 5.153548717498779,
+      "learning_rate": 1.8526315789473684e-05,
+      "loss": 0.7957,
+      "step": 1200
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 5.3308305740356445,
+      "learning_rate": 1.8505263157894737e-05,
+      "loss": 0.7301,
+      "step": 1210
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 5.269808769226074,
+      "learning_rate": 1.8484210526315793e-05,
+      "loss": 0.8072,
+      "step": 1220
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 5.588324546813965,
+      "learning_rate": 1.8463157894736842e-05,
+      "loss": 0.8587,
+      "step": 1230
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 4.593557357788086,
+      "learning_rate": 1.8442105263157898e-05,
+      "loss": 0.856,
+      "step": 1240
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 5.2591094970703125,
+      "learning_rate": 1.8421052631578947e-05,
+      "loss": 0.7717,
+      "step": 1250
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 4.052567958831787,
+      "learning_rate": 1.8400000000000003e-05,
+      "loss": 0.7823,
+      "step": 1260
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 4.447838306427002,
+      "learning_rate": 1.8378947368421053e-05,
+      "loss": 0.83,
+      "step": 1270
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 4.029257774353027,
+      "learning_rate": 1.8357894736842105e-05,
+      "loss": 0.7504,
+      "step": 1280
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 9.053960800170898,
+      "learning_rate": 1.8336842105263158e-05,
+      "loss": 0.9074,
+      "step": 1290
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 2.2877705097198486,
+      "learning_rate": 1.831578947368421e-05,
+      "loss": 0.772,
+      "step": 1300
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.4482290744781494,
+      "learning_rate": 1.8294736842105267e-05,
+      "loss": 0.8658,
+      "step": 1310
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 6.684794902801514,
+      "learning_rate": 1.8273684210526316e-05,
+      "loss": 0.7848,
+      "step": 1320
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.553828716278076,
+      "learning_rate": 1.8252631578947372e-05,
+      "loss": 0.8219,
+      "step": 1330
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 2.5203397274017334,
+      "learning_rate": 1.823157894736842e-05,
+      "loss": 0.9071,
+      "step": 1340
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 4.961795806884766,
+      "learning_rate": 1.8210526315789477e-05,
+      "loss": 0.6542,
+      "step": 1350
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.663081645965576,
+      "learning_rate": 1.8189473684210527e-05,
+      "loss": 0.7402,
+      "step": 1360
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 8.785040855407715,
+      "learning_rate": 1.816842105263158e-05,
+      "loss": 0.7462,
+      "step": 1370
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 4.659074783325195,
+      "learning_rate": 1.8147368421052632e-05,
+      "loss": 0.6951,
+      "step": 1380
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 3.5885703563690186,
+      "learning_rate": 1.8126315789473685e-05,
+      "loss": 0.7008,
+      "step": 1390
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.1295347213745117,
+      "learning_rate": 1.810526315789474e-05,
+      "loss": 0.9103,
+      "step": 1400
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 2.4699888229370117,
+      "learning_rate": 1.808421052631579e-05,
+      "loss": 0.841,
+      "step": 1410
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 5.3273444175720215,
+      "learning_rate": 1.8063157894736846e-05,
+      "loss": 0.9041,
+      "step": 1420
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 5.149638652801514,
+      "learning_rate": 1.8042105263157895e-05,
+      "loss": 0.7784,
+      "step": 1430
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.4124910831451416,
+      "learning_rate": 1.8021052631578948e-05,
+      "loss": 0.8208,
+      "step": 1440
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 2.9231085777282715,
+      "learning_rate": 1.8e-05,
+      "loss": 0.7173,
+      "step": 1450
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 4.008113384246826,
+      "learning_rate": 1.7978947368421053e-05,
+      "loss": 0.7383,
+      "step": 1460
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 5.1748046875,
+      "learning_rate": 1.795789473684211e-05,
+      "loss": 0.8399,
+      "step": 1470
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.4990293979644775,
+      "learning_rate": 1.793684210526316e-05,
+      "loss": 0.6721,
+      "step": 1480
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.1186299324035645,
+      "learning_rate": 1.7915789473684214e-05,
+      "loss": 0.782,
+      "step": 1490
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 5.12732458114624,
+      "learning_rate": 1.7894736842105264e-05,
+      "loss": 0.7211,
+      "step": 1500
     },
     {
+      "epoch": 0.04,
+      "eval_loss": 0.811568021774292,
+      "eval_runtime": 67.7961,
+      "eval_samples_per_second": 14.75,
+      "eval_steps_per_second": 14.75,
+      "step": 1500
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.631096124649048,
+      "learning_rate": 1.7873684210526316e-05,
+      "loss": 0.7557,
+      "step": 1510
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 8.850045204162598,
+      "learning_rate": 1.785263157894737e-05,
+      "loss": 0.8757,
+      "step": 1520
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.1114978790283203,
+      "learning_rate": 1.7831578947368422e-05,
+      "loss": 0.7613,
+      "step": 1530
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 4.5038743019104,
+      "learning_rate": 1.7810526315789474e-05,
+      "loss": 0.8049,
+      "step": 1540
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 4.2331156730651855,
+      "learning_rate": 1.7789473684210527e-05,
+      "loss": 0.8277,
+      "step": 1550
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 5.05696964263916,
+      "learning_rate": 1.7768421052631583e-05,
+      "loss": 0.7973,
+      "step": 1560
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 2.1331920623779297,
+      "learning_rate": 1.7747368421052632e-05,
+      "loss": 0.7688,
+      "step": 1570
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 4.984541416168213,
+      "learning_rate": 1.7726315789473685e-05,
+      "loss": 0.7865,
+      "step": 1580
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 7.149406433105469,
+      "learning_rate": 1.7705263157894738e-05,
+      "loss": 0.7728,
+      "step": 1590
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 8.092243194580078,
+      "learning_rate": 1.768421052631579e-05,
+      "loss": 0.935,
+      "step": 1600
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 13.16551399230957,
+      "learning_rate": 1.7663157894736843e-05,
+      "loss": 0.8286,
+      "step": 1610
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 2.131350517272949,
+      "learning_rate": 1.7642105263157896e-05,
+      "loss": 0.7864,
+      "step": 1620
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 7.870023727416992,
+      "learning_rate": 1.7621052631578948e-05,
+      "loss": 0.8645,
+      "step": 1630
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 10.631692886352539,
+      "learning_rate": 1.76e-05,
+      "loss": 0.8473,
+      "step": 1640
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 6.421032905578613,
+      "learning_rate": 1.7578947368421054e-05,
+      "loss": 0.7868,
+      "step": 1650
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 4.57529878616333,
+      "learning_rate": 1.7557894736842106e-05,
+      "loss": 0.7882,
+      "step": 1660
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.8785624504089355,
+      "learning_rate": 1.753684210526316e-05,
+      "loss": 0.7543,
+      "step": 1670
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 5.722006320953369,
+      "learning_rate": 1.751578947368421e-05,
+      "loss": 0.9626,
+      "step": 1680
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 2.466771364212036,
+      "learning_rate": 1.7494736842105264e-05,
+      "loss": 0.783,
+      "step": 1690
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.072049856185913,
+      "learning_rate": 1.7473684210526317e-05,
+      "loss": 0.7503,
+      "step": 1700
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 5.768575668334961,
+      "learning_rate": 1.745263157894737e-05,
+      "loss": 0.8193,
+      "step": 1710
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 2.585022211074829,
+      "learning_rate": 1.7431578947368422e-05,
+      "loss": 0.8808,
+      "step": 1720
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.0711567401885986,
+      "learning_rate": 1.7410526315789475e-05,
+      "loss": 0.8098,
+      "step": 1730
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.3020272254943848,
+      "learning_rate": 1.7389473684210527e-05,
+      "loss": 0.7196,
+      "step": 1740
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.645238161087036,
+      "learning_rate": 1.736842105263158e-05,
+      "loss": 0.8904,
+      "step": 1750
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 6.018638610839844,
+      "learning_rate": 1.7347368421052633e-05,
+      "loss": 0.7937,
+      "step": 1760
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.629096746444702,
+      "learning_rate": 1.7326315789473685e-05,
+      "loss": 0.9171,
+      "step": 1770
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 2.5619189739227295,
+      "learning_rate": 1.7305263157894738e-05,
+      "loss": 0.9488,
+      "step": 1780
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 9.464752197265625,
+      "learning_rate": 1.728421052631579e-05,
+      "loss": 0.8459,
+      "step": 1790
     },
     {
+      "epoch": 0.04,
+      "grad_norm": 3.9856364727020264,
+      "learning_rate": 1.7263157894736843e-05,
+      "loss": 0.8378,
+      "step": 1800
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.753553867340088,
+      "learning_rate": 1.7242105263157896e-05,
+      "loss": 0.8093,
+      "step": 1810
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.4593358039855957,
+      "learning_rate": 1.722105263157895e-05,
+      "loss": 0.7896,
+      "step": 1820
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.7163546085357666,
+      "learning_rate": 1.72e-05,
+      "loss": 0.7188,
+      "step": 1830
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.105628728866577,
+      "learning_rate": 1.7178947368421054e-05,
+      "loss": 0.7643,
+      "step": 1840
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.387368679046631,
+      "learning_rate": 1.7157894736842107e-05,
+      "loss": 0.8465,
+      "step": 1850
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 6.020385265350342,
+      "learning_rate": 1.713684210526316e-05,
+      "loss": 0.7798,
+      "step": 1860
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 4.560520172119141,
+      "learning_rate": 1.7115789473684212e-05,
+      "loss": 0.7704,
+      "step": 1870
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 15.739727973937988,
+      "learning_rate": 1.7094736842105265e-05,
+      "loss": 0.7148,
+      "step": 1880
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 5.79690408706665,
+      "learning_rate": 1.7073684210526317e-05,
+      "loss": 0.798,
+      "step": 1890
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.6939146518707275,
+      "learning_rate": 1.705263157894737e-05,
+      "loss": 0.7641,
+      "step": 1900
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 5.193384170532227,
+      "learning_rate": 1.7031578947368423e-05,
+      "loss": 0.7866,
+      "step": 1910
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 4.940731525421143,
+      "learning_rate": 1.7010526315789475e-05,
+      "loss": 0.8261,
+      "step": 1920
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.1812446117401123,
+      "learning_rate": 1.6989473684210528e-05,
+      "loss": 0.7973,
+      "step": 1930
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.7413289546966553,
+      "learning_rate": 1.696842105263158e-05,
+      "loss": 0.7818,
+      "step": 1940
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 4.024014472961426,
+      "learning_rate": 1.6947368421052633e-05,
+      "loss": 0.7237,
+      "step": 1950
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.0871291160583496,
+      "learning_rate": 1.6926315789473686e-05,
+      "loss": 0.772,
+      "step": 1960
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.28814435005188,
+      "learning_rate": 1.690526315789474e-05,
+      "loss": 0.7067,
+      "step": 1970
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.8241286277770996,
+      "learning_rate": 1.688421052631579e-05,
+      "loss": 0.8175,
+      "step": 1980
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.5942068099975586,
+      "learning_rate": 1.6863157894736844e-05,
+      "loss": 0.9265,
+      "step": 1990
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 6.6822662353515625,
+      "learning_rate": 1.6842105263157896e-05,
+      "loss": 0.8593,
+      "step": 2000
     },
     {
+      "epoch": 0.05,
+      "eval_loss": 0.8064771890640259,
+      "eval_runtime": 67.7887,
+      "eval_samples_per_second": 14.752,
+      "eval_steps_per_second": 14.752,
+      "step": 2000
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 7.032164573669434,
+      "learning_rate": 1.682105263157895e-05,
+      "loss": 0.8819,
+      "step": 2010
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 4.874982833862305,
+      "learning_rate": 1.6800000000000002e-05,
+      "loss": 0.8021,
+      "step": 2020
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.6172547340393066,
+      "learning_rate": 1.6778947368421054e-05,
+      "loss": 0.8017,
+      "step": 2030
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 10.659741401672363,
+      "learning_rate": 1.6757894736842107e-05,
+      "loss": 0.8896,
+      "step": 2040
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 6.189141750335693,
+      "learning_rate": 1.673684210526316e-05,
+      "loss": 0.7997,
+      "step": 2050
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 4.523468971252441,
+      "learning_rate": 1.6715789473684212e-05,
+      "loss": 0.8498,
+      "step": 2060
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 8.533658981323242,
+      "learning_rate": 1.6694736842105265e-05,
+      "loss": 0.8857,
+      "step": 2070
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.0041606426239014,
+      "learning_rate": 1.6673684210526318e-05,
+      "loss": 0.8112,
+      "step": 2080
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 5.055651664733887,
+      "learning_rate": 1.665263157894737e-05,
+      "loss": 0.7872,
+      "step": 2090
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 5.761922836303711,
+      "learning_rate": 1.6631578947368423e-05,
+      "loss": 0.7727,
+      "step": 2100
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.518223524093628,
+      "learning_rate": 1.6610526315789476e-05,
+      "loss": 0.7997,
+      "step": 2110
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 4.975761890411377,
+      "learning_rate": 1.658947368421053e-05,
+      "loss": 0.7457,
+      "step": 2120
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.2227561473846436,
+      "learning_rate": 1.656842105263158e-05,
+      "loss": 0.816,
+      "step": 2130
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 4.705923080444336,
+      "learning_rate": 1.6547368421052634e-05,
+      "loss": 0.8113,
+      "step": 2140
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.655057430267334,
+      "learning_rate": 1.6526315789473686e-05,
+      "loss": 0.7912,
+      "step": 2150
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.0186755657196045,
+      "learning_rate": 1.650526315789474e-05,
+      "loss": 0.8608,
+      "step": 2160
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 1.232386827468872,
+      "learning_rate": 1.648421052631579e-05,
+      "loss": 0.8549,
+      "step": 2170
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 11.968620300292969,
+      "learning_rate": 1.6463157894736844e-05,
+      "loss": 0.868,
+      "step": 2180
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 3.5853216648101807,
+      "learning_rate": 1.6442105263157897e-05,
+      "loss": 0.8388,
+      "step": 2190
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 2.375610589981079,
+      "learning_rate": 1.642105263157895e-05,
+      "loss": 0.9111,
+      "step": 2200
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 1.9734487533569336,
+      "learning_rate": 1.64e-05,
+      "loss": 0.7288,
+      "step": 2210
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 10.517192840576172,
+      "learning_rate": 1.6378947368421055e-05,
+      "loss": 0.698,
+      "step": 2220
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 4.183718204498291,
+      "learning_rate": 1.6357894736842108e-05,
+      "loss": 0.7759,
+      "step": 2230
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.9075675010681152,
+      "learning_rate": 1.633684210526316e-05,
+      "loss": 0.7829,
+      "step": 2240
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 5.287744998931885,
+      "learning_rate": 1.6315789473684213e-05,
+      "loss": 0.7057,
+      "step": 2250
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 4.977657318115234,
+      "learning_rate": 1.6294736842105265e-05,
+      "loss": 0.8346,
+      "step": 2260
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 7.196689128875732,
+      "learning_rate": 1.6273684210526318e-05,
+      "loss": 0.8508,
+      "step": 2270
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 2.467477798461914,
+      "learning_rate": 1.6252631578947367e-05,
+      "loss": 0.7179,
+      "step": 2280
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 7.059762954711914,
+      "learning_rate": 1.6231578947368423e-05,
+      "loss": 0.7549,
+      "step": 2290
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.980865955352783,
+      "learning_rate": 1.6210526315789473e-05,
+      "loss": 0.814,
+      "step": 2300
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 7.675939559936523,
+      "learning_rate": 1.618947368421053e-05,
+      "loss": 0.8227,
+      "step": 2310
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.530073642730713,
+      "learning_rate": 1.616842105263158e-05,
+      "loss": 0.8517,
+      "step": 2320
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.6851344108581543,
+      "learning_rate": 1.6147368421052634e-05,
+      "loss": 0.7684,
+      "step": 2330
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 5.206923961639404,
+      "learning_rate": 1.6126315789473687e-05,
+      "loss": 0.8199,
+      "step": 2340
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 5.220828056335449,
+      "learning_rate": 1.6105263157894736e-05,
+      "loss": 0.8871,
+      "step": 2350
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.5062482357025146,
+      "learning_rate": 1.6084210526315792e-05,
+      "loss": 0.8281,
+      "step": 2360
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 1.9830796718597412,
+      "learning_rate": 1.606315789473684e-05,
+      "loss": 0.8678,
+      "step": 2370
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.3255491256713867,
+      "learning_rate": 1.6042105263157897e-05,
+      "loss": 0.8337,
+      "step": 2380
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 5.259572505950928,
+      "learning_rate": 1.6021052631578947e-05,
+      "loss": 0.7954,
+      "step": 2390
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.6201376914978027,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 0.818,
+      "step": 2400
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.3598544597625732,
+      "learning_rate": 1.5978947368421055e-05,
+      "loss": 0.7697,
+      "step": 2410
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 6.34808349609375,
+      "learning_rate": 1.5957894736842105e-05,
+      "loss": 0.6347,
+      "step": 2420
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.967682361602783,
+      "learning_rate": 1.593684210526316e-05,
+      "loss": 0.7178,
+      "step": 2430
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 10.222978591918945,
+      "learning_rate": 1.591578947368421e-05,
+      "loss": 0.7642,
+      "step": 2440
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 3.9339826107025146,
+      "learning_rate": 1.5894736842105266e-05,
+      "loss": 0.8197,
+      "step": 2450
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 2.3337771892547607,
+      "learning_rate": 1.5873684210526315e-05,
+      "loss": 0.9375,
+      "step": 2460
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 2.8479838371276855,
+      "learning_rate": 1.585263157894737e-05,
+      "loss": 0.9196,
+      "step": 2470
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 9.294541358947754,
+      "learning_rate": 1.5831578947368424e-05,
+      "loss": 0.7144,
+      "step": 2480
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 5.325323104858398,
+      "learning_rate": 1.5810526315789473e-05,
+      "loss": 0.7897,
+      "step": 2490
     },
     {
+      "epoch": 0.06,
+      "grad_norm": 4.377369403839111,
+      "learning_rate": 1.578947368421053e-05,
+      "loss": 0.9008,
+      "step": 2500
+    },
+    {
+      "epoch": 0.06,
+      "eval_loss": 0.8163847923278809,
+      "eval_runtime": 67.7994,
+      "eval_samples_per_second": 14.749,
+      "eval_steps_per_second": 14.749,
+      "step": 2500
     }
   ],
+  "logging_steps": 10,
+  "max_steps": 10000,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 2500,
+  "total_flos": 4.025531498496e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ae8864807404348e5714abdca7ecd3f7b499a2f8b4bff1a613654ec5edf69101
-size 4984

 version https://git-lfs.github.com/spec/v1
+oid sha256:51ac7424107c168679594d767b2ffefa42eac9e349caa7916abcb7990d9f453e
+size 4920