diff --git "a/trainer_state.json" "b/trainer_state.json"
--- "a/trainer_state.json"
+++ "b/trainer_state.json"
@@ -1,5437 +1,2749 @@
 {
-  "best_metric": 1.4534417390823364,
-  "best_model_checkpoint": "./results/models/checkpoint-307580",
-  "epoch": 5.0,
+  "best_metric": 1.3305245637893677,
+  "best_model_checkpoint": "./results/models/checkpoint-182628",
+  "epoch": 19.0,
   "eval_steps": 500,
-  "global_step": 384475,
+  "global_step": 182628,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.006502373366278692,
-      "grad_norm": 1.359375,
-      "learning_rate": 0.001999739905065349,
-      "loss": 2.651,
+      "epoch": 0.05201831044527674,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.001997919267582189,
+      "loss": 2.3433,
       "step": 500
     },
     {
-      "epoch": 0.013004746732557384,
-      "grad_norm": 4.40625,
-      "learning_rate": 0.0019994798101306975,
-      "loss": 2.3581,
+      "epoch": 0.10403662089055347,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.001995838535164378,
+      "loss": 1.9524,
       "step": 1000
     },
     {
-      "epoch": 0.019507120098836074,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.0019992197151960465,
-      "loss": 2.2735,
+      "epoch": 0.1560549313358302,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.001993757802746567,
+      "loss": 1.8833,
       "step": 1500
     },
     {
-      "epoch": 0.026009493465114768,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0019989596202613954,
-      "loss": 2.2235,
+      "epoch": 0.20807324178110695,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0019916770703287557,
+      "loss": 1.8518,
       "step": 2000
     },
     {
-      "epoch": 0.03251186683139346,
-      "grad_norm": 1.75,
-      "learning_rate": 0.0019986995253267444,
-      "loss": 2.1651,
+      "epoch": 0.2600915522263837,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0019895963379109446,
+      "loss": 1.8064,
       "step": 2500
     },
     {
-      "epoch": 0.03901424019767215,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0019984394303920934,
-      "loss": 2.125,
+      "epoch": 0.3121098626716604,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0019875156054931335,
+      "loss": 1.7881,
       "step": 3000
     },
     {
-      "epoch": 0.04551661356395084,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.001998179335457442,
-      "loss": 2.0895,
+      "epoch": 0.3641281731169372,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0019854348730753224,
+      "loss": 1.7552,
       "step": 3500
     },
     {
-      "epoch": 0.052018986930229535,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001997919240522791,
-      "loss": 2.0827,
+      "epoch": 0.4161464835622139,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019833541406575114,
+      "loss": 1.7362,
       "step": 4000
     },
     {
-      "epoch": 0.05852136029650822,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.00199765914558814,
-      "loss": 2.0646,
+      "epoch": 0.4681647940074906,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0019812734082397003,
+      "loss": 1.7302,
       "step": 4500
     },
     {
-      "epoch": 0.06502373366278692,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0019973990506534883,
-      "loss": 2.039,
+      "epoch": 0.5201831044527674,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0019791926758218896,
+      "loss": 1.6997,
       "step": 5000
     },
     {
-      "epoch": 0.0715261070290656,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0019971389557188377,
-      "loss": 2.0101,
+      "epoch": 0.5722014148980441,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.001977111943404078,
+      "loss": 1.6694,
       "step": 5500
     },
     {
-      "epoch": 0.0780284803953443,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0019968788607841862,
-      "loss": 2.0275,
+      "epoch": 0.6242197253433208,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.001975031210986267,
+      "loss": 1.6882,
       "step": 6000
     },
     {
-      "epoch": 0.08453085376162299,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.001996618765849535,
-      "loss": 1.9981,
+      "epoch": 0.6762380357885975,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0019729504785684564,
+      "loss": 1.6712,
       "step": 6500
     },
     {
-      "epoch": 0.09103322712790168,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.001996358670914884,
-      "loss": 1.9754,
+      "epoch": 0.7282563462338744,
+      "grad_norm": 1.21875,
+      "learning_rate": 0.0019708697461506453,
+      "loss": 1.6519,
       "step": 7000
     },
     {
-      "epoch": 0.09753560049418038,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019960985759802327,
-      "loss": 1.9366,
+      "epoch": 0.7802746566791511,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0019687890137328337,
+      "loss": 1.6525,
       "step": 7500
     },
     {
-      "epoch": 0.10403797386045907,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0019958384810455816,
-      "loss": 1.921,
+      "epoch": 0.8322929671244278,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001966708281315023,
+      "loss": 1.6239,
       "step": 8000
     },
     {
-      "epoch": 0.11054034722673776,
-      "grad_norm": 0.455078125,
-      "learning_rate": 0.0019955783861109306,
-      "loss": 1.9106,
+      "epoch": 0.8843112775697045,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.001964627548897212,
+      "loss": 1.6152,
       "step": 8500
     },
     {
-      "epoch": 0.11704272059301644,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.001995318291176279,
-      "loss": 1.9082,
+      "epoch": 0.9363295880149812,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0019625468164794005,
+      "loss": 1.6053,
       "step": 9000
     },
     {
-      "epoch": 0.12354509395929514,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019950581962416285,
-      "loss": 1.8952,
+      "epoch": 0.9883478984602581,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.00196046608406159,
+      "loss": 1.6078,
       "step": 9500
     },
     {
-      "epoch": 0.13004746732557385,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.001994798101306977,
-      "loss": 1.9024,
+      "epoch": 1.0,
+      "eval_loss": 1.5721051692962646,
+      "eval_runtime": 1.4853,
+      "eval_samples_per_second": 673.283,
+      "eval_steps_per_second": 0.673,
+      "step": 9612
+    },
+    {
+      "epoch": 1.0403662089055348,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0019583853516437788,
+      "loss": 1.5953,
       "step": 10000
     },
     {
-      "epoch": 0.13654984069185253,
-      "grad_norm": 1.0859375,
-      "learning_rate": 0.001994538006372326,
-      "loss": 1.9417,
+      "epoch": 1.0923845193508115,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0019563046192259677,
+      "loss": 1.587,
       "step": 10500
     },
     {
-      "epoch": 0.1430522140581312,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.001994277911437675,
-      "loss": 1.9231,
+      "epoch": 1.1444028297960882,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0019542238868081566,
+      "loss": 1.5759,
       "step": 11000
     },
     {
-      "epoch": 0.1495545874244099,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.0019940178165030235,
-      "loss": 1.9009,
+      "epoch": 1.196421140241365,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0019521431543903455,
+      "loss": 1.5785,
       "step": 11500
     },
     {
-      "epoch": 0.1560569607906886,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0019937577215683724,
-      "loss": 1.9132,
+      "epoch": 1.2484394506866416,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0019500624219725344,
+      "loss": 1.5734,
       "step": 12000
     },
     {
-      "epoch": 0.1625593341569673,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0019934976266337214,
-      "loss": 1.8845,
+      "epoch": 1.3004577611319184,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0019479816895547233,
+      "loss": 1.5593,
       "step": 12500
     },
     {
-      "epoch": 0.16906170752324598,
-      "grad_norm": 1.578125,
-      "learning_rate": 0.0019932375316990703,
-      "loss": 1.8613,
+      "epoch": 1.352476071577195,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0019459009571369122,
+      "loss": 1.5589,
       "step": 13000
     },
     {
-      "epoch": 0.1755640808895247,
-      "grad_norm": 4.78125,
-      "learning_rate": 0.0019929774367644193,
-      "loss": 1.8528,
+      "epoch": 1.404494382022472,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0019438202247191011,
+      "loss": 1.555,
       "step": 13500
     },
     {
-      "epoch": 0.18206645425580337,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.001992717341829768,
-      "loss": 1.8416,
+      "epoch": 1.4565126924677487,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.00194173949230129,
+      "loss": 1.548,
       "step": 14000
     },
     {
-      "epoch": 0.18856882762208205,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019924572468951168,
-      "loss": 1.8443,
+      "epoch": 1.5085310029130254,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.001939658759883479,
+      "loss": 1.5519,
       "step": 14500
     },
     {
-      "epoch": 0.19507120098836075,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.0019921971519604657,
-      "loss": 1.8301,
+      "epoch": 1.5605493133583022,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001937578027465668,
+      "loss": 1.5534,
       "step": 15000
     },
     {
-      "epoch": 0.20157357435463943,
-      "grad_norm": 0.98828125,
-      "learning_rate": 0.0019919370570258142,
-      "loss": 1.8077,
+      "epoch": 1.6125676238035789,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0019354972950478568,
+      "loss": 1.5408,
       "step": 15500
     },
     {
-      "epoch": 0.20807594772091814,
-      "grad_norm": 1.53125,
-      "learning_rate": 0.001991676962091163,
-      "loss": 1.7963,
+      "epoch": 1.6645859342488556,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0019334165626300457,
+      "loss": 1.5339,
       "step": 16000
     },
     {
-      "epoch": 0.21457832108719682,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.001991416867156512,
-      "loss": 1.7879,
+      "epoch": 1.7166042446941323,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0019313358302122348,
+      "loss": 1.5238,
       "step": 16500
     },
     {
-      "epoch": 0.22108069445347553,
-      "grad_norm": 5.25,
-      "learning_rate": 0.001991156772221861,
-      "loss": 1.793,
+      "epoch": 1.768622555139409,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0019292550977944235,
+      "loss": 1.527,
       "step": 17000
     },
     {
-      "epoch": 0.2275830678197542,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.00199089667728721,
-      "loss": 1.7897,
+      "epoch": 1.8206408655846857,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0019271743653766125,
+      "loss": 1.5253,
       "step": 17500
     },
     {
-      "epoch": 0.2340854411860329,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0019906365823525586,
-      "loss": 1.7831,
+      "epoch": 1.8726591760299627,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0019250936329588016,
+      "loss": 1.5217,
       "step": 18000
     },
     {
-      "epoch": 0.2405878145523116,
-      "grad_norm": 1.28125,
-      "learning_rate": 0.0019903764874179075,
-      "loss": 1.7912,
+      "epoch": 1.9246774864752392,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0019230129005409905,
+      "loss": 1.5201,
       "step": 18500
     },
     {
-      "epoch": 0.24709018791859028,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0019901163924832565,
-      "loss": 1.7988,
+      "epoch": 1.9766957969205161,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0019209321681231794,
+      "loss": 1.5157,
       "step": 19000
     },
     {
-      "epoch": 0.25359256128486896,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.001989856297548605,
-      "loss": 1.8031,
+      "epoch": 2.0,
+      "eval_loss": 1.4967154264450073,
+      "eval_runtime": 1.4155,
+      "eval_samples_per_second": 706.48,
+      "eval_steps_per_second": 0.706,
+      "step": 19224
+    },
+    {
+      "epoch": 2.0287141073657926,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0019188514357053683,
+      "loss": 1.5093,
       "step": 19500
     },
     {
-      "epoch": 0.2600949346511477,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019895962026139544,
-      "loss": 1.8197,
+      "epoch": 2.0807324178110695,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019167707032875572,
+      "loss": 1.5081,
       "step": 20000
     },
     {
-      "epoch": 0.26659730801742637,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.001989336107679303,
-      "loss": 1.8048,
+      "epoch": 2.132750728256346,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0019146899708697464,
+      "loss": 1.5137,
       "step": 20500
     },
     {
-      "epoch": 0.27309968138370505,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.001989076012744652,
-      "loss": 1.7964,
+      "epoch": 2.184769038701623,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001912609238451935,
+      "loss": 1.5052,
       "step": 21000
     },
     {
-      "epoch": 0.27960205474998373,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.001988815917810001,
-      "loss": 1.7898,
+      "epoch": 2.2367873491468995,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001910528506034124,
+      "loss": 1.4989,
       "step": 21500
     },
     {
-      "epoch": 0.2861044281162624,
-      "grad_norm": 5.25,
-      "learning_rate": 0.0019885558228753494,
-      "loss": 1.7911,
+      "epoch": 2.2888056595921764,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019084477736163131,
+      "loss": 1.4933,
       "step": 22000
     },
     {
-      "epoch": 0.29260680148254115,
-      "grad_norm": 0.494140625,
-      "learning_rate": 0.0019882957279406983,
-      "loss": 1.7739,
+      "epoch": 2.3408239700374533,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0019063670411985018,
+      "loss": 1.4908,
       "step": 22500
     },
     {
-      "epoch": 0.2991091748488198,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0019880356330060473,
-      "loss": 1.7675,
+      "epoch": 2.39284228048273,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0019042863087806907,
+      "loss": 1.483,
       "step": 23000
     },
     {
-      "epoch": 0.3056115482150985,
-      "grad_norm": 0.431640625,
-      "learning_rate": 0.001987775538071396,
-      "loss": 1.767,
+      "epoch": 2.444860590928007,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019022055763628799,
+      "loss": 1.4808,
       "step": 23500
     },
     {
-      "epoch": 0.3121139215813772,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.001987515443136745,
-      "loss": 1.7782,
+      "epoch": 2.4968789013732833,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019001248439450688,
+      "loss": 1.4751,
       "step": 24000
     },
     {
-      "epoch": 0.3186162949476559,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0019872553482020937,
-      "loss": 1.7878,
+      "epoch": 2.54889721181856,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0018980441115272575,
+      "loss": 1.4713,
       "step": 24500
     },
     {
-      "epoch": 0.3251186683139346,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0019869952532674427,
-      "loss": 1.7762,
+      "epoch": 2.6009155222638367,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0018959633791094466,
+      "loss": 1.4743,
       "step": 25000
     },
     {
-      "epoch": 0.3316210416802133,
-      "grad_norm": 6.15625,
-      "learning_rate": 0.0019867351583327916,
-      "loss": 1.744,
+      "epoch": 2.6529338327091136,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018938826466916355,
+      "loss": 1.4703,
       "step": 25500
     },
     {
-      "epoch": 0.33812341504649196,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.00198647506339814,
-      "loss": 1.7591,
+      "epoch": 2.70495214315439,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0018918019142738244,
+      "loss": 1.4722,
       "step": 26000
     },
     {
-      "epoch": 0.34462578841277064,
+      "epoch": 2.756970453599667,
       "grad_norm": 0.87890625,
-      "learning_rate": 0.001986214968463489,
-      "loss": 1.7565,
+      "learning_rate": 0.0018897211818560133,
+      "loss": 1.4728,
       "step": 26500
     },
     {
-      "epoch": 0.3511281617790494,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.001985954873528838,
-      "loss": 1.7639,
+      "epoch": 2.808988764044944,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018876404494382023,
+      "loss": 1.4738,
       "step": 27000
     },
     {
-      "epoch": 0.35763053514532805,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.001985694778594187,
-      "loss": 1.7547,
+      "epoch": 2.8610070744902205,
+      "grad_norm": 1.7265625,
+      "learning_rate": 0.0018855597170203914,
+      "loss": 1.4717,
       "step": 27500
     },
     {
-      "epoch": 0.36413290851160673,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.001985434683659536,
-      "loss": 1.7356,
+      "epoch": 2.9130253849354975,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.00188347898460258,
+      "loss": 1.473,
       "step": 28000
     },
     {
-      "epoch": 0.3706352818778854,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0019851745887248845,
-      "loss": 1.7366,
+      "epoch": 2.965043695380774,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.001881398252184769,
+      "loss": 1.472,
       "step": 28500
     },
     {
-      "epoch": 0.3771376552441641,
-      "grad_norm": 1.2265625,
-      "learning_rate": 0.0019849144937902335,
-      "loss": 1.7293,
+      "epoch": 3.0,
+      "eval_loss": 1.4684182405471802,
+      "eval_runtime": 1.4391,
+      "eval_samples_per_second": 694.9,
+      "eval_steps_per_second": 0.695,
+      "step": 28836
+    },
+    {
+      "epoch": 3.017062005826051,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0018793175197669581,
+      "loss": 1.4727,
       "step": 29000
     },
     {
-      "epoch": 0.38364002861044283,
-      "grad_norm": 0.498046875,
-      "learning_rate": 0.0019846543988555824,
-      "loss": 1.7272,
+      "epoch": 3.0690803162713274,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.001877236787349147,
+      "loss": 1.4677,
       "step": 29500
     },
     {
-      "epoch": 0.3901424019767215,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.001984394303920931,
-      "loss": 1.7196,
+      "epoch": 3.1210986267166043,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018751560549313357,
+      "loss": 1.4668,
       "step": 30000
     },
     {
-      "epoch": 0.3966447753430002,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.00198413420898628,
-      "loss": 1.7293,
+      "epoch": 3.173116937161881,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0018730753225135249,
+      "loss": 1.4653,
       "step": 30500
     },
     {
-      "epoch": 0.40314714870927887,
-      "grad_norm": 0.875,
-      "learning_rate": 0.001983874114051629,
-      "loss": 1.7171,
+      "epoch": 3.2251352476071578,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0018709945900957138,
+      "loss": 1.4667,
       "step": 31000
     },
     {
-      "epoch": 0.4096495220755576,
-      "grad_norm": 1.296875,
-      "learning_rate": 0.001983614019116978,
-      "loss": 1.7131,
+      "epoch": 3.2771535580524347,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0018689138576779025,
+      "loss": 1.4613,
       "step": 31500
     },
     {
-      "epoch": 0.4161518954418363,
-      "grad_norm": 2.546875,
-      "learning_rate": 0.001983353924182327,
-      "loss": 1.7128,
+      "epoch": 3.329171868497711,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0018668331252600916,
+      "loss": 1.4604,
       "step": 32000
     },
     {
-      "epoch": 0.42265426880811496,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019830938292476753,
-      "loss": 1.7127,
+      "epoch": 3.381190178942988,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0018647523928422805,
+      "loss": 1.4672,
       "step": 32500
     },
     {
-      "epoch": 0.42915664217439364,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.0019828337343130243,
-      "loss": 1.7052,
+      "epoch": 3.4332084893882646,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0018626716604244697,
+      "loss": 1.465,
       "step": 33000
     },
     {
-      "epoch": 0.4356590155406723,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.0019825736393783732,
-      "loss": 1.701,
+      "epoch": 3.4852267998335416,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0018605909280066584,
+      "loss": 1.4604,
       "step": 33500
     },
     {
-      "epoch": 0.44216138890695106,
-      "grad_norm": 0.4609375,
-      "learning_rate": 0.0019823135444437217,
-      "loss": 1.6965,
+      "epoch": 3.537245110278818,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018585101955888473,
+      "loss": 1.4539,
       "step": 34000
     },
     {
-      "epoch": 0.44866376227322974,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.001982053449509071,
-      "loss": 1.6987,
+      "epoch": 3.589263420724095,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0018564294631710364,
+      "loss": 1.4536,
       "step": 34500
     },
     {
-      "epoch": 0.4551661356395084,
-      "grad_norm": 1.4296875,
-      "learning_rate": 0.0019817933545744197,
-      "loss": 1.699,
+      "epoch": 3.6412817311693715,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001854348730753225,
+      "loss": 1.4549,
       "step": 35000
     },
     {
-      "epoch": 0.4616685090057871,
-      "grad_norm": 0.490234375,
-      "learning_rate": 0.0019815332596397686,
-      "loss": 1.6868,
+      "epoch": 3.6933000416146484,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001852267998335414,
+      "loss": 1.4571,
       "step": 35500
     },
     {
-      "epoch": 0.4681708823720658,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019812731647051176,
-      "loss": 1.6836,
+      "epoch": 3.7453183520599254,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0018501872659176031,
+      "loss": 1.4584,
       "step": 36000
     },
     {
-      "epoch": 0.4746732557383445,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.001981013069770466,
-      "loss": 1.6805,
+      "epoch": 3.797336662505202,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.001848106533499792,
+      "loss": 1.4542,
       "step": 36500
     },
     {
-      "epoch": 0.4811756291046232,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.001980752974835815,
-      "loss": 1.6832,
+      "epoch": 3.8493549729504783,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0018460258010819808,
+      "loss": 1.4588,
       "step": 37000
     },
     {
-      "epoch": 0.4876780024709019,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.001980492879901164,
-      "loss": 1.6827,
+      "epoch": 3.9013732833957553,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0018439450686641699,
+      "loss": 1.456,
       "step": 37500
     },
     {
-      "epoch": 0.49418037583718055,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019802327849665125,
-      "loss": 1.6739,
+      "epoch": 3.9533915938410322,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018418643362463588,
+      "loss": 1.4596,
       "step": 38000
     },
     {
-      "epoch": 0.5006827492034592,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.001979972690031862,
-      "loss": 1.676,
+      "epoch": 4.0,
+      "eval_loss": 1.4403541088104248,
+      "eval_runtime": 1.4124,
+      "eval_samples_per_second": 708.019,
+      "eval_steps_per_second": 0.708,
+      "step": 38448
+    },
+    {
+      "epoch": 4.005409904286309,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0018397836038285475,
+      "loss": 1.449,
       "step": 38500
     },
     {
-      "epoch": 0.5071851225697379,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0019797125950972105,
-      "loss": 1.6762,
+      "epoch": 4.057428214731585,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018377028714107366,
+      "loss": 1.445,
       "step": 39000
     },
     {
-      "epoch": 0.5136874959360166,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019794525001625594,
-      "loss": 1.6743,
+      "epoch": 4.109446525176862,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0018356221389929255,
+      "loss": 1.4441,
       "step": 39500
     },
     {
-      "epoch": 0.5201898693022954,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0019791924052279084,
-      "loss": 1.6713,
+      "epoch": 4.161464835622139,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018335414065751145,
+      "loss": 1.4423,
       "step": 40000
     },
     {
-      "epoch": 0.5266922426685741,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.001978932310293257,
-      "loss": 1.668,
+      "epoch": 4.213483146067416,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0018314606741573034,
+      "loss": 1.442,
       "step": 40500
     },
     {
-      "epoch": 0.5331946160348527,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.001978672215358606,
-      "loss": 1.665,
+      "epoch": 4.265501456512692,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0018293799417394923,
+      "loss": 1.4348,
       "step": 41000
     },
     {
-      "epoch": 0.5396969894011314,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.001978412120423955,
-      "loss": 1.6673,
+      "epoch": 4.317519766957969,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0018272992093216814,
+      "loss": 1.4352,
       "step": 41500
     },
     {
-      "epoch": 0.5461993627674101,
-      "grad_norm": 2.578125,
-      "learning_rate": 0.0019781520254893038,
-      "loss": 1.6608,
+      "epoch": 4.369538077403246,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0018252184769038703,
+      "loss": 1.4344,
       "step": 42000
     },
     {
-      "epoch": 0.5527017361336888,
-      "grad_norm": 0.46484375,
-      "learning_rate": 0.0019778919305546527,
-      "loss": 1.6578,
+      "epoch": 4.421556387848523,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.001823137744486059,
+      "loss": 1.438,
       "step": 42500
     },
     {
-      "epoch": 0.5592041094999675,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0019776318356200012,
-      "loss": 1.6561,
+      "epoch": 4.473574698293799,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0018210570120682482,
+      "loss": 1.4417,
       "step": 43000
     },
     {
-      "epoch": 0.5657064828662461,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.00197737174068535,
-      "loss": 1.6527,
+      "epoch": 4.525593008739076,
+      "grad_norm": 1.2265625,
+      "learning_rate": 0.001818976279650437,
+      "loss": 1.436,
       "step": 43500
     },
     {
-      "epoch": 0.5722088562325248,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.001977111645750699,
-      "loss": 1.6467,
+      "epoch": 4.577611319184353,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0018168955472326258,
+      "loss": 1.4367,
       "step": 44000
     },
     {
-      "epoch": 0.5787112295988036,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0019768515508160477,
-      "loss": 1.6501,
+      "epoch": 4.62962962962963,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001814814814814815,
+      "loss": 1.4299,
       "step": 44500
     },
     {
-      "epoch": 0.5852136029650823,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019765914558813966,
-      "loss": 1.6533,
+      "epoch": 4.681647940074907,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018127340823970038,
+      "loss": 1.4332,
       "step": 45000
     },
     {
-      "epoch": 0.591715976331361,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0019763313609467456,
-      "loss": 1.6543,
+      "epoch": 4.733666250520183,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018106533499791927,
+      "loss": 1.4269,
       "step": 45500
     },
     {
-      "epoch": 0.5982183496976397,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.0019760712660120945,
-      "loss": 1.651,
+      "epoch": 4.78568456096546,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0018085726175613816,
+      "loss": 1.4276,
       "step": 46000
     },
     {
-      "epoch": 0.6047207230639183,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.0019758111710774435,
-      "loss": 1.6474,
+      "epoch": 4.837702871410737,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0018064918851435705,
+      "loss": 1.4288,
       "step": 46500
     },
     {
-      "epoch": 0.611223096430197,
-      "grad_norm": 0.5,
-      "learning_rate": 0.001975551076142792,
-      "loss": 1.6363,
+      "epoch": 4.889721181856014,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0018044111527257595,
+      "loss": 1.4351,
       "step": 47000
     },
     {
-      "epoch": 0.6177254697964757,
-      "grad_norm": 0.48828125,
-      "learning_rate": 0.001975290981208141,
-      "loss": 1.6395,
+      "epoch": 4.94173949230129,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0018023304203079484,
+      "loss": 1.4307,
       "step": 47500
     },
     {
-      "epoch": 0.6242278431627544,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.00197503088627349,
-      "loss": 1.6419,
+      "epoch": 4.9937578027465666,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0018002496878901373,
+      "loss": 1.4257,
       "step": 48000
     },
     {
-      "epoch": 0.630730216529033,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0019747707913388385,
-      "loss": 1.6377,
+      "epoch": 5.0,
+      "eval_loss": 1.4106667041778564,
+      "eval_runtime": 1.4218,
+      "eval_samples_per_second": 703.325,
+      "eval_steps_per_second": 0.703,
+      "step": 48060
+    },
+    {
+      "epoch": 5.0457761131918435,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0017981689554723264,
+      "loss": 1.4193,
       "step": 48500
     },
     {
-      "epoch": 0.6372325898953118,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.001974510696404188,
-      "loss": 1.6428,
+      "epoch": 5.09779442363712,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0017960882230545153,
+      "loss": 1.4176,
       "step": 49000
     },
     {
-      "epoch": 0.6437349632615905,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0019742506014695364,
-      "loss": 1.6507,
+      "epoch": 5.149812734082397,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001794007490636704,
+      "loss": 1.4157,
       "step": 49500
     },
     {
-      "epoch": 0.6502373366278692,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0019739905065348853,
-      "loss": 1.6552,
+      "epoch": 5.201831044527673,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0017919267582188932,
+      "loss": 1.4186,
       "step": 50000
     },
     {
-      "epoch": 0.6567397099941479,
-      "grad_norm": 2.09375,
-      "learning_rate": 0.0019737304116002343,
-      "loss": 1.6509,
+      "epoch": 5.25384935497295,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001789846025801082,
+      "loss": 1.4155,
       "step": 50500
     },
     {
-      "epoch": 0.6632420833604266,
-      "grad_norm": 0.5390625,
-      "learning_rate": 0.001973470316665583,
-      "loss": 1.6545,
+      "epoch": 5.305867665418227,
+      "grad_norm": 2.640625,
+      "learning_rate": 0.0017877652933832708,
+      "loss": 1.4142,
       "step": 51000
     },
     {
-      "epoch": 0.6697444567267052,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0019732102217309318,
-      "loss": 1.6451,
+      "epoch": 5.357885975863504,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00178568456096546,
+      "loss": 1.4155,
       "step": 51500
     },
     {
-      "epoch": 0.6762468300929839,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0019729501267962807,
-      "loss": 1.6408,
+      "epoch": 5.40990428630878,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0017836038285476488,
+      "loss": 1.4113,
       "step": 52000
     },
     {
-      "epoch": 0.6827492034592626,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.0019726900318616293,
-      "loss": 1.6378,
+      "epoch": 5.461922596754057,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017815230961298377,
+      "loss": 1.4145,
       "step": 52500
     },
     {
-      "epoch": 0.6892515768255413,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0019724299369269786,
-      "loss": 1.637,
+      "epoch": 5.513940907199334,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0017794423637120266,
+      "loss": 1.4132,
       "step": 53000
     },
     {
-      "epoch": 0.6957539501918201,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.001972169841992327,
-      "loss": 1.6317,
+      "epoch": 5.565959217644611,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0017773616312942156,
+      "loss": 1.4141,
       "step": 53500
     },
     {
-      "epoch": 0.7022563235580987,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.001971909747057676,
-      "loss": 1.6326,
+      "epoch": 5.617977528089888,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017752808988764045,
+      "loss": 1.4118,
       "step": 54000
     },
     {
-      "epoch": 0.7087586969243774,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.001971649652123025,
-      "loss": 1.6285,
+      "epoch": 5.669995838535164,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.0017732001664585936,
+      "loss": 1.4094,
       "step": 54500
     },
     {
-      "epoch": 0.7152610702906561,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0019713895571883736,
-      "loss": 1.6336,
+      "epoch": 5.722014148980441,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017711194340407823,
+      "loss": 1.4088,
       "step": 55000
     },
     {
-      "epoch": 0.7217634436569348,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0019711294622537226,
-      "loss": 1.636,
+      "epoch": 5.774032459425718,
+      "grad_norm": 2.34375,
+      "learning_rate": 0.0017690387016229714,
+      "loss": 1.4068,
       "step": 55500
     },
     {
-      "epoch": 0.7282658170232135,
-      "grad_norm": 0.5,
-      "learning_rate": 0.0019708693673190715,
-      "loss": 1.6426,
+      "epoch": 5.826050769870995,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0017669579692051603,
+      "loss": 1.4059,
       "step": 56000
     },
     {
-      "epoch": 0.7347681903894921,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019706092723844205,
-      "loss": 1.6444,
+      "epoch": 5.878069080316271,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.001764877236787349,
+      "loss": 1.4041,
       "step": 56500
     },
     {
-      "epoch": 0.7412705637557708,
-      "grad_norm": 2.140625,
-      "learning_rate": 0.0019703491774497694,
-      "loss": 1.6306,
+      "epoch": 5.930087390761548,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0017627965043695382,
+      "loss": 1.4024,
       "step": 57000
     },
     {
-      "epoch": 0.7477729371220495,
-      "grad_norm": 0.5390625,
-      "learning_rate": 0.001970089082515118,
-      "loss": 1.6341,
+      "epoch": 5.982105701206825,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001760715771951727,
+      "loss": 1.4006,
       "step": 57500
     },
     {
-      "epoch": 0.7542753104883282,
-      "grad_norm": 0.435546875,
-      "learning_rate": 0.001969828987580467,
-      "loss": 1.6356,
+      "epoch": 6.0,
+      "eval_loss": 1.3921489715576172,
+      "eval_runtime": 1.4219,
+      "eval_samples_per_second": 703.273,
+      "eval_steps_per_second": 0.703,
+      "step": 57672
+    },
+    {
+      "epoch": 6.034124011652102,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001758635039533916,
+      "loss": 1.4001,
       "step": 58000
     },
     {
-      "epoch": 0.760777683854607,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.001969568892645816,
-      "loss": 1.6326,
+      "epoch": 6.086142322097379,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001756554307116105,
+      "loss": 1.3981,
       "step": 58500
     },
     {
-      "epoch": 0.7672800572208857,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019693087977111644,
-      "loss": 1.6272,
+      "epoch": 6.138160632542655,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017544735746982938,
+      "loss": 1.4018,
       "step": 59000
     },
     {
-      "epoch": 0.7737824305871643,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.0019690487027765134,
-      "loss": 1.6223,
+      "epoch": 6.190178942987932,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0017523928422804827,
+      "loss": 1.397,
       "step": 59500
     },
     {
-      "epoch": 0.780284803953443,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0019687886078418623,
-      "loss": 1.6297,
+      "epoch": 6.242197253433209,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0017503121098626717,
+      "loss": 1.3974,
       "step": 60000
     },
     {
-      "epoch": 0.7867871773197217,
-      "grad_norm": 0.427734375,
-      "learning_rate": 0.0019685285129072113,
-      "loss": 1.6278,
+      "epoch": 6.294215563878486,
+      "grad_norm": 3.578125,
+      "learning_rate": 0.0017482313774448606,
+      "loss": 1.397,
       "step": 60500
     },
     {
-      "epoch": 0.7932895506860004,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0019682684179725602,
-      "loss": 1.632,
+      "epoch": 6.346233874323762,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017461506450270495,
+      "loss": 1.3971,
       "step": 61000
     },
     {
-      "epoch": 0.7997919240522791,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019680083230379087,
-      "loss": 1.6304,
+      "epoch": 6.398252184769039,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0017440699126092386,
+      "loss": 1.3955,
       "step": 61500
     },
     {
-      "epoch": 0.8062942974185577,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.0019677482281032577,
-      "loss": 1.6262,
+      "epoch": 6.4502704952143155,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0017419891801914273,
+      "loss": 1.3959,
       "step": 62000
     },
     {
-      "epoch": 0.8127966707848364,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0019674881331686067,
-      "loss": 1.6304,
+      "epoch": 6.502288805659592,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0017399084477736164,
+      "loss": 1.3951,
       "step": 62500
     },
     {
-      "epoch": 0.8192990441511152,
-      "grad_norm": 1.1640625,
-      "learning_rate": 0.001967228038233955,
-      "loss": 1.6413,
+      "epoch": 6.554307116104869,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0017378277153558054,
+      "loss": 1.3977,
       "step": 63000
     },
     {
-      "epoch": 0.8258014175173939,
-      "grad_norm": 1.4921875,
-      "learning_rate": 0.0019669679432993046,
-      "loss": 1.6264,
+      "epoch": 6.606325426550145,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.001735746982937994,
+      "loss": 1.3976,
       "step": 63500
     },
     {
-      "epoch": 0.8323037908836726,
-      "grad_norm": 0.625,
-      "learning_rate": 0.001966707848364653,
-      "loss": 1.6243,
+      "epoch": 6.658343736995422,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0017336662505201832,
+      "loss": 1.3964,
       "step": 64000
     },
     {
-      "epoch": 0.8388061642499512,
-      "grad_norm": 0.75,
-      "learning_rate": 0.001966447753430002,
-      "loss": 1.6298,
+      "epoch": 6.710362047440699,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.001731585518102372,
+      "loss": 1.396,
       "step": 64500
     },
     {
-      "epoch": 0.8453085376162299,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.001966187658495351,
-      "loss": 1.6207,
+      "epoch": 6.762380357885976,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.001729504785684561,
+      "loss": 1.3931,
       "step": 65000
     },
     {
-      "epoch": 0.8518109109825086,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0019659275635606995,
-      "loss": 1.6121,
+      "epoch": 6.814398668331252,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.00172742405326675,
+      "loss": 1.3931,
       "step": 65500
     },
     {
-      "epoch": 0.8583132843487873,
-      "grad_norm": 0.48046875,
-      "learning_rate": 0.0019656674686260485,
-      "loss": 1.6175,
+      "epoch": 6.866416978776529,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0017253433208489388,
+      "loss": 1.388,
       "step": 66000
     },
     {
-      "epoch": 0.864815657715066,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0019654073736913974,
-      "loss": 1.6142,
+      "epoch": 6.918435289221806,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0017232625884311278,
+      "loss": 1.3862,
       "step": 66500
     },
     {
-      "epoch": 0.8713180310813446,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.001965147278756746,
-      "loss": 1.614,
+      "epoch": 6.970453599667083,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0017211818560133169,
+      "loss": 1.3865,
       "step": 67000
     },
     {
-      "epoch": 0.8778204044476234,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0019648871838220954,
-      "loss": 1.6112,
+      "epoch": 7.0,
+      "eval_loss": 1.3738893270492554,
+      "eval_runtime": 1.4206,
+      "eval_samples_per_second": 703.936,
+      "eval_steps_per_second": 0.704,
+      "step": 67284
+    },
+    {
+      "epoch": 7.022471910112359,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0017191011235955056,
+      "loss": 1.385,
       "step": 67500
     },
     {
-      "epoch": 0.8843227778139021,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.001964627088887444,
-      "loss": 1.6082,
+      "epoch": 7.074490220557636,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017170203911776945,
+      "loss": 1.3827,
       "step": 68000
     },
     {
-      "epoch": 0.8908251511801808,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.001964366993952793,
-      "loss": 1.6121,
+      "epoch": 7.126508531002913,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0017149396587598836,
+      "loss": 1.3864,
       "step": 68500
     },
     {
-      "epoch": 0.8973275245464595,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.001964106899018142,
-      "loss": 1.6084,
+      "epoch": 7.17852684144819,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0017128589263420723,
+      "loss": 1.3874,
       "step": 69000
     },
     {
-      "epoch": 0.9038298979127382,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0019638468040834903,
-      "loss": 1.603,
+      "epoch": 7.230545151893467,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0017107781939242615,
+      "loss": 1.3913,
       "step": 69500
     },
     {
-      "epoch": 0.9103322712790168,
-      "grad_norm": 0.486328125,
-      "learning_rate": 0.0019635867091488393,
-      "loss": 1.6033,
+      "epoch": 7.282563462338743,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0017086974615064504,
+      "loss": 1.3892,
       "step": 70000
     },
     {
-      "epoch": 0.9168346446452955,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019633266142141882,
-      "loss": 1.5994,
+      "epoch": 7.33458177278402,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0017066167290886393,
+      "loss": 1.3888,
       "step": 70500
     },
     {
-      "epoch": 0.9233370180115742,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.001963066519279537,
-      "loss": 1.5958,
+      "epoch": 7.386600083229297,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017045359966708282,
+      "loss": 1.3908,
       "step": 71000
     },
     {
-      "epoch": 0.9298393913778529,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.001962806424344886,
-      "loss": 1.5963,
+      "epoch": 7.438618393674574,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001702455264253017,
+      "loss": 1.3879,
       "step": 71500
     },
     {
-      "epoch": 0.9363417647441316,
-      "grad_norm": 0.5,
-      "learning_rate": 0.0019625463294102347,
-      "loss": 1.5962,
+      "epoch": 7.49063670411985,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001700374531835206,
+      "loss": 1.3873,
       "step": 72000
     },
     {
-      "epoch": 0.9428441381104103,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0019622862344755836,
-      "loss": 1.5861,
+      "epoch": 7.542655014565127,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001698293799417395,
+      "loss": 1.3845,
       "step": 72500
     },
     {
-      "epoch": 0.949346511476689,
-      "grad_norm": 1.140625,
-      "learning_rate": 0.0019620261395409326,
-      "loss": 1.5928,
+      "epoch": 7.594673325010404,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0016962130669995838,
+      "loss": 1.3832,
       "step": 73000
     },
     {
-      "epoch": 0.9558488848429677,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.001961766044606281,
-      "loss": 1.592,
+      "epoch": 7.646691635455681,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0016941323345817728,
+      "loss": 1.38,
       "step": 73500
     },
     {
-      "epoch": 0.9623512582092464,
-      "grad_norm": 0.46875,
-      "learning_rate": 0.00196150594967163,
-      "loss": 1.5859,
+      "epoch": 7.698709945900957,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001692051602163962,
+      "loss": 1.3798,
       "step": 74000
     },
     {
-      "epoch": 0.9688536315755251,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.001961245854736979,
-      "loss": 1.5834,
+      "epoch": 7.750728256346234,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0016899708697461506,
+      "loss": 1.3775,
       "step": 74500
     },
     {
-      "epoch": 0.9753560049418037,
+      "epoch": 7.802746566791511,
       "grad_norm": 1.015625,
-      "learning_rate": 0.001960985759802328,
-      "loss": 1.5893,
+      "learning_rate": 0.0016878901373283395,
+      "loss": 1.3797,
       "step": 75000
     },
     {
-      "epoch": 0.9818583783080824,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.001960725664867677,
-      "loss": 1.5846,
+      "epoch": 7.8547648772367875,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016858094049105286,
+      "loss": 1.3813,
       "step": 75500
     },
     {
-      "epoch": 0.9883607516743611,
-      "grad_norm": 0.45703125,
-      "learning_rate": 0.0019604655699330255,
-      "loss": 1.5838,
+      "epoch": 7.9067831876820645,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016837286724927173,
+      "loss": 1.3779,
       "step": 76000
     },
     {
-      "epoch": 0.9948631250406398,
-      "grad_norm": 0.5390625,
-      "learning_rate": 0.0019602054749983744,
-      "loss": 1.5809,
+      "epoch": 7.9588014981273405,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0016816479400749065,
+      "loss": 1.3771,
       "step": 76500
     },
     {
-      "epoch": 1.0,
-      "eval_loss": 1.5596588850021362,
-      "eval_runtime": 0.8816,
-      "eval_samples_per_second": 1134.308,
-      "eval_steps_per_second": 9.074,
-      "step": 76895
+      "epoch": 8.0,
+      "eval_loss": 1.3700777292251587,
+      "eval_runtime": 1.4197,
+      "eval_samples_per_second": 704.38,
+      "eval_steps_per_second": 0.704,
+      "step": 76896
     },
     {
-      "epoch": 1.0013654984069185,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0019599453800637234,
-      "loss": 1.5838,
+      "epoch": 8.010819808572618,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0016795672076570954,
+      "loss": 1.3753,
       "step": 77000
     },
     {
-      "epoch": 1.0078678717731973,
-      "grad_norm": 0.47265625,
-      "learning_rate": 0.001959685285129072,
-      "loss": 1.5829,
+      "epoch": 8.062838119017893,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0016774864752392843,
+      "loss": 1.3763,
       "step": 77500
     },
     {
-      "epoch": 1.0143702451394758,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0019594251901944213,
-      "loss": 1.5812,
+      "epoch": 8.11485642946317,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016754057428214732,
+      "loss": 1.3761,
       "step": 78000
     },
     {
-      "epoch": 1.0208726185057546,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.00195916509525977,
-      "loss": 1.5797,
+      "epoch": 8.166874739908447,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0016733250104036621,
+      "loss": 1.3786,
       "step": 78500
     },
     {
-      "epoch": 1.0273749918720332,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0019589050003251188,
-      "loss": 1.5785,
+      "epoch": 8.218893050353724,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.001671244277985851,
+      "loss": 1.3786,
       "step": 79000
     },
     {
-      "epoch": 1.033877365238312,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0019586449053904677,
-      "loss": 1.5752,
+      "epoch": 8.270911360799001,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0016691635455680402,
+      "loss": 1.3797,
       "step": 79500
     },
     {
-      "epoch": 1.0403797386045908,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0019583848104558163,
-      "loss": 1.582,
+      "epoch": 8.322929671244278,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0016670828131502289,
+      "loss": 1.3791,
       "step": 80000
     },
     {
-      "epoch": 1.0468821119708693,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.001958124715521165,
-      "loss": 1.5762,
+      "epoch": 8.374947981689555,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0016650020807324178,
+      "loss": 1.3783,
       "step": 80500
     },
     {
-      "epoch": 1.0533844853371481,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.001957864620586514,
-      "loss": 1.577,
+      "epoch": 8.426966292134832,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001662921348314607,
+      "loss": 1.376,
       "step": 81000
     },
     {
-      "epoch": 1.0598868587034267,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0019576045256518627,
-      "loss": 1.5741,
+      "epoch": 8.478984602580109,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.0016608406158967956,
+      "loss": 1.3763,
       "step": 81500
     },
     {
-      "epoch": 1.0663892320697055,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.001957344430717212,
-      "loss": 1.574,
+      "epoch": 8.531002913025384,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0016587598834789845,
+      "loss": 1.3757,
       "step": 82000
     },
     {
-      "epoch": 1.072891605435984,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019570843357825606,
-      "loss": 1.5744,
+      "epoch": 8.583021223470661,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0016566791510611736,
+      "loss": 1.377,
       "step": 82500
     },
     {
-      "epoch": 1.0793939788022628,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.0019568242408479096,
-      "loss": 1.5745,
+      "epoch": 8.635039533915938,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0016545984186433626,
+      "loss": 1.3755,
       "step": 83000
     },
     {
-      "epoch": 1.0858963521685414,
-      "grad_norm": 0.5,
-      "learning_rate": 0.0019565641459132585,
-      "loss": 1.5758,
+      "epoch": 8.687057844361215,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0016525176862255513,
+      "loss": 1.3739,
       "step": 83500
     },
     {
-      "epoch": 1.0923987255348202,
-      "grad_norm": 0.75,
-      "learning_rate": 0.001956304050978607,
-      "loss": 1.5698,
+      "epoch": 8.739076154806492,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0016504369538077404,
+      "loss": 1.3744,
       "step": 84000
     },
     {
-      "epoch": 1.098901098901099,
-      "grad_norm": 0.447265625,
-      "learning_rate": 0.001956043956043956,
-      "loss": 1.576,
+      "epoch": 8.791094465251769,
+      "grad_norm": 0.36328125,
+      "learning_rate": 0.0016483562213899293,
+      "loss": 1.3733,
       "step": 84500
     },
     {
-      "epoch": 1.1054034722673776,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.001955783861109305,
-      "loss": 1.5778,
+      "epoch": 8.843112775697046,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016462754889721182,
+      "loss": 1.3735,
       "step": 85000
     },
     {
-      "epoch": 1.1119058456336564,
+      "epoch": 8.895131086142323,
       "grad_norm": 0.478515625,
-      "learning_rate": 0.001955523766174654,
-      "loss": 1.5739,
+      "learning_rate": 0.0016441947565543071,
+      "loss": 1.3754,
       "step": 85500
     },
     {
-      "epoch": 1.118408218999935,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.001955263671240003,
-      "loss": 1.5778,
+      "epoch": 8.947149396587598,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.001642114024136496,
+      "loss": 1.3749,
       "step": 86000
     },
     {
-      "epoch": 1.1249105923662137,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0019550035763053514,
-      "loss": 1.5713,
+      "epoch": 8.999167707032875,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0016400332917186852,
+      "loss": 1.3739,
       "step": 86500
     },
     {
-      "epoch": 1.1314129657324923,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019547434813707004,
-      "loss": 1.5767,
+      "epoch": 9.0,
+      "eval_loss": 1.3678644895553589,
+      "eval_runtime": 1.4359,
+      "eval_samples_per_second": 696.406,
+      "eval_steps_per_second": 0.696,
+      "step": 86508
+    },
+    {
+      "epoch": 9.051186017478152,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016379525593008739,
+      "loss": 1.3717,
       "step": 87000
     },
     {
-      "epoch": 1.137915339098771,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0019544833864360493,
-      "loss": 1.5732,
+      "epoch": 9.103204327923429,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016358718268830628,
+      "loss": 1.3705,
       "step": 87500
     },
     {
-      "epoch": 1.1444177124650499,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001954223291501398,
-      "loss": 1.5688,
+      "epoch": 9.155222638368706,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001633791094465252,
+      "loss": 1.3719,
       "step": 88000
     },
     {
-      "epoch": 1.1509200858313284,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.001953963196566747,
-      "loss": 1.5739,
+      "epoch": 9.207240948813983,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0016317103620474408,
+      "loss": 1.3727,
       "step": 88500
     },
     {
-      "epoch": 1.1574224591976072,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0019537031016320957,
-      "loss": 1.5733,
+      "epoch": 9.25925925925926,
+      "grad_norm": 5.125,
+      "learning_rate": 0.0016296296296296295,
+      "loss": 1.3737,
       "step": 89000
     },
     {
-      "epoch": 1.1639248325638858,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019534430066974447,
-      "loss": 1.576,
+      "epoch": 9.311277569704536,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0016275488972118187,
+      "loss": 1.3707,
       "step": 89500
     },
     {
-      "epoch": 1.1704272059301646,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0019531829117627937,
-      "loss": 1.5668,
+      "epoch": 9.363295880149813,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016254681647940076,
+      "loss": 1.3694,
       "step": 90000
     },
     {
-      "epoch": 1.1769295792964432,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.0019529228168281424,
-      "loss": 1.5677,
+      "epoch": 9.41531419059509,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0016233874323761963,
+      "loss": 1.3687,
       "step": 90500
     },
     {
-      "epoch": 1.183431952662722,
-      "grad_norm": 2.03125,
-      "learning_rate": 0.0019526627218934911,
-      "loss": 1.5671,
+      "epoch": 9.467332501040365,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0016213066999583854,
+      "loss": 1.368,
       "step": 91000
     },
     {
-      "epoch": 1.1899343260290005,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.0019524026269588399,
-      "loss": 1.5649,
+      "epoch": 9.519350811485642,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0016192259675405743,
+      "loss": 1.3716,
       "step": 91500
     },
     {
-      "epoch": 1.1964366993952793,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0019521425320241888,
-      "loss": 1.5645,
+      "epoch": 9.57136912193092,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0016171452351227634,
+      "loss": 1.3697,
       "step": 92000
     },
     {
-      "epoch": 1.2029390727615579,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.0019518824370895378,
-      "loss": 1.5646,
+      "epoch": 9.623387432376196,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016150645027049521,
+      "loss": 1.3692,
       "step": 92500
     },
     {
-      "epoch": 1.2094414461278367,
-      "grad_norm": 2.421875,
-      "learning_rate": 0.0019516223421548868,
-      "loss": 1.5711,
+      "epoch": 9.675405742821473,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.001612983770287141,
+      "loss": 1.3682,
       "step": 93000
     },
     {
-      "epoch": 1.2159438194941155,
-      "grad_norm": 10.25,
-      "learning_rate": 0.0019513622472202355,
-      "loss": 1.5674,
+      "epoch": 9.72742405326675,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.0016109030378693302,
+      "loss": 1.3673,
       "step": 93500
     },
     {
-      "epoch": 1.222446192860394,
-      "grad_norm": 1.2421875,
-      "learning_rate": 0.0019511021522855842,
-      "loss": 1.5693,
+      "epoch": 9.779442363712027,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016088223054515189,
+      "loss": 1.3663,
       "step": 94000
     },
     {
-      "epoch": 1.2289485662266728,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0019508420573509332,
-      "loss": 1.5683,
+      "epoch": 9.831460674157304,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016067415730337078,
+      "loss": 1.3662,
       "step": 94500
     },
     {
-      "epoch": 1.2354509395929514,
-      "grad_norm": 0.9765625,
-      "learning_rate": 0.001950581962416282,
-      "loss": 1.5786,
+      "epoch": 9.88347898460258,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.001604660840615897,
+      "loss": 1.37,
       "step": 95000
     },
     {
-      "epoch": 1.2419533129592302,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0019503218674816307,
-      "loss": 1.5745,
+      "epoch": 9.935497295047856,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0016025801081980858,
+      "loss": 1.372,
       "step": 95500
     },
     {
-      "epoch": 1.2484556863255087,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.0019500617725469796,
-      "loss": 1.5704,
+      "epoch": 9.987515605493133,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0016004993757802745,
+      "loss": 1.3699,
       "step": 96000
     },
     {
-      "epoch": 1.2549580596917875,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0019498016776123286,
-      "loss": 1.5753,
+      "epoch": 10.0,
+      "eval_loss": 1.3671537637710571,
+      "eval_runtime": 2.0428,
+      "eval_samples_per_second": 489.53,
+      "eval_steps_per_second": 0.49,
+      "step": 96120
+    },
+    {
+      "epoch": 10.03953391593841,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0015984186433624637,
+      "loss": 1.3686,
       "step": 96500
     },
     {
-      "epoch": 1.2614604330580663,
-      "grad_norm": 1.2265625,
-      "learning_rate": 0.0019495415826776775,
-      "loss": 1.5765,
+      "epoch": 10.091552226383687,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0015963379109446526,
+      "loss": 1.3682,
       "step": 97000
     },
     {
-      "epoch": 1.267962806424345,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0019492814877430263,
-      "loss": 1.5718,
+      "epoch": 10.143570536828964,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0015942571785268413,
+      "loss": 1.3659,
       "step": 97500
     },
     {
-      "epoch": 1.2744651797906235,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.001949021392808375,
-      "loss": 1.5716,
+      "epoch": 10.19558884727424,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0015921764461090304,
+      "loss": 1.3653,
       "step": 98000
     },
     {
-      "epoch": 1.2809675531569022,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.001948761297873724,
-      "loss": 1.5655,
+      "epoch": 10.247607157719518,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015900957136912193,
+      "loss": 1.3668,
       "step": 98500
     },
     {
-      "epoch": 1.287469926523181,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0019485012029390727,
-      "loss": 1.5646,
+      "epoch": 10.299625468164795,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0015880149812734085,
+      "loss": 1.3695,
       "step": 99000
     },
     {
-      "epoch": 1.2939722998894596,
-      "grad_norm": 0.52734375,
-      "learning_rate": 0.0019482411080044215,
-      "loss": 1.57,
+      "epoch": 10.35164377861007,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0015859342488555972,
+      "loss": 1.3673,
       "step": 99500
     },
     {
-      "epoch": 1.3004746732557384,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0019479810130697706,
-      "loss": 1.5607,
+      "epoch": 10.403662089055347,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001583853516437786,
+      "loss": 1.3669,
       "step": 100000
     },
     {
-      "epoch": 1.306977046622017,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.0019477209181351194,
-      "loss": 1.5576,
+      "epoch": 10.455680399500624,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0015817727840199752,
+      "loss": 1.367,
       "step": 100500
     },
     {
-      "epoch": 1.3134794199882958,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0019474608232004683,
-      "loss": 1.5718,
+      "epoch": 10.5076987099459,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015796920516021641,
+      "loss": 1.3668,
       "step": 101000
     },
     {
-      "epoch": 1.3199817933545743,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.001947200728265817,
-      "loss": 1.562,
+      "epoch": 10.559717020391178,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015776113191843528,
+      "loss": 1.3654,
       "step": 101500
     },
     {
-      "epoch": 1.3264841667208531,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.0019469406333311658,
-      "loss": 1.5603,
+      "epoch": 10.611735330836455,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.001575530586766542,
+      "loss": 1.3655,
       "step": 102000
     },
     {
-      "epoch": 1.332986540087132,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.0019466805383965148,
-      "loss": 1.5631,
+      "epoch": 10.663753641281732,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0015734498543487309,
+      "loss": 1.3673,
       "step": 102500
     },
     {
-      "epoch": 1.3394889134534105,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0019464204434618635,
-      "loss": 1.5616,
+      "epoch": 10.715771951727008,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0015713691219309195,
+      "loss": 1.3651,
       "step": 103000
     },
     {
-      "epoch": 1.345991286819689,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0019461603485272127,
-      "loss": 1.5611,
+      "epoch": 10.767790262172285,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0015692883895131087,
+      "loss": 1.3652,
       "step": 103500
     },
     {
-      "epoch": 1.3524936601859678,
-      "grad_norm": 0.5,
-      "learning_rate": 0.0019459002535925614,
-      "loss": 1.5634,
+      "epoch": 10.81980857261756,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0015672076570952976,
+      "loss": 1.366,
       "step": 104000
     },
     {
-      "epoch": 1.3589960335522466,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0019456401586579102,
-      "loss": 1.5648,
+      "epoch": 10.871826883062838,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0015651269246774865,
+      "loss": 1.3624,
       "step": 104500
     },
     {
-      "epoch": 1.3654984069185252,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0019453800637232591,
-      "loss": 1.5611,
+      "epoch": 10.923845193508114,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0015630461922596754,
+      "loss": 1.3605,
       "step": 105000
     },
     {
-      "epoch": 1.372000780284804,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0019451199687886079,
-      "loss": 1.5615,
+      "epoch": 10.975863503953391,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0015609654598418643,
+      "loss": 1.36,
       "step": 105500
     },
     {
-      "epoch": 1.3785031536510828,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0019448598738539566,
-      "loss": 1.5628,
+      "epoch": 11.0,
+      "eval_loss": 1.3506468534469604,
+      "eval_runtime": 1.4359,
+      "eval_samples_per_second": 696.43,
+      "eval_steps_per_second": 0.696,
+      "step": 105732
+    },
+    {
+      "epoch": 11.027881814398668,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015588847274240535,
+      "loss": 1.3598,
       "step": 106000
     },
     {
-      "epoch": 1.3850055270173613,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0019445997789193056,
-      "loss": 1.5608,
+      "epoch": 11.079900124843945,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0015568039950062422,
+      "loss": 1.3614,
       "step": 106500
     },
     {
-      "epoch": 1.39150790038364,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019443396839846543,
-      "loss": 1.5609,
+      "epoch": 11.131918435289222,
+      "grad_norm": 0.35546875,
+      "learning_rate": 0.001554723262588431,
+      "loss": 1.3587,
       "step": 107000
     },
     {
-      "epoch": 1.3980102737499187,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0019440795890500035,
-      "loss": 1.5579,
+      "epoch": 11.1839367457345,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015526425301706202,
+      "loss": 1.3585,
       "step": 107500
     },
     {
-      "epoch": 1.4045126471161975,
-      "grad_norm": 0.46484375,
-      "learning_rate": 0.0019438194941153522,
-      "loss": 1.5526,
+      "epoch": 11.235955056179776,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0015505617977528091,
+      "loss": 1.3577,
       "step": 108000
     },
     {
-      "epoch": 1.411015020482476,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.001943559399180701,
-      "loss": 1.554,
+      "epoch": 11.287973366625051,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0015484810653349978,
+      "loss": 1.3581,
       "step": 108500
     },
     {
-      "epoch": 1.4175173938487549,
-      "grad_norm": 1.4453125,
-      "learning_rate": 0.00194329930424605,
-      "loss": 1.5554,
+      "epoch": 11.339991677070328,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.001546400332917187,
+      "loss": 1.3574,
       "step": 109000
     },
     {
-      "epoch": 1.4240197672150334,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.0019430392093113986,
-      "loss": 1.5518,
+      "epoch": 11.392009987515605,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0015443196004993759,
+      "loss": 1.3586,
       "step": 109500
     },
     {
-      "epoch": 1.4305221405813122,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.0019427791143767474,
-      "loss": 1.5562,
+      "epoch": 11.444028297960882,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0015422388680815646,
+      "loss": 1.3582,
       "step": 110000
     },
     {
-      "epoch": 1.4370245139475908,
-      "grad_norm": 0.46875,
-      "learning_rate": 0.0019425190194420963,
-      "loss": 1.5568,
+      "epoch": 11.496046608406159,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015401581356637537,
+      "loss": 1.3573,
       "step": 110500
     },
     {
-      "epoch": 1.4435268873138696,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.0019422589245074453,
-      "loss": 1.5476,
+      "epoch": 11.548064918851436,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0015380774032459426,
+      "loss": 1.3592,
       "step": 111000
     },
     {
-      "epoch": 1.4500292606801484,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0019419988295727943,
-      "loss": 1.558,
+      "epoch": 11.600083229296713,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0015359966708281315,
+      "loss": 1.3558,
       "step": 111500
     },
     {
-      "epoch": 1.456531634046427,
-      "grad_norm": 0.455078125,
-      "learning_rate": 0.001941738734638143,
-      "loss": 1.5478,
+      "epoch": 11.65210153974199,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0015339159384103204,
+      "loss": 1.3578,
       "step": 112000
     },
     {
-      "epoch": 1.4630340074127055,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0019414786397034917,
-      "loss": 1.553,
+      "epoch": 11.704119850187267,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0015318352059925093,
+      "loss": 1.3562,
       "step": 112500
     },
     {
-      "epoch": 1.4695363807789843,
-      "grad_norm": 1.5390625,
-      "learning_rate": 0.0019412185447688407,
-      "loss": 1.5621,
+      "epoch": 11.756138160632542,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0015297544735746985,
+      "loss": 1.3575,
       "step": 113000
     },
     {
-      "epoch": 1.476038754145263,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0019409584498341894,
-      "loss": 1.5583,
+      "epoch": 11.808156471077819,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0015276737411568874,
+      "loss": 1.3568,
       "step": 113500
     },
     {
-      "epoch": 1.4825411275115417,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019406983548995382,
-      "loss": 1.5567,
+      "epoch": 11.860174781523096,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001525593008739076,
+      "loss": 1.3611,
       "step": 114000
     },
     {
-      "epoch": 1.4890435008778204,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0019404382599648873,
-      "loss": 1.5509,
+      "epoch": 11.912193091968373,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0015235122763212652,
+      "loss": 1.3598,
       "step": 114500
     },
     {
-      "epoch": 1.495545874244099,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.001940178165030236,
-      "loss": 1.5542,
+      "epoch": 11.96421140241365,
+      "grad_norm": 4.28125,
+      "learning_rate": 0.0015214315439034541,
+      "loss": 1.3598,
       "step": 115000
     },
     {
-      "epoch": 1.5020482476103778,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001939918070095585,
-      "loss": 1.554,
+      "epoch": 12.0,
+      "eval_loss": 1.3485276699066162,
+      "eval_runtime": 1.4101,
+      "eval_samples_per_second": 709.157,
+      "eval_steps_per_second": 0.709,
+      "step": 115344
+    },
+    {
+      "epoch": 12.016229712858927,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0015193508114856428,
+      "loss": 1.3563,
       "step": 115500
     },
     {
-      "epoch": 1.5085506209766564,
-      "grad_norm": 0.875,
-      "learning_rate": 0.0019396579751609338,
-      "loss": 1.5505,
+      "epoch": 12.068248023304204,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001517270079067832,
+      "loss": 1.3552,
       "step": 116000
     },
     {
-      "epoch": 1.5150529943429352,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0019393978802262825,
-      "loss": 1.5542,
+      "epoch": 12.12026633374948,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015151893466500209,
+      "loss": 1.3552,
       "step": 116500
     },
     {
-      "epoch": 1.521555367709214,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.0019391377852916315,
-      "loss": 1.5487,
+      "epoch": 12.172284644194757,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015131086142322098,
+      "loss": 1.3543,
       "step": 117000
     },
     {
-      "epoch": 1.5280577410754925,
-      "grad_norm": 1.171875,
-      "learning_rate": 0.0019388776903569802,
-      "loss": 1.5535,
+      "epoch": 12.224302954640033,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0015110278818143987,
+      "loss": 1.3534,
       "step": 117500
     },
     {
-      "epoch": 1.534560114441771,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0019386175954223294,
-      "loss": 1.5524,
+      "epoch": 12.27632126508531,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015089471493965876,
+      "loss": 1.3541,
       "step": 118000
     },
     {
-      "epoch": 1.5410624878080499,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0019383575004876781,
-      "loss": 1.548,
+      "epoch": 12.328339575530586,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0015068664169787765,
+      "loss": 1.3542,
       "step": 118500
     },
     {
-      "epoch": 1.5475648611743287,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0019380974055530269,
-      "loss": 1.5434,
+      "epoch": 12.380357885975863,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0015047856845609654,
+      "loss": 1.3561,
       "step": 119000
     },
     {
-      "epoch": 1.5540672345406072,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0019378373106183758,
-      "loss": 1.5461,
+      "epoch": 12.43237619642114,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015027049521431544,
+      "loss": 1.3544,
       "step": 119500
     },
     {
-      "epoch": 1.560569607906886,
-      "grad_norm": 0.5,
-      "learning_rate": 0.0019375772156837246,
-      "loss": 1.5469,
+      "epoch": 12.484394506866417,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0015006242197253433,
+      "loss": 1.3561,
       "step": 120000
     },
     {
-      "epoch": 1.5670719812731648,
-      "grad_norm": 0.455078125,
-      "learning_rate": 0.0019373171207490733,
-      "loss": 1.5484,
+      "epoch": 12.536412817311694,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0014985434873075324,
+      "loss": 1.3533,
       "step": 120500
     },
     {
-      "epoch": 1.5735743546394434,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0019370570258144223,
-      "loss": 1.5412,
+      "epoch": 12.588431127756971,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.001496462754889721,
+      "loss": 1.3546,
       "step": 121000
     },
     {
-      "epoch": 1.580076728005722,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.001936796930879771,
-      "loss": 1.5426,
+      "epoch": 12.640449438202246,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014943820224719102,
+      "loss": 1.3548,
       "step": 121500
     },
     {
-      "epoch": 1.5865791013720008,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0019365368359451202,
-      "loss": 1.5482,
+      "epoch": 12.692467748647523,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0014923012900540991,
+      "loss": 1.3531,
       "step": 122000
     },
     {
-      "epoch": 1.5930814747382795,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.001936276741010469,
-      "loss": 1.5487,
+      "epoch": 12.7444860590928,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0014902205576362878,
+      "loss": 1.3518,
       "step": 122500
     },
     {
-      "epoch": 1.5995838481045581,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0019360166460758177,
-      "loss": 1.5529,
+      "epoch": 12.796504369538077,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.001488139825218477,
+      "loss": 1.3512,
       "step": 123000
     },
     {
-      "epoch": 1.606086221470837,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0019357565511411666,
-      "loss": 1.5546,
+      "epoch": 12.848522679983354,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0014860590928006659,
+      "loss": 1.3506,
       "step": 123500
     },
     {
-      "epoch": 1.6125885948371157,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0019354964562065154,
-      "loss": 1.544,
+      "epoch": 12.900540990428631,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014839783603828548,
+      "loss": 1.3539,
       "step": 124000
     },
     {
-      "epoch": 1.6190909682033943,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.001935236361271864,
-      "loss": 1.5507,
+      "epoch": 12.952559300873908,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014818976279650437,
+      "loss": 1.3581,
       "step": 124500
     },
     {
-      "epoch": 1.6255933415696728,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.001934976266337213,
-      "loss": 1.5515,
+      "epoch": 13.0,
+      "eval_loss": 1.3496302366256714,
+      "eval_runtime": 1.4282,
+      "eval_samples_per_second": 700.187,
+      "eval_steps_per_second": 0.7,
+      "step": 124956
+    },
+    {
+      "epoch": 13.004577611319185,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0014798168955472326,
+      "loss": 1.3558,
       "step": 125000
     },
     {
-      "epoch": 1.6320957149359516,
-      "grad_norm": 0.98828125,
-      "learning_rate": 0.001934716171402562,
-      "loss": 1.5502,
+      "epoch": 13.056595921764462,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0014777361631294215,
+      "loss": 1.3526,
       "step": 125500
     },
     {
-      "epoch": 1.6385980883022304,
-      "grad_norm": 1.3046875,
-      "learning_rate": 0.001934456076467911,
-      "loss": 1.5427,
+      "epoch": 13.108614232209737,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0014756554307116107,
+      "loss": 1.3508,
       "step": 126000
     },
     {
-      "epoch": 1.645100461668509,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0019341959815332597,
-      "loss": 1.5471,
+      "epoch": 13.160632542655014,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0014735746982937994,
+      "loss": 1.3507,
       "step": 126500
     },
     {
-      "epoch": 1.6516028350347876,
-      "grad_norm": 0.466796875,
-      "learning_rate": 0.0019339358865986085,
-      "loss": 1.5405,
+      "epoch": 13.21265085310029,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0014714939658759883,
+      "loss": 1.3525,
       "step": 127000
     },
     {
-      "epoch": 1.6581052084010663,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.0019336757916639574,
-      "loss": 1.5431,
+      "epoch": 13.264669163545568,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0014694132334581774,
+      "loss": 1.3525,
       "step": 127500
     },
     {
-      "epoch": 1.6646075817673451,
-      "grad_norm": 1.7421875,
-      "learning_rate": 0.0019334156967293062,
-      "loss": 1.5484,
+      "epoch": 13.316687473990845,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0014673325010403661,
+      "loss": 1.3529,
       "step": 128000
     },
     {
-      "epoch": 1.6711099551336237,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.001933155601794655,
-      "loss": 1.5474,
+      "epoch": 13.368705784436122,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0014652517686225552,
+      "loss": 1.3518,
       "step": 128500
     },
     {
-      "epoch": 1.6776123284999025,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.001932895506860004,
-      "loss": 1.544,
+      "epoch": 13.420724094881399,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0014631710362047442,
+      "loss": 1.3534,
       "step": 129000
     },
     {
-      "epoch": 1.6841147018661813,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0019326354119253528,
-      "loss": 1.5443,
+      "epoch": 13.472742405326676,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001461090303786933,
+      "loss": 1.3513,
       "step": 129500
     },
     {
-      "epoch": 1.6906170752324599,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0019323753169907018,
-      "loss": 1.5468,
+      "epoch": 13.524760715771952,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001459009571369122,
+      "loss": 1.3516,
       "step": 130000
     },
     {
-      "epoch": 1.6971194485987384,
-      "grad_norm": 0.9453125,
-      "learning_rate": 0.0019321152220560505,
-      "loss": 1.5416,
+      "epoch": 13.576779026217228,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001456928838951311,
+      "loss": 1.3521,
       "step": 130500
     },
     {
-      "epoch": 1.7036218219650172,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0019318551271213992,
-      "loss": 1.5398,
+      "epoch": 13.628797336662505,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0014548481065334998,
+      "loss": 1.353,
       "step": 131000
     },
     {
-      "epoch": 1.710124195331296,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019315950321867482,
-      "loss": 1.5375,
+      "epoch": 13.680815647107782,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0014527673741156887,
+      "loss": 1.3525,
       "step": 131500
     },
     {
-      "epoch": 1.7166265686975746,
-      "grad_norm": 0.48046875,
-      "learning_rate": 0.001931334937252097,
-      "loss": 1.5381,
+      "epoch": 13.732833957553058,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014506866416978776,
+      "loss": 1.3501,
       "step": 132000
     },
     {
-      "epoch": 1.7231289420638531,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019310748423174461,
-      "loss": 1.5443,
+      "epoch": 13.784852267998335,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0014486059092800666,
+      "loss": 1.3495,
       "step": 132500
     },
     {
-      "epoch": 1.7296313154301322,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019308147473827949,
-      "loss": 1.5485,
+      "epoch": 13.836870578443612,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0014465251768622557,
+      "loss": 1.3513,
       "step": 133000
     },
     {
-      "epoch": 1.7361336887964107,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0019305546524481436,
-      "loss": 1.5439,
+      "epoch": 13.88888888888889,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0014444444444444444,
+      "loss": 1.3516,
       "step": 133500
     },
     {
-      "epoch": 1.7426360621626893,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0019302945575134926,
-      "loss": 1.5427,
+      "epoch": 13.940907199334166,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0014423637120266333,
+      "loss": 1.3512,
       "step": 134000
     },
     {
-      "epoch": 1.749138435528968,
-      "grad_norm": 1.890625,
-      "learning_rate": 0.0019300344625788413,
-      "loss": 1.5403,
+      "epoch": 13.992925509779443,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0014402829796088224,
+      "loss": 1.3493,
       "step": 134500
     },
     {
-      "epoch": 1.7556408088952469,
-      "grad_norm": 0.490234375,
-      "learning_rate": 0.00192977436764419,
-      "loss": 1.5394,
+      "epoch": 14.0,
+      "eval_loss": 1.3465324640274048,
+      "eval_runtime": 1.419,
+      "eval_samples_per_second": 704.714,
+      "eval_steps_per_second": 0.705,
+      "step": 134568
+    },
+    {
+      "epoch": 14.044943820224718,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0014382022471910111,
+      "loss": 1.3478,
       "step": 135000
     },
     {
-      "epoch": 1.7621431822615254,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.001929514272709539,
-      "loss": 1.5378,
+      "epoch": 14.096962130669995,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0014361215147732003,
+      "loss": 1.3485,
       "step": 135500
     },
     {
-      "epoch": 1.768645555627804,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0019292541777748877,
-      "loss": 1.5362,
+      "epoch": 14.148980441115272,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0014340407823553892,
+      "loss": 1.3473,
       "step": 136000
     },
     {
-      "epoch": 1.7751479289940828,
-      "grad_norm": 1.46875,
-      "learning_rate": 0.001928994082840237,
-      "loss": 1.5378,
+      "epoch": 14.20099875156055,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.001431960049937578,
+      "loss": 1.3469,
       "step": 136500
     },
     {
-      "epoch": 1.7816503023603616,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0019287339879055856,
-      "loss": 1.5368,
+      "epoch": 14.253017062005826,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.001429879317519767,
+      "loss": 1.3488,
       "step": 137000
     },
     {
-      "epoch": 1.7881526757266402,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0019284738929709344,
-      "loss": 1.5322,
+      "epoch": 14.305035372451103,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001427798585101956,
+      "loss": 1.3482,
       "step": 137500
     },
     {
-      "epoch": 1.794655049092919,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019282137980362833,
-      "loss": 1.5335,
+      "epoch": 14.35705368289638,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0014257178526841448,
+      "loss": 1.3485,
       "step": 138000
     },
     {
-      "epoch": 1.8011574224591977,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.001927953703101632,
-      "loss": 1.5356,
+      "epoch": 14.409071993341657,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.001423637120266334,
+      "loss": 1.3479,
       "step": 138500
     },
     {
-      "epoch": 1.8076597958254763,
-      "grad_norm": 1.5234375,
-      "learning_rate": 0.0019276936081669808,
-      "loss": 1.5369,
+      "epoch": 14.461090303786934,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0014215563878485226,
+      "loss": 1.3483,
       "step": 139000
     },
     {
-      "epoch": 1.8141621691917549,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0019274335132323298,
-      "loss": 1.5375,
+      "epoch": 14.513108614232209,
+      "grad_norm": 0.4140625,
+      "learning_rate": 0.0014194756554307116,
+      "loss": 1.3471,
       "step": 139500
     },
     {
-      "epoch": 1.8206645425580337,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0019271734182976787,
-      "loss": 1.5313,
+      "epoch": 14.565126924677486,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0014173949230129007,
+      "loss": 1.3475,
       "step": 140000
     },
     {
-      "epoch": 1.8271669159243125,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.0019269133233630277,
-      "loss": 1.5302,
+      "epoch": 14.617145235122763,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0014153141905950894,
+      "loss": 1.3467,
       "step": 140500
     },
     {
-      "epoch": 1.833669289290591,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0019266532284283764,
-      "loss": 1.5262,
+      "epoch": 14.66916354556804,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0014132334581772783,
+      "loss": 1.3462,
       "step": 141000
     },
     {
-      "epoch": 1.8401716626568696,
-      "grad_norm": 0.443359375,
-      "learning_rate": 0.0019263931334937252,
-      "loss": 1.5323,
+      "epoch": 14.721181856013317,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0014111527257594674,
+      "loss": 1.3456,
       "step": 141500
     },
     {
-      "epoch": 1.8466740360231486,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0019261330385590741,
-      "loss": 1.5322,
+      "epoch": 14.773200166458594,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0014090719933416563,
+      "loss": 1.3455,
       "step": 142000
     },
     {
-      "epoch": 1.8531764093894272,
-      "grad_norm": 0.4921875,
-      "learning_rate": 0.0019258729436244229,
-      "loss": 1.5286,
+      "epoch": 14.82521847690387,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0014069912609238453,
+      "loss": 1.3447,
       "step": 142500
     },
     {
-      "epoch": 1.8596787827557058,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019256128486897716,
-      "loss": 1.5282,
+      "epoch": 14.877236787349148,
+      "grad_norm": 2.359375,
+      "learning_rate": 0.0014049105285060342,
+      "loss": 1.3454,
       "step": 143000
     },
     {
-      "epoch": 1.8661811561219845,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.0019253527537551208,
-      "loss": 1.5302,
+      "epoch": 14.929255097794425,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.001402829796088223,
+      "loss": 1.3461,
       "step": 143500
     },
     {
-      "epoch": 1.8726835294882633,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.0019250926588204695,
-      "loss": 1.5259,
+      "epoch": 14.9812734082397,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.001400749063670412,
+      "loss": 1.3454,
       "step": 144000
     },
     {
-      "epoch": 1.879185902854542,
-      "grad_norm": 2.078125,
-      "learning_rate": 0.0019248325638858185,
-      "loss": 1.5295,
+      "epoch": 15.0,
+      "eval_loss": 1.3407135009765625,
+      "eval_runtime": 1.4239,
+      "eval_samples_per_second": 702.319,
+      "eval_steps_per_second": 0.702,
+      "step": 144180
+    },
+    {
+      "epoch": 15.033291718684977,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.001398668331252601,
+      "loss": 1.3444,
       "step": 144500
     },
     {
-      "epoch": 1.8856882762208205,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0019245724689511672,
-      "loss": 1.5289,
+      "epoch": 15.085310029130254,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0013965875988347898,
+      "loss": 1.3444,
       "step": 145000
     },
     {
-      "epoch": 1.8921906495870993,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.001924312374016516,
-      "loss": 1.5319,
+      "epoch": 15.13732833957553,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.001394506866416979,
+      "loss": 1.3437,
       "step": 145500
     },
     {
-      "epoch": 1.898693022953378,
-      "grad_norm": 0.462890625,
-      "learning_rate": 0.001924052279081865,
-      "loss": 1.5349,
+      "epoch": 15.189346650020807,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0013924261339991677,
+      "loss": 1.3473,
       "step": 146000
     },
     {
-      "epoch": 1.9051953963196566,
-      "grad_norm": 3.109375,
-      "learning_rate": 0.0019237921841472137,
-      "loss": 1.5369,
+      "epoch": 15.241364960466084,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0013903454015813566,
+      "loss": 1.3474,
       "step": 146500
     },
     {
-      "epoch": 1.9116977696859354,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.0019235320892125628,
-      "loss": 1.5371,
+      "epoch": 15.293383270911361,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0013882646691635457,
+      "loss": 1.3472,
       "step": 147000
     },
     {
-      "epoch": 1.9182001430522142,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0019232719942779116,
-      "loss": 1.5371,
+      "epoch": 15.345401581356638,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013861839367457346,
+      "loss": 1.3457,
       "step": 147500
     },
     {
-      "epoch": 1.9247025164184928,
-      "grad_norm": 1.265625,
-      "learning_rate": 0.0019230118993432603,
-      "loss": 1.531,
+      "epoch": 15.397419891801913,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013841032043279233,
+      "loss": 1.3438,
       "step": 148000
     },
     {
-      "epoch": 1.9312048897847713,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0019227518044086093,
-      "loss": 1.5355,
+      "epoch": 15.44943820224719,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0013820224719101124,
+      "loss": 1.3452,
       "step": 148500
     },
     {
-      "epoch": 1.9377072631510501,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.001922491709473958,
-      "loss": 1.5308,
+      "epoch": 15.501456512692467,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0013799417394923014,
+      "loss": 1.3461,
       "step": 149000
     },
     {
-      "epoch": 1.944209636517329,
-      "grad_norm": 0.52734375,
-      "learning_rate": 0.0019222316145393068,
-      "loss": 1.533,
+      "epoch": 15.553474823137744,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0013778610070744903,
+      "loss": 1.344,
       "step": 149500
     },
     {
-      "epoch": 1.9507120098836075,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0019219715196046557,
-      "loss": 1.5318,
+      "epoch": 15.605493133583021,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0013757802746566792,
+      "loss": 1.3437,
       "step": 150000
     },
     {
-      "epoch": 1.957214383249886,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0019217114246700044,
-      "loss": 1.5307,
+      "epoch": 15.657511444028298,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.001373699542238868,
+      "loss": 1.3468,
       "step": 150500
     },
     {
-      "epoch": 1.9637167566161648,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0019214513297353536,
-      "loss": 1.5297,
+      "epoch": 15.709529754473575,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0013716188098210572,
+      "loss": 1.3439,
       "step": 151000
     },
     {
-      "epoch": 1.9702191299824436,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0019211912348007024,
-      "loss": 1.5264,
+      "epoch": 15.761548064918852,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.001369538077403246,
+      "loss": 1.3435,
       "step": 151500
     },
     {
-      "epoch": 1.9767215033487222,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.001920931139866051,
-      "loss": 1.5289,
+      "epoch": 15.813566375364129,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0013674573449854348,
+      "loss": 1.3463,
       "step": 152000
     },
     {
-      "epoch": 1.983223876715001,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019206710449314,
-      "loss": 1.5292,
+      "epoch": 15.865584685809406,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001365376612567624,
+      "loss": 1.3467,
       "step": 152500
     },
     {
-      "epoch": 1.9897262500812798,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0019204109499967488,
-      "loss": 1.5252,
+      "epoch": 15.917602996254681,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0013632958801498127,
+      "loss": 1.3462,
       "step": 153000
     },
     {
-      "epoch": 1.9962286234475584,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.0019201508550620975,
-      "loss": 1.5272,
+      "epoch": 15.969621306699958,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0013612151477320016,
+      "loss": 1.3461,
       "step": 153500
     },
     {
-      "epoch": 2.0,
-      "eval_loss": 1.502743124961853,
-      "eval_runtime": 7.2048,
-      "eval_samples_per_second": 138.797,
-      "eval_steps_per_second": 1.11,
-      "step": 153790
+      "epoch": 16.0,
+      "eval_loss": 1.3369859457015991,
+      "eval_runtime": 1.4272,
+      "eval_samples_per_second": 700.674,
+      "eval_steps_per_second": 0.701,
+      "step": 153792
     },
     {
-      "epoch": 2.002730996813837,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0019198907601274465,
-      "loss": 1.523,
+      "epoch": 16.021639617145237,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0013591344153141907,
+      "loss": 1.3446,
       "step": 154000
     },
     {
-      "epoch": 2.009233370180116,
-      "grad_norm": 0.4375,
-      "learning_rate": 0.0019196306651927955,
-      "loss": 1.5192,
+      "epoch": 16.073657927590514,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0013570536828963796,
+      "loss": 1.3422,
       "step": 154500
     },
     {
-      "epoch": 2.0157357435463945,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0019193705702581444,
-      "loss": 1.5178,
+      "epoch": 16.125676238035787,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0013549729504785683,
+      "loss": 1.3409,
       "step": 155000
     },
     {
-      "epoch": 2.022238116912673,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019191104753234932,
-      "loss": 1.5219,
+      "epoch": 16.177694548481064,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0013528922180607575,
+      "loss": 1.3428,
       "step": 155500
     },
     {
-      "epoch": 2.0287404902789516,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.001918850380388842,
-      "loss": 1.5232,
+      "epoch": 16.22971285892634,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0013508114856429464,
+      "loss": 1.3443,
       "step": 156000
     },
     {
-      "epoch": 2.0352428636452307,
-      "grad_norm": 0.439453125,
-      "learning_rate": 0.0019185902854541908,
-      "loss": 1.5235,
+      "epoch": 16.281731169371618,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001348730753225135,
+      "loss": 1.3423,
       "step": 156500
     },
     {
-      "epoch": 2.0417452370115092,
-      "grad_norm": 0.478515625,
-      "learning_rate": 0.0019183301905195396,
-      "loss": 1.5259,
+      "epoch": 16.333749479816895,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.0013466500208073242,
+      "loss": 1.3408,
       "step": 157000
     },
     {
-      "epoch": 2.048247610377788,
-      "grad_norm": 0.458984375,
-      "learning_rate": 0.0019180700955848883,
-      "loss": 1.5214,
+      "epoch": 16.38576779026217,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0013445692883895131,
+      "loss": 1.3421,
       "step": 157500
     },
     {
-      "epoch": 2.0547499837440664,
-      "grad_norm": 0.5390625,
-      "learning_rate": 0.0019178100006502375,
-      "loss": 1.5198,
+      "epoch": 16.43778610070745,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0013424885559717022,
+      "loss": 1.3424,
       "step": 158000
     },
     {
-      "epoch": 2.0612523571103454,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.0019175499057155862,
-      "loss": 1.52,
+      "epoch": 16.489804411152726,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001340407823553891,
+      "loss": 1.3412,
       "step": 158500
     },
     {
-      "epoch": 2.067754730476624,
-      "grad_norm": 0.48828125,
-      "learning_rate": 0.0019172898107809352,
-      "loss": 1.5229,
+      "epoch": 16.541822721598002,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0013383270911360799,
+      "loss": 1.3419,
       "step": 159000
     },
     {
-      "epoch": 2.0742571038429025,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.001917029715846284,
-      "loss": 1.5198,
+      "epoch": 16.59384103204328,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001336246358718269,
+      "loss": 1.3415,
       "step": 159500
     },
     {
-      "epoch": 2.0807594772091815,
-      "grad_norm": 0.4765625,
-      "learning_rate": 0.0019167696209116327,
-      "loss": 1.525,
+      "epoch": 16.645859342488556,
+      "grad_norm": 5.65625,
+      "learning_rate": 0.001334165626300458,
+      "loss": 1.3417,
       "step": 160000
     },
     {
-      "epoch": 2.08726185057546,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0019165095259769816,
-      "loss": 1.5199,
+      "epoch": 16.697877652933833,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0013320848938826466,
+      "loss": 1.3405,
       "step": 160500
     },
     {
-      "epoch": 2.0937642239417387,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019162494310423304,
-      "loss": 1.5194,
+      "epoch": 16.74989596337911,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0013300041614648357,
+      "loss": 1.339,
       "step": 161000
     },
     {
-      "epoch": 2.1002665973080172,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0019159893361076796,
-      "loss": 1.5146,
+      "epoch": 16.801914273824387,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0013279234290470246,
+      "loss": 1.3415,
       "step": 161500
     },
     {
-      "epoch": 2.1067689706742962,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0019157292411730283,
-      "loss": 1.5195,
+      "epoch": 16.853932584269664,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0013258426966292133,
+      "loss": 1.341,
       "step": 162000
     },
     {
-      "epoch": 2.113271344040575,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.001915469146238377,
-      "loss": 1.5237,
+      "epoch": 16.90595089471494,
+      "grad_norm": 0.330078125,
+      "learning_rate": 0.0013237619642114025,
+      "loss": 1.3424,
       "step": 162500
     },
     {
-      "epoch": 2.1197737174068534,
-      "grad_norm": 0.5,
-      "learning_rate": 0.001915209051303726,
-      "loss": 1.5214,
+      "epoch": 16.957969205160218,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013216812317935914,
+      "loss": 1.3479,
       "step": 163000
     },
     {
-      "epoch": 2.1262760907731324,
-      "grad_norm": 2.1875,
-      "learning_rate": 0.0019149489563690747,
-      "loss": 1.517,
+      "epoch": 17.0,
+      "eval_loss": 1.339566707611084,
+      "eval_runtime": 1.4145,
+      "eval_samples_per_second": 706.982,
+      "eval_steps_per_second": 0.707,
+      "step": 163404
+    },
+    {
+      "epoch": 17.00998751560549,
+      "grad_norm": 0.43359375,
+      "learning_rate": 0.0013196004993757803,
+      "loss": 1.3445,
       "step": 163500
     },
     {
-      "epoch": 2.132778464139411,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.0019146888614344235,
-      "loss": 1.5163,
+      "epoch": 17.06200582605077,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0013175197669579692,
+      "loss": 1.3435,
       "step": 164000
     },
     {
-      "epoch": 2.1392808375056895,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0019144287664997724,
-      "loss": 1.5162,
+      "epoch": 17.114024136496045,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0013154390345401581,
+      "loss": 1.3448,
       "step": 164500
     },
     {
-      "epoch": 2.145783210871968,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0019141686715651212,
-      "loss": 1.5184,
+      "epoch": 17.166042446941322,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0013133583021223473,
+      "loss": 1.3436,
       "step": 165000
     },
     {
-      "epoch": 2.152285584238247,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0019139085766304703,
-      "loss": 1.5204,
+      "epoch": 17.2180607573866,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.001311277569704536,
+      "loss": 1.3445,
       "step": 165500
     },
     {
-      "epoch": 2.1587879576045257,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.001913648481695819,
-      "loss": 1.5225,
+      "epoch": 17.270079067831876,
+      "grad_norm": 1.7890625,
+      "learning_rate": 0.0013091968372867249,
+      "loss": 1.343,
       "step": 166000
     },
     {
-      "epoch": 2.1652903309708043,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0019133883867611678,
-      "loss": 1.5205,
+      "epoch": 17.322097378277153,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.001307116104868914,
+      "loss": 1.3429,
       "step": 166500
     },
     {
-      "epoch": 2.171792704337083,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019131282918265168,
-      "loss": 1.5217,
+      "epoch": 17.37411568872243,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.001305035372451103,
+      "loss": 1.342,
       "step": 167000
     },
     {
-      "epoch": 2.178295077703362,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0019128681968918655,
-      "loss": 1.5178,
+      "epoch": 17.426133999167707,
+      "grad_norm": 2.875,
+      "learning_rate": 0.0013029546400332916,
+      "loss": 1.3429,
       "step": 167500
     },
     {
-      "epoch": 2.1847974510696404,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0019126081019572143,
-      "loss": 1.5149,
+      "epoch": 17.478152309612984,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013008739076154807,
+      "loss": 1.3424,
       "step": 168000
     },
     {
-      "epoch": 2.191299824435919,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019123480070225632,
-      "loss": 1.516,
+      "epoch": 17.53017062005826,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0012987931751976696,
+      "loss": 1.3422,
       "step": 168500
     },
     {
-      "epoch": 2.197802197802198,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.0019120879120879122,
-      "loss": 1.5138,
+      "epoch": 17.582188930503538,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0012967124427798583,
+      "loss": 1.3419,
       "step": 169000
     },
     {
-      "epoch": 2.2043045711684766,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.0019118278171532611,
-      "loss": 1.5141,
+      "epoch": 17.634207240948815,
+      "grad_norm": 0.3515625,
+      "learning_rate": 0.0012946317103620475,
+      "loss": 1.3419,
       "step": 169500
     },
     {
-      "epoch": 2.210806944534755,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0019115677222186099,
-      "loss": 1.5111,
+      "epoch": 17.68622555139409,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0012925509779442364,
+      "loss": 1.3411,
       "step": 170000
     },
     {
-      "epoch": 2.2173093179010337,
-      "grad_norm": 0.875,
-      "learning_rate": 0.0019113076272839586,
-      "loss": 1.5136,
+      "epoch": 17.73824386183937,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0012904702455264253,
+      "loss": 1.3409,
       "step": 170500
     },
     {
-      "epoch": 2.2238116912673127,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0019110475323493076,
-      "loss": 1.5115,
+      "epoch": 17.790262172284645,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0012883895131086142,
+      "loss": 1.3429,
       "step": 171000
     },
     {
-      "epoch": 2.2303140646335913,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019107874374146563,
-      "loss": 1.5159,
+      "epoch": 17.842280482729922,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0012863087806908031,
+      "loss": 1.3426,
       "step": 171500
     },
     {
-      "epoch": 2.23681643799987,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.001910527342480005,
-      "loss": 1.514,
+      "epoch": 17.8942987931752,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0012842280482729923,
+      "loss": 1.342,
       "step": 172000
     },
     {
-      "epoch": 2.243318811366149,
-      "grad_norm": 0.48046875,
-      "learning_rate": 0.0019102672475453542,
-      "loss": 1.5094,
+      "epoch": 17.946317103620473,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0012821473158551812,
+      "loss": 1.3418,
       "step": 172500
     },
     {
-      "epoch": 2.2498211847324274,
-      "grad_norm": 0.498046875,
-      "learning_rate": 0.001910007152610703,
-      "loss": 1.5128,
+      "epoch": 17.99833541406575,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0012800665834373699,
+      "loss": 1.3429,
       "step": 173000
     },
     {
-      "epoch": 2.256323558098706,
-      "grad_norm": 1.265625,
-      "learning_rate": 0.001909747057676052,
-      "loss": 1.5091,
+      "epoch": 18.0,
+      "eval_loss": 1.3371888399124146,
+      "eval_runtime": 1.4206,
+      "eval_samples_per_second": 703.942,
+      "eval_steps_per_second": 0.704,
+      "step": 173016
+    },
+    {
+      "epoch": 18.050353724511027,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.001277985851019559,
+      "loss": 1.3399,
       "step": 173500
     },
     {
-      "epoch": 2.2628259314649846,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0019094869627414007,
-      "loss": 1.5094,
+      "epoch": 18.102372034956304,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001275905118601748,
+      "loss": 1.3414,
       "step": 174000
     },
     {
-      "epoch": 2.2693283048312636,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0019092268678067494,
-      "loss": 1.5072,
+      "epoch": 18.15439034540158,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0012738243861839366,
+      "loss": 1.3425,
       "step": 174500
     },
     {
-      "epoch": 2.275830678197542,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0019089667728720984,
-      "loss": 1.5126,
+      "epoch": 18.206408655846857,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012717436537661257,
+      "loss": 1.339,
       "step": 175000
     },
     {
-      "epoch": 2.2823330515638207,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.001908706677937447,
-      "loss": 1.5113,
+      "epoch": 18.258426966292134,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0012696629213483147,
+      "loss": 1.3403,
       "step": 175500
     },
     {
-      "epoch": 2.2888354249300997,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0019084465830027963,
-      "loss": 1.5066,
+      "epoch": 18.31044527673741,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0012675821889305036,
+      "loss": 1.3388,
       "step": 176000
     },
     {
-      "epoch": 2.2953377982963783,
-      "grad_norm": 0.46484375,
-      "learning_rate": 0.001908186488068145,
-      "loss": 1.5094,
+      "epoch": 18.36246358718269,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0012655014565126925,
+      "loss": 1.3385,
       "step": 176500
     },
     {
-      "epoch": 2.301840171662657,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0019079263931334938,
-      "loss": 1.5059,
+      "epoch": 18.414481897627965,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012634207240948814,
+      "loss": 1.3372,
       "step": 177000
     },
     {
-      "epoch": 2.3083425450289354,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0019076662981988427,
-      "loss": 1.5127,
+      "epoch": 18.466500208073242,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0012613399916770703,
+      "loss": 1.3384,
       "step": 177500
     },
     {
-      "epoch": 2.3148449183952144,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0019074062032641914,
-      "loss": 1.5114,
+      "epoch": 18.51851851851852,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012592592592592592,
+      "loss": 1.3377,
       "step": 178000
     },
     {
-      "epoch": 2.321347291761493,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0019071461083295402,
-      "loss": 1.5088,
+      "epoch": 18.570536828963796,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0012571785268414481,
+      "loss": 1.3372,
       "step": 178500
     },
     {
-      "epoch": 2.3278496651277716,
-      "grad_norm": 0.5,
-      "learning_rate": 0.0019068860133948891,
-      "loss": 1.509,
+      "epoch": 18.622555139409073,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012550977944236373,
+      "loss": 1.3368,
       "step": 179000
     },
     {
-      "epoch": 2.33435203849405,
-      "grad_norm": 3.625,
-      "learning_rate": 0.0019066259184602379,
-      "loss": 1.5016,
+      "epoch": 18.67457344985435,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0012530170620058262,
+      "loss": 1.3384,
       "step": 179500
     },
     {
-      "epoch": 2.340854411860329,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.001906365823525587,
-      "loss": 1.5006,
+      "epoch": 18.726591760299627,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0012509363295880149,
+      "loss": 1.3374,
       "step": 180000
     },
     {
-      "epoch": 2.3473567852266077,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0019061057285909358,
-      "loss": 1.5052,
+      "epoch": 18.778610070744904,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.001248855597170204,
+      "loss": 1.3378,
       "step": 180500
     },
     {
-      "epoch": 2.3538591585928863,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0019058456336562845,
-      "loss": 1.502,
+      "epoch": 18.83062838119018,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.001246774864752393,
+      "loss": 1.3397,
       "step": 181000
     },
     {
-      "epoch": 2.360361531959165,
-      "grad_norm": 0.4765625,
-      "learning_rate": 0.0019055855387216335,
-      "loss": 1.5067,
+      "epoch": 18.882646691635454,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012446941323345816,
+      "loss": 1.3379,
       "step": 181500
     },
     {
-      "epoch": 2.366863905325444,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019053254437869822,
-      "loss": 1.5013,
+      "epoch": 18.93466500208073,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0012426133999167708,
+      "loss": 1.3377,
       "step": 182000
     },
     {
-      "epoch": 2.3733662786917225,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.001905065348852331,
-      "loss": 1.5033,
+      "epoch": 18.986683312526008,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0012405326674989597,
+      "loss": 1.3379,
       "step": 182500
     },
     {
-      "epoch": 2.379868652058001,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.00190480525391768,
-      "loss": 1.5061,
-      "step": 183000
-    },
-    {
-      "epoch": 2.38637102542428,
-      "grad_norm": 2.90625,
-      "learning_rate": 0.0019045451589830289,
-      "loss": 1.508,
-      "step": 183500
-    },
-    {
-      "epoch": 2.3928733987905586,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019042850640483778,
-      "loss": 1.5048,
-      "step": 184000
-    },
-    {
-      "epoch": 2.399375772156837,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019040249691137266,
-      "loss": 1.5025,
-      "step": 184500
-    },
-    {
-      "epoch": 2.4058781455231157,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0019037648741790753,
-      "loss": 1.4983,
-      "step": 185000
-    },
-    {
-      "epoch": 2.4123805188893948,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0019035047792444243,
-      "loss": 1.5018,
-      "step": 185500
-    },
-    {
-      "epoch": 2.4188828922556733,
-      "grad_norm": 0.458984375,
-      "learning_rate": 0.001903244684309773,
-      "loss": 1.5034,
-      "step": 186000
-    },
-    {
-      "epoch": 2.425385265621952,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0019029845893751218,
-      "loss": 1.5026,
-      "step": 186500
-    },
-    {
-      "epoch": 2.431887638988231,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.001902724494440471,
-      "loss": 1.5019,
-      "step": 187000
-    },
-    {
-      "epoch": 2.4383900123545095,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0019024643995058197,
-      "loss": 1.5043,
-      "step": 187500
-    },
-    {
-      "epoch": 2.444892385720788,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019022043045711686,
-      "loss": 1.5081,
-      "step": 188000
-    },
-    {
-      "epoch": 2.4513947590870666,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0019019442096365174,
-      "loss": 1.5045,
-      "step": 188500
-    },
-    {
-      "epoch": 2.4578971324533456,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.0019016841147018661,
-      "loss": 1.5091,
-      "step": 189000
-    },
-    {
-      "epoch": 2.464399505819624,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.001901424019767215,
-      "loss": 1.5121,
-      "step": 189500
-    },
-    {
-      "epoch": 2.4709018791859028,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0019011639248325638,
-      "loss": 1.5124,
-      "step": 190000
-    },
-    {
-      "epoch": 2.4774042525521818,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.001900903829897913,
-      "loss": 1.5159,
-      "step": 190500
-    },
-    {
-      "epoch": 2.4839066259184603,
-      "grad_norm": 0.4765625,
-      "learning_rate": 0.0019006437349632617,
-      "loss": 1.5078,
-      "step": 191000
-    },
-    {
-      "epoch": 2.490408999284739,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019003836400286105,
-      "loss": 1.5112,
-      "step": 191500
-    },
-    {
-      "epoch": 2.4969113726510175,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.0019001235450939594,
-      "loss": 1.5106,
-      "step": 192000
-    },
-    {
-      "epoch": 2.503413746017296,
-      "grad_norm": 0.478515625,
-      "learning_rate": 0.0018998634501593082,
-      "loss": 1.5032,
-      "step": 192500
-    },
-    {
-      "epoch": 2.509916119383575,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.001899603355224657,
-      "loss": 1.5047,
-      "step": 193000
-    },
-    {
-      "epoch": 2.5164184927498536,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0018993432602900059,
-      "loss": 1.5036,
-      "step": 193500
-    },
-    {
-      "epoch": 2.5229208661161326,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0018990831653553546,
-      "loss": 1.5028,
-      "step": 194000
-    },
-    {
-      "epoch": 2.529423239482411,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.0018988230704207038,
-      "loss": 1.5045,
-      "step": 194500
-    },
-    {
-      "epoch": 2.53592561284869,
-      "grad_norm": 0.52734375,
-      "learning_rate": 0.0018985629754860525,
-      "loss": 1.5016,
-      "step": 195000
-    },
-    {
-      "epoch": 2.5424279862149683,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0018983028805514013,
-      "loss": 1.5057,
-      "step": 195500
-    },
-    {
-      "epoch": 2.548930359581247,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0018980427856167502,
-      "loss": 1.5069,
-      "step": 196000
-    },
-    {
-      "epoch": 2.555432732947526,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.001897782690682099,
-      "loss": 1.5057,
-      "step": 196500
-    },
-    {
-      "epoch": 2.5619351063138045,
-      "grad_norm": 0.490234375,
-      "learning_rate": 0.0018975225957474477,
-      "loss": 1.5039,
-      "step": 197000
-    },
-    {
-      "epoch": 2.568437479680083,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0018972625008127967,
-      "loss": 1.5041,
-      "step": 197500
-    },
-    {
-      "epoch": 2.574939853046362,
-      "grad_norm": 0.486328125,
-      "learning_rate": 0.0018970024058781456,
-      "loss": 1.5048,
-      "step": 198000
-    },
-    {
-      "epoch": 2.5814422264126407,
-      "grad_norm": 3.59375,
-      "learning_rate": 0.0018967423109434946,
-      "loss": 1.5011,
-      "step": 198500
-    },
-    {
-      "epoch": 2.587944599778919,
-      "grad_norm": 1.546875,
-      "learning_rate": 0.0018964822160088433,
-      "loss": 1.5014,
-      "step": 199000
-    },
-    {
-      "epoch": 2.594446973145198,
-      "grad_norm": 0.453125,
-      "learning_rate": 0.001896222121074192,
-      "loss": 1.4983,
-      "step": 199500
-    },
-    {
-      "epoch": 2.600949346511477,
-      "grad_norm": 0.5,
-      "learning_rate": 0.001895962026139541,
-      "loss": 1.5062,
-      "step": 200000
-    },
-    {
-      "epoch": 2.6074517198777554,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018957019312048897,
-      "loss": 1.5062,
-      "step": 200500
-    },
-    {
-      "epoch": 2.613954093244034,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0018954418362702385,
-      "loss": 1.5011,
-      "step": 201000
-    },
-    {
-      "epoch": 2.620456466610313,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.0018951817413355877,
-      "loss": 1.5029,
-      "step": 201500
-    },
-    {
-      "epoch": 2.6269588399765915,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.0018949216464009364,
-      "loss": 1.5017,
-      "step": 202000
-    },
-    {
-      "epoch": 2.63346121334287,
-      "grad_norm": 0.470703125,
-      "learning_rate": 0.0018946615514662854,
-      "loss": 1.5012,
-      "step": 202500
-    },
-    {
-      "epoch": 2.6399635867091487,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.001894401456531634,
-      "loss": 1.5036,
-      "step": 203000
-    },
-    {
-      "epoch": 2.6464659600754277,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0018941413615969828,
-      "loss": 1.5042,
-      "step": 203500
-    },
-    {
-      "epoch": 2.6529683334417062,
-      "grad_norm": 0.474609375,
-      "learning_rate": 0.0018938812666623318,
-      "loss": 1.5042,
-      "step": 204000
-    },
-    {
-      "epoch": 2.659470706807985,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0018936211717276805,
-      "loss": 1.5079,
-      "step": 204500
-    },
-    {
-      "epoch": 2.665973080174264,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0018933610767930297,
-      "loss": 1.5093,
-      "step": 205000
-    },
-    {
-      "epoch": 2.6724754535405424,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0018931009818583784,
-      "loss": 1.5066,
-      "step": 205500
-    },
-    {
-      "epoch": 2.678977826906821,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018928408869237272,
-      "loss": 1.5034,
-      "step": 206000
-    },
-    {
-      "epoch": 2.6854802002730995,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.0018925807919890761,
-      "loss": 1.5045,
-      "step": 206500
-    },
-    {
-      "epoch": 2.691982573639378,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0018923206970544249,
-      "loss": 1.5112,
-      "step": 207000
-    },
-    {
-      "epoch": 2.698484947005657,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0018920606021197736,
-      "loss": 1.5055,
-      "step": 207500
-    },
-    {
-      "epoch": 2.7049873203719357,
-      "grad_norm": 2.5,
-      "learning_rate": 0.0018918005071851226,
-      "loss": 1.5023,
-      "step": 208000
-    },
-    {
-      "epoch": 2.7114896937382147,
-      "grad_norm": 0.423828125,
-      "learning_rate": 0.0018915404122504713,
-      "loss": 1.5052,
-      "step": 208500
-    },
-    {
-      "epoch": 2.7179920671044933,
-      "grad_norm": 1.5234375,
-      "learning_rate": 0.0018912803173158205,
-      "loss": 1.4958,
-      "step": 209000
-    },
-    {
-      "epoch": 2.724494440470772,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0018910202223811692,
-      "loss": 1.5016,
-      "step": 209500
-    },
-    {
-      "epoch": 2.7309968138370504,
-      "grad_norm": 1.4375,
-      "learning_rate": 0.001890760127446518,
-      "loss": 1.4964,
-      "step": 210000
-    },
-    {
-      "epoch": 2.737499187203329,
-      "grad_norm": 1.3203125,
-      "learning_rate": 0.001890500032511867,
-      "loss": 1.5011,
-      "step": 210500
-    },
-    {
-      "epoch": 2.744001560569608,
-      "grad_norm": 0.423828125,
-      "learning_rate": 0.0018902399375772157,
-      "loss": 1.5008,
-      "step": 211000
-    },
-    {
-      "epoch": 2.7505039339358865,
-      "grad_norm": 0.478515625,
-      "learning_rate": 0.0018899798426425644,
-      "loss": 1.4991,
-      "step": 211500
-    },
-    {
-      "epoch": 2.7570063073021656,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.0018897197477079134,
-      "loss": 1.4968,
-      "step": 212000
-    },
-    {
-      "epoch": 2.763508680668444,
-      "grad_norm": 1.453125,
-      "learning_rate": 0.0018894596527732623,
-      "loss": 1.4974,
-      "step": 212500
-    },
-    {
-      "epoch": 2.7700110540347227,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0018891995578386113,
-      "loss": 1.4991,
-      "step": 213000
-    },
-    {
-      "epoch": 2.7765134274010013,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.00188893946290396,
-      "loss": 1.4964,
-      "step": 213500
-    },
-    {
-      "epoch": 2.78301580076728,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0018886793679693088,
-      "loss": 1.4988,
-      "step": 214000
-    },
-    {
-      "epoch": 2.789518174133559,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0018884192730346577,
-      "loss": 1.4971,
-      "step": 214500
-    },
-    {
-      "epoch": 2.7960205474998374,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0018881591781000065,
-      "loss": 1.4988,
-      "step": 215000
-    },
-    {
-      "epoch": 2.802522920866116,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0018878990831653552,
-      "loss": 1.5002,
-      "step": 215500
-    },
-    {
-      "epoch": 2.809025294232395,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0018876389882307044,
-      "loss": 1.4966,
-      "step": 216000
-    },
-    {
-      "epoch": 2.8155276675986736,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0018873788932960531,
-      "loss": 1.4988,
-      "step": 216500
-    },
-    {
-      "epoch": 2.822030040964952,
-      "grad_norm": 2.15625,
-      "learning_rate": 0.001887118798361402,
-      "loss": 1.5013,
-      "step": 217000
-    },
-    {
-      "epoch": 2.8285324143312307,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0018868587034267508,
-      "loss": 1.4979,
-      "step": 217500
-    },
-    {
-      "epoch": 2.8350347876975097,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0018865986084920996,
-      "loss": 1.4991,
-      "step": 218000
-    },
-    {
-      "epoch": 2.8415371610637883,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0018863385135574485,
-      "loss": 1.4934,
-      "step": 218500
-    },
-    {
-      "epoch": 2.848039534430067,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018860784186227973,
-      "loss": 1.4923,
-      "step": 219000
-    },
-    {
-      "epoch": 2.854541907796346,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018858183236881464,
-      "loss": 1.4957,
-      "step": 219500
-    },
-    {
-      "epoch": 2.8610442811626244,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.0018855582287534952,
-      "loss": 1.4958,
-      "step": 220000
-    },
-    {
-      "epoch": 2.867546654528903,
-      "grad_norm": 1.5546875,
-      "learning_rate": 0.001885298133818844,
-      "loss": 1.4962,
-      "step": 220500
-    },
-    {
-      "epoch": 2.8740490278951816,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018850380388841929,
-      "loss": 1.4942,
-      "step": 221000
-    },
-    {
-      "epoch": 2.8805514012614606,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018847779439495416,
-      "loss": 1.4962,
-      "step": 221500
-    },
-    {
-      "epoch": 2.887053774627739,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0018845178490148903,
-      "loss": 1.4908,
-      "step": 222000
-    },
-    {
-      "epoch": 2.8935561479940177,
-      "grad_norm": 1.1328125,
-      "learning_rate": 0.0018842577540802393,
-      "loss": 1.4947,
-      "step": 222500
-    },
-    {
-      "epoch": 2.9000585213602967,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.001883997659145588,
-      "loss": 1.497,
-      "step": 223000
-    },
-    {
-      "epoch": 2.9065608947265753,
-      "grad_norm": 0.427734375,
-      "learning_rate": 0.0018837375642109372,
-      "loss": 1.4954,
-      "step": 223500
-    },
-    {
-      "epoch": 2.913063268092854,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001883477469276286,
-      "loss": 1.4954,
-      "step": 224000
-    },
-    {
-      "epoch": 2.9195656414591324,
-      "grad_norm": 1.0859375,
-      "learning_rate": 0.0018832173743416347,
-      "loss": 1.4906,
-      "step": 224500
-    },
-    {
-      "epoch": 2.926068014825411,
-      "grad_norm": 1.8984375,
-      "learning_rate": 0.0018829572794069837,
-      "loss": 1.4947,
-      "step": 225000
-    },
-    {
-      "epoch": 2.93257038819169,
-      "grad_norm": 0.52734375,
-      "learning_rate": 0.0018826971844723324,
-      "loss": 1.5016,
-      "step": 225500
-    },
-    {
-      "epoch": 2.9390727615579686,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018824370895376811,
-      "loss": 1.4978,
-      "step": 226000
-    },
-    {
-      "epoch": 2.9455751349242476,
-      "grad_norm": 1.5,
-      "learning_rate": 0.00188217699460303,
-      "loss": 1.495,
-      "step": 226500
-    },
-    {
-      "epoch": 2.952077508290526,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.001881916899668379,
-      "loss": 1.4984,
-      "step": 227000
-    },
-    {
-      "epoch": 2.9585798816568047,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.001881656804733728,
-      "loss": 1.4918,
-      "step": 227500
-    },
-    {
-      "epoch": 2.9650822550230833,
-      "grad_norm": 0.44921875,
-      "learning_rate": 0.0018813967097990767,
-      "loss": 1.4907,
-      "step": 228000
-    },
-    {
-      "epoch": 2.971584628389362,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0018811366148644255,
-      "loss": 1.4954,
-      "step": 228500
-    },
-    {
-      "epoch": 2.978087001755641,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018808765199297744,
-      "loss": 1.4943,
-      "step": 229000
-    },
-    {
-      "epoch": 2.9845893751219195,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0018806164249951232,
-      "loss": 1.493,
-      "step": 229500
-    },
-    {
-      "epoch": 2.991091748488198,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.001880356330060472,
-      "loss": 1.4926,
-      "step": 230000
-    },
-    {
-      "epoch": 2.997594121854477,
-      "grad_norm": 0.5,
-      "learning_rate": 0.001880096235125821,
-      "loss": 1.4914,
-      "step": 230500
-    },
-    {
-      "epoch": 3.0,
-      "eval_loss": 1.4690916538238525,
-      "eval_runtime": 0.9958,
-      "eval_samples_per_second": 1004.258,
-      "eval_steps_per_second": 8.034,
-      "step": 230685
-    },
-    {
-      "epoch": 3.0040964952207556,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0018798361401911698,
-      "loss": 1.4899,
-      "step": 231000
-    },
-    {
-      "epoch": 3.010598868587034,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0018795760452565188,
-      "loss": 1.4889,
-      "step": 231500
-    },
-    {
-      "epoch": 3.0171012419533128,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0018793159503218675,
-      "loss": 1.4908,
-      "step": 232000
-    },
-    {
-      "epoch": 3.0236036153195918,
-      "grad_norm": 1.75,
-      "learning_rate": 0.0018790558553872163,
-      "loss": 1.4906,
-      "step": 232500
-    },
-    {
-      "epoch": 3.0301059886858703,
-      "grad_norm": 0.5390625,
-      "learning_rate": 0.0018787957604525652,
-      "loss": 1.4915,
-      "step": 233000
-    },
-    {
-      "epoch": 3.036608362052149,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.001878535665517914,
-      "loss": 1.4917,
-      "step": 233500
-    },
-    {
-      "epoch": 3.043110735418428,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0018782755705832631,
-      "loss": 1.4915,
-      "step": 234000
-    },
-    {
-      "epoch": 3.0496131087847065,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.0018780154756486119,
-      "loss": 1.4875,
-      "step": 234500
-    },
-    {
-      "epoch": 3.056115482150985,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.0018777553807139606,
-      "loss": 1.4922,
-      "step": 235000
-    },
-    {
-      "epoch": 3.0626178555172636,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0018774952857793096,
-      "loss": 1.4917,
-      "step": 235500
-    },
-    {
-      "epoch": 3.0691202288835426,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018772351908446583,
-      "loss": 1.4925,
-      "step": 236000
-    },
-    {
-      "epoch": 3.075622602249821,
-      "grad_norm": 0.451171875,
-      "learning_rate": 0.001876975095910007,
-      "loss": 1.4906,
-      "step": 236500
-    },
-    {
-      "epoch": 3.0821249756160998,
-      "grad_norm": 1.2734375,
-      "learning_rate": 0.001876715000975356,
-      "loss": 1.4913,
-      "step": 237000
-    },
-    {
-      "epoch": 3.088627348982379,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0018764549060407048,
-      "loss": 1.4908,
-      "step": 237500
-    },
-    {
-      "epoch": 3.0951297223486574,
-      "grad_norm": 1.625,
-      "learning_rate": 0.001876194811106054,
-      "loss": 1.4913,
-      "step": 238000
-    },
-    {
-      "epoch": 3.101632095714936,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.0018759347161714027,
-      "loss": 1.4925,
-      "step": 238500
-    },
-    {
-      "epoch": 3.1081344690812145,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0018756746212367514,
-      "loss": 1.4905,
-      "step": 239000
-    },
-    {
-      "epoch": 3.1146368424474935,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0018754145263021004,
-      "loss": 1.4899,
-      "step": 239500
-    },
-    {
-      "epoch": 3.121139215813772,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0018751544313674491,
-      "loss": 1.4862,
-      "step": 240000
-    },
-    {
-      "epoch": 3.1276415891800506,
-      "grad_norm": 0.482421875,
-      "learning_rate": 0.0018748943364327978,
-      "loss": 1.4903,
-      "step": 240500
-    },
-    {
-      "epoch": 3.134143962546329,
-      "grad_norm": 0.4921875,
-      "learning_rate": 0.0018746342414981468,
-      "loss": 1.4926,
-      "step": 241000
-    },
-    {
-      "epoch": 3.1406463359126082,
-      "grad_norm": 0.48046875,
-      "learning_rate": 0.0018743741465634958,
-      "loss": 1.4893,
-      "step": 241500
-    },
-    {
-      "epoch": 3.147148709278887,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0018741140516288447,
-      "loss": 1.4858,
-      "step": 242000
-    },
-    {
-      "epoch": 3.1536510826451654,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0018738539566941935,
-      "loss": 1.4875,
-      "step": 242500
-    },
-    {
-      "epoch": 3.1601534560114444,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0018735938617595422,
-      "loss": 1.4881,
-      "step": 243000
-    },
-    {
-      "epoch": 3.166655829377723,
-      "grad_norm": 0.494140625,
-      "learning_rate": 0.0018733337668248912,
-      "loss": 1.4878,
-      "step": 243500
-    },
-    {
-      "epoch": 3.1731582027440015,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.00187307367189024,
-      "loss": 1.4921,
-      "step": 244000
-    },
-    {
-      "epoch": 3.17966057611028,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0018728135769555886,
-      "loss": 1.4891,
-      "step": 244500
-    },
-    {
-      "epoch": 3.186162949476559,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0018725534820209378,
-      "loss": 1.4898,
-      "step": 245000
-    },
-    {
-      "epoch": 3.1926653228428377,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0018722933870862866,
-      "loss": 1.4902,
-      "step": 245500
-    },
-    {
-      "epoch": 3.1991676962091162,
-      "grad_norm": 1.546875,
-      "learning_rate": 0.0018720332921516355,
-      "loss": 1.4911,
-      "step": 246000
-    },
-    {
-      "epoch": 3.205670069575395,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.0018717731972169842,
-      "loss": 1.4868,
-      "step": 246500
-    },
-    {
-      "epoch": 3.212172442941674,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.001871513102282333,
-      "loss": 1.4901,
-      "step": 247000
-    },
-    {
-      "epoch": 3.2186748163079524,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.001871253007347682,
-      "loss": 1.4866,
-      "step": 247500
-    },
-    {
-      "epoch": 3.225177189674231,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.0018709929124130307,
-      "loss": 1.489,
-      "step": 248000
-    },
-    {
-      "epoch": 3.23167956304051,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0018707328174783799,
-      "loss": 1.4874,
-      "step": 248500
-    },
-    {
-      "epoch": 3.2381819364067885,
-      "grad_norm": 1.453125,
-      "learning_rate": 0.0018704727225437286,
-      "loss": 1.4909,
-      "step": 249000
-    },
-    {
-      "epoch": 3.244684309773067,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0018702126276090773,
-      "loss": 1.487,
-      "step": 249500
-    },
-    {
-      "epoch": 3.2511866831393457,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0018699525326744263,
-      "loss": 1.4912,
-      "step": 250000
-    },
-    {
-      "epoch": 3.2576890565056247,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.001869692437739775,
-      "loss": 1.4872,
-      "step": 250500
-    },
-    {
-      "epoch": 3.2641914298719032,
-      "grad_norm": 0.4609375,
-      "learning_rate": 0.0018694323428051238,
-      "loss": 1.4817,
-      "step": 251000
-    },
-    {
-      "epoch": 3.270693803238182,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.0018691722478704727,
-      "loss": 1.4821,
-      "step": 251500
-    },
-    {
-      "epoch": 3.277196176604461,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0018689121529358215,
-      "loss": 1.4884,
-      "step": 252000
-    },
-    {
-      "epoch": 3.2836985499707394,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0018686520580011706,
-      "loss": 1.4884,
-      "step": 252500
-    },
-    {
-      "epoch": 3.290200923337018,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018683919630665194,
-      "loss": 1.4864,
-      "step": 253000
-    },
-    {
-      "epoch": 3.2967032967032965,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0018681318681318681,
-      "loss": 1.4894,
-      "step": 253500
-    },
-    {
-      "epoch": 3.3032056700695756,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.001867871773197217,
-      "loss": 1.482,
-      "step": 254000
-    },
-    {
-      "epoch": 3.309708043435854,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0018676116782625658,
-      "loss": 1.4864,
-      "step": 254500
-    },
-    {
-      "epoch": 3.3162104168021327,
-      "grad_norm": 0.490234375,
-      "learning_rate": 0.0018673515833279146,
-      "loss": 1.488,
-      "step": 255000
-    },
-    {
-      "epoch": 3.3227127901684117,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0018670914883932635,
-      "loss": 1.4858,
-      "step": 255500
-    },
-    {
-      "epoch": 3.3292151635346903,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0018668313934586125,
-      "loss": 1.4894,
-      "step": 256000
-    },
-    {
-      "epoch": 3.335717536900969,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.0018665712985239614,
-      "loss": 1.4929,
-      "step": 256500
-    },
-    {
-      "epoch": 3.3422199102672474,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0018663112035893102,
-      "loss": 1.4937,
-      "step": 257000
-    },
-    {
-      "epoch": 3.3487222836335264,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.001866051108654659,
-      "loss": 1.4913,
-      "step": 257500
-    },
-    {
-      "epoch": 3.355224656999805,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0018657910137200079,
-      "loss": 1.4903,
-      "step": 258000
-    },
-    {
-      "epoch": 3.3617270303660836,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.0018655309187853566,
-      "loss": 1.4862,
-      "step": 258500
-    },
-    {
-      "epoch": 3.368229403732362,
-      "grad_norm": 0.4453125,
-      "learning_rate": 0.0018652708238507054,
-      "loss": 1.4836,
-      "step": 259000
-    },
-    {
-      "epoch": 3.374731777098641,
-      "grad_norm": 0.490234375,
-      "learning_rate": 0.0018650107289160545,
-      "loss": 1.4841,
-      "step": 259500
-    },
-    {
-      "epoch": 3.3812341504649197,
-      "grad_norm": 0.4609375,
-      "learning_rate": 0.0018647506339814033,
-      "loss": 1.4857,
-      "step": 260000
-    },
-    {
-      "epoch": 3.3877365238311983,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018644905390467522,
-      "loss": 1.4833,
-      "step": 260500
-    },
-    {
-      "epoch": 3.394238897197477,
-      "grad_norm": 1.375,
-      "learning_rate": 0.001864230444112101,
-      "loss": 1.487,
-      "step": 261000
-    },
-    {
-      "epoch": 3.400741270563756,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0018639703491774497,
-      "loss": 1.4886,
-      "step": 261500
-    },
-    {
-      "epoch": 3.4072436439300344,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0018637102542427987,
-      "loss": 1.4843,
-      "step": 262000
-    },
-    {
-      "epoch": 3.413746017296313,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0018634501593081474,
-      "loss": 1.4825,
-      "step": 262500
-    },
-    {
-      "epoch": 3.420248390662592,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0018631900643734966,
-      "loss": 1.4808,
-      "step": 263000
-    },
-    {
-      "epoch": 3.4267507640288706,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0018629299694388453,
-      "loss": 1.4814,
-      "step": 263500
-    },
-    {
-      "epoch": 3.433253137395149,
-      "grad_norm": 3.3125,
-      "learning_rate": 0.001862669874504194,
-      "loss": 1.4807,
-      "step": 264000
-    },
-    {
-      "epoch": 3.4397555107614277,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.001862409779569543,
-      "loss": 1.4817,
-      "step": 264500
-    },
-    {
-      "epoch": 3.4462578841277067,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0018621496846348918,
-      "loss": 1.4824,
-      "step": 265000
-    },
-    {
-      "epoch": 3.4527602574939853,
-      "grad_norm": 1.296875,
-      "learning_rate": 0.0018618895897002405,
-      "loss": 1.4813,
-      "step": 265500
-    },
-    {
-      "epoch": 3.459262630860264,
-      "grad_norm": 25.0,
-      "learning_rate": 0.0018616294947655895,
-      "loss": 1.4827,
-      "step": 266000
-    },
-    {
-      "epoch": 3.465765004226543,
-      "grad_norm": 0.5390625,
-      "learning_rate": 0.0018613693998309382,
-      "loss": 1.4771,
-      "step": 266500
-    },
-    {
-      "epoch": 3.4722673775928214,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0018611093048962874,
-      "loss": 1.4795,
-      "step": 267000
-    },
-    {
-      "epoch": 3.4787697509591,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.001860849209961636,
-      "loss": 1.4776,
-      "step": 267500
-    },
-    {
-      "epoch": 3.4852721243253786,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0018605891150269848,
-      "loss": 1.4828,
-      "step": 268000
-    },
-    {
-      "epoch": 3.4917744976916576,
-      "grad_norm": 0.474609375,
-      "learning_rate": 0.0018603290200923338,
-      "loss": 1.4827,
-      "step": 268500
-    },
-    {
-      "epoch": 3.498276871057936,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0018600689251576825,
-      "loss": 1.4794,
-      "step": 269000
-    },
-    {
-      "epoch": 3.5047792444242147,
-      "grad_norm": 2.8125,
-      "learning_rate": 0.0018598088302230313,
-      "loss": 1.4791,
-      "step": 269500
-    },
-    {
-      "epoch": 3.5112816177904937,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018595487352883802,
-      "loss": 1.4805,
-      "step": 270000
-    },
-    {
-      "epoch": 3.5177839911567723,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018592886403537292,
-      "loss": 1.4781,
-      "step": 270500
-    },
-    {
-      "epoch": 3.524286364523051,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018590285454190782,
-      "loss": 1.4817,
-      "step": 271000
-    },
-    {
-      "epoch": 3.5307887378893295,
-      "grad_norm": 0.4375,
-      "learning_rate": 0.001858768450484427,
-      "loss": 1.4873,
-      "step": 271500
-    },
-    {
-      "epoch": 3.537291111255608,
-      "grad_norm": 0.486328125,
-      "learning_rate": 0.0018585083555497756,
-      "loss": 1.4794,
-      "step": 272000
-    },
-    {
-      "epoch": 3.543793484621887,
-      "grad_norm": 0.453125,
-      "learning_rate": 0.0018582482606151246,
-      "loss": 1.4802,
-      "step": 272500
-    },
-    {
-      "epoch": 3.5502958579881656,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0018579881656804733,
-      "loss": 1.4799,
-      "step": 273000
-    },
-    {
-      "epoch": 3.5567982313544446,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.001857728070745822,
-      "loss": 1.4772,
-      "step": 273500
-    },
-    {
-      "epoch": 3.563300604720723,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0018574679758111712,
-      "loss": 1.4766,
-      "step": 274000
-    },
-    {
-      "epoch": 3.5698029780870018,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.00185720788087652,
-      "loss": 1.4788,
-      "step": 274500
-    },
-    {
-      "epoch": 3.5763053514532803,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.001856947785941869,
-      "loss": 1.4778,
-      "step": 275000
-    },
-    {
-      "epoch": 3.582807724819559,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018566876910072177,
-      "loss": 1.4819,
-      "step": 275500
-    },
-    {
-      "epoch": 3.589310098185838,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0018564275960725664,
-      "loss": 1.4799,
-      "step": 276000
-    },
-    {
-      "epoch": 3.5958124715521165,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0018561675011379154,
-      "loss": 1.4789,
-      "step": 276500
-    },
-    {
-      "epoch": 3.6023148449183955,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0018559074062032641,
-      "loss": 1.4805,
-      "step": 277000
-    },
-    {
-      "epoch": 3.608817218284674,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0018556473112686133,
-      "loss": 1.4783,
-      "step": 277500
-    },
-    {
-      "epoch": 3.6153195916509526,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.001855387216333962,
-      "loss": 1.4763,
-      "step": 278000
-    },
-    {
-      "epoch": 3.621821965017231,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018551271213993108,
-      "loss": 1.4771,
-      "step": 278500
-    },
-    {
-      "epoch": 3.6283243383835098,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0018548670264646597,
-      "loss": 1.4786,
-      "step": 279000
-    },
-    {
-      "epoch": 3.6348267117497888,
-      "grad_norm": 3.390625,
-      "learning_rate": 0.0018546069315300085,
-      "loss": 1.4806,
-      "step": 279500
-    },
-    {
-      "epoch": 3.6413290851160673,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0018543468365953572,
-      "loss": 1.4846,
-      "step": 280000
-    },
-    {
-      "epoch": 3.647831458482346,
-      "grad_norm": 0.455078125,
-      "learning_rate": 0.0018540867416607062,
-      "loss": 1.487,
-      "step": 280500
-    },
-    {
-      "epoch": 3.654333831848625,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.001853826646726055,
-      "loss": 1.4828,
-      "step": 281000
-    },
-    {
-      "epoch": 3.6608362052149035,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.001853566551791404,
-      "loss": 1.4859,
-      "step": 281500
-    },
-    {
-      "epoch": 3.667338578581182,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0018533064568567528,
-      "loss": 1.4862,
-      "step": 282000
-    },
-    {
-      "epoch": 3.6738409519474606,
-      "grad_norm": 0.443359375,
-      "learning_rate": 0.0018530463619221016,
-      "loss": 1.4859,
-      "step": 282500
-    },
-    {
-      "epoch": 3.6803433253137396,
-      "grad_norm": 2.15625,
-      "learning_rate": 0.0018527862669874505,
-      "loss": 1.4826,
-      "step": 283000
-    },
-    {
-      "epoch": 3.686845698680018,
-      "grad_norm": 0.455078125,
-      "learning_rate": 0.0018525261720527993,
-      "loss": 1.4795,
-      "step": 283500
-    },
-    {
-      "epoch": 3.693348072046297,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.001852266077118148,
-      "loss": 1.4778,
-      "step": 284000
-    },
-    {
-      "epoch": 3.699850445412576,
-      "grad_norm": 0.451171875,
-      "learning_rate": 0.001852005982183497,
-      "loss": 1.4837,
-      "step": 284500
-    },
-    {
-      "epoch": 3.7063528187788544,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.001851745887248846,
-      "loss": 1.4808,
-      "step": 285000
-    },
-    {
-      "epoch": 3.712855192145133,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0018514857923141949,
-      "loss": 1.4796,
-      "step": 285500
-    },
-    {
-      "epoch": 3.7193575655114115,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0018512256973795436,
-      "loss": 1.477,
-      "step": 286000
-    },
-    {
-      "epoch": 3.7258599388776905,
-      "grad_norm": 0.52734375,
-      "learning_rate": 0.0018509656024448924,
-      "loss": 1.475,
-      "step": 286500
-    },
-    {
-      "epoch": 3.732362312243969,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018507055075102413,
-      "loss": 1.4817,
-      "step": 287000
-    },
-    {
-      "epoch": 3.7388646856102477,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.00185044541257559,
-      "loss": 1.4804,
-      "step": 287500
-    },
-    {
-      "epoch": 3.7453670589765267,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0018501853176409388,
-      "loss": 1.4767,
-      "step": 288000
-    },
-    {
-      "epoch": 3.7518694323428052,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001849925222706288,
-      "loss": 1.4779,
-      "step": 288500
-    },
-    {
-      "epoch": 3.758371805709084,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0018496651277716367,
-      "loss": 1.4767,
-      "step": 289000
-    },
-    {
-      "epoch": 3.7648741790753624,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0018494050328369857,
-      "loss": 1.4749,
-      "step": 289500
-    },
-    {
-      "epoch": 3.771376552441641,
-      "grad_norm": 0.4609375,
-      "learning_rate": 0.0018491449379023344,
-      "loss": 1.4803,
-      "step": 290000
-    },
-    {
-      "epoch": 3.77787892580792,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0018488848429676831,
-      "loss": 1.483,
-      "step": 290500
-    },
-    {
-      "epoch": 3.7843812991741985,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.001848624748033032,
-      "loss": 1.4785,
-      "step": 291000
-    },
-    {
-      "epoch": 3.7908836725404775,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0018483646530983808,
-      "loss": 1.4722,
-      "step": 291500
-    },
-    {
-      "epoch": 3.797386045906756,
-      "grad_norm": 0.5390625,
-      "learning_rate": 0.00184810455816373,
-      "loss": 1.4768,
-      "step": 292000
-    },
-    {
-      "epoch": 3.8038884192730347,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0018478444632290788,
-      "loss": 1.4764,
-      "step": 292500
-    },
-    {
-      "epoch": 3.8103907926393132,
-      "grad_norm": 0.484375,
-      "learning_rate": 0.0018475843682944275,
-      "loss": 1.4804,
-      "step": 293000
-    },
-    {
-      "epoch": 3.816893166005592,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0018473242733597765,
-      "loss": 1.4773,
-      "step": 293500
-    },
-    {
-      "epoch": 3.823395539371871,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0018470641784251252,
-      "loss": 1.4777,
-      "step": 294000
-    },
-    {
-      "epoch": 3.8298979127381494,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.001846804083490474,
-      "loss": 1.4838,
-      "step": 294500
-    },
-    {
-      "epoch": 3.836400286104428,
-      "grad_norm": 0.48046875,
-      "learning_rate": 0.0018465439885558229,
-      "loss": 1.4781,
-      "step": 295000
-    },
-    {
-      "epoch": 3.842902659470707,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0018462838936211716,
-      "loss": 1.4859,
-      "step": 295500
-    },
-    {
-      "epoch": 3.8494050328369855,
-      "grad_norm": 1.25,
-      "learning_rate": 0.0018460237986865208,
-      "loss": 1.4856,
-      "step": 296000
-    },
-    {
-      "epoch": 3.855907406203264,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0018457637037518695,
-      "loss": 1.4853,
-      "step": 296500
-    },
-    {
-      "epoch": 3.8624097795695427,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.0018455036088172183,
-      "loss": 1.4766,
-      "step": 297000
-    },
-    {
-      "epoch": 3.8689121529358217,
-      "grad_norm": 1.4765625,
-      "learning_rate": 0.0018452435138825672,
-      "loss": 1.4831,
-      "step": 297500
-    },
-    {
-      "epoch": 3.8754145263021003,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.001844983418947916,
-      "loss": 1.4798,
-      "step": 298000
-    },
-    {
-      "epoch": 3.881916899668379,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0018447233240132647,
-      "loss": 1.4808,
-      "step": 298500
-    },
-    {
-      "epoch": 3.888419273034658,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018444632290786137,
-      "loss": 1.4808,
-      "step": 299000
-    },
-    {
-      "epoch": 3.8949216464009364,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0018442031341439626,
-      "loss": 1.4744,
-      "step": 299500
-    },
-    {
-      "epoch": 3.901424019767215,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0018439430392093116,
-      "loss": 1.4792,
-      "step": 300000
-    },
-    {
-      "epoch": 3.9079263931334935,
-      "grad_norm": 0.486328125,
-      "learning_rate": 0.0018436829442746603,
-      "loss": 1.4776,
-      "step": 300500
-    },
-    {
-      "epoch": 3.9144287664997726,
-      "grad_norm": 1.0859375,
-      "learning_rate": 0.001843422849340009,
-      "loss": 1.4733,
-      "step": 301000
-    },
-    {
-      "epoch": 3.920931139866051,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.001843162754405358,
-      "loss": 1.4766,
-      "step": 301500
-    },
-    {
-      "epoch": 3.9274335132323297,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018429026594707068,
-      "loss": 1.4743,
-      "step": 302000
-    },
-    {
-      "epoch": 3.9339358865986087,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.0018426425645360555,
-      "loss": 1.4735,
-      "step": 302500
-    },
-    {
-      "epoch": 3.9404382599648873,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0018423824696014047,
-      "loss": 1.4729,
-      "step": 303000
-    },
-    {
-      "epoch": 3.946940633331166,
-      "grad_norm": 0.4921875,
-      "learning_rate": 0.0018421223746667534,
-      "loss": 1.4776,
-      "step": 303500
-    },
-    {
-      "epoch": 3.9534430066974444,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0018418622797321024,
-      "loss": 1.4805,
-      "step": 304000
-    },
-    {
-      "epoch": 3.959945380063723,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0018416021847974511,
-      "loss": 1.4744,
-      "step": 304500
-    },
-    {
-      "epoch": 3.966447753430002,
-      "grad_norm": 0.46875,
-      "learning_rate": 0.0018413420898627999,
-      "loss": 1.4781,
-      "step": 305000
-    },
-    {
-      "epoch": 3.9729501267962806,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018410819949281488,
-      "loss": 1.4763,
-      "step": 305500
-    },
-    {
-      "epoch": 3.9794525001625596,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018408218999934976,
-      "loss": 1.4752,
-      "step": 306000
-    },
-    {
-      "epoch": 3.985954873528838,
-      "grad_norm": 1.4921875,
-      "learning_rate": 0.0018405618050588467,
-      "loss": 1.4749,
-      "step": 306500
-    },
-    {
-      "epoch": 3.9924572468951167,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0018403017101241955,
-      "loss": 1.4747,
-      "step": 307000
-    },
-    {
-      "epoch": 3.9989596202613953,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.0018400416151895442,
-      "loss": 1.4753,
-      "step": 307500
-    },
-    {
-      "epoch": 4.0,
-      "eval_loss": 1.4534417390823364,
-      "eval_runtime": 0.9363,
-      "eval_samples_per_second": 1068.081,
-      "eval_steps_per_second": 8.545,
-      "step": 307580
-    },
-    {
-      "epoch": 4.005461993627674,
-      "grad_norm": 0.4609375,
-      "learning_rate": 0.0018397815202548932,
-      "loss": 1.4734,
-      "step": 308000
-    },
-    {
-      "epoch": 4.011964366993952,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.001839521425320242,
-      "loss": 1.4706,
-      "step": 308500
-    },
-    {
-      "epoch": 4.018466740360232,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.0018392613303855907,
-      "loss": 1.4735,
-      "step": 309000
-    },
-    {
-      "epoch": 4.0249691137265105,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.0018390012354509396,
-      "loss": 1.4777,
-      "step": 309500
-    },
-    {
-      "epoch": 4.031471487092789,
-      "grad_norm": 1.9765625,
-      "learning_rate": 0.0018387411405162883,
-      "loss": 1.4732,
-      "step": 310000
-    },
-    {
-      "epoch": 4.037973860459068,
-      "grad_norm": 0.498046875,
-      "learning_rate": 0.0018384810455816375,
-      "loss": 1.4726,
-      "step": 310500
-    },
-    {
-      "epoch": 4.044476233825346,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0018382209506469863,
-      "loss": 1.4752,
-      "step": 311000
-    },
-    {
-      "epoch": 4.050978607191625,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.001837960855712335,
-      "loss": 1.478,
-      "step": 311500
-    },
-    {
-      "epoch": 4.057480980557903,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.001837700760777684,
-      "loss": 1.4736,
-      "step": 312000
-    },
-    {
-      "epoch": 4.063983353924183,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0018374406658430327,
-      "loss": 1.4773,
-      "step": 312500
-    },
-    {
-      "epoch": 4.070485727290461,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0018371805709083814,
-      "loss": 1.4747,
-      "step": 313000
-    },
-    {
-      "epoch": 4.07698810065674,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.0018369204759737304,
-      "loss": 1.4762,
-      "step": 313500
-    },
-    {
-      "epoch": 4.0834904740230185,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0018366603810390794,
-      "loss": 1.4793,
-      "step": 314000
-    },
-    {
-      "epoch": 4.089992847389297,
-      "grad_norm": 3.640625,
-      "learning_rate": 0.0018364002861044283,
-      "loss": 1.4763,
-      "step": 314500
-    },
-    {
-      "epoch": 4.096495220755576,
-      "grad_norm": 0.5,
-      "learning_rate": 0.001836140191169777,
-      "loss": 1.4728,
-      "step": 315000
-    },
-    {
-      "epoch": 4.102997594121854,
-      "grad_norm": 1.4765625,
-      "learning_rate": 0.0018358800962351258,
-      "loss": 1.4715,
-      "step": 315500
-    },
-    {
-      "epoch": 4.109499967488133,
-      "grad_norm": 0.453125,
-      "learning_rate": 0.0018356200013004747,
-      "loss": 1.4714,
-      "step": 316000
-    },
-    {
-      "epoch": 4.116002340854412,
-      "grad_norm": 0.416015625,
-      "learning_rate": 0.0018353599063658235,
-      "loss": 1.4756,
-      "step": 316500
-    },
-    {
-      "epoch": 4.122504714220691,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0018350998114311722,
-      "loss": 1.4792,
-      "step": 317000
-    },
-    {
-      "epoch": 4.129007087586969,
-      "grad_norm": 0.478515625,
-      "learning_rate": 0.0018348397164965214,
-      "loss": 1.4831,
-      "step": 317500
-    },
-    {
-      "epoch": 4.135509460953248,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.0018345796215618701,
-      "loss": 1.4805,
-      "step": 318000
-    },
-    {
-      "epoch": 4.1420118343195265,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.001834319526627219,
-      "loss": 1.4802,
-      "step": 318500
-    },
-    {
-      "epoch": 4.148514207685805,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.0018340594316925678,
-      "loss": 1.4795,
-      "step": 319000
-    },
-    {
-      "epoch": 4.155016581052084,
-      "grad_norm": 1.6484375,
-      "learning_rate": 0.0018337993367579166,
-      "loss": 1.481,
-      "step": 319500
-    },
-    {
-      "epoch": 4.161518954418363,
-      "grad_norm": 0.490234375,
-      "learning_rate": 0.0018335392418232655,
-      "loss": 1.4766,
-      "step": 320000
-    },
-    {
-      "epoch": 4.168021327784642,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0018332791468886143,
-      "loss": 1.4774,
-      "step": 320500
-    },
-    {
-      "epoch": 4.17452370115092,
-      "grad_norm": 52.5,
-      "learning_rate": 0.001833019051953963,
-      "loss": 1.4821,
-      "step": 321000
-    },
-    {
-      "epoch": 4.181026074517199,
-      "grad_norm": 1.2421875,
-      "learning_rate": 0.0018327589570193122,
-      "loss": 1.4914,
-      "step": 321500
-    },
-    {
-      "epoch": 4.187528447883477,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.001832498862084661,
-      "loss": 1.4781,
-      "step": 322000
-    },
-    {
-      "epoch": 4.194030821249756,
-      "grad_norm": 7.96875,
-      "learning_rate": 0.0018322387671500099,
-      "loss": 1.4784,
-      "step": 322500
-    },
-    {
-      "epoch": 4.2005331946160345,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0018319786722153586,
-      "loss": 1.4806,
-      "step": 323000
-    },
-    {
-      "epoch": 4.207035567982314,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.0018317185772807074,
-      "loss": 1.4829,
-      "step": 323500
-    },
-    {
-      "epoch": 4.2135379413485925,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0018314584823460563,
-      "loss": 1.4921,
-      "step": 324000
-    },
-    {
-      "epoch": 4.220040314714871,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.001831198387411405,
-      "loss": 1.4937,
-      "step": 324500
-    },
-    {
-      "epoch": 4.22654268808115,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0018309382924767542,
-      "loss": 1.495,
-      "step": 325000
-    },
-    {
-      "epoch": 4.233045061447428,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.001830678197542103,
-      "loss": 1.4985,
-      "step": 325500
-    },
-    {
-      "epoch": 4.239547434813707,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0018304181026074517,
-      "loss": 1.5151,
-      "step": 326000
-    },
-    {
-      "epoch": 4.246049808179985,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0018301580076728007,
-      "loss": 1.515,
-      "step": 326500
-    },
-    {
-      "epoch": 4.252552181546265,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.0018298979127381494,
-      "loss": 1.5118,
-      "step": 327000
-    },
-    {
-      "epoch": 4.259054554912543,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018296378178034982,
-      "loss": 1.5253,
-      "step": 327500
-    },
-    {
-      "epoch": 4.265556928278822,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018293777228688471,
-      "loss": 1.5161,
-      "step": 328000
-    },
-    {
-      "epoch": 4.2720593016451005,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.001829117627934196,
-      "loss": 1.4984,
-      "step": 328500
-    },
-    {
-      "epoch": 4.278561675011379,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.001828857532999545,
-      "loss": 1.4888,
-      "step": 329000
-    },
-    {
-      "epoch": 4.285064048377658,
-      "grad_norm": 3.078125,
-      "learning_rate": 0.0018285974380648938,
-      "loss": 1.4896,
-      "step": 329500
-    },
-    {
-      "epoch": 4.291566421743936,
-      "grad_norm": 0.462890625,
-      "learning_rate": 0.0018283373431302425,
-      "loss": 1.4805,
-      "step": 330000
-    },
-    {
-      "epoch": 4.298068795110215,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.0018280772481955915,
-      "loss": 1.4812,
-      "step": 330500
-    },
-    {
-      "epoch": 4.304571168476494,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0018278171532609402,
-      "loss": 1.4903,
-      "step": 331000
-    },
-    {
-      "epoch": 4.311073541842773,
-      "grad_norm": 1.34375,
-      "learning_rate": 0.001827557058326289,
-      "loss": 1.4923,
-      "step": 331500
-    },
-    {
-      "epoch": 4.317575915209051,
-      "grad_norm": 0.494140625,
-      "learning_rate": 0.0018272969633916381,
-      "loss": 1.4971,
-      "step": 332000
-    },
-    {
-      "epoch": 4.32407828857533,
-      "grad_norm": 0.44140625,
-      "learning_rate": 0.0018270368684569869,
-      "loss": 1.4985,
-      "step": 332500
-    },
-    {
-      "epoch": 4.3305806619416085,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.0018267767735223358,
-      "loss": 1.4864,
-      "step": 333000
-    },
-    {
-      "epoch": 4.337083035307887,
-      "grad_norm": 0.49609375,
-      "learning_rate": 0.0018265166785876846,
-      "loss": 1.4898,
-      "step": 333500
-    },
-    {
-      "epoch": 4.343585408674166,
-      "grad_norm": 1.640625,
-      "learning_rate": 0.0018262565836530333,
-      "loss": 1.4832,
-      "step": 334000
-    },
-    {
-      "epoch": 4.350087782040445,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0018259964887183823,
-      "loss": 1.4832,
-      "step": 334500
-    },
-    {
-      "epoch": 4.356590155406724,
-      "grad_norm": 3.0,
-      "learning_rate": 0.001825736393783731,
-      "loss": 1.4868,
-      "step": 335000
-    },
-    {
-      "epoch": 4.363092528773002,
-      "grad_norm": 1.59375,
-      "learning_rate": 0.0018254762988490797,
-      "loss": 1.4895,
-      "step": 335500
-    },
-    {
-      "epoch": 4.369594902139281,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.001825216203914429,
-      "loss": 1.4896,
-      "step": 336000
-    },
-    {
-      "epoch": 4.376097275505559,
-      "grad_norm": 3.984375,
-      "learning_rate": 0.0018249561089797777,
-      "loss": 1.4895,
-      "step": 336500
-    },
-    {
-      "epoch": 4.382599648871838,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0018246960140451266,
-      "loss": 1.4857,
-      "step": 337000
-    },
-    {
-      "epoch": 4.3891020222381165,
-      "grad_norm": 1.3515625,
-      "learning_rate": 0.0018244359191104753,
-      "loss": 1.4922,
-      "step": 337500
-    },
-    {
-      "epoch": 4.395604395604396,
-      "grad_norm": 0.52734375,
-      "learning_rate": 0.001824175824175824,
-      "loss": 1.492,
-      "step": 338000
-    },
-    {
-      "epoch": 4.4021067689706745,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.001823915729241173,
-      "loss": 1.4948,
-      "step": 338500
-    },
-    {
-      "epoch": 4.408609142336953,
-      "grad_norm": 0.458984375,
-      "learning_rate": 0.0018236556343065218,
-      "loss": 1.4994,
-      "step": 339000
-    },
-    {
-      "epoch": 4.415111515703232,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.001823395539371871,
-      "loss": 1.4972,
-      "step": 339500
-    },
-    {
-      "epoch": 4.42161388906951,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.0018231354444372197,
-      "loss": 1.4951,
-      "step": 340000
-    },
-    {
-      "epoch": 4.428116262435789,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.0018228753495025684,
-      "loss": 1.4894,
-      "step": 340500
-    },
-    {
-      "epoch": 4.434618635802067,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.0018226152545679174,
-      "loss": 1.4951,
-      "step": 341000
-    },
-    {
-      "epoch": 4.441121009168347,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.0018223551596332661,
-      "loss": 1.4847,
-      "step": 341500
-    },
-    {
-      "epoch": 4.447623382534625,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0018220950646986149,
-      "loss": 1.4824,
-      "step": 342000
-    },
-    {
-      "epoch": 4.454125755900904,
-      "grad_norm": 0.5,
-      "learning_rate": 0.0018218349697639638,
-      "loss": 1.4808,
-      "step": 342500
-    },
-    {
-      "epoch": 4.4606281292671826,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.0018215748748293128,
-      "loss": 1.4828,
-      "step": 343000
-    },
-    {
-      "epoch": 4.467130502633461,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.0018213147798946617,
-      "loss": 1.4803,
-      "step": 343500
-    },
-    {
-      "epoch": 4.47363287599974,
-      "grad_norm": 0.5,
-      "learning_rate": 0.0018210546849600105,
-      "loss": 1.4776,
-      "step": 344000
-    },
-    {
-      "epoch": 4.480135249366018,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0018207945900253592,
-      "loss": 1.4754,
-      "step": 344500
-    },
-    {
-      "epoch": 4.486637622732298,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0018205344950907082,
-      "loss": 1.4711,
-      "step": 345000
-    },
-    {
-      "epoch": 4.493139996098576,
-      "grad_norm": 0.5,
-      "learning_rate": 0.001820274400156057,
-      "loss": 1.4715,
-      "step": 345500
-    },
-    {
-      "epoch": 4.499642369464855,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0018200143052214057,
-      "loss": 1.4745,
-      "step": 346000
-    },
-    {
-      "epoch": 4.506144742831133,
-      "grad_norm": 0.423828125,
-      "learning_rate": 0.0018197542102867548,
-      "loss": 1.4725,
-      "step": 346500
-    },
-    {
-      "epoch": 4.512647116197412,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0018194941153521036,
-      "loss": 1.4725,
-      "step": 347000
-    },
-    {
-      "epoch": 4.519149489563691,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.0018192340204174525,
-      "loss": 1.4669,
-      "step": 347500
-    },
-    {
-      "epoch": 4.525651862929969,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0018189739254828013,
-      "loss": 1.4681,
-      "step": 348000
-    },
-    {
-      "epoch": 4.532154236296249,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.00181871383054815,
-      "loss": 1.4667,
-      "step": 348500
-    },
-    {
-      "epoch": 4.538656609662527,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.001818453735613499,
-      "loss": 1.468,
-      "step": 349000
-    },
-    {
-      "epoch": 4.545158983028806,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.0018181936406788477,
-      "loss": 1.4745,
-      "step": 349500
-    },
-    {
-      "epoch": 4.551661356395084,
-      "grad_norm": 0.46875,
-      "learning_rate": 0.0018179335457441965,
-      "loss": 1.4762,
-      "step": 350000
-    },
-    {
-      "epoch": 4.558163729761363,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0018176734508095456,
-      "loss": 1.4789,
-      "step": 350500
-    },
-    {
-      "epoch": 4.564666103127641,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0018174133558748944,
-      "loss": 1.475,
-      "step": 351000
-    },
-    {
-      "epoch": 4.57116847649392,
-      "grad_norm": 0.447265625,
-      "learning_rate": 0.0018171532609402433,
-      "loss": 1.4692,
-      "step": 351500
-    },
-    {
-      "epoch": 4.5776708498601995,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.001816893166005592,
-      "loss": 1.475,
-      "step": 352000
-    },
-    {
-      "epoch": 4.584173223226478,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018166330710709408,
-      "loss": 1.477,
-      "step": 352500
-    },
-    {
-      "epoch": 4.590675596592757,
-      "grad_norm": 0.462890625,
-      "learning_rate": 0.0018163729761362898,
-      "loss": 1.4727,
-      "step": 353000
-    },
-    {
-      "epoch": 4.597177969959035,
-      "grad_norm": 2.53125,
-      "learning_rate": 0.0018161128812016385,
-      "loss": 1.4759,
-      "step": 353500
-    },
-    {
-      "epoch": 4.603680343325314,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018158527862669877,
-      "loss": 1.4758,
-      "step": 354000
-    },
-    {
-      "epoch": 4.610182716691592,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0018155926913323364,
-      "loss": 1.4732,
-      "step": 354500
-    },
-    {
-      "epoch": 4.616685090057871,
-      "grad_norm": 0.4375,
-      "learning_rate": 0.0018153325963976852,
-      "loss": 1.4689,
-      "step": 355000
-    },
-    {
-      "epoch": 4.623187463424149,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0018150725014630341,
-      "loss": 1.4703,
-      "step": 355500
-    },
-    {
-      "epoch": 4.629689836790429,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0018148124065283829,
-      "loss": 1.4674,
-      "step": 356000
-    },
-    {
-      "epoch": 4.6361922101567075,
-      "grad_norm": 1.1171875,
-      "learning_rate": 0.0018145523115937316,
-      "loss": 1.4734,
-      "step": 356500
-    },
-    {
-      "epoch": 4.642694583522986,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0018142922166590806,
-      "loss": 1.4702,
-      "step": 357000
-    },
-    {
-      "epoch": 4.649196956889265,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0018140321217244295,
-      "loss": 1.4727,
-      "step": 357500
-    },
-    {
-      "epoch": 4.655699330255543,
-      "grad_norm": 0.478515625,
-      "learning_rate": 0.0018137720267897785,
-      "loss": 1.4695,
-      "step": 358000
-    },
-    {
-      "epoch": 4.662201703621822,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0018135119318551272,
-      "loss": 1.4736,
-      "step": 358500
-    },
-    {
-      "epoch": 4.6687040769881,
-      "grad_norm": 0.498046875,
-      "learning_rate": 0.001813251836920476,
-      "loss": 1.4751,
-      "step": 359000
-    },
-    {
-      "epoch": 4.675206450354379,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.001812991741985825,
-      "loss": 1.4731,
-      "step": 359500
-    },
-    {
-      "epoch": 4.681708823720658,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0018127316470511736,
-      "loss": 1.4729,
-      "step": 360000
-    },
-    {
-      "epoch": 4.688211197086937,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018124715521165224,
-      "loss": 1.4717,
-      "step": 360500
-    },
-    {
-      "epoch": 4.6947135704532155,
-      "grad_norm": 0.48046875,
-      "learning_rate": 0.0018122114571818716,
-      "loss": 1.4695,
-      "step": 361000
-    },
-    {
-      "epoch": 4.701215943819494,
-      "grad_norm": 0.494140625,
-      "learning_rate": 0.0018119513622472203,
-      "loss": 1.4721,
-      "step": 361500
-    },
-    {
-      "epoch": 4.707718317185773,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0018116912673125693,
-      "loss": 1.4716,
-      "step": 362000
-    },
-    {
-      "epoch": 4.714220690552051,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.001811431172377918,
-      "loss": 1.466,
-      "step": 362500
-    },
-    {
-      "epoch": 4.72072306391833,
-      "grad_norm": 3.59375,
-      "learning_rate": 0.0018111710774432667,
-      "loss": 1.4712,
-      "step": 363000
-    },
-    {
-      "epoch": 4.727225437284609,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0018109109825086157,
-      "loss": 1.4683,
-      "step": 363500
-    },
-    {
-      "epoch": 4.733727810650888,
-      "grad_norm": 1.390625,
-      "learning_rate": 0.0018106508875739644,
-      "loss": 1.4724,
-      "step": 364000
-    },
-    {
-      "epoch": 4.740230184017166,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0018103907926393132,
-      "loss": 1.4704,
-      "step": 364500
-    },
-    {
-      "epoch": 4.746732557383445,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0018101306977046623,
-      "loss": 1.4699,
-      "step": 365000
-    },
-    {
-      "epoch": 4.7532349307497235,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.001809870602770011,
-      "loss": 1.4741,
-      "step": 365500
-    },
-    {
-      "epoch": 4.759737304116002,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.00180961050783536,
-      "loss": 1.4728,
-      "step": 366000
-    },
-    {
-      "epoch": 4.766239677482281,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0018093504129007088,
-      "loss": 1.4674,
-      "step": 366500
-    },
-    {
-      "epoch": 4.77274205084856,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.0018090903179660575,
-      "loss": 1.4757,
-      "step": 367000
-    },
-    {
-      "epoch": 4.779244424214839,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018088302230314065,
-      "loss": 1.4755,
-      "step": 367500
-    },
-    {
-      "epoch": 4.785746797581117,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0018085701280967552,
-      "loss": 1.4721,
-      "step": 368000
-    },
-    {
-      "epoch": 4.792249170947396,
-      "grad_norm": 2.375,
-      "learning_rate": 0.0018083100331621044,
-      "loss": 1.473,
-      "step": 368500
-    },
-    {
-      "epoch": 4.798751544313674,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0018080499382274531,
-      "loss": 1.472,
-      "step": 369000
-    },
-    {
-      "epoch": 4.805253917679953,
-      "grad_norm": 0.443359375,
-      "learning_rate": 0.0018077898432928019,
-      "loss": 1.4735,
-      "step": 369500
-    },
-    {
-      "epoch": 4.8117562910462315,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0018075297483581508,
-      "loss": 1.4696,
-      "step": 370000
-    },
-    {
-      "epoch": 4.818258664412511,
-      "grad_norm": 0.466796875,
-      "learning_rate": 0.0018072696534234996,
-      "loss": 1.4682,
-      "step": 370500
-    },
-    {
-      "epoch": 4.8247610377787895,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018070095584888483,
-      "loss": 1.4683,
-      "step": 371000
-    },
-    {
-      "epoch": 4.831263411145068,
-      "grad_norm": 0.453125,
-      "learning_rate": 0.0018067494635541973,
-      "loss": 1.4678,
-      "step": 371500
-    },
-    {
-      "epoch": 4.837765784511347,
-      "grad_norm": 1.1171875,
-      "learning_rate": 0.0018064893686195462,
-      "loss": 1.4719,
-      "step": 372000
-    },
-    {
-      "epoch": 4.844268157877625,
-      "grad_norm": 0.55078125,
-      "learning_rate": 0.0018062292736848952,
-      "loss": 1.4704,
-      "step": 372500
-    },
-    {
-      "epoch": 4.850770531243904,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.001805969178750244,
-      "loss": 1.4744,
-      "step": 373000
-    },
-    {
-      "epoch": 4.857272904610182,
-      "grad_norm": 0.99609375,
-      "learning_rate": 0.0018057090838155927,
-      "loss": 1.4697,
-      "step": 373500
-    },
-    {
-      "epoch": 4.863775277976462,
-      "grad_norm": 0.52734375,
-      "learning_rate": 0.0018054489888809416,
-      "loss": 1.4715,
-      "step": 374000
-    },
-    {
-      "epoch": 4.87027765134274,
-      "grad_norm": 0.5078125,
-      "learning_rate": 0.0018051888939462904,
-      "loss": 1.4689,
-      "step": 374500
-    },
-    {
-      "epoch": 4.876780024709019,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.001804928799011639,
-      "loss": 1.4676,
-      "step": 375000
-    },
-    {
-      "epoch": 4.8832823980752975,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0018046687040769883,
-      "loss": 1.4726,
-      "step": 375500
-    },
-    {
-      "epoch": 4.889784771441576,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.001804408609142337,
-      "loss": 1.4689,
-      "step": 376000
-    },
-    {
-      "epoch": 4.896287144807855,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.001804148514207686,
-      "loss": 1.4697,
-      "step": 376500
-    },
-    {
-      "epoch": 4.902789518174133,
-      "grad_norm": 0.4921875,
-      "learning_rate": 0.0018038884192730347,
-      "loss": 1.4706,
-      "step": 377000
-    },
-    {
-      "epoch": 4.909291891540413,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0018036283243383835,
-      "loss": 1.4714,
-      "step": 377500
-    },
-    {
-      "epoch": 4.915794264906691,
-      "grad_norm": 0.46875,
-      "learning_rate": 0.0018033682294037324,
-      "loss": 1.466,
-      "step": 378000
-    },
-    {
-      "epoch": 4.92229663827297,
-      "grad_norm": 1.1875,
-      "learning_rate": 0.0018031081344690812,
-      "loss": 1.4719,
-      "step": 378500
-    },
-    {
-      "epoch": 4.928799011639248,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0018028480395344299,
-      "loss": 1.4701,
-      "step": 379000
-    },
-    {
-      "epoch": 4.935301385005527,
-      "grad_norm": 5.46875,
-      "learning_rate": 0.001802587944599779,
-      "loss": 1.473,
-      "step": 379500
-    },
-    {
-      "epoch": 4.9418037583718055,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018023278496651278,
-      "loss": 1.4721,
-      "step": 380000
-    },
-    {
-      "epoch": 4.948306131738084,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0018020677547304768,
-      "loss": 1.474,
-      "step": 380500
-    },
-    {
-      "epoch": 4.9548085051043635,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.0018018076597958255,
-      "loss": 1.473,
-      "step": 381000
-    },
-    {
-      "epoch": 4.961310878470642,
-      "grad_norm": 0.54296875,
-      "learning_rate": 0.0018015475648611742,
-      "loss": 1.4764,
-      "step": 381500
-    },
-    {
-      "epoch": 4.967813251836921,
-      "grad_norm": 1.9140625,
-      "learning_rate": 0.0018012874699265232,
-      "loss": 1.4717,
-      "step": 382000
-    },
-    {
-      "epoch": 4.974315625203199,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.001801027374991872,
-      "loss": 1.4759,
-      "step": 382500
-    },
-    {
-      "epoch": 4.980817998569478,
-      "grad_norm": 0.462890625,
-      "learning_rate": 0.0018007672800572211,
-      "loss": 1.4792,
-      "step": 383000
-    },
-    {
-      "epoch": 4.987320371935756,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0018005071851225699,
-      "loss": 1.4764,
-      "step": 383500
-    },
-    {
-      "epoch": 4.993822745302035,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018002470901879186,
-      "loss": 1.4758,
-      "step": 384000
-    },
-    {
-      "epoch": 5.0,
-      "eval_loss": 1.4539484977722168,
-      "eval_runtime": 0.9039,
-      "eval_samples_per_second": 1106.3,
-      "eval_steps_per_second": 8.85,
-      "step": 384475
+      "epoch": 19.0,
+      "eval_loss": 1.3305245637893677,
+      "eval_runtime": 1.4316,
+      "eval_samples_per_second": 698.511,
+      "eval_steps_per_second": 0.699,
+      "step": 182628
     }
   ],
   "logging_steps": 500,
-  "max_steps": 3844750,
+  "max_steps": 480600,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 50,
   "save_steps": 500,
-  "total_flos": 3.3089171938476826e+18,
-  "train_batch_size": 128,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 3,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3420668838410615e+19,
+  "train_batch_size": 1024,
   "trial_name": null,
   "trial_params": null
 }