diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,8277 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.991416309012875,
+  "eval_steps": 500,
+  "global_step": 5820,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017167381974248926,
+      "grad_norm": 6.089517593383789,
+      "learning_rate": 3.436426116838488e-07,
+      "loss": 2.0708,
+      "step": 1
+    },
+    {
+      "epoch": 0.008583690987124463,
+      "grad_norm": 6.525425434112549,
+      "learning_rate": 1.7182130584192443e-06,
+      "loss": 2.0164,
+      "step": 5
+    },
+    {
+      "epoch": 0.017167381974248927,
+      "grad_norm": 5.859015464782715,
+      "learning_rate": 3.4364261168384886e-06,
+      "loss": 2.0159,
+      "step": 10
+    },
+    {
+      "epoch": 0.02575107296137339,
+      "grad_norm": 4.282891750335693,
+      "learning_rate": 5.154639175257732e-06,
+      "loss": 1.9357,
+      "step": 15
+    },
+    {
+      "epoch": 0.034334763948497854,
+      "grad_norm": 2.575316905975342,
+      "learning_rate": 6.872852233676977e-06,
+      "loss": 1.8673,
+      "step": 20
+    },
+    {
+      "epoch": 0.04291845493562232,
+      "grad_norm": 1.4356714487075806,
+      "learning_rate": 8.591065292096221e-06,
+      "loss": 1.7557,
+      "step": 25
+    },
+    {
+      "epoch": 0.05150214592274678,
+      "grad_norm": 0.756867527961731,
+      "learning_rate": 1.0309278350515464e-05,
+      "loss": 1.6729,
+      "step": 30
+    },
+    {
+      "epoch": 0.060085836909871244,
+      "grad_norm": 0.5709408521652222,
+      "learning_rate": 1.2027491408934708e-05,
+      "loss": 1.6029,
+      "step": 35
+    },
+    {
+      "epoch": 0.06866952789699571,
+      "grad_norm": 0.6336522698402405,
+      "learning_rate": 1.3745704467353954e-05,
+      "loss": 1.5371,
+      "step": 40
+    },
+    {
+      "epoch": 0.07725321888412018,
+      "grad_norm": 0.48585963249206543,
+      "learning_rate": 1.5463917525773197e-05,
+      "loss": 1.4468,
+      "step": 45
+    },
+    {
+      "epoch": 0.08583690987124463,
+      "grad_norm": 0.45667070150375366,
+      "learning_rate": 1.7182130584192442e-05,
+      "loss": 1.3687,
+      "step": 50
+    },
+    {
+      "epoch": 0.0944206008583691,
+      "grad_norm": 0.4194663465023041,
+      "learning_rate": 1.8900343642611683e-05,
+      "loss": 1.3212,
+      "step": 55
+    },
+    {
+      "epoch": 0.10300429184549356,
+      "grad_norm": 0.3849167227745056,
+      "learning_rate": 2.0618556701030927e-05,
+      "loss": 1.2499,
+      "step": 60
+    },
+    {
+      "epoch": 0.11158798283261803,
+      "grad_norm": 0.32962682843208313,
+      "learning_rate": 2.2336769759450175e-05,
+      "loss": 1.2345,
+      "step": 65
+    },
+    {
+      "epoch": 0.12017167381974249,
+      "grad_norm": 0.32320886850357056,
+      "learning_rate": 2.4054982817869417e-05,
+      "loss": 1.1963,
+      "step": 70
+    },
+    {
+      "epoch": 0.12875536480686695,
+      "grad_norm": 0.3570818603038788,
+      "learning_rate": 2.5773195876288658e-05,
+      "loss": 1.1658,
+      "step": 75
+    },
+    {
+      "epoch": 0.13733905579399142,
+      "grad_norm": 0.3045337498188019,
+      "learning_rate": 2.749140893470791e-05,
+      "loss": 1.1628,
+      "step": 80
+    },
+    {
+      "epoch": 0.1459227467811159,
+      "grad_norm": 0.2675187289714813,
+      "learning_rate": 2.920962199312715e-05,
+      "loss": 1.153,
+      "step": 85
+    },
+    {
+      "epoch": 0.15450643776824036,
+      "grad_norm": 0.2941209077835083,
+      "learning_rate": 3.0927835051546395e-05,
+      "loss": 1.1233,
+      "step": 90
+    },
+    {
+      "epoch": 0.1630901287553648,
+      "grad_norm": 0.30070310831069946,
+      "learning_rate": 3.2646048109965636e-05,
+      "loss": 1.1199,
+      "step": 95
+    },
+    {
+      "epoch": 0.17167381974248927,
+      "grad_norm": 0.2994473874568939,
+      "learning_rate": 3.4364261168384884e-05,
+      "loss": 1.1063,
+      "step": 100
+    },
+    {
+      "epoch": 0.18025751072961374,
+      "grad_norm": 0.301921010017395,
+      "learning_rate": 3.6082474226804125e-05,
+      "loss": 1.0991,
+      "step": 105
+    },
+    {
+      "epoch": 0.1888412017167382,
+      "grad_norm": 0.31418925523757935,
+      "learning_rate": 3.7800687285223366e-05,
+      "loss": 1.0939,
+      "step": 110
+    },
+    {
+      "epoch": 0.19742489270386265,
+      "grad_norm": 0.31536900997161865,
+      "learning_rate": 3.9518900343642614e-05,
+      "loss": 1.0961,
+      "step": 115
+    },
+    {
+      "epoch": 0.20600858369098712,
+      "grad_norm": 0.2873052954673767,
+      "learning_rate": 4.1237113402061855e-05,
+      "loss": 1.0807,
+      "step": 120
+    },
+    {
+      "epoch": 0.2145922746781116,
+      "grad_norm": 0.34555503726005554,
+      "learning_rate": 4.2955326460481096e-05,
+      "loss": 1.0645,
+      "step": 125
+    },
+    {
+      "epoch": 0.22317596566523606,
+      "grad_norm": 0.313761442899704,
+      "learning_rate": 4.467353951890035e-05,
+      "loss": 1.0513,
+      "step": 130
+    },
+    {
+      "epoch": 0.2317596566523605,
+      "grad_norm": 0.32121869921684265,
+      "learning_rate": 4.639175257731959e-05,
+      "loss": 1.0735,
+      "step": 135
+    },
+    {
+      "epoch": 0.24034334763948498,
+      "grad_norm": 0.3088555335998535,
+      "learning_rate": 4.810996563573883e-05,
+      "loss": 1.0552,
+      "step": 140
+    },
+    {
+      "epoch": 0.24892703862660945,
+      "grad_norm": 0.35492023825645447,
+      "learning_rate": 4.982817869415808e-05,
+      "loss": 1.0546,
+      "step": 145
+    },
+    {
+      "epoch": 0.2575107296137339,
+      "grad_norm": 0.30434370040893555,
+      "learning_rate": 5.1546391752577315e-05,
+      "loss": 1.0422,
+      "step": 150
+    },
+    {
+      "epoch": 0.26609442060085836,
+      "grad_norm": 0.30399689078330994,
+      "learning_rate": 5.326460481099656e-05,
+      "loss": 1.0465,
+      "step": 155
+    },
+    {
+      "epoch": 0.27467811158798283,
+      "grad_norm": 0.30653682351112366,
+      "learning_rate": 5.498281786941582e-05,
+      "loss": 1.0404,
+      "step": 160
+    },
+    {
+      "epoch": 0.2832618025751073,
+      "grad_norm": 0.3112322986125946,
+      "learning_rate": 5.670103092783505e-05,
+      "loss": 1.0369,
+      "step": 165
+    },
+    {
+      "epoch": 0.2918454935622318,
+      "grad_norm": 0.3165334165096283,
+      "learning_rate": 5.84192439862543e-05,
+      "loss": 1.0433,
+      "step": 170
+    },
+    {
+      "epoch": 0.30042918454935624,
+      "grad_norm": 0.34382325410842896,
+      "learning_rate": 6.013745704467354e-05,
+      "loss": 1.042,
+      "step": 175
+    },
+    {
+      "epoch": 0.3090128755364807,
+      "grad_norm": 0.3302488625049591,
+      "learning_rate": 6.185567010309279e-05,
+      "loss": 1.0166,
+      "step": 180
+    },
+    {
+      "epoch": 0.31759656652360513,
+      "grad_norm": 0.3078051209449768,
+      "learning_rate": 6.357388316151203e-05,
+      "loss": 1.0138,
+      "step": 185
+    },
+    {
+      "epoch": 0.3261802575107296,
+      "grad_norm": 0.30805814266204834,
+      "learning_rate": 6.529209621993127e-05,
+      "loss": 0.9951,
+      "step": 190
+    },
+    {
+      "epoch": 0.33476394849785407,
+      "grad_norm": 0.4036194980144501,
+      "learning_rate": 6.701030927835051e-05,
+      "loss": 1.013,
+      "step": 195
+    },
+    {
+      "epoch": 0.34334763948497854,
+      "grad_norm": 0.4557146430015564,
+      "learning_rate": 6.872852233676977e-05,
+      "loss": 0.9959,
+      "step": 200
+    },
+    {
+      "epoch": 0.351931330472103,
+      "grad_norm": 0.36137068271636963,
+      "learning_rate": 7.044673539518901e-05,
+      "loss": 1.0068,
+      "step": 205
+    },
+    {
+      "epoch": 0.3605150214592275,
+      "grad_norm": 0.3014100193977356,
+      "learning_rate": 7.216494845360825e-05,
+      "loss": 0.9923,
+      "step": 210
+    },
+    {
+      "epoch": 0.36909871244635195,
+      "grad_norm": 0.290464848279953,
+      "learning_rate": 7.38831615120275e-05,
+      "loss": 0.9969,
+      "step": 215
+    },
+    {
+      "epoch": 0.3776824034334764,
+      "grad_norm": 0.31709638237953186,
+      "learning_rate": 7.560137457044673e-05,
+      "loss": 1.0221,
+      "step": 220
+    },
+    {
+      "epoch": 0.38626609442060084,
+      "grad_norm": 0.2746679484844208,
+      "learning_rate": 7.731958762886599e-05,
+      "loss": 0.9829,
+      "step": 225
+    },
+    {
+      "epoch": 0.3948497854077253,
+      "grad_norm": 0.28260111808776855,
+      "learning_rate": 7.903780068728523e-05,
+      "loss": 0.9868,
+      "step": 230
+    },
+    {
+      "epoch": 0.4034334763948498,
+      "grad_norm": 0.3063802421092987,
+      "learning_rate": 8.075601374570447e-05,
+      "loss": 0.9789,
+      "step": 235
+    },
+    {
+      "epoch": 0.41201716738197425,
+      "grad_norm": 0.29451537132263184,
+      "learning_rate": 8.247422680412371e-05,
+      "loss": 1.0036,
+      "step": 240
+    },
+    {
+      "epoch": 0.4206008583690987,
+      "grad_norm": 0.29433488845825195,
+      "learning_rate": 8.419243986254296e-05,
+      "loss": 0.9936,
+      "step": 245
+    },
+    {
+      "epoch": 0.4291845493562232,
+      "grad_norm": 0.2725401520729065,
+      "learning_rate": 8.591065292096219e-05,
+      "loss": 0.9786,
+      "step": 250
+    },
+    {
+      "epoch": 0.43776824034334766,
+      "grad_norm": 0.2759503722190857,
+      "learning_rate": 8.762886597938145e-05,
+      "loss": 0.9656,
+      "step": 255
+    },
+    {
+      "epoch": 0.44635193133047213,
+      "grad_norm": 0.2762455642223358,
+      "learning_rate": 8.93470790378007e-05,
+      "loss": 0.9837,
+      "step": 260
+    },
+    {
+      "epoch": 0.45493562231759654,
+      "grad_norm": 0.2803398668766022,
+      "learning_rate": 9.106529209621993e-05,
+      "loss": 0.9757,
+      "step": 265
+    },
+    {
+      "epoch": 0.463519313304721,
+      "grad_norm": 0.29107633233070374,
+      "learning_rate": 9.278350515463918e-05,
+      "loss": 0.9692,
+      "step": 270
+    },
+    {
+      "epoch": 0.4721030042918455,
+      "grad_norm": 0.32915282249450684,
+      "learning_rate": 9.450171821305843e-05,
+      "loss": 0.9911,
+      "step": 275
+    },
+    {
+      "epoch": 0.48068669527896996,
+      "grad_norm": 0.2902511954307556,
+      "learning_rate": 9.621993127147767e-05,
+      "loss": 0.9635,
+      "step": 280
+    },
+    {
+      "epoch": 0.4892703862660944,
+      "grad_norm": 0.28177133202552795,
+      "learning_rate": 9.793814432989691e-05,
+      "loss": 0.9704,
+      "step": 285
+    },
+    {
+      "epoch": 0.4978540772532189,
+      "grad_norm": 0.26041337847709656,
+      "learning_rate": 9.965635738831616e-05,
+      "loss": 0.9668,
+      "step": 290
+    },
+    {
+      "epoch": 0.5064377682403434,
+      "grad_norm": 0.27237018942832947,
+      "learning_rate": 0.00010137457044673539,
+      "loss": 0.9751,
+      "step": 295
+    },
+    {
+      "epoch": 0.5150214592274678,
+      "grad_norm": 0.2928680181503296,
+      "learning_rate": 0.00010309278350515463,
+      "loss": 0.9828,
+      "step": 300
+    },
+    {
+      "epoch": 0.5236051502145923,
+      "grad_norm": 0.3540053367614746,
+      "learning_rate": 0.0001048109965635739,
+      "loss": 0.9725,
+      "step": 305
+    },
+    {
+      "epoch": 0.5321888412017167,
+      "grad_norm": 0.34892937541007996,
+      "learning_rate": 0.00010652920962199313,
+      "loss": 0.9688,
+      "step": 310
+    },
+    {
+      "epoch": 0.5407725321888412,
+      "grad_norm": 0.2727091908454895,
+      "learning_rate": 0.00010824742268041237,
+      "loss": 0.9797,
+      "step": 315
+    },
+    {
+      "epoch": 0.5493562231759657,
+      "grad_norm": 0.2613857090473175,
+      "learning_rate": 0.00010996563573883164,
+      "loss": 0.9751,
+      "step": 320
+    },
+    {
+      "epoch": 0.5579399141630901,
+      "grad_norm": 0.24695193767547607,
+      "learning_rate": 0.00011168384879725086,
+      "loss": 0.9661,
+      "step": 325
+    },
+    {
+      "epoch": 0.5665236051502146,
+      "grad_norm": 0.25507402420043945,
+      "learning_rate": 0.0001134020618556701,
+      "loss": 0.9703,
+      "step": 330
+    },
+    {
+      "epoch": 0.575107296137339,
+      "grad_norm": 0.2496771663427353,
+      "learning_rate": 0.00011512027491408935,
+      "loss": 0.9766,
+      "step": 335
+    },
+    {
+      "epoch": 0.5836909871244635,
+      "grad_norm": 0.2503701448440552,
+      "learning_rate": 0.0001168384879725086,
+      "loss": 0.9506,
+      "step": 340
+    },
+    {
+      "epoch": 0.592274678111588,
+      "grad_norm": 0.26991888880729675,
+      "learning_rate": 0.00011855670103092784,
+      "loss": 0.9587,
+      "step": 345
+    },
+    {
+      "epoch": 0.6008583690987125,
+      "grad_norm": 0.24292829632759094,
+      "learning_rate": 0.00012027491408934708,
+      "loss": 0.951,
+      "step": 350
+    },
+    {
+      "epoch": 0.6094420600858369,
+      "grad_norm": 0.26761141419410706,
+      "learning_rate": 0.00012199312714776634,
+      "loss": 0.9525,
+      "step": 355
+    },
+    {
+      "epoch": 0.6180257510729614,
+      "grad_norm": 0.24770408868789673,
+      "learning_rate": 0.00012371134020618558,
+      "loss": 0.9606,
+      "step": 360
+    },
+    {
+      "epoch": 0.6266094420600858,
+      "grad_norm": 0.24938061833381653,
+      "learning_rate": 0.00012542955326460482,
+      "loss": 0.9577,
+      "step": 365
+    },
+    {
+      "epoch": 0.6351931330472103,
+      "grad_norm": 0.22758124768733978,
+      "learning_rate": 0.00012714776632302406,
+      "loss": 0.9693,
+      "step": 370
+    },
+    {
+      "epoch": 0.6437768240343348,
+      "grad_norm": 0.24254348874092102,
+      "learning_rate": 0.0001288659793814433,
+      "loss": 0.9492,
+      "step": 375
+    },
+    {
+      "epoch": 0.6523605150214592,
+      "grad_norm": 0.37186160683631897,
+      "learning_rate": 0.00013058419243986254,
+      "loss": 0.9722,
+      "step": 380
+    },
+    {
+      "epoch": 0.6609442060085837,
+      "grad_norm": 0.23567767441272736,
+      "learning_rate": 0.00013230240549828178,
+      "loss": 0.9567,
+      "step": 385
+    },
+    {
+      "epoch": 0.6695278969957081,
+      "grad_norm": 0.2288963794708252,
+      "learning_rate": 0.00013402061855670103,
+      "loss": 0.9564,
+      "step": 390
+    },
+    {
+      "epoch": 0.6781115879828327,
+      "grad_norm": 0.24152550101280212,
+      "learning_rate": 0.0001357388316151203,
+      "loss": 0.9532,
+      "step": 395
+    },
+    {
+      "epoch": 0.6866952789699571,
+      "grad_norm": 0.261593759059906,
+      "learning_rate": 0.00013745704467353953,
+      "loss": 0.9471,
+      "step": 400
+    },
+    {
+      "epoch": 0.6952789699570815,
+      "grad_norm": 0.27105116844177246,
+      "learning_rate": 0.00013917525773195878,
+      "loss": 0.9639,
+      "step": 405
+    },
+    {
+      "epoch": 0.703862660944206,
+      "grad_norm": 0.361182302236557,
+      "learning_rate": 0.00014089347079037802,
+      "loss": 0.9516,
+      "step": 410
+    },
+    {
+      "epoch": 0.7124463519313304,
+      "grad_norm": 0.2614869475364685,
+      "learning_rate": 0.00014261168384879726,
+      "loss": 0.9615,
+      "step": 415
+    },
+    {
+      "epoch": 0.721030042918455,
+      "grad_norm": 0.2598520517349243,
+      "learning_rate": 0.0001443298969072165,
+      "loss": 0.9587,
+      "step": 420
+    },
+    {
+      "epoch": 0.7296137339055794,
+      "grad_norm": 0.26025694608688354,
+      "learning_rate": 0.00014604810996563574,
+      "loss": 0.9483,
+      "step": 425
+    },
+    {
+      "epoch": 0.7381974248927039,
+      "grad_norm": 0.27435532212257385,
+      "learning_rate": 0.000147766323024055,
+      "loss": 0.9555,
+      "step": 430
+    },
+    {
+      "epoch": 0.7467811158798283,
+      "grad_norm": 0.2179042547941208,
+      "learning_rate": 0.00014948453608247422,
+      "loss": 0.9719,
+      "step": 435
+    },
+    {
+      "epoch": 0.7553648068669528,
+      "grad_norm": 0.25120726227760315,
+      "learning_rate": 0.00015120274914089346,
+      "loss": 0.9601,
+      "step": 440
+    },
+    {
+      "epoch": 0.7639484978540773,
+      "grad_norm": 0.21970221400260925,
+      "learning_rate": 0.0001529209621993127,
+      "loss": 0.9545,
+      "step": 445
+    },
+    {
+      "epoch": 0.7725321888412017,
+      "grad_norm": 0.2047254890203476,
+      "learning_rate": 0.00015463917525773197,
+      "loss": 0.9448,
+      "step": 450
+    },
+    {
+      "epoch": 0.7811158798283262,
+      "grad_norm": 0.21874375641345978,
+      "learning_rate": 0.00015635738831615121,
+      "loss": 0.9426,
+      "step": 455
+    },
+    {
+      "epoch": 0.7896995708154506,
+      "grad_norm": 0.21705736219882965,
+      "learning_rate": 0.00015807560137457046,
+      "loss": 0.941,
+      "step": 460
+    },
+    {
+      "epoch": 0.7982832618025751,
+      "grad_norm": 0.20229893922805786,
+      "learning_rate": 0.0001597938144329897,
+      "loss": 0.9495,
+      "step": 465
+    },
+    {
+      "epoch": 0.8068669527896996,
+      "grad_norm": 0.21079690754413605,
+      "learning_rate": 0.00016151202749140894,
+      "loss": 0.9488,
+      "step": 470
+    },
+    {
+      "epoch": 0.8154506437768241,
+      "grad_norm": 0.21350346505641937,
+      "learning_rate": 0.00016323024054982818,
+      "loss": 0.941,
+      "step": 475
+    },
+    {
+      "epoch": 0.8240343347639485,
+      "grad_norm": 0.22696025669574738,
+      "learning_rate": 0.00016494845360824742,
+      "loss": 0.9468,
+      "step": 480
+    },
+    {
+      "epoch": 0.8326180257510729,
+      "grad_norm": 0.2032315880060196,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.9649,
+      "step": 485
+    },
+    {
+      "epoch": 0.8412017167381974,
+      "grad_norm": 0.24577978253364563,
+      "learning_rate": 0.00016838487972508593,
+      "loss": 0.9457,
+      "step": 490
+    },
+    {
+      "epoch": 0.8497854077253219,
+      "grad_norm": 0.2154797613620758,
+      "learning_rate": 0.00017010309278350517,
+      "loss": 0.9322,
+      "step": 495
+    },
+    {
+      "epoch": 0.8583690987124464,
+      "grad_norm": 0.20580855011940002,
+      "learning_rate": 0.00017182130584192438,
+      "loss": 0.9417,
+      "step": 500
+    },
+    {
+      "epoch": 0.8669527896995708,
+      "grad_norm": 0.2095131129026413,
+      "learning_rate": 0.00017353951890034365,
+      "loss": 0.9447,
+      "step": 505
+    },
+    {
+      "epoch": 0.8755364806866953,
+      "grad_norm": 0.20167525112628937,
+      "learning_rate": 0.0001752577319587629,
+      "loss": 0.9504,
+      "step": 510
+    },
+    {
+      "epoch": 0.8841201716738197,
+      "grad_norm": 0.21665619313716888,
+      "learning_rate": 0.00017697594501718214,
+      "loss": 0.9386,
+      "step": 515
+    },
+    {
+      "epoch": 0.8927038626609443,
+      "grad_norm": 0.2125951647758484,
+      "learning_rate": 0.0001786941580756014,
+      "loss": 0.9373,
+      "step": 520
+    },
+    {
+      "epoch": 0.9012875536480687,
+      "grad_norm": 0.19751432538032532,
+      "learning_rate": 0.00018041237113402062,
+      "loss": 0.935,
+      "step": 525
+    },
+    {
+      "epoch": 0.9098712446351931,
+      "grad_norm": 0.20792262256145477,
+      "learning_rate": 0.00018213058419243986,
+      "loss": 0.9624,
+      "step": 530
+    },
+    {
+      "epoch": 0.9184549356223176,
+      "grad_norm": 0.2099096179008484,
+      "learning_rate": 0.0001838487972508591,
+      "loss": 0.9392,
+      "step": 535
+    },
+    {
+      "epoch": 0.927038626609442,
+      "grad_norm": 0.21597731113433838,
+      "learning_rate": 0.00018556701030927837,
+      "loss": 0.942,
+      "step": 540
+    },
+    {
+      "epoch": 0.9356223175965666,
+      "grad_norm": 0.2026844620704651,
+      "learning_rate": 0.0001872852233676976,
+      "loss": 0.9579,
+      "step": 545
+    },
+    {
+      "epoch": 0.944206008583691,
+      "grad_norm": 0.20321713387966156,
+      "learning_rate": 0.00018900343642611685,
+      "loss": 0.9519,
+      "step": 550
+    },
+    {
+      "epoch": 0.9527896995708155,
+      "grad_norm": 0.20575563609600067,
+      "learning_rate": 0.0001907216494845361,
+      "loss": 0.9363,
+      "step": 555
+    },
+    {
+      "epoch": 0.9613733905579399,
+      "grad_norm": 0.21118000149726868,
+      "learning_rate": 0.00019243986254295533,
+      "loss": 0.9493,
+      "step": 560
+    },
+    {
+      "epoch": 0.9699570815450643,
+      "grad_norm": 0.2216077297925949,
+      "learning_rate": 0.00019415807560137457,
+      "loss": 0.9499,
+      "step": 565
+    },
+    {
+      "epoch": 0.9785407725321889,
+      "grad_norm": 0.20766399800777435,
+      "learning_rate": 0.00019587628865979381,
+      "loss": 0.9359,
+      "step": 570
+    },
+    {
+      "epoch": 0.9871244635193133,
+      "grad_norm": 0.30319133400917053,
+      "learning_rate": 0.00019759450171821308,
+      "loss": 0.9483,
+      "step": 575
+    },
+    {
+      "epoch": 0.9957081545064378,
+      "grad_norm": 0.21939022839069366,
+      "learning_rate": 0.00019931271477663232,
+      "loss": 0.9422,
+      "step": 580
+    },
+    {
+      "epoch": 0.9991416309012876,
+      "eval_loss": 1.976241946220398,
+      "eval_runtime": 0.3946,
+      "eval_samples_per_second": 15.206,
+      "eval_steps_per_second": 2.534,
+      "step": 582
+    },
+    {
+      "epoch": 1.0042918454935623,
+      "grad_norm": 0.21883882582187653,
+      "learning_rate": 0.00019999983812448848,
+      "loss": 0.915,
+      "step": 585
+    },
+    {
+      "epoch": 1.0128755364806867,
+      "grad_norm": 0.21552623808383942,
+      "learning_rate": 0.0001999988488871492,
+      "loss": 0.9101,
+      "step": 590
+    },
+    {
+      "epoch": 1.0214592274678111,
+      "grad_norm": 2.067782402038574,
+      "learning_rate": 0.00019999696035219593,
+      "loss": 0.9213,
+      "step": 595
+    },
+    {
+      "epoch": 1.0300429184549356,
+      "grad_norm": 0.22093655169010162,
+      "learning_rate": 0.00019999417253661235,
+      "loss": 0.9244,
+      "step": 600
+    },
+    {
+      "epoch": 1.0386266094420602,
+      "grad_norm": 0.2211354523897171,
+      "learning_rate": 0.00019999048546546954,
+      "loss": 0.8949,
+      "step": 605
+    },
+    {
+      "epoch": 1.0472103004291846,
+      "grad_norm": 0.21562980115413666,
+      "learning_rate": 0.00019998589917192568,
+      "loss": 0.9248,
+      "step": 610
+    },
+    {
+      "epoch": 1.055793991416309,
+      "grad_norm": 0.22060342133045197,
+      "learning_rate": 0.00019998041369722556,
+      "loss": 0.907,
+      "step": 615
+    },
+    {
+      "epoch": 1.0643776824034334,
+      "grad_norm": 0.20447732508182526,
+      "learning_rate": 0.00019997402909070059,
+      "loss": 0.9155,
+      "step": 620
+    },
+    {
+      "epoch": 1.0729613733905579,
+      "grad_norm": 0.21326489746570587,
+      "learning_rate": 0.000199966745409768,
+      "loss": 0.9148,
+      "step": 625
+    },
+    {
+      "epoch": 1.0815450643776825,
+      "grad_norm": 0.21152488887310028,
+      "learning_rate": 0.0001999585627199305,
+      "loss": 0.8954,
+      "step": 630
+    },
+    {
+      "epoch": 1.090128755364807,
+      "grad_norm": 0.21141602098941803,
+      "learning_rate": 0.0001999494810947757,
+      "loss": 0.9091,
+      "step": 635
+    },
+    {
+      "epoch": 1.0987124463519313,
+      "grad_norm": 0.21263065934181213,
+      "learning_rate": 0.00019993950061597535,
+      "loss": 0.9065,
+      "step": 640
+    },
+    {
+      "epoch": 1.1072961373390557,
+      "grad_norm": 0.21794655919075012,
+      "learning_rate": 0.00019992862137328474,
+      "loss": 0.9029,
+      "step": 645
+    },
+    {
+      "epoch": 1.1158798283261802,
+      "grad_norm": 0.21535712480545044,
+      "learning_rate": 0.00019991684346454172,
+      "loss": 0.9139,
+      "step": 650
+    },
+    {
+      "epoch": 1.1244635193133048,
+      "grad_norm": 0.1971653550863266,
+      "learning_rate": 0.00019990416699566598,
+      "loss": 0.8918,
+      "step": 655
+    },
+    {
+      "epoch": 1.1330472103004292,
+      "grad_norm": 0.2062826007604599,
+      "learning_rate": 0.000199890592080658,
+      "loss": 0.9188,
+      "step": 660
+    },
+    {
+      "epoch": 1.1416309012875536,
+      "grad_norm": 0.2263791412115097,
+      "learning_rate": 0.0001998761188415981,
+      "loss": 0.904,
+      "step": 665
+    },
+    {
+      "epoch": 1.150214592274678,
+      "grad_norm": 0.19670893251895905,
+      "learning_rate": 0.00019986074740864526,
+      "loss": 0.9165,
+      "step": 670
+    },
+    {
+      "epoch": 1.1587982832618025,
+      "grad_norm": 0.21204271912574768,
+      "learning_rate": 0.000199844477920036,
+      "loss": 0.8874,
+      "step": 675
+    },
+    {
+      "epoch": 1.167381974248927,
+      "grad_norm": 0.19298429787158966,
+      "learning_rate": 0.00019982731052208309,
+      "loss": 0.9102,
+      "step": 680
+    },
+    {
+      "epoch": 1.1759656652360515,
+      "grad_norm": 0.21324272453784943,
+      "learning_rate": 0.00019980924536917437,
+      "loss": 0.9119,
+      "step": 685
+    },
+    {
+      "epoch": 1.184549356223176,
+      "grad_norm": 0.192140594124794,
+      "learning_rate": 0.00019979028262377118,
+      "loss": 0.8957,
+      "step": 690
+    },
+    {
+      "epoch": 1.1931330472103003,
+      "grad_norm": 0.20233942568302155,
+      "learning_rate": 0.00019977042245640698,
+      "loss": 0.8969,
+      "step": 695
+    },
+    {
+      "epoch": 1.201716738197425,
+      "grad_norm": 0.21660216152668,
+      "learning_rate": 0.00019974966504568583,
+      "loss": 0.9064,
+      "step": 700
+    },
+    {
+      "epoch": 1.2103004291845494,
+      "grad_norm": 0.2243824154138565,
+      "learning_rate": 0.0001997280105782808,
+      "loss": 0.9166,
+      "step": 705
+    },
+    {
+      "epoch": 1.2188841201716738,
+      "grad_norm": 0.20581458508968353,
+      "learning_rate": 0.00019970545924893226,
+      "loss": 0.9064,
+      "step": 710
+    },
+    {
+      "epoch": 1.2274678111587982,
+      "grad_norm": 0.19686444103717804,
+      "learning_rate": 0.00019968201126044604,
+      "loss": 0.9126,
+      "step": 715
+    },
+    {
+      "epoch": 1.2360515021459229,
+      "grad_norm": 0.21029411256313324,
+      "learning_rate": 0.00019965766682369186,
+      "loss": 0.892,
+      "step": 720
+    },
+    {
+      "epoch": 1.2446351931330473,
+      "grad_norm": 0.21291205286979675,
+      "learning_rate": 0.0001996324261576011,
+      "loss": 0.8936,
+      "step": 725
+    },
+    {
+      "epoch": 1.2532188841201717,
+      "grad_norm": 0.2174995094537735,
+      "learning_rate": 0.00019960628948916518,
+      "loss": 0.9157,
+      "step": 730
+    },
+    {
+      "epoch": 1.261802575107296,
+      "grad_norm": 0.2011156976222992,
+      "learning_rate": 0.0001995792570534331,
+      "loss": 0.9161,
+      "step": 735
+    },
+    {
+      "epoch": 1.2703862660944205,
+      "grad_norm": 0.20497067272663116,
+      "learning_rate": 0.00019955132909350984,
+      "loss": 0.8999,
+      "step": 740
+    },
+    {
+      "epoch": 1.2789699570815452,
+      "grad_norm": 0.19909746944904327,
+      "learning_rate": 0.0001995225058605537,
+      "loss": 0.9142,
+      "step": 745
+    },
+    {
+      "epoch": 1.2875536480686696,
+      "grad_norm": 0.22116069495677948,
+      "learning_rate": 0.0001994927876137743,
+      "loss": 0.8992,
+      "step": 750
+    },
+    {
+      "epoch": 1.296137339055794,
+      "grad_norm": 0.22861087322235107,
+      "learning_rate": 0.00019946217462043025,
+      "loss": 0.898,
+      "step": 755
+    },
+    {
+      "epoch": 1.3047210300429184,
+      "grad_norm": 0.20132282376289368,
+      "learning_rate": 0.0001994306671558266,
+      "loss": 0.9147,
+      "step": 760
+    },
+    {
+      "epoch": 1.3133047210300428,
+      "grad_norm": 0.21803739666938782,
+      "learning_rate": 0.00019939826550331252,
+      "loss": 0.9,
+      "step": 765
+    },
+    {
+      "epoch": 1.3218884120171674,
+      "grad_norm": 0.19455976784229279,
+      "learning_rate": 0.0001993649699542786,
+      "loss": 0.9126,
+      "step": 770
+    },
+    {
+      "epoch": 1.3304721030042919,
+      "grad_norm": 0.18571655452251434,
+      "learning_rate": 0.0001993307808081544,
+      "loss": 0.9006,
+      "step": 775
+    },
+    {
+      "epoch": 1.3390557939914163,
+      "grad_norm": 0.20103998482227325,
+      "learning_rate": 0.00019929569837240564,
+      "loss": 0.8881,
+      "step": 780
+    },
+    {
+      "epoch": 1.3476394849785407,
+      "grad_norm": 0.19315999746322632,
+      "learning_rate": 0.00019925972296253145,
+      "loss": 0.901,
+      "step": 785
+    },
+    {
+      "epoch": 1.356223175965665,
+      "grad_norm": 0.2066372036933899,
+      "learning_rate": 0.00019922285490206156,
+      "loss": 0.888,
+      "step": 790
+    },
+    {
+      "epoch": 1.3648068669527897,
+      "grad_norm": 0.20879539847373962,
+      "learning_rate": 0.00019918509452255338,
+      "loss": 0.901,
+      "step": 795
+    },
+    {
+      "epoch": 1.3733905579399142,
+      "grad_norm": 0.20333191752433777,
+      "learning_rate": 0.000199146442163589,
+      "loss": 0.9099,
+      "step": 800
+    },
+    {
+      "epoch": 1.3819742489270386,
+      "grad_norm": 0.1949775516986847,
+      "learning_rate": 0.00019910689817277216,
+      "loss": 0.904,
+      "step": 805
+    },
+    {
+      "epoch": 1.3905579399141632,
+      "grad_norm": 0.20540495216846466,
+      "learning_rate": 0.00019906646290572514,
+      "loss": 0.8965,
+      "step": 810
+    },
+    {
+      "epoch": 1.3991416309012876,
+      "grad_norm": 0.19921506941318512,
+      "learning_rate": 0.00019902513672608553,
+      "loss": 0.8991,
+      "step": 815
+    },
+    {
+      "epoch": 1.407725321888412,
+      "grad_norm": 0.21238817274570465,
+      "learning_rate": 0.0001989829200055029,
+      "loss": 0.9026,
+      "step": 820
+    },
+    {
+      "epoch": 1.4163090128755365,
+      "grad_norm": 0.2081788033246994,
+      "learning_rate": 0.00019893981312363562,
+      "loss": 0.9052,
+      "step": 825
+    },
+    {
+      "epoch": 1.4248927038626609,
+      "grad_norm": 0.20578624308109283,
+      "learning_rate": 0.00019889581646814728,
+      "loss": 0.9038,
+      "step": 830
+    },
+    {
+      "epoch": 1.4334763948497855,
+      "grad_norm": 0.2119644731283188,
+      "learning_rate": 0.00019885093043470336,
+      "loss": 0.8936,
+      "step": 835
+    },
+    {
+      "epoch": 1.44206008583691,
+      "grad_norm": 0.19631995260715485,
+      "learning_rate": 0.0001988051554269675,
+      "loss": 0.9059,
+      "step": 840
+    },
+    {
+      "epoch": 1.4506437768240343,
+      "grad_norm": 0.22262215614318848,
+      "learning_rate": 0.00019875849185659798,
+      "loss": 0.9172,
+      "step": 845
+    },
+    {
+      "epoch": 1.4592274678111588,
+      "grad_norm": 0.19081105291843414,
+      "learning_rate": 0.00019871094014324404,
+      "loss": 0.9025,
+      "step": 850
+    },
+    {
+      "epoch": 1.4678111587982832,
+      "grad_norm": 0.18824172019958496,
+      "learning_rate": 0.000198662500714542,
+      "loss": 0.9141,
+      "step": 855
+    },
+    {
+      "epoch": 1.4763948497854078,
+      "grad_norm": 0.20280902087688446,
+      "learning_rate": 0.0001986131740061115,
+      "loss": 0.8889,
+      "step": 860
+    },
+    {
+      "epoch": 1.4849785407725322,
+      "grad_norm": 0.19314704835414886,
+      "learning_rate": 0.00019856296046155157,
+      "loss": 0.8919,
+      "step": 865
+    },
+    {
+      "epoch": 1.4935622317596566,
+      "grad_norm": 0.1936980038881302,
+      "learning_rate": 0.00019851186053243666,
+      "loss": 0.9015,
+      "step": 870
+    },
+    {
+      "epoch": 1.5021459227467813,
+      "grad_norm": 0.21349290013313293,
+      "learning_rate": 0.00019845987467831242,
+      "loss": 0.9068,
+      "step": 875
+    },
+    {
+      "epoch": 1.5107296137339055,
+      "grad_norm": 0.1915241926908493,
+      "learning_rate": 0.00019840700336669183,
+      "loss": 0.9148,
+      "step": 880
+    },
+    {
+      "epoch": 1.51931330472103,
+      "grad_norm": 0.1982114166021347,
+      "learning_rate": 0.00019835324707305076,
+      "loss": 0.9043,
+      "step": 885
+    },
+    {
+      "epoch": 1.5278969957081545,
+      "grad_norm": 0.18504977226257324,
+      "learning_rate": 0.0001982986062808239,
+      "loss": 0.8926,
+      "step": 890
+    },
+    {
+      "epoch": 1.536480686695279,
+      "grad_norm": 0.22229517996311188,
+      "learning_rate": 0.0001982430814814002,
+      "loss": 0.8849,
+      "step": 895
+    },
+    {
+      "epoch": 1.5450643776824036,
+      "grad_norm": 0.21088634431362152,
+      "learning_rate": 0.00019818667317411865,
+      "loss": 0.9075,
+      "step": 900
+    },
+    {
+      "epoch": 1.5536480686695278,
+      "grad_norm": 0.20124419033527374,
+      "learning_rate": 0.0001981293818662636,
+      "loss": 0.8914,
+      "step": 905
+    },
+    {
+      "epoch": 1.5622317596566524,
+      "grad_norm": 0.19154104590415955,
+      "learning_rate": 0.0001980712080730604,
+      "loss": 0.8816,
+      "step": 910
+    },
+    {
+      "epoch": 1.5708154506437768,
+      "grad_norm": 0.1901169866323471,
+      "learning_rate": 0.00019801215231767056,
+      "loss": 0.9051,
+      "step": 915
+    },
+    {
+      "epoch": 1.5793991416309012,
+      "grad_norm": 0.19122549891471863,
+      "learning_rate": 0.00019795221513118722,
+      "loss": 0.8965,
+      "step": 920
+    },
+    {
+      "epoch": 1.5879828326180259,
+      "grad_norm": 0.192024827003479,
+      "learning_rate": 0.00019789139705263026,
+      "loss": 0.8958,
+      "step": 925
+    },
+    {
+      "epoch": 1.59656652360515,
+      "grad_norm": 0.19915080070495605,
+      "learning_rate": 0.0001978296986289415,
+      "loss": 0.8924,
+      "step": 930
+    },
+    {
+      "epoch": 1.6051502145922747,
+      "grad_norm": 0.19441018998622894,
+      "learning_rate": 0.0001977671204149798,
+      "loss": 0.8997,
+      "step": 935
+    },
+    {
+      "epoch": 1.613733905579399,
+      "grad_norm": 0.20060202479362488,
+      "learning_rate": 0.000197703662973516,
+      "loss": 0.8808,
+      "step": 940
+    },
+    {
+      "epoch": 1.6223175965665235,
+      "grad_norm": 0.20653241872787476,
+      "learning_rate": 0.00019763932687522794,
+      "loss": 0.9032,
+      "step": 945
+    },
+    {
+      "epoch": 1.6309012875536482,
+      "grad_norm": 0.1972658932209015,
+      "learning_rate": 0.00019757411269869527,
+      "loss": 0.9093,
+      "step": 950
+    },
+    {
+      "epoch": 1.6394849785407726,
+      "grad_norm": 0.18751849234104156,
+      "learning_rate": 0.0001975080210303943,
+      "loss": 0.8842,
+      "step": 955
+    },
+    {
+      "epoch": 1.648068669527897,
+      "grad_norm": 0.1898711621761322,
+      "learning_rate": 0.00019744105246469263,
+      "loss": 0.8914,
+      "step": 960
+    },
+    {
+      "epoch": 1.6566523605150214,
+      "grad_norm": 0.1935146301984787,
+      "learning_rate": 0.0001973732076038439,
+      "loss": 0.881,
+      "step": 965
+    },
+    {
+      "epoch": 1.6652360515021458,
+      "grad_norm": 0.19017855823040009,
+      "learning_rate": 0.00019730448705798239,
+      "loss": 0.9005,
+      "step": 970
+    },
+    {
+      "epoch": 1.6738197424892705,
+      "grad_norm": 0.1929733008146286,
+      "learning_rate": 0.00019723489144511742,
+      "loss": 0.8898,
+      "step": 975
+    },
+    {
+      "epoch": 1.6824034334763949,
+      "grad_norm": 0.1935940533876419,
+      "learning_rate": 0.0001971644213911279,
+      "loss": 0.8889,
+      "step": 980
+    },
+    {
+      "epoch": 1.6909871244635193,
+      "grad_norm": 0.19845978915691376,
+      "learning_rate": 0.0001970930775297566,
+      "loss": 0.8891,
+      "step": 985
+    },
+    {
+      "epoch": 1.699570815450644,
+      "grad_norm": 0.18910686671733856,
+      "learning_rate": 0.00019702086050260456,
+      "loss": 0.8909,
+      "step": 990
+    },
+    {
+      "epoch": 1.7081545064377681,
+      "grad_norm": 0.20300810039043427,
+      "learning_rate": 0.00019694777095912534,
+      "loss": 0.9012,
+      "step": 995
+    },
+    {
+      "epoch": 1.7167381974248928,
+      "grad_norm": 0.21887531876564026,
+      "learning_rate": 0.0001968738095566189,
+      "loss": 0.9116,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7253218884120172,
+      "grad_norm": 0.21735869348049164,
+      "learning_rate": 0.00019679897696022608,
+      "loss": 0.8873,
+      "step": 1005
+    },
+    {
+      "epoch": 1.7339055793991416,
+      "grad_norm": 0.1856721192598343,
+      "learning_rate": 0.0001967232738429224,
+      "loss": 0.8889,
+      "step": 1010
+    },
+    {
+      "epoch": 1.7424892703862662,
+      "grad_norm": 0.2046109437942505,
+      "learning_rate": 0.000196646700885512,
+      "loss": 0.89,
+      "step": 1015
+    },
+    {
+      "epoch": 1.7510729613733904,
+      "grad_norm": 0.19593974947929382,
+      "learning_rate": 0.0001965692587766216,
+      "loss": 0.9065,
+      "step": 1020
+    },
+    {
+      "epoch": 1.759656652360515,
+      "grad_norm": 0.18540222942829132,
+      "learning_rate": 0.00019649094821269425,
+      "loss": 0.8877,
+      "step": 1025
+    },
+    {
+      "epoch": 1.7682403433476395,
+      "grad_norm": 0.19899272918701172,
+      "learning_rate": 0.00019641176989798305,
+      "loss": 0.8965,
+      "step": 1030
+    },
+    {
+      "epoch": 1.7768240343347639,
+      "grad_norm": 0.18957588076591492,
+      "learning_rate": 0.00019633172454454497,
+      "loss": 0.8876,
+      "step": 1035
+    },
+    {
+      "epoch": 1.7854077253218885,
+      "grad_norm": 0.20278845727443695,
+      "learning_rate": 0.0001962508128722342,
+      "loss": 0.8982,
+      "step": 1040
+    },
+    {
+      "epoch": 1.7939914163090127,
+      "grad_norm": 0.1874280869960785,
+      "learning_rate": 0.00019616903560869584,
+      "loss": 0.9049,
+      "step": 1045
+    },
+    {
+      "epoch": 1.8025751072961373,
+      "grad_norm": 0.19025950133800507,
+      "learning_rate": 0.0001960863934893594,
+      "loss": 0.8901,
+      "step": 1050
+    },
+    {
+      "epoch": 1.8111587982832618,
+      "grad_norm": 0.20806211233139038,
+      "learning_rate": 0.00019600288725743194,
+      "loss": 0.9015,
+      "step": 1055
+    },
+    {
+      "epoch": 1.8197424892703862,
+      "grad_norm": 0.2008458375930786,
+      "learning_rate": 0.00019591851766389176,
+      "loss": 0.9029,
+      "step": 1060
+    },
+    {
+      "epoch": 1.8283261802575108,
+      "grad_norm": 0.1986788511276245,
+      "learning_rate": 0.00019583328546748127,
+      "loss": 0.8942,
+      "step": 1065
+    },
+    {
+      "epoch": 1.8369098712446352,
+      "grad_norm": 0.18976636230945587,
+      "learning_rate": 0.00019574719143470044,
+      "loss": 0.8962,
+      "step": 1070
+    },
+    {
+      "epoch": 1.8454935622317596,
+      "grad_norm": 0.19317425787448883,
+      "learning_rate": 0.00019566023633979976,
+      "loss": 0.8918,
+      "step": 1075
+    },
+    {
+      "epoch": 1.8540772532188843,
+      "grad_norm": 0.1889304369688034,
+      "learning_rate": 0.00019557242096477327,
+      "loss": 0.8934,
+      "step": 1080
+    },
+    {
+      "epoch": 1.8626609442060085,
+      "grad_norm": 0.18771173059940338,
+      "learning_rate": 0.00019548374609935172,
+      "loss": 0.8782,
+      "step": 1085
+    },
+    {
+      "epoch": 1.871244635193133,
+      "grad_norm": 0.18727517127990723,
+      "learning_rate": 0.00019539421254099519,
+      "loss": 0.9014,
+      "step": 1090
+    },
+    {
+      "epoch": 1.8798283261802575,
+      "grad_norm": 0.19307033717632294,
+      "learning_rate": 0.0001953038210948861,
+      "loss": 0.896,
+      "step": 1095
+    },
+    {
+      "epoch": 1.888412017167382,
+      "grad_norm": 0.1863000988960266,
+      "learning_rate": 0.00019521257257392192,
+      "loss": 0.8855,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8969957081545066,
+      "grad_norm": 0.1884726732969284,
+      "learning_rate": 0.0001951204677987079,
+      "loss": 0.8902,
+      "step": 1105
+    },
+    {
+      "epoch": 1.9055793991416308,
+      "grad_norm": 0.20304642617702484,
+      "learning_rate": 0.00019502750759754962,
+      "loss": 0.8892,
+      "step": 1110
+    },
+    {
+      "epoch": 1.9141630901287554,
+      "grad_norm": 0.1887015998363495,
+      "learning_rate": 0.00019493369280644554,
+      "loss": 0.8946,
+      "step": 1115
+    },
+    {
+      "epoch": 1.9227467811158798,
+      "grad_norm": 0.18979288637638092,
+      "learning_rate": 0.00019483902426907954,
+      "loss": 0.8825,
+      "step": 1120
+    },
+    {
+      "epoch": 1.9313304721030042,
+      "grad_norm": 0.18896907567977905,
+      "learning_rate": 0.00019474350283681338,
+      "loss": 0.887,
+      "step": 1125
+    },
+    {
+      "epoch": 1.9399141630901289,
+      "grad_norm": 0.17926710844039917,
+      "learning_rate": 0.00019464712936867885,
+      "loss": 0.8832,
+      "step": 1130
+    },
+    {
+      "epoch": 1.948497854077253,
+      "grad_norm": 0.19314360618591309,
+      "learning_rate": 0.00019454990473137028,
+      "loss": 0.89,
+      "step": 1135
+    },
+    {
+      "epoch": 1.9570815450643777,
+      "grad_norm": 0.19700467586517334,
+      "learning_rate": 0.00019445182979923654,
+      "loss": 0.8844,
+      "step": 1140
+    },
+    {
+      "epoch": 1.9656652360515021,
+      "grad_norm": 0.20681554079055786,
+      "learning_rate": 0.00019435290545427328,
+      "loss": 0.896,
+      "step": 1145
+    },
+    {
+      "epoch": 1.9742489270386265,
+      "grad_norm": 0.1876552402973175,
+      "learning_rate": 0.0001942531325861151,
+      "loss": 0.886,
+      "step": 1150
+    },
+    {
+      "epoch": 1.9828326180257512,
+      "grad_norm": 0.18256564438343048,
+      "learning_rate": 0.0001941525120920273,
+      "loss": 0.9008,
+      "step": 1155
+    },
+    {
+      "epoch": 1.9914163090128756,
+      "grad_norm": 0.19153741002082825,
+      "learning_rate": 0.00019405104487689798,
+      "loss": 0.8804,
+      "step": 1160
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.19448307156562805,
+      "learning_rate": 0.0001939487318532299,
+      "loss": 0.8939,
+      "step": 1165
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 2.0231504440307617,
+      "eval_runtime": 0.3938,
+      "eval_samples_per_second": 15.238,
+      "eval_steps_per_second": 2.54,
+      "step": 1165
+    },
+    {
+      "epoch": 2.0085836909871246,
+      "grad_norm": 0.2327311784029007,
+      "learning_rate": 0.00019384557394113228,
+      "loss": 0.8256,
+      "step": 1170
+    },
+    {
+      "epoch": 2.017167381974249,
+      "grad_norm": 0.23671980202198029,
+      "learning_rate": 0.00019374157206831236,
+      "loss": 0.8223,
+      "step": 1175
+    },
+    {
+      "epoch": 2.0257510729613735,
+      "grad_norm": 0.23530033230781555,
+      "learning_rate": 0.00019363672717006734,
+      "loss": 0.8109,
+      "step": 1180
+    },
+    {
+      "epoch": 2.0343347639484977,
+      "grad_norm": 0.21592716872692108,
+      "learning_rate": 0.00019353104018927567,
+      "loss": 0.8071,
+      "step": 1185
+    },
+    {
+      "epoch": 2.0429184549356223,
+      "grad_norm": 0.22698843479156494,
+      "learning_rate": 0.0001934245120763889,
+      "loss": 0.8047,
+      "step": 1190
+    },
+    {
+      "epoch": 2.051502145922747,
+      "grad_norm": 0.20773455500602722,
+      "learning_rate": 0.0001933171437894227,
+      "loss": 0.8081,
+      "step": 1195
+    },
+    {
+      "epoch": 2.060085836909871,
+      "grad_norm": 0.21392963826656342,
+      "learning_rate": 0.00019320893629394873,
+      "loss": 0.8267,
+      "step": 1200
+    },
+    {
+      "epoch": 2.0686695278969958,
+      "grad_norm": 0.1993769258260727,
+      "learning_rate": 0.00019309989056308556,
+      "loss": 0.8122,
+      "step": 1205
+    },
+    {
+      "epoch": 2.0772532188841204,
+      "grad_norm": 0.21093840897083282,
+      "learning_rate": 0.00019299000757749016,
+      "loss": 0.8135,
+      "step": 1210
+    },
+    {
+      "epoch": 2.0858369098712446,
+      "grad_norm": 0.21615874767303467,
+      "learning_rate": 0.00019287928832534897,
+      "loss": 0.8098,
+      "step": 1215
+    },
+    {
+      "epoch": 2.0944206008583692,
+      "grad_norm": 0.21068502962589264,
+      "learning_rate": 0.00019276773380236904,
+      "loss": 0.813,
+      "step": 1220
+    },
+    {
+      "epoch": 2.1030042918454934,
+      "grad_norm": 0.21265622973442078,
+      "learning_rate": 0.00019265534501176906,
+      "loss": 0.8034,
+      "step": 1225
+    },
+    {
+      "epoch": 2.111587982832618,
+      "grad_norm": 0.22396954894065857,
+      "learning_rate": 0.00019254212296427044,
+      "loss": 0.8221,
+      "step": 1230
+    },
+    {
+      "epoch": 2.1201716738197427,
+      "grad_norm": 0.2072274535894394,
+      "learning_rate": 0.00019242806867808798,
+      "loss": 0.8235,
+      "step": 1235
+    },
+    {
+      "epoch": 2.128755364806867,
+      "grad_norm": 0.20798753201961517,
+      "learning_rate": 0.00019231318317892106,
+      "loss": 0.8137,
+      "step": 1240
+    },
+    {
+      "epoch": 2.1373390557939915,
+      "grad_norm": 0.2030133157968521,
+      "learning_rate": 0.00019219746749994405,
+      "loss": 0.8154,
+      "step": 1245
+    },
+    {
+      "epoch": 2.1459227467811157,
+      "grad_norm": 0.22628700733184814,
+      "learning_rate": 0.0001920809226817973,
+      "loss": 0.8261,
+      "step": 1250
+    },
+    {
+      "epoch": 2.1545064377682404,
+      "grad_norm": 0.21634644269943237,
+      "learning_rate": 0.00019196354977257766,
+      "loss": 0.8221,
+      "step": 1255
+    },
+    {
+      "epoch": 2.163090128755365,
+      "grad_norm": 0.2259581983089447,
+      "learning_rate": 0.00019184534982782904,
+      "loss": 0.8287,
+      "step": 1260
+    },
+    {
+      "epoch": 2.171673819742489,
+      "grad_norm": 0.23607933521270752,
+      "learning_rate": 0.00019172632391053294,
+      "loss": 0.8218,
+      "step": 1265
+    },
+    {
+      "epoch": 2.180257510729614,
+      "grad_norm": 0.20960725843906403,
+      "learning_rate": 0.0001916064730910989,
+      "loss": 0.8233,
+      "step": 1270
+    },
+    {
+      "epoch": 2.188841201716738,
+      "grad_norm": 0.19818070530891418,
+      "learning_rate": 0.00019148579844735497,
+      "loss": 0.8253,
+      "step": 1275
+    },
+    {
+      "epoch": 2.1974248927038627,
+      "grad_norm": 0.2142871767282486,
+      "learning_rate": 0.00019136430106453777,
+      "loss": 0.8289,
+      "step": 1280
+    },
+    {
+      "epoch": 2.2060085836909873,
+      "grad_norm": 0.21934735774993896,
+      "learning_rate": 0.0001912419820352829,
+      "loss": 0.8191,
+      "step": 1285
+    },
+    {
+      "epoch": 2.2145922746781115,
+      "grad_norm": 0.21653762459754944,
+      "learning_rate": 0.00019111884245961522,
+      "loss": 0.8194,
+      "step": 1290
+    },
+    {
+      "epoch": 2.223175965665236,
+      "grad_norm": 0.21233248710632324,
+      "learning_rate": 0.00019099488344493873,
+      "loss": 0.8247,
+      "step": 1295
+    },
+    {
+      "epoch": 2.2317596566523603,
+      "grad_norm": 0.23292584717273712,
+      "learning_rate": 0.00019087010610602668,
+      "loss": 0.8197,
+      "step": 1300
+    },
+    {
+      "epoch": 2.240343347639485,
+      "grad_norm": 0.20501044392585754,
+      "learning_rate": 0.00019074451156501164,
+      "loss": 0.8152,
+      "step": 1305
+    },
+    {
+      "epoch": 2.2489270386266096,
+      "grad_norm": 0.23035867512226105,
+      "learning_rate": 0.00019061810095137533,
+      "loss": 0.8168,
+      "step": 1310
+    },
+    {
+      "epoch": 2.257510729613734,
+      "grad_norm": 0.21323524415493011,
+      "learning_rate": 0.00019049087540193847,
+      "loss": 0.8118,
+      "step": 1315
+    },
+    {
+      "epoch": 2.2660944206008584,
+      "grad_norm": 0.20477545261383057,
+      "learning_rate": 0.00019036283606085053,
+      "loss": 0.8164,
+      "step": 1320
+    },
+    {
+      "epoch": 2.274678111587983,
+      "grad_norm": 0.21431773900985718,
+      "learning_rate": 0.00019023398407957956,
+      "loss": 0.8258,
+      "step": 1325
+    },
+    {
+      "epoch": 2.2832618025751072,
+      "grad_norm": 0.21260547637939453,
+      "learning_rate": 0.00019010432061690165,
+      "loss": 0.8166,
+      "step": 1330
+    },
+    {
+      "epoch": 2.291845493562232,
+      "grad_norm": 0.20846493542194366,
+      "learning_rate": 0.00018997384683889067,
+      "loss": 0.8066,
+      "step": 1335
+    },
+    {
+      "epoch": 2.300429184549356,
+      "grad_norm": 0.20466403663158417,
+      "learning_rate": 0.00018984256391890765,
+      "loss": 0.8251,
+      "step": 1340
+    },
+    {
+      "epoch": 2.3090128755364807,
+      "grad_norm": 0.20920304954051971,
+      "learning_rate": 0.0001897104730375904,
+      "loss": 0.8164,
+      "step": 1345
+    },
+    {
+      "epoch": 2.317596566523605,
+      "grad_norm": 0.22407568991184235,
+      "learning_rate": 0.00018957757538284273,
+      "loss": 0.8156,
+      "step": 1350
+    },
+    {
+      "epoch": 2.3261802575107295,
+      "grad_norm": 0.22706876695156097,
+      "learning_rate": 0.00018944387214982382,
+      "loss": 0.8348,
+      "step": 1355
+    },
+    {
+      "epoch": 2.334763948497854,
+      "grad_norm": 0.20964165031909943,
+      "learning_rate": 0.00018930936454093753,
+      "loss": 0.8258,
+      "step": 1360
+    },
+    {
+      "epoch": 2.3433476394849784,
+      "grad_norm": 0.23025156557559967,
+      "learning_rate": 0.00018917405376582145,
+      "loss": 0.8205,
+      "step": 1365
+    },
+    {
+      "epoch": 2.351931330472103,
+      "grad_norm": 0.23171818256378174,
+      "learning_rate": 0.0001890379410413362,
+      "loss": 0.8224,
+      "step": 1370
+    },
+    {
+      "epoch": 2.3605150214592276,
+      "grad_norm": 0.2035280019044876,
+      "learning_rate": 0.0001889010275915543,
+      "loss": 0.8363,
+      "step": 1375
+    },
+    {
+      "epoch": 2.369098712446352,
+      "grad_norm": 0.2576422691345215,
+      "learning_rate": 0.00018876331464774945,
+      "loss": 0.8216,
+      "step": 1380
+    },
+    {
+      "epoch": 2.3776824034334765,
+      "grad_norm": 0.21184222400188446,
+      "learning_rate": 0.00018862480344838495,
+      "loss": 0.8161,
+      "step": 1385
+    },
+    {
+      "epoch": 2.3862660944206007,
+      "grad_norm": 0.22491346299648285,
+      "learning_rate": 0.00018848549523910313,
+      "loss": 0.8261,
+      "step": 1390
+    },
+    {
+      "epoch": 2.3948497854077253,
+      "grad_norm": 0.21227188408374786,
+      "learning_rate": 0.0001883453912727138,
+      "loss": 0.8377,
+      "step": 1395
+    },
+    {
+      "epoch": 2.40343347639485,
+      "grad_norm": 0.21044416725635529,
+      "learning_rate": 0.0001882044928091831,
+      "loss": 0.819,
+      "step": 1400
+    },
+    {
+      "epoch": 2.412017167381974,
+      "grad_norm": 0.20745404064655304,
+      "learning_rate": 0.00018806280111562215,
+      "loss": 0.8265,
+      "step": 1405
+    },
+    {
+      "epoch": 2.4206008583690988,
+      "grad_norm": 0.2179802805185318,
+      "learning_rate": 0.00018792031746627563,
+      "loss": 0.8382,
+      "step": 1410
+    },
+    {
+      "epoch": 2.429184549356223,
+      "grad_norm": 0.20480507612228394,
+      "learning_rate": 0.00018777704314251032,
+      "loss": 0.8312,
+      "step": 1415
+    },
+    {
+      "epoch": 2.4377682403433476,
+      "grad_norm": 0.23306381702423096,
+      "learning_rate": 0.00018763297943280368,
+      "loss": 0.8161,
+      "step": 1420
+    },
+    {
+      "epoch": 2.4463519313304722,
+      "grad_norm": 0.21607355773448944,
+      "learning_rate": 0.00018748812763273208,
+      "loss": 0.8197,
+      "step": 1425
+    },
+    {
+      "epoch": 2.4549356223175964,
+      "grad_norm": 0.21942569315433502,
+      "learning_rate": 0.0001873424890449593,
+      "loss": 0.8291,
+      "step": 1430
+    },
+    {
+      "epoch": 2.463519313304721,
+      "grad_norm": 0.2144131362438202,
+      "learning_rate": 0.00018719606497922476,
+      "loss": 0.8203,
+      "step": 1435
+    },
+    {
+      "epoch": 2.4721030042918457,
+      "grad_norm": 0.21602974832057953,
+      "learning_rate": 0.0001870488567523318,
+      "loss": 0.8154,
+      "step": 1440
+    },
+    {
+      "epoch": 2.48068669527897,
+      "grad_norm": 0.2094966620206833,
+      "learning_rate": 0.0001869008656881357,
+      "loss": 0.8197,
+      "step": 1445
+    },
+    {
+      "epoch": 2.4892703862660945,
+      "grad_norm": 0.21330519020557404,
+      "learning_rate": 0.00018675209311753185,
+      "loss": 0.8325,
+      "step": 1450
+    },
+    {
+      "epoch": 2.4978540772532187,
+      "grad_norm": 0.22934697568416595,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 0.8238,
+      "step": 1455
+    },
+    {
+      "epoch": 2.5064377682403434,
+      "grad_norm": 0.23202557861804962,
+      "learning_rate": 0.00018645220881581144,
+      "loss": 0.8277,
+      "step": 1460
+    },
+    {
+      "epoch": 2.5150214592274676,
+      "grad_norm": 0.22423741221427917,
+      "learning_rate": 0.0001863010997815783,
+      "loss": 0.8205,
+      "step": 1465
+    },
+    {
+      "epoch": 2.523605150214592,
+      "grad_norm": 0.2139664888381958,
+      "learning_rate": 0.00018614921463468002,
+      "loss": 0.833,
+      "step": 1470
+    },
+    {
+      "epoch": 2.532188841201717,
+      "grad_norm": 0.22042877972126007,
+      "learning_rate": 0.00018599655474103182,
+      "loss": 0.8281,
+      "step": 1475
+    },
+    {
+      "epoch": 2.540772532188841,
+      "grad_norm": 0.21639470756053925,
+      "learning_rate": 0.0001858431214735163,
+      "loss": 0.8353,
+      "step": 1480
+    },
+    {
+      "epoch": 2.5493562231759657,
+      "grad_norm": 0.21406595408916473,
+      "learning_rate": 0.00018568891621197103,
+      "loss": 0.8351,
+      "step": 1485
+    },
+    {
+      "epoch": 2.5579399141630903,
+      "grad_norm": 0.20837725698947906,
+      "learning_rate": 0.00018553394034317622,
+      "loss": 0.8251,
+      "step": 1490
+    },
+    {
+      "epoch": 2.5665236051502145,
+      "grad_norm": 0.21612149477005005,
+      "learning_rate": 0.0001853781952608422,
+      "loss": 0.846,
+      "step": 1495
+    },
+    {
+      "epoch": 2.575107296137339,
+      "grad_norm": 0.21887291967868805,
+      "learning_rate": 0.00018522168236559695,
+      "loss": 0.8388,
+      "step": 1500
+    },
+    {
+      "epoch": 2.5836909871244638,
+      "grad_norm": 0.20973001420497894,
+      "learning_rate": 0.00018506440306497335,
+      "loss": 0.839,
+      "step": 1505
+    },
+    {
+      "epoch": 2.592274678111588,
+      "grad_norm": 0.21462783217430115,
+      "learning_rate": 0.00018490635877339666,
+      "loss": 0.8276,
+      "step": 1510
+    },
+    {
+      "epoch": 2.6008583690987126,
+      "grad_norm": 0.210985004901886,
+      "learning_rate": 0.00018474755091217186,
+      "loss": 0.8221,
+      "step": 1515
+    },
+    {
+      "epoch": 2.609442060085837,
+      "grad_norm": 0.20986580848693848,
+      "learning_rate": 0.00018458798090947065,
+      "loss": 0.8234,
+      "step": 1520
+    },
+    {
+      "epoch": 2.6180257510729614,
+      "grad_norm": 0.22892533242702484,
+      "learning_rate": 0.00018442765020031877,
+      "loss": 0.8242,
+      "step": 1525
+    },
+    {
+      "epoch": 2.6266094420600856,
+      "grad_norm": 0.2284938395023346,
+      "learning_rate": 0.0001842665602265831,
+      "loss": 0.8161,
+      "step": 1530
+    },
+    {
+      "epoch": 2.6351931330472103,
+      "grad_norm": 0.22317782044410706,
+      "learning_rate": 0.00018410471243695856,
+      "loss": 0.8284,
+      "step": 1535
+    },
+    {
+      "epoch": 2.643776824034335,
+      "grad_norm": 0.21049915254116058,
+      "learning_rate": 0.00018394210828695523,
+      "loss": 0.8183,
+      "step": 1540
+    },
+    {
+      "epoch": 2.652360515021459,
+      "grad_norm": 0.21207213401794434,
+      "learning_rate": 0.0001837787492388852,
+      "loss": 0.8287,
+      "step": 1545
+    },
+    {
+      "epoch": 2.6609442060085837,
+      "grad_norm": 0.2118200659751892,
+      "learning_rate": 0.0001836146367618494,
+      "loss": 0.8204,
+      "step": 1550
+    },
+    {
+      "epoch": 2.6695278969957084,
+      "grad_norm": 0.22095955908298492,
+      "learning_rate": 0.00018344977233172437,
+      "loss": 0.8335,
+      "step": 1555
+    },
+    {
+      "epoch": 2.6781115879828326,
+      "grad_norm": 0.21252469718456268,
+      "learning_rate": 0.00018328415743114912,
+      "loss": 0.8191,
+      "step": 1560
+    },
+    {
+      "epoch": 2.686695278969957,
+      "grad_norm": 0.20323017239570618,
+      "learning_rate": 0.0001831177935495116,
+      "loss": 0.8231,
+      "step": 1565
+    },
+    {
+      "epoch": 2.6952789699570814,
+      "grad_norm": 0.21805858612060547,
+      "learning_rate": 0.00018295068218293547,
+      "loss": 0.8341,
+      "step": 1570
+    },
+    {
+      "epoch": 2.703862660944206,
+      "grad_norm": 0.21013419330120087,
+      "learning_rate": 0.00018278282483426658,
+      "loss": 0.839,
+      "step": 1575
+    },
+    {
+      "epoch": 2.71244635193133,
+      "grad_norm": 0.21768461167812347,
+      "learning_rate": 0.0001826142230130594,
+      "loss": 0.8356,
+      "step": 1580
+    },
+    {
+      "epoch": 2.721030042918455,
+      "grad_norm": 0.21069899201393127,
+      "learning_rate": 0.00018244487823556357,
+      "loss": 0.8188,
+      "step": 1585
+    },
+    {
+      "epoch": 2.7296137339055795,
+      "grad_norm": 0.21788835525512695,
+      "learning_rate": 0.00018227479202471015,
+      "loss": 0.8408,
+      "step": 1590
+    },
+    {
+      "epoch": 2.7381974248927037,
+      "grad_norm": 0.21380050480365753,
+      "learning_rate": 0.00018210396591009795,
+      "loss": 0.8358,
+      "step": 1595
+    },
+    {
+      "epoch": 2.7467811158798283,
+      "grad_norm": 0.21521276235580444,
+      "learning_rate": 0.00018193240142797988,
+      "loss": 0.8328,
+      "step": 1600
+    },
+    {
+      "epoch": 2.755364806866953,
+      "grad_norm": 0.20885252952575684,
+      "learning_rate": 0.000181760100121249,
+      "loss": 0.8238,
+      "step": 1605
+    },
+    {
+      "epoch": 2.763948497854077,
+      "grad_norm": 0.21117731928825378,
+      "learning_rate": 0.00018158706353942463,
+      "loss": 0.8301,
+      "step": 1610
+    },
+    {
+      "epoch": 2.772532188841202,
+      "grad_norm": 0.22012095153331757,
+      "learning_rate": 0.0001814132932386386,
+      "loss": 0.8357,
+      "step": 1615
+    },
+    {
+      "epoch": 2.7811158798283264,
+      "grad_norm": 0.22017072141170502,
+      "learning_rate": 0.00018123879078162097,
+      "loss": 0.8323,
+      "step": 1620
+    },
+    {
+      "epoch": 2.7896995708154506,
+      "grad_norm": 0.2259422242641449,
+      "learning_rate": 0.00018106355773768638,
+      "loss": 0.848,
+      "step": 1625
+    },
+    {
+      "epoch": 2.7982832618025753,
+      "grad_norm": 0.21191255748271942,
+      "learning_rate": 0.0001808875956827194,
+      "loss": 0.823,
+      "step": 1630
+    },
+    {
+      "epoch": 2.8068669527896994,
+      "grad_norm": 0.21371833980083466,
+      "learning_rate": 0.00018071090619916093,
+      "loss": 0.8194,
+      "step": 1635
+    },
+    {
+      "epoch": 2.815450643776824,
+      "grad_norm": 0.22189456224441528,
+      "learning_rate": 0.00018053349087599353,
+      "loss": 0.8329,
+      "step": 1640
+    },
+    {
+      "epoch": 2.8240343347639483,
+      "grad_norm": 0.20956319570541382,
+      "learning_rate": 0.00018035535130872732,
+      "loss": 0.8293,
+      "step": 1645
+    },
+    {
+      "epoch": 2.832618025751073,
+      "grad_norm": 0.21734033524990082,
+      "learning_rate": 0.0001801764890993856,
+      "loss": 0.8334,
+      "step": 1650
+    },
+    {
+      "epoch": 2.8412017167381975,
+      "grad_norm": 0.2138412892818451,
+      "learning_rate": 0.00017999690585649052,
+      "loss": 0.8354,
+      "step": 1655
+    },
+    {
+      "epoch": 2.8497854077253217,
+      "grad_norm": 0.21562372148036957,
+      "learning_rate": 0.00017981660319504845,
+      "loss": 0.8384,
+      "step": 1660
+    },
+    {
+      "epoch": 2.8583690987124464,
+      "grad_norm": 0.21281686425209045,
+      "learning_rate": 0.0001796355827365356,
+      "loss": 0.8312,
+      "step": 1665
+    },
+    {
+      "epoch": 2.866952789699571,
+      "grad_norm": 0.21461673080921173,
+      "learning_rate": 0.00017945384610888341,
+      "loss": 0.8344,
+      "step": 1670
+    },
+    {
+      "epoch": 2.875536480686695,
+      "grad_norm": 0.20743022859096527,
+      "learning_rate": 0.00017927139494646377,
+      "loss": 0.8215,
+      "step": 1675
+    },
+    {
+      "epoch": 2.88412017167382,
+      "grad_norm": 0.21129368245601654,
+      "learning_rate": 0.00017908823089007457,
+      "loss": 0.8274,
+      "step": 1680
+    },
+    {
+      "epoch": 2.8927038626609445,
+      "grad_norm": 0.2333795428276062,
+      "learning_rate": 0.00017890435558692475,
+      "loss": 0.8307,
+      "step": 1685
+    },
+    {
+      "epoch": 2.9012875536480687,
+      "grad_norm": 0.21824228763580322,
+      "learning_rate": 0.0001787197706906196,
+      "loss": 0.8498,
+      "step": 1690
+    },
+    {
+      "epoch": 2.909871244635193,
+      "grad_norm": 0.21459732949733734,
+      "learning_rate": 0.0001785344778611457,
+      "loss": 0.8265,
+      "step": 1695
+    },
+    {
+      "epoch": 2.9184549356223175,
+      "grad_norm": 0.20637935400009155,
+      "learning_rate": 0.00017834847876485629,
+      "loss": 0.8309,
+      "step": 1700
+    },
+    {
+      "epoch": 2.927038626609442,
+      "grad_norm": 0.2137777954339981,
+      "learning_rate": 0.0001781617750744561,
+      "loss": 0.8345,
+      "step": 1705
+    },
+    {
+      "epoch": 2.9356223175965663,
+      "grad_norm": 0.23476457595825195,
+      "learning_rate": 0.00017797436846898619,
+      "loss": 0.8335,
+      "step": 1710
+    },
+    {
+      "epoch": 2.944206008583691,
+      "grad_norm": 0.20995980501174927,
+      "learning_rate": 0.00017778626063380917,
+      "loss": 0.8209,
+      "step": 1715
+    },
+    {
+      "epoch": 2.9527896995708156,
+      "grad_norm": 0.2296920269727707,
+      "learning_rate": 0.00017759745326059379,
+      "loss": 0.8426,
+      "step": 1720
+    },
+    {
+      "epoch": 2.96137339055794,
+      "grad_norm": 0.20545101165771484,
+      "learning_rate": 0.00017740794804729969,
+      "loss": 0.8324,
+      "step": 1725
+    },
+    {
+      "epoch": 2.9699570815450644,
+      "grad_norm": 0.21105705201625824,
+      "learning_rate": 0.00017721774669816252,
+      "loss": 0.8212,
+      "step": 1730
+    },
+    {
+      "epoch": 2.978540772532189,
+      "grad_norm": 0.21741057932376862,
+      "learning_rate": 0.000177026850923678,
+      "loss": 0.8333,
+      "step": 1735
+    },
+    {
+      "epoch": 2.9871244635193133,
+      "grad_norm": 0.22390629351139069,
+      "learning_rate": 0.00017683526244058716,
+      "loss": 0.8364,
+      "step": 1740
+    },
+    {
+      "epoch": 2.995708154506438,
+      "grad_norm": 0.21623565256595612,
+      "learning_rate": 0.00017664298297186042,
+      "loss": 0.8255,
+      "step": 1745
+    },
+    {
+      "epoch": 2.9991416309012875,
+      "eval_loss": 2.1085665225982666,
+      "eval_runtime": 0.3945,
+      "eval_samples_per_second": 15.208,
+      "eval_steps_per_second": 2.535,
+      "step": 1747
+    },
+    {
+      "epoch": 3.004291845493562,
+      "grad_norm": 0.21203316748142242,
+      "learning_rate": 0.00017645001424668237,
+      "loss": 0.7739,
+      "step": 1750
+    },
+    {
+      "epoch": 3.0128755364806867,
+      "grad_norm": 0.23032112419605255,
+      "learning_rate": 0.00017625635800043617,
+      "loss": 0.741,
+      "step": 1755
+    },
+    {
+      "epoch": 3.0214592274678114,
+      "grad_norm": 0.24847178161144257,
+      "learning_rate": 0.00017606201597468782,
+      "loss": 0.7348,
+      "step": 1760
+    },
+    {
+      "epoch": 3.0300429184549356,
+      "grad_norm": 0.24480335414409637,
+      "learning_rate": 0.00017586698991717064,
+      "loss": 0.7212,
+      "step": 1765
+    },
+    {
+      "epoch": 3.03862660944206,
+      "grad_norm": 0.24489726126194,
+      "learning_rate": 0.00017567128158176953,
+      "loss": 0.7312,
+      "step": 1770
+    },
+    {
+      "epoch": 3.0472103004291844,
+      "grad_norm": 0.24028155207633972,
+      "learning_rate": 0.00017547489272850511,
+      "loss": 0.7271,
+      "step": 1775
+    },
+    {
+      "epoch": 3.055793991416309,
+      "grad_norm": 0.24730311334133148,
+      "learning_rate": 0.00017527782512351804,
+      "loss": 0.7344,
+      "step": 1780
+    },
+    {
+      "epoch": 3.0643776824034337,
+      "grad_norm": 0.23651528358459473,
+      "learning_rate": 0.00017508008053905295,
+      "loss": 0.7297,
+      "step": 1785
+    },
+    {
+      "epoch": 3.072961373390558,
+      "grad_norm": 0.23505684733390808,
+      "learning_rate": 0.0001748816607534426,
+      "loss": 0.7214,
+      "step": 1790
+    },
+    {
+      "epoch": 3.0815450643776825,
+      "grad_norm": 0.2424248605966568,
+      "learning_rate": 0.00017468256755109199,
+      "loss": 0.721,
+      "step": 1795
+    },
+    {
+      "epoch": 3.0901287553648067,
+      "grad_norm": 0.243468776345253,
+      "learning_rate": 0.00017448280272246212,
+      "loss": 0.7203,
+      "step": 1800
+    },
+    {
+      "epoch": 3.0987124463519313,
+      "grad_norm": 0.2453926056623459,
+      "learning_rate": 0.000174282368064054,
+      "loss": 0.7326,
+      "step": 1805
+    },
+    {
+      "epoch": 3.107296137339056,
+      "grad_norm": 0.24548988044261932,
+      "learning_rate": 0.00017408126537839252,
+      "loss": 0.7345,
+      "step": 1810
+    },
+    {
+      "epoch": 3.11587982832618,
+      "grad_norm": 0.2359829694032669,
+      "learning_rate": 0.00017387949647401012,
+      "loss": 0.748,
+      "step": 1815
+    },
+    {
+      "epoch": 3.124463519313305,
+      "grad_norm": 0.25862741470336914,
+      "learning_rate": 0.00017367706316543063,
+      "loss": 0.7338,
+      "step": 1820
+    },
+    {
+      "epoch": 3.133047210300429,
+      "grad_norm": 0.25267720222473145,
+      "learning_rate": 0.00017347396727315296,
+      "loss": 0.7403,
+      "step": 1825
+    },
+    {
+      "epoch": 3.1416309012875536,
+      "grad_norm": 0.2458384782075882,
+      "learning_rate": 0.00017327021062363458,
+      "loss": 0.7432,
+      "step": 1830
+    },
+    {
+      "epoch": 3.1502145922746783,
+      "grad_norm": 0.2578388750553131,
+      "learning_rate": 0.0001730657950492753,
+      "loss": 0.7447,
+      "step": 1835
+    },
+    {
+      "epoch": 3.1587982832618025,
+      "grad_norm": 0.23755429685115814,
+      "learning_rate": 0.00017286072238840067,
+      "loss": 0.7389,
+      "step": 1840
+    },
+    {
+      "epoch": 3.167381974248927,
+      "grad_norm": 0.24692735075950623,
+      "learning_rate": 0.0001726549944852455,
+      "loss": 0.7584,
+      "step": 1845
+    },
+    {
+      "epoch": 3.1759656652360517,
+      "grad_norm": 0.24396221339702606,
+      "learning_rate": 0.00017244861318993713,
+      "loss": 0.7386,
+      "step": 1850
+    },
+    {
+      "epoch": 3.184549356223176,
+      "grad_norm": 0.25548049807548523,
+      "learning_rate": 0.00017224158035847905,
+      "loss": 0.738,
+      "step": 1855
+    },
+    {
+      "epoch": 3.1931330472103006,
+      "grad_norm": 0.2472919523715973,
+      "learning_rate": 0.000172033897852734,
+      "loss": 0.7519,
+      "step": 1860
+    },
+    {
+      "epoch": 3.2017167381974247,
+      "grad_norm": 0.245948925614357,
+      "learning_rate": 0.0001718255675404073,
+      "loss": 0.7461,
+      "step": 1865
+    },
+    {
+      "epoch": 3.2103004291845494,
+      "grad_norm": 0.2513918876647949,
+      "learning_rate": 0.00017161659129503003,
+      "loss": 0.7458,
+      "step": 1870
+    },
+    {
+      "epoch": 3.218884120171674,
+      "grad_norm": 0.24049414694309235,
+      "learning_rate": 0.0001714069709959422,
+      "loss": 0.7344,
+      "step": 1875
+    },
+    {
+      "epoch": 3.227467811158798,
+      "grad_norm": 0.25180676579475403,
+      "learning_rate": 0.00017119670852827588,
+      "loss": 0.7378,
+      "step": 1880
+    },
+    {
+      "epoch": 3.236051502145923,
+      "grad_norm": 0.2704819440841675,
+      "learning_rate": 0.0001709858057829382,
+      "loss": 0.7491,
+      "step": 1885
+    },
+    {
+      "epoch": 3.244635193133047,
+      "grad_norm": 0.2382296621799469,
+      "learning_rate": 0.00017077426465659433,
+      "loss": 0.7433,
+      "step": 1890
+    },
+    {
+      "epoch": 3.2532188841201717,
+      "grad_norm": 0.25334346294403076,
+      "learning_rate": 0.00017056208705165045,
+      "loss": 0.7505,
+      "step": 1895
+    },
+    {
+      "epoch": 3.2618025751072963,
+      "grad_norm": 0.2550380527973175,
+      "learning_rate": 0.0001703492748762367,
+      "loss": 0.7531,
+      "step": 1900
+    },
+    {
+      "epoch": 3.2703862660944205,
+      "grad_norm": 0.257135808467865,
+      "learning_rate": 0.00017013583004418993,
+      "loss": 0.7453,
+      "step": 1905
+    },
+    {
+      "epoch": 3.278969957081545,
+      "grad_norm": 0.27000248432159424,
+      "learning_rate": 0.0001699217544750365,
+      "loss": 0.7512,
+      "step": 1910
+    },
+    {
+      "epoch": 3.2875536480686693,
+      "grad_norm": 0.25268518924713135,
+      "learning_rate": 0.00016970705009397504,
+      "loss": 0.7397,
+      "step": 1915
+    },
+    {
+      "epoch": 3.296137339055794,
+      "grad_norm": 0.26630303263664246,
+      "learning_rate": 0.00016949171883185918,
+      "loss": 0.7436,
+      "step": 1920
+    },
+    {
+      "epoch": 3.3047210300429186,
+      "grad_norm": 0.24609267711639404,
+      "learning_rate": 0.0001692757626251801,
+      "loss": 0.7402,
+      "step": 1925
+    },
+    {
+      "epoch": 3.313304721030043,
+      "grad_norm": 0.26151612401008606,
+      "learning_rate": 0.00016905918341604922,
+      "loss": 0.7424,
+      "step": 1930
+    },
+    {
+      "epoch": 3.3218884120171674,
+      "grad_norm": 0.2529394030570984,
+      "learning_rate": 0.00016884198315218055,
+      "loss": 0.7566,
+      "step": 1935
+    },
+    {
+      "epoch": 3.3304721030042916,
+      "grad_norm": 0.2545251250267029,
+      "learning_rate": 0.0001686241637868734,
+      "loss": 0.762,
+      "step": 1940
+    },
+    {
+      "epoch": 3.3390557939914163,
+      "grad_norm": 0.249998539686203,
+      "learning_rate": 0.00016840572727899462,
+      "loss": 0.7399,
+      "step": 1945
+    },
+    {
+      "epoch": 3.347639484978541,
+      "grad_norm": 0.24969734251499176,
+      "learning_rate": 0.0001681866755929612,
+      "loss": 0.748,
+      "step": 1950
+    },
+    {
+      "epoch": 3.356223175965665,
+      "grad_norm": 0.262955904006958,
+      "learning_rate": 0.00016796701069872238,
+      "loss": 0.754,
+      "step": 1955
+    },
+    {
+      "epoch": 3.3648068669527897,
+      "grad_norm": 0.2457767277956009,
+      "learning_rate": 0.00016774673457174206,
+      "loss": 0.7443,
+      "step": 1960
+    },
+    {
+      "epoch": 3.3733905579399144,
+      "grad_norm": 0.2644675374031067,
+      "learning_rate": 0.00016752584919298093,
+      "loss": 0.7519,
+      "step": 1965
+    },
+    {
+      "epoch": 3.3819742489270386,
+      "grad_norm": 0.2620808482170105,
+      "learning_rate": 0.0001673043565488789,
+      "loss": 0.7556,
+      "step": 1970
+    },
+    {
+      "epoch": 3.390557939914163,
+      "grad_norm": 0.2510511577129364,
+      "learning_rate": 0.00016708225863133693,
+      "loss": 0.7556,
+      "step": 1975
+    },
+    {
+      "epoch": 3.3991416309012874,
+      "grad_norm": 0.2542615830898285,
+      "learning_rate": 0.0001668595574376992,
+      "loss": 0.7502,
+      "step": 1980
+    },
+    {
+      "epoch": 3.407725321888412,
+      "grad_norm": 0.25436341762542725,
+      "learning_rate": 0.0001666362549707354,
+      "loss": 0.7505,
+      "step": 1985
+    },
+    {
+      "epoch": 3.4163090128755362,
+      "grad_norm": 0.24043235182762146,
+      "learning_rate": 0.00016641235323862236,
+      "loss": 0.7433,
+      "step": 1990
+    },
+    {
+      "epoch": 3.424892703862661,
+      "grad_norm": 0.25933003425598145,
+      "learning_rate": 0.00016618785425492617,
+      "loss": 0.7595,
+      "step": 1995
+    },
+    {
+      "epoch": 3.4334763948497855,
+      "grad_norm": 0.24922600388526917,
+      "learning_rate": 0.00016596276003858412,
+      "loss": 0.7489,
+      "step": 2000
+    },
+    {
+      "epoch": 3.4420600858369097,
+      "grad_norm": 0.23722489178180695,
+      "learning_rate": 0.0001657370726138864,
+      "loss": 0.7447,
+      "step": 2005
+    },
+    {
+      "epoch": 3.4506437768240343,
+      "grad_norm": 0.273787260055542,
+      "learning_rate": 0.000165510794010458,
+      "loss": 0.75,
+      "step": 2010
+    },
+    {
+      "epoch": 3.459227467811159,
+      "grad_norm": 0.23869618773460388,
+      "learning_rate": 0.0001652839262632404,
+      "loss": 0.7463,
+      "step": 2015
+    },
+    {
+      "epoch": 3.467811158798283,
+      "grad_norm": 0.264568030834198,
+      "learning_rate": 0.0001650564714124734,
+      "loss": 0.7566,
+      "step": 2020
+    },
+    {
+      "epoch": 3.476394849785408,
+      "grad_norm": 0.2636789083480835,
+      "learning_rate": 0.0001648284315036765,
+      "loss": 0.749,
+      "step": 2025
+    },
+    {
+      "epoch": 3.484978540772532,
+      "grad_norm": 0.24988381564617157,
+      "learning_rate": 0.0001645998085876308,
+      "loss": 0.7523,
+      "step": 2030
+    },
+    {
+      "epoch": 3.4935622317596566,
+      "grad_norm": 0.26084083318710327,
+      "learning_rate": 0.00016437060472036046,
+      "loss": 0.7541,
+      "step": 2035
+    },
+    {
+      "epoch": 3.5021459227467813,
+      "grad_norm": 0.2548128664493561,
+      "learning_rate": 0.000164140821963114,
+      "loss": 0.7593,
+      "step": 2040
+    },
+    {
+      "epoch": 3.5107296137339055,
+      "grad_norm": 0.2459845244884491,
+      "learning_rate": 0.00016391046238234616,
+      "loss": 0.7485,
+      "step": 2045
+    },
+    {
+      "epoch": 3.51931330472103,
+      "grad_norm": 0.26049911975860596,
+      "learning_rate": 0.00016367952804969895,
+      "loss": 0.7492,
+      "step": 2050
+    },
+    {
+      "epoch": 3.5278969957081543,
+      "grad_norm": 0.2775178551673889,
+      "learning_rate": 0.00016344802104198324,
+      "loss": 0.7534,
+      "step": 2055
+    },
+    {
+      "epoch": 3.536480686695279,
+      "grad_norm": 0.26411354541778564,
+      "learning_rate": 0.00016321594344115997,
+      "loss": 0.7597,
+      "step": 2060
+    },
+    {
+      "epoch": 3.5450643776824036,
+      "grad_norm": 0.26565033197402954,
+      "learning_rate": 0.00016298329733432153,
+      "loss": 0.7659,
+      "step": 2065
+    },
+    {
+      "epoch": 3.5536480686695278,
+      "grad_norm": 0.2576376795768738,
+      "learning_rate": 0.00016275008481367287,
+      "loss": 0.7632,
+      "step": 2070
+    },
+    {
+      "epoch": 3.5622317596566524,
+      "grad_norm": 0.25273096561431885,
+      "learning_rate": 0.00016251630797651276,
+      "loss": 0.7604,
+      "step": 2075
+    },
+    {
+      "epoch": 3.570815450643777,
+      "grad_norm": 0.25641995668411255,
+      "learning_rate": 0.0001622819689252149,
+      "loss": 0.7575,
+      "step": 2080
+    },
+    {
+      "epoch": 3.5793991416309012,
+      "grad_norm": 0.24130167067050934,
+      "learning_rate": 0.0001620470697672091,
+      "loss": 0.7512,
+      "step": 2085
+    },
+    {
+      "epoch": 3.587982832618026,
+      "grad_norm": 0.2561969757080078,
+      "learning_rate": 0.00016181161261496216,
+      "loss": 0.7555,
+      "step": 2090
+    },
+    {
+      "epoch": 3.59656652360515,
+      "grad_norm": 0.2512848675251007,
+      "learning_rate": 0.000161575599585959,
+      "loss": 0.7556,
+      "step": 2095
+    },
+    {
+      "epoch": 3.6051502145922747,
+      "grad_norm": 0.2550983130931854,
+      "learning_rate": 0.00016133903280268362,
+      "loss": 0.7673,
+      "step": 2100
+    },
+    {
+      "epoch": 3.613733905579399,
+      "grad_norm": 0.2565702795982361,
+      "learning_rate": 0.00016110191439259997,
+      "loss": 0.7662,
+      "step": 2105
+    },
+    {
+      "epoch": 3.6223175965665235,
+      "grad_norm": 0.24696961045265198,
+      "learning_rate": 0.00016086424648813273,
+      "loss": 0.742,
+      "step": 2110
+    },
+    {
+      "epoch": 3.630901287553648,
+      "grad_norm": 0.2504982054233551,
+      "learning_rate": 0.00016062603122664833,
+      "loss": 0.7514,
+      "step": 2115
+    },
+    {
+      "epoch": 3.6394849785407724,
+      "grad_norm": 0.2710771858692169,
+      "learning_rate": 0.00016038727075043562,
+      "loss": 0.7681,
+      "step": 2120
+    },
+    {
+      "epoch": 3.648068669527897,
+      "grad_norm": 0.26008063554763794,
+      "learning_rate": 0.0001601479672066865,
+      "loss": 0.7655,
+      "step": 2125
+    },
+    {
+      "epoch": 3.6566523605150216,
+      "grad_norm": 0.25275853276252747,
+      "learning_rate": 0.00015990812274747692,
+      "loss": 0.761,
+      "step": 2130
+    },
+    {
+      "epoch": 3.665236051502146,
+      "grad_norm": 0.26592886447906494,
+      "learning_rate": 0.00015966773952974715,
+      "loss": 0.7529,
+      "step": 2135
+    },
+    {
+      "epoch": 3.6738197424892705,
+      "grad_norm": 0.24650226533412933,
+      "learning_rate": 0.0001594268197152826,
+      "loss": 0.7538,
+      "step": 2140
+    },
+    {
+      "epoch": 3.682403433476395,
+      "grad_norm": 0.25551459193229675,
+      "learning_rate": 0.00015918536547069435,
+      "loss": 0.7719,
+      "step": 2145
+    },
+    {
+      "epoch": 3.6909871244635193,
+      "grad_norm": 0.24978633224964142,
+      "learning_rate": 0.0001589433789673997,
+      "loss": 0.7545,
+      "step": 2150
+    },
+    {
+      "epoch": 3.699570815450644,
+      "grad_norm": 0.2471482902765274,
+      "learning_rate": 0.0001587008623816025,
+      "loss": 0.749,
+      "step": 2155
+    },
+    {
+      "epoch": 3.708154506437768,
+      "grad_norm": 0.24815160036087036,
+      "learning_rate": 0.00015845781789427377,
+      "loss": 0.7506,
+      "step": 2160
+    },
+    {
+      "epoch": 3.7167381974248928,
+      "grad_norm": 0.26376664638519287,
+      "learning_rate": 0.00015821424769113193,
+      "loss": 0.7569,
+      "step": 2165
+    },
+    {
+      "epoch": 3.725321888412017,
+      "grad_norm": 0.25356897711753845,
+      "learning_rate": 0.0001579701539626232,
+      "loss": 0.7707,
+      "step": 2170
+    },
+    {
+      "epoch": 3.7339055793991416,
+      "grad_norm": 0.25035008788108826,
+      "learning_rate": 0.00015772553890390197,
+      "loss": 0.76,
+      "step": 2175
+    },
+    {
+      "epoch": 3.742489270386266,
+      "grad_norm": 0.2481870949268341,
+      "learning_rate": 0.0001574804047148109,
+      "loss": 0.7521,
+      "step": 2180
+    },
+    {
+      "epoch": 3.7510729613733904,
+      "grad_norm": 0.25502651929855347,
+      "learning_rate": 0.00015723475359986127,
+      "loss": 0.7713,
+      "step": 2185
+    },
+    {
+      "epoch": 3.759656652360515,
+      "grad_norm": 0.25871410965919495,
+      "learning_rate": 0.0001569885877682132,
+      "loss": 0.7668,
+      "step": 2190
+    },
+    {
+      "epoch": 3.7682403433476397,
+      "grad_norm": 0.25364378094673157,
+      "learning_rate": 0.00015674190943365556,
+      "loss": 0.754,
+      "step": 2195
+    },
+    {
+      "epoch": 3.776824034334764,
+      "grad_norm": 0.2515285909175873,
+      "learning_rate": 0.0001564947208145863,
+      "loss": 0.7689,
+      "step": 2200
+    },
+    {
+      "epoch": 3.7854077253218885,
+      "grad_norm": 0.24017582833766937,
+      "learning_rate": 0.00015624702413399231,
+      "loss": 0.7718,
+      "step": 2205
+    },
+    {
+      "epoch": 3.7939914163090127,
+      "grad_norm": 0.26583361625671387,
+      "learning_rate": 0.00015599882161942966,
+      "loss": 0.7668,
+      "step": 2210
+    },
+    {
+      "epoch": 3.8025751072961373,
+      "grad_norm": 0.2555334270000458,
+      "learning_rate": 0.00015575011550300323,
+      "loss": 0.7507,
+      "step": 2215
+    },
+    {
+      "epoch": 3.8111587982832615,
+      "grad_norm": 0.2571168839931488,
+      "learning_rate": 0.000155500908021347,
+      "loss": 0.752,
+      "step": 2220
+    },
+    {
+      "epoch": 3.819742489270386,
+      "grad_norm": 0.26001662015914917,
+      "learning_rate": 0.0001552512014156037,
+      "loss": 0.7633,
+      "step": 2225
+    },
+    {
+      "epoch": 3.828326180257511,
+      "grad_norm": 0.25432994961738586,
+      "learning_rate": 0.00015500099793140475,
+      "loss": 0.758,
+      "step": 2230
+    },
+    {
+      "epoch": 3.836909871244635,
+      "grad_norm": 0.24695011973381042,
+      "learning_rate": 0.00015475029981884992,
+      "loss": 0.7674,
+      "step": 2235
+    },
+    {
+      "epoch": 3.8454935622317596,
+      "grad_norm": 0.24964170157909393,
+      "learning_rate": 0.00015449910933248743,
+      "loss": 0.7611,
+      "step": 2240
+    },
+    {
+      "epoch": 3.8540772532188843,
+      "grad_norm": 0.2555537223815918,
+      "learning_rate": 0.00015424742873129324,
+      "loss": 0.7657,
+      "step": 2245
+    },
+    {
+      "epoch": 3.8626609442060085,
+      "grad_norm": 0.27087053656578064,
+      "learning_rate": 0.00015399526027865107,
+      "loss": 0.7645,
+      "step": 2250
+    },
+    {
+      "epoch": 3.871244635193133,
+      "grad_norm": 0.2602386176586151,
+      "learning_rate": 0.00015374260624233195,
+      "loss": 0.7604,
+      "step": 2255
+    },
+    {
+      "epoch": 3.8798283261802577,
+      "grad_norm": 0.24609869718551636,
+      "learning_rate": 0.00015348946889447368,
+      "loss": 0.7596,
+      "step": 2260
+    },
+    {
+      "epoch": 3.888412017167382,
+      "grad_norm": 0.2546113133430481,
+      "learning_rate": 0.0001532358505115607,
+      "loss": 0.7594,
+      "step": 2265
+    },
+    {
+      "epoch": 3.8969957081545066,
+      "grad_norm": 0.25102177262306213,
+      "learning_rate": 0.0001529817533744032,
+      "loss": 0.7606,
+      "step": 2270
+    },
+    {
+      "epoch": 3.9055793991416308,
+      "grad_norm": 0.2544861435890198,
+      "learning_rate": 0.00015272717976811708,
+      "loss": 0.7535,
+      "step": 2275
+    },
+    {
+      "epoch": 3.9141630901287554,
+      "grad_norm": 0.2670022249221802,
+      "learning_rate": 0.000152472131982103,
+      "loss": 0.7609,
+      "step": 2280
+    },
+    {
+      "epoch": 3.9227467811158796,
+      "grad_norm": 0.2539633810520172,
+      "learning_rate": 0.00015221661231002605,
+      "loss": 0.7556,
+      "step": 2285
+    },
+    {
+      "epoch": 3.9313304721030042,
+      "grad_norm": 0.2580619156360626,
+      "learning_rate": 0.00015196062304979497,
+      "loss": 0.7717,
+      "step": 2290
+    },
+    {
+      "epoch": 3.939914163090129,
+      "grad_norm": 0.24921616911888123,
+      "learning_rate": 0.00015170416650354157,
+      "loss": 0.7642,
+      "step": 2295
+    },
+    {
+      "epoch": 3.948497854077253,
+      "grad_norm": 0.2630159556865692,
+      "learning_rate": 0.00015144724497760003,
+      "loss": 0.7522,
+      "step": 2300
+    },
+    {
+      "epoch": 3.9570815450643777,
+      "grad_norm": 0.2687895894050598,
+      "learning_rate": 0.00015118986078248612,
+      "loss": 0.7623,
+      "step": 2305
+    },
+    {
+      "epoch": 3.9656652360515023,
+      "grad_norm": 0.27267009019851685,
+      "learning_rate": 0.00015093201623287631,
+      "loss": 0.7755,
+      "step": 2310
+    },
+    {
+      "epoch": 3.9742489270386265,
+      "grad_norm": 0.2584875524044037,
+      "learning_rate": 0.00015067371364758727,
+      "loss": 0.7582,
+      "step": 2315
+    },
+    {
+      "epoch": 3.982832618025751,
+      "grad_norm": 0.25810128450393677,
+      "learning_rate": 0.00015041495534955467,
+      "loss": 0.7738,
+      "step": 2320
+    },
+    {
+      "epoch": 3.991416309012876,
+      "grad_norm": 0.42070698738098145,
+      "learning_rate": 0.00015015574366581257,
+      "loss": 0.7609,
+      "step": 2325
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.2550402581691742,
+      "learning_rate": 0.0001498960809274722,
+      "loss": 0.7584,
+      "step": 2330
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.2541472911834717,
+      "eval_runtime": 0.3945,
+      "eval_samples_per_second": 15.209,
+      "eval_steps_per_second": 2.535,
+      "step": 2330
+    },
+    {
+      "epoch": 4.008583690987124,
+      "grad_norm": 0.34891819953918457,
+      "learning_rate": 0.00014963596946970128,
+      "loss": 0.6641,
+      "step": 2335
+    },
+    {
+      "epoch": 4.017167381974249,
+      "grad_norm": 0.26585909724235535,
+      "learning_rate": 0.0001493754116317029,
+      "loss": 0.657,
+      "step": 2340
+    },
+    {
+      "epoch": 4.0257510729613735,
+      "grad_norm": 0.3037340044975281,
+      "learning_rate": 0.00014911440975669447,
+      "loss": 0.6571,
+      "step": 2345
+    },
+    {
+      "epoch": 4.034334763948498,
+      "grad_norm": 0.3071967363357544,
+      "learning_rate": 0.00014885296619188658,
+      "loss": 0.6607,
+      "step": 2350
+    },
+    {
+      "epoch": 4.042918454935623,
+      "grad_norm": 0.28058749437332153,
+      "learning_rate": 0.00014859108328846204,
+      "loss": 0.6558,
+      "step": 2355
+    },
+    {
+      "epoch": 4.051502145922747,
+      "grad_norm": 0.27246275544166565,
+      "learning_rate": 0.00014832876340155476,
+      "loss": 0.6464,
+      "step": 2360
+    },
+    {
+      "epoch": 4.060085836909871,
+      "grad_norm": 0.30583012104034424,
+      "learning_rate": 0.00014806600889022824,
+      "loss": 0.6602,
+      "step": 2365
+    },
+    {
+      "epoch": 4.068669527896995,
+      "grad_norm": 0.2768241763114929,
+      "learning_rate": 0.0001478028221174548,
+      "loss": 0.6601,
+      "step": 2370
+    },
+    {
+      "epoch": 4.07725321888412,
+      "grad_norm": 0.2900557219982147,
+      "learning_rate": 0.00014753920545009408,
+      "loss": 0.6612,
+      "step": 2375
+    },
+    {
+      "epoch": 4.085836909871245,
+      "grad_norm": 0.28369641304016113,
+      "learning_rate": 0.00014727516125887175,
+      "loss": 0.6497,
+      "step": 2380
+    },
+    {
+      "epoch": 4.094420600858369,
+      "grad_norm": 0.2893315553665161,
+      "learning_rate": 0.0001470106919183582,
+      "loss": 0.6615,
+      "step": 2385
+    },
+    {
+      "epoch": 4.103004291845494,
+      "grad_norm": 0.2940422296524048,
+      "learning_rate": 0.00014674579980694736,
+      "loss": 0.6583,
+      "step": 2390
+    },
+    {
+      "epoch": 4.111587982832618,
+      "grad_norm": 0.28732019662857056,
+      "learning_rate": 0.00014648048730683507,
+      "loss": 0.6717,
+      "step": 2395
+    },
+    {
+      "epoch": 4.120171673819742,
+      "grad_norm": 0.31053388118743896,
+      "learning_rate": 0.0001462147568039977,
+      "loss": 0.654,
+      "step": 2400
+    },
+    {
+      "epoch": 4.128755364806867,
+      "grad_norm": 0.29770082235336304,
+      "learning_rate": 0.00014594861068817095,
+      "loss": 0.6641,
+      "step": 2405
+    },
+    {
+      "epoch": 4.1373390557939915,
+      "grad_norm": 0.28309738636016846,
+      "learning_rate": 0.00014568205135282795,
+      "loss": 0.6715,
+      "step": 2410
+    },
+    {
+      "epoch": 4.145922746781116,
+      "grad_norm": 0.29497766494750977,
+      "learning_rate": 0.00014541508119515808,
+      "loss": 0.6622,
+      "step": 2415
+    },
+    {
+      "epoch": 4.154506437768241,
+      "grad_norm": 0.3100745975971222,
+      "learning_rate": 0.00014514770261604522,
+      "loss": 0.6714,
+      "step": 2420
+    },
+    {
+      "epoch": 4.163090128755365,
+      "grad_norm": 0.29295915365219116,
+      "learning_rate": 0.00014487991802004623,
+      "loss": 0.6617,
+      "step": 2425
+    },
+    {
+      "epoch": 4.171673819742489,
+      "grad_norm": 0.2942890524864197,
+      "learning_rate": 0.0001446117298153693,
+      "loss": 0.6627,
+      "step": 2430
+    },
+    {
+      "epoch": 4.180257510729613,
+      "grad_norm": 0.2925000786781311,
+      "learning_rate": 0.0001443431404138524,
+      "loss": 0.6747,
+      "step": 2435
+    },
+    {
+      "epoch": 4.1888412017167385,
+      "grad_norm": 0.31883692741394043,
+      "learning_rate": 0.00014407415223094132,
+      "loss": 0.6645,
+      "step": 2440
+    },
+    {
+      "epoch": 4.197424892703863,
+      "grad_norm": 0.29616445302963257,
+      "learning_rate": 0.00014380476768566824,
+      "loss": 0.6653,
+      "step": 2445
+    },
+    {
+      "epoch": 4.206008583690987,
+      "grad_norm": 0.2958747148513794,
+      "learning_rate": 0.00014353498920062987,
+      "loss": 0.6663,
+      "step": 2450
+    },
+    {
+      "epoch": 4.214592274678112,
+      "grad_norm": 0.2944903075695038,
+      "learning_rate": 0.00014326481920196556,
+      "loss": 0.6584,
+      "step": 2455
+    },
+    {
+      "epoch": 4.223175965665236,
+      "grad_norm": 0.30497610569000244,
+      "learning_rate": 0.00014299426011933568,
+      "loss": 0.6734,
+      "step": 2460
+    },
+    {
+      "epoch": 4.23175965665236,
+      "grad_norm": 0.2795341908931732,
+      "learning_rate": 0.0001427233143858996,
+      "loss": 0.6664,
+      "step": 2465
+    },
+    {
+      "epoch": 4.240343347639485,
+      "grad_norm": 0.2952185273170471,
+      "learning_rate": 0.00014245198443829383,
+      "loss": 0.675,
+      "step": 2470
+    },
+    {
+      "epoch": 4.24892703862661,
+      "grad_norm": 0.29675596952438354,
+      "learning_rate": 0.0001421802727166103,
+      "loss": 0.6726,
+      "step": 2475
+    },
+    {
+      "epoch": 4.257510729613734,
+      "grad_norm": 0.2960766553878784,
+      "learning_rate": 0.0001419081816643741,
+      "loss": 0.6657,
+      "step": 2480
+    },
+    {
+      "epoch": 4.266094420600858,
+      "grad_norm": 0.2973078191280365,
+      "learning_rate": 0.00014163571372852177,
+      "loss": 0.6781,
+      "step": 2485
+    },
+    {
+      "epoch": 4.274678111587983,
+      "grad_norm": 0.293087363243103,
+      "learning_rate": 0.00014136287135937915,
+      "loss": 0.6715,
+      "step": 2490
+    },
+    {
+      "epoch": 4.283261802575107,
+      "grad_norm": 0.30738070607185364,
+      "learning_rate": 0.00014108965701063942,
+      "loss": 0.6692,
+      "step": 2495
+    },
+    {
+      "epoch": 4.291845493562231,
+      "grad_norm": 0.29339906573295593,
+      "learning_rate": 0.0001408160731393409,
+      "loss": 0.6632,
+      "step": 2500
+    },
+    {
+      "epoch": 4.3004291845493565,
+      "grad_norm": 0.3105657696723938,
+      "learning_rate": 0.00014054212220584525,
+      "loss": 0.6768,
+      "step": 2505
+    },
+    {
+      "epoch": 4.309012875536481,
+      "grad_norm": 0.29471009969711304,
+      "learning_rate": 0.00014026780667381498,
+      "loss": 0.6703,
+      "step": 2510
+    },
+    {
+      "epoch": 4.317596566523605,
+      "grad_norm": 0.30494722723960876,
+      "learning_rate": 0.0001399931290101915,
+      "loss": 0.6725,
+      "step": 2515
+    },
+    {
+      "epoch": 4.32618025751073,
+      "grad_norm": 0.2980051040649414,
+      "learning_rate": 0.00013971809168517298,
+      "loss": 0.6652,
+      "step": 2520
+    },
+    {
+      "epoch": 4.334763948497854,
+      "grad_norm": 0.2986336350440979,
+      "learning_rate": 0.00013944269717219198,
+      "loss": 0.6579,
+      "step": 2525
+    },
+    {
+      "epoch": 4.343347639484978,
+      "grad_norm": 0.2928684949874878,
+      "learning_rate": 0.00013916694794789325,
+      "loss": 0.6797,
+      "step": 2530
+    },
+    {
+      "epoch": 4.3519313304721035,
+      "grad_norm": 0.30945298075675964,
+      "learning_rate": 0.00013889084649211156,
+      "loss": 0.6781,
+      "step": 2535
+    },
+    {
+      "epoch": 4.360515021459228,
+      "grad_norm": 0.29915961623191833,
+      "learning_rate": 0.0001386143952878493,
+      "loss": 0.6802,
+      "step": 2540
+    },
+    {
+      "epoch": 4.369098712446352,
+      "grad_norm": 0.309627503156662,
+      "learning_rate": 0.0001383375968212542,
+      "loss": 0.6728,
+      "step": 2545
+    },
+    {
+      "epoch": 4.377682403433476,
+      "grad_norm": 0.292521595954895,
+      "learning_rate": 0.00013806045358159683,
+      "loss": 0.6739,
+      "step": 2550
+    },
+    {
+      "epoch": 4.386266094420601,
+      "grad_norm": 0.30214038491249084,
+      "learning_rate": 0.00013778296806124852,
+      "loss": 0.6821,
+      "step": 2555
+    },
+    {
+      "epoch": 4.394849785407725,
+      "grad_norm": 0.30407920479774475,
+      "learning_rate": 0.0001375051427556586,
+      "loss": 0.6723,
+      "step": 2560
+    },
+    {
+      "epoch": 4.4034334763948495,
+      "grad_norm": 0.3059447109699249,
+      "learning_rate": 0.00013722698016333218,
+      "loss": 0.6787,
+      "step": 2565
+    },
+    {
+      "epoch": 4.412017167381975,
+      "grad_norm": 0.2976439893245697,
+      "learning_rate": 0.00013694848278580763,
+      "loss": 0.6675,
+      "step": 2570
+    },
+    {
+      "epoch": 4.420600858369099,
+      "grad_norm": 0.30373451113700867,
+      "learning_rate": 0.00013666965312763405,
+      "loss": 0.6743,
+      "step": 2575
+    },
+    {
+      "epoch": 4.429184549356223,
+      "grad_norm": 0.2918217182159424,
+      "learning_rate": 0.00013639049369634876,
+      "loss": 0.6753,
+      "step": 2580
+    },
+    {
+      "epoch": 4.437768240343348,
+      "grad_norm": 0.2900952696800232,
+      "learning_rate": 0.00013611100700245478,
+      "loss": 0.6732,
+      "step": 2585
+    },
+    {
+      "epoch": 4.446351931330472,
+      "grad_norm": 0.307317852973938,
+      "learning_rate": 0.00013583119555939815,
+      "loss": 0.6825,
+      "step": 2590
+    },
+    {
+      "epoch": 4.454935622317596,
+      "grad_norm": 0.298483282327652,
+      "learning_rate": 0.00013555106188354557,
+      "loss": 0.6744,
+      "step": 2595
+    },
+    {
+      "epoch": 4.463519313304721,
+      "grad_norm": 0.3024675250053406,
+      "learning_rate": 0.00013527060849416144,
+      "loss": 0.6786,
+      "step": 2600
+    },
+    {
+      "epoch": 4.472103004291846,
+      "grad_norm": 0.30732661485671997,
+      "learning_rate": 0.00013498983791338545,
+      "loss": 0.6745,
+      "step": 2605
+    },
+    {
+      "epoch": 4.48068669527897,
+      "grad_norm": 0.29272985458374023,
+      "learning_rate": 0.00013470875266620978,
+      "loss": 0.6782,
+      "step": 2610
+    },
+    {
+      "epoch": 4.489270386266094,
+      "grad_norm": 0.3057718873023987,
+      "learning_rate": 0.00013442735528045643,
+      "loss": 0.6844,
+      "step": 2615
+    },
+    {
+      "epoch": 4.497854077253219,
+      "grad_norm": 0.31071603298187256,
+      "learning_rate": 0.00013414564828675456,
+      "loss": 0.6827,
+      "step": 2620
+    },
+    {
+      "epoch": 4.506437768240343,
+      "grad_norm": 0.32284069061279297,
+      "learning_rate": 0.00013386363421851756,
+      "loss": 0.6809,
+      "step": 2625
+    },
+    {
+      "epoch": 4.515021459227468,
+      "grad_norm": 0.3142790198326111,
+      "learning_rate": 0.00013358131561192046,
+      "loss": 0.6859,
+      "step": 2630
+    },
+    {
+      "epoch": 4.523605150214593,
+      "grad_norm": 0.30643147230148315,
+      "learning_rate": 0.00013329869500587694,
+      "loss": 0.6742,
+      "step": 2635
+    },
+    {
+      "epoch": 4.532188841201717,
+      "grad_norm": 0.2984931468963623,
+      "learning_rate": 0.00013301577494201664,
+      "loss": 0.677,
+      "step": 2640
+    },
+    {
+      "epoch": 4.540772532188841,
+      "grad_norm": 0.30939051508903503,
+      "learning_rate": 0.00013273255796466223,
+      "loss": 0.6742,
+      "step": 2645
+    },
+    {
+      "epoch": 4.549356223175966,
+      "grad_norm": 0.3024001717567444,
+      "learning_rate": 0.00013244904662080653,
+      "loss": 0.6718,
+      "step": 2650
+    },
+    {
+      "epoch": 4.55793991416309,
+      "grad_norm": 0.294514536857605,
+      "learning_rate": 0.0001321652434600897,
+      "loss": 0.6787,
+      "step": 2655
+    },
+    {
+      "epoch": 4.5665236051502145,
+      "grad_norm": 0.2984825372695923,
+      "learning_rate": 0.00013188115103477606,
+      "loss": 0.6905,
+      "step": 2660
+    },
+    {
+      "epoch": 4.575107296137339,
+      "grad_norm": 0.298879474401474,
+      "learning_rate": 0.00013159677189973152,
+      "loss": 0.6882,
+      "step": 2665
+    },
+    {
+      "epoch": 4.583690987124464,
+      "grad_norm": 0.30825379490852356,
+      "learning_rate": 0.00013131210861240026,
+      "loss": 0.6752,
+      "step": 2670
+    },
+    {
+      "epoch": 4.592274678111588,
+      "grad_norm": 0.3206503093242645,
+      "learning_rate": 0.00013102716373278192,
+      "loss": 0.6827,
+      "step": 2675
+    },
+    {
+      "epoch": 4.600858369098712,
+      "grad_norm": 0.3049548268318176,
+      "learning_rate": 0.00013074193982340847,
+      "loss": 0.6871,
+      "step": 2680
+    },
+    {
+      "epoch": 4.609442060085837,
+      "grad_norm": 0.29572340846061707,
+      "learning_rate": 0.00013045643944932126,
+      "loss": 0.6796,
+      "step": 2685
+    },
+    {
+      "epoch": 4.618025751072961,
+      "grad_norm": 0.2995782792568207,
+      "learning_rate": 0.00013017066517804793,
+      "loss": 0.6827,
+      "step": 2690
+    },
+    {
+      "epoch": 4.626609442060086,
+      "grad_norm": 0.31238484382629395,
+      "learning_rate": 0.00012988461957957922,
+      "loss": 0.6755,
+      "step": 2695
+    },
+    {
+      "epoch": 4.63519313304721,
+      "grad_norm": 0.29801589250564575,
+      "learning_rate": 0.00012959830522634596,
+      "loss": 0.6825,
+      "step": 2700
+    },
+    {
+      "epoch": 4.643776824034335,
+      "grad_norm": 0.318678081035614,
+      "learning_rate": 0.000129311724693196,
+      "loss": 0.6898,
+      "step": 2705
+    },
+    {
+      "epoch": 4.652360515021459,
+      "grad_norm": 0.3065158426761627,
+      "learning_rate": 0.00012902488055737093,
+      "loss": 0.6765,
+      "step": 2710
+    },
+    {
+      "epoch": 4.660944206008583,
+      "grad_norm": 0.30059394240379333,
+      "learning_rate": 0.00012873777539848283,
+      "loss": 0.6938,
+      "step": 2715
+    },
+    {
+      "epoch": 4.669527896995708,
+      "grad_norm": 0.30598220229148865,
+      "learning_rate": 0.00012845041179849128,
+      "loss": 0.686,
+      "step": 2720
+    },
+    {
+      "epoch": 4.6781115879828326,
+      "grad_norm": 0.30764105916023254,
+      "learning_rate": 0.00012816279234168008,
+      "loss": 0.6886,
+      "step": 2725
+    },
+    {
+      "epoch": 4.686695278969957,
+      "grad_norm": 0.302541583776474,
+      "learning_rate": 0.0001278749196146339,
+      "loss": 0.6848,
+      "step": 2730
+    },
+    {
+      "epoch": 4.695278969957082,
+      "grad_norm": 0.2911517322063446,
+      "learning_rate": 0.00012758679620621503,
+      "loss": 0.6859,
+      "step": 2735
+    },
+    {
+      "epoch": 4.703862660944206,
+      "grad_norm": 0.2968668043613434,
+      "learning_rate": 0.00012729842470754032,
+      "loss": 0.68,
+      "step": 2740
+    },
+    {
+      "epoch": 4.71244635193133,
+      "grad_norm": 0.30458763241767883,
+      "learning_rate": 0.00012700980771195762,
+      "loss": 0.6785,
+      "step": 2745
+    },
+    {
+      "epoch": 4.721030042918455,
+      "grad_norm": 0.3035382032394409,
+      "learning_rate": 0.00012672094781502252,
+      "loss": 0.6896,
+      "step": 2750
+    },
+    {
+      "epoch": 4.7296137339055795,
+      "grad_norm": 0.2893989682197571,
+      "learning_rate": 0.00012643184761447512,
+      "loss": 0.6786,
+      "step": 2755
+    },
+    {
+      "epoch": 4.738197424892704,
+      "grad_norm": 0.3313075006008148,
+      "learning_rate": 0.00012614250971021657,
+      "loss": 0.6859,
+      "step": 2760
+    },
+    {
+      "epoch": 4.746781115879829,
+      "grad_norm": 0.3178950250148773,
+      "learning_rate": 0.00012585293670428564,
+      "loss": 0.6855,
+      "step": 2765
+    },
+    {
+      "epoch": 4.755364806866953,
+      "grad_norm": 0.28977376222610474,
+      "learning_rate": 0.00012556313120083546,
+      "loss": 0.6761,
+      "step": 2770
+    },
+    {
+      "epoch": 4.763948497854077,
+      "grad_norm": 0.30826762318611145,
+      "learning_rate": 0.00012527309580611005,
+      "loss": 0.677,
+      "step": 2775
+    },
+    {
+      "epoch": 4.772532188841201,
+      "grad_norm": 0.3121355473995209,
+      "learning_rate": 0.0001249828331284207,
+      "loss": 0.6854,
+      "step": 2780
+    },
+    {
+      "epoch": 4.781115879828326,
+      "grad_norm": 0.30394992232322693,
+      "learning_rate": 0.00012469234577812296,
+      "loss": 0.6927,
+      "step": 2785
+    },
+    {
+      "epoch": 4.789699570815451,
+      "grad_norm": 0.28112301230430603,
+      "learning_rate": 0.0001244016363675926,
+      "loss": 0.6917,
+      "step": 2790
+    },
+    {
+      "epoch": 4.798283261802575,
+      "grad_norm": 0.2926720678806305,
+      "learning_rate": 0.00012411070751120254,
+      "loss": 0.6703,
+      "step": 2795
+    },
+    {
+      "epoch": 4.8068669527897,
+      "grad_norm": 0.296567440032959,
+      "learning_rate": 0.00012381956182529918,
+      "loss": 0.6831,
+      "step": 2800
+    },
+    {
+      "epoch": 4.815450643776824,
+      "grad_norm": 0.29968711733818054,
+      "learning_rate": 0.00012352820192817877,
+      "loss": 0.68,
+      "step": 2805
+    },
+    {
+      "epoch": 4.824034334763948,
+      "grad_norm": 0.31971994042396545,
+      "learning_rate": 0.0001232366304400642,
+      "loss": 0.6936,
+      "step": 2810
+    },
+    {
+      "epoch": 4.8326180257510725,
+      "grad_norm": 0.29575350880622864,
+      "learning_rate": 0.000122944849983081,
+      "loss": 0.689,
+      "step": 2815
+    },
+    {
+      "epoch": 4.8412017167381975,
+      "grad_norm": 0.33436283469200134,
+      "learning_rate": 0.00012265286318123415,
+      "loss": 0.6905,
+      "step": 2820
+    },
+    {
+      "epoch": 4.849785407725322,
+      "grad_norm": 0.27712202072143555,
+      "learning_rate": 0.00012236067266038414,
+      "loss": 0.6856,
+      "step": 2825
+    },
+    {
+      "epoch": 4.858369098712446,
+      "grad_norm": 0.32512983679771423,
+      "learning_rate": 0.00012206828104822363,
+      "loss": 0.6906,
+      "step": 2830
+    },
+    {
+      "epoch": 4.866952789699571,
+      "grad_norm": 0.3128452003002167,
+      "learning_rate": 0.00012177569097425368,
+      "loss": 0.6814,
+      "step": 2835
+    },
+    {
+      "epoch": 4.875536480686695,
+      "grad_norm": 0.28740525245666504,
+      "learning_rate": 0.00012148290506976012,
+      "loss": 0.683,
+      "step": 2840
+    },
+    {
+      "epoch": 4.884120171673819,
+      "grad_norm": 0.3094848394393921,
+      "learning_rate": 0.00012118992596778995,
+      "loss": 0.6751,
+      "step": 2845
+    },
+    {
+      "epoch": 4.8927038626609445,
+      "grad_norm": 0.28997689485549927,
+      "learning_rate": 0.00012089675630312754,
+      "loss": 0.6918,
+      "step": 2850
+    },
+    {
+      "epoch": 4.901287553648069,
+      "grad_norm": 0.29608073830604553,
+      "learning_rate": 0.00012060339871227101,
+      "loss": 0.687,
+      "step": 2855
+    },
+    {
+      "epoch": 4.909871244635193,
+      "grad_norm": 0.2982884347438812,
+      "learning_rate": 0.00012030985583340861,
+      "loss": 0.6886,
+      "step": 2860
+    },
+    {
+      "epoch": 4.918454935622318,
+      "grad_norm": 0.31985583901405334,
+      "learning_rate": 0.00012001613030639478,
+      "loss": 0.6787,
+      "step": 2865
+    },
+    {
+      "epoch": 4.927038626609442,
+      "grad_norm": 0.31187719106674194,
+      "learning_rate": 0.00011972222477272663,
+      "loss": 0.6944,
+      "step": 2870
+    },
+    {
+      "epoch": 4.935622317596566,
+      "grad_norm": 0.28702715039253235,
+      "learning_rate": 0.00011942814187552005,
+      "loss": 0.6849,
+      "step": 2875
+    },
+    {
+      "epoch": 4.944206008583691,
+      "grad_norm": 0.28225070238113403,
+      "learning_rate": 0.00011913388425948584,
+      "loss": 0.6791,
+      "step": 2880
+    },
+    {
+      "epoch": 4.952789699570816,
+      "grad_norm": 0.3197003901004791,
+      "learning_rate": 0.00011883945457090633,
+      "loss": 0.6905,
+      "step": 2885
+    },
+    {
+      "epoch": 4.96137339055794,
+      "grad_norm": 0.30299967527389526,
+      "learning_rate": 0.00011854485545761108,
+      "loss": 0.6794,
+      "step": 2890
+    },
+    {
+      "epoch": 4.969957081545064,
+      "grad_norm": 0.3089299499988556,
+      "learning_rate": 0.00011825008956895338,
+      "loss": 0.6815,
+      "step": 2895
+    },
+    {
+      "epoch": 4.978540772532189,
+      "grad_norm": 0.31284767389297485,
+      "learning_rate": 0.0001179551595557864,
+      "loss": 0.6878,
+      "step": 2900
+    },
+    {
+      "epoch": 4.987124463519313,
+      "grad_norm": 0.2950330078601837,
+      "learning_rate": 0.00011766006807043921,
+      "loss": 0.6765,
+      "step": 2905
+    },
+    {
+      "epoch": 4.9957081545064375,
+      "grad_norm": 0.31097424030303955,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.6928,
+      "step": 2910
+    },
+    {
+      "epoch": 4.9991416309012875,
+      "eval_loss": 2.442364454269409,
+      "eval_runtime": 0.3946,
+      "eval_samples_per_second": 15.205,
+      "eval_steps_per_second": 2.534,
+      "step": 2912
+    },
+    {
+      "epoch": 5.0042918454935625,
+      "grad_norm": 0.24456113576889038,
+      "learning_rate": 0.00011706941129975741,
+      "loss": 0.6479,
+      "step": 2915
+    },
+    {
+      "epoch": 5.012875536480687,
+      "grad_norm": 0.3765704333782196,
+      "learning_rate": 0.00011677385132624621,
+      "loss": 0.5981,
+      "step": 2920
+    },
+    {
+      "epoch": 5.021459227467811,
+      "grad_norm": 0.30039721727371216,
+      "learning_rate": 0.00011647814050415378,
+      "loss": 0.5899,
+      "step": 2925
+    },
+    {
+      "epoch": 5.030042918454936,
+      "grad_norm": 0.3503376245498657,
+      "learning_rate": 0.00011618228149283117,
+      "loss": 0.586,
+      "step": 2930
+    },
+    {
+      "epoch": 5.03862660944206,
+      "grad_norm": 0.315405935049057,
+      "learning_rate": 0.000115886276952962,
+      "loss": 0.5842,
+      "step": 2935
+    },
+    {
+      "epoch": 5.047210300429184,
+      "grad_norm": 0.3637129068374634,
+      "learning_rate": 0.00011559012954653865,
+      "loss": 0.5894,
+      "step": 2940
+    },
+    {
+      "epoch": 5.055793991416309,
+      "grad_norm": 0.3133433759212494,
+      "learning_rate": 0.00011529384193683838,
+      "loss": 0.5889,
+      "step": 2945
+    },
+    {
+      "epoch": 5.064377682403434,
+      "grad_norm": 0.3363387882709503,
+      "learning_rate": 0.00011499741678839928,
+      "loss": 0.5992,
+      "step": 2950
+    },
+    {
+      "epoch": 5.072961373390558,
+      "grad_norm": 0.33112606406211853,
+      "learning_rate": 0.00011470085676699627,
+      "loss": 0.584,
+      "step": 2955
+    },
+    {
+      "epoch": 5.081545064377682,
+      "grad_norm": 0.33626094460487366,
+      "learning_rate": 0.00011440416453961728,
+      "loss": 0.5907,
+      "step": 2960
+    },
+    {
+      "epoch": 5.090128755364807,
+      "grad_norm": 0.32099804282188416,
+      "learning_rate": 0.00011410734277443915,
+      "loss": 0.5875,
+      "step": 2965
+    },
+    {
+      "epoch": 5.098712446351931,
+      "grad_norm": 0.33036282658576965,
+      "learning_rate": 0.00011381039414080365,
+      "loss": 0.5883,
+      "step": 2970
+    },
+    {
+      "epoch": 5.1072961373390555,
+      "grad_norm": 0.33885952830314636,
+      "learning_rate": 0.00011351332130919348,
+      "loss": 0.5857,
+      "step": 2975
+    },
+    {
+      "epoch": 5.115879828326181,
+      "grad_norm": 0.31977617740631104,
+      "learning_rate": 0.00011321612695120832,
+      "loss": 0.5855,
+      "step": 2980
+    },
+    {
+      "epoch": 5.124463519313305,
+      "grad_norm": 0.3337278664112091,
+      "learning_rate": 0.00011291881373954065,
+      "loss": 0.5969,
+      "step": 2985
+    },
+    {
+      "epoch": 5.133047210300429,
+      "grad_norm": 0.33656007051467896,
+      "learning_rate": 0.00011262138434795191,
+      "loss": 0.5811,
+      "step": 2990
+    },
+    {
+      "epoch": 5.141630901287554,
+      "grad_norm": 0.34739845991134644,
+      "learning_rate": 0.00011232384145124831,
+      "loss": 0.5932,
+      "step": 2995
+    },
+    {
+      "epoch": 5.150214592274678,
+      "grad_norm": 0.33286526799201965,
+      "learning_rate": 0.0001120261877252568,
+      "loss": 0.5983,
+      "step": 3000
+    },
+    {
+      "epoch": 5.1587982832618025,
+      "grad_norm": 0.3316696584224701,
+      "learning_rate": 0.00011172842584680107,
+      "loss": 0.5976,
+      "step": 3005
+    },
+    {
+      "epoch": 5.167381974248927,
+      "grad_norm": 0.3135989308357239,
+      "learning_rate": 0.00011143055849367738,
+      "loss": 0.5978,
+      "step": 3010
+    },
+    {
+      "epoch": 5.175965665236052,
+      "grad_norm": 0.3531875014305115,
+      "learning_rate": 0.00011113258834463063,
+      "loss": 0.5965,
+      "step": 3015
+    },
+    {
+      "epoch": 5.184549356223176,
+      "grad_norm": 0.3395566940307617,
+      "learning_rate": 0.00011083451807933008,
+      "loss": 0.5933,
+      "step": 3020
+    },
+    {
+      "epoch": 5.1931330472103,
+      "grad_norm": 0.3231445252895355,
+      "learning_rate": 0.00011053635037834532,
+      "loss": 0.6071,
+      "step": 3025
+    },
+    {
+      "epoch": 5.201716738197425,
+      "grad_norm": 0.33721092343330383,
+      "learning_rate": 0.00011023808792312227,
+      "loss": 0.6049,
+      "step": 3030
+    },
+    {
+      "epoch": 5.210300429184549,
+      "grad_norm": 0.32114890217781067,
+      "learning_rate": 0.00010993973339595896,
+      "loss": 0.6038,
+      "step": 3035
+    },
+    {
+      "epoch": 5.218884120171674,
+      "grad_norm": 0.3454916477203369,
+      "learning_rate": 0.00010964128947998142,
+      "loss": 0.5998,
+      "step": 3040
+    },
+    {
+      "epoch": 5.227467811158799,
+      "grad_norm": 0.31621354818344116,
+      "learning_rate": 0.00010934275885911956,
+      "loss": 0.5977,
+      "step": 3045
+    },
+    {
+      "epoch": 5.236051502145923,
+      "grad_norm": 0.3424486517906189,
+      "learning_rate": 0.00010904414421808303,
+      "loss": 0.5923,
+      "step": 3050
+    },
+    {
+      "epoch": 5.244635193133047,
+      "grad_norm": 0.3184787333011627,
+      "learning_rate": 0.00010874544824233705,
+      "loss": 0.5986,
+      "step": 3055
+    },
+    {
+      "epoch": 5.253218884120171,
+      "grad_norm": 0.3448573052883148,
+      "learning_rate": 0.00010844667361807842,
+      "loss": 0.5931,
+      "step": 3060
+    },
+    {
+      "epoch": 5.261802575107296,
+      "grad_norm": 0.314815878868103,
+      "learning_rate": 0.00010814782303221105,
+      "loss": 0.6008,
+      "step": 3065
+    },
+    {
+      "epoch": 5.2703862660944205,
+      "grad_norm": 0.3566027581691742,
+      "learning_rate": 0.00010784889917232206,
+      "loss": 0.585,
+      "step": 3070
+    },
+    {
+      "epoch": 5.278969957081545,
+      "grad_norm": 0.31585681438446045,
+      "learning_rate": 0.0001075499047266576,
+      "loss": 0.5927,
+      "step": 3075
+    },
+    {
+      "epoch": 5.28755364806867,
+      "grad_norm": 0.33874404430389404,
+      "learning_rate": 0.00010725084238409848,
+      "loss": 0.6047,
+      "step": 3080
+    },
+    {
+      "epoch": 5.296137339055794,
+      "grad_norm": 0.3274739682674408,
+      "learning_rate": 0.00010695171483413619,
+      "loss": 0.5919,
+      "step": 3085
+    },
+    {
+      "epoch": 5.304721030042918,
+      "grad_norm": 0.3326583802700043,
+      "learning_rate": 0.00010665252476684864,
+      "loss": 0.5982,
+      "step": 3090
+    },
+    {
+      "epoch": 5.313304721030043,
+      "grad_norm": 0.34962788224220276,
+      "learning_rate": 0.00010635327487287591,
+      "loss": 0.5999,
+      "step": 3095
+    },
+    {
+      "epoch": 5.3218884120171674,
+      "grad_norm": 0.33242544531822205,
+      "learning_rate": 0.00010605396784339612,
+      "loss": 0.6037,
+      "step": 3100
+    },
+    {
+      "epoch": 5.330472103004292,
+      "grad_norm": 0.38023149967193604,
+      "learning_rate": 0.00010575460637010128,
+      "loss": 0.6068,
+      "step": 3105
+    },
+    {
+      "epoch": 5.339055793991417,
+      "grad_norm": 0.3179317116737366,
+      "learning_rate": 0.00010545519314517291,
+      "loss": 0.5993,
+      "step": 3110
+    },
+    {
+      "epoch": 5.347639484978541,
+      "grad_norm": 0.343841016292572,
+      "learning_rate": 0.00010515573086125805,
+      "loss": 0.6029,
+      "step": 3115
+    },
+    {
+      "epoch": 5.356223175965665,
+      "grad_norm": 0.3552517890930176,
+      "learning_rate": 0.00010485622221144484,
+      "loss": 0.599,
+      "step": 3120
+    },
+    {
+      "epoch": 5.364806866952789,
+      "grad_norm": 0.3441089391708374,
+      "learning_rate": 0.00010455666988923842,
+      "loss": 0.598,
+      "step": 3125
+    },
+    {
+      "epoch": 5.373390557939914,
+      "grad_norm": 0.35372617840766907,
+      "learning_rate": 0.00010425707658853672,
+      "loss": 0.597,
+      "step": 3130
+    },
+    {
+      "epoch": 5.381974248927039,
+      "grad_norm": 0.33165809512138367,
+      "learning_rate": 0.00010395744500360612,
+      "loss": 0.6077,
+      "step": 3135
+    },
+    {
+      "epoch": 5.390557939914163,
+      "grad_norm": 0.3485681414604187,
+      "learning_rate": 0.00010365777782905735,
+      "loss": 0.5956,
+      "step": 3140
+    },
+    {
+      "epoch": 5.399141630901288,
+      "grad_norm": 0.3294559717178345,
+      "learning_rate": 0.00010335807775982116,
+      "loss": 0.6076,
+      "step": 3145
+    },
+    {
+      "epoch": 5.407725321888412,
+      "grad_norm": 0.3176850378513336,
+      "learning_rate": 0.00010305834749112421,
+      "loss": 0.594,
+      "step": 3150
+    },
+    {
+      "epoch": 5.416309012875536,
+      "grad_norm": 0.3580548167228699,
+      "learning_rate": 0.00010275858971846463,
+      "loss": 0.6029,
+      "step": 3155
+    },
+    {
+      "epoch": 5.424892703862661,
+      "grad_norm": 0.32710379362106323,
+      "learning_rate": 0.00010245880713758793,
+      "loss": 0.6063,
+      "step": 3160
+    },
+    {
+      "epoch": 5.4334763948497855,
+      "grad_norm": 0.37140390276908875,
+      "learning_rate": 0.00010215900244446279,
+      "loss": 0.6108,
+      "step": 3165
+    },
+    {
+      "epoch": 5.44206008583691,
+      "grad_norm": 0.3271103501319885,
+      "learning_rate": 0.00010185917833525669,
+      "loss": 0.6086,
+      "step": 3170
+    },
+    {
+      "epoch": 5.450643776824034,
+      "grad_norm": 0.3177226781845093,
+      "learning_rate": 0.00010155933750631172,
+      "loss": 0.5976,
+      "step": 3175
+    },
+    {
+      "epoch": 5.459227467811159,
+      "grad_norm": 0.33881354331970215,
+      "learning_rate": 0.00010125948265412033,
+      "loss": 0.5975,
+      "step": 3180
+    },
+    {
+      "epoch": 5.467811158798283,
+      "grad_norm": 0.35869210958480835,
+      "learning_rate": 0.0001009596164753011,
+      "loss": 0.6032,
+      "step": 3185
+    },
+    {
+      "epoch": 5.476394849785407,
+      "grad_norm": 0.34474968910217285,
+      "learning_rate": 0.00010065974166657448,
+      "loss": 0.6014,
+      "step": 3190
+    },
+    {
+      "epoch": 5.484978540772532,
+      "grad_norm": 0.3329688012599945,
+      "learning_rate": 0.00010035986092473847,
+      "loss": 0.6042,
+      "step": 3195
+    },
+    {
+      "epoch": 5.493562231759657,
+      "grad_norm": 0.34499508142471313,
+      "learning_rate": 0.00010005997694664451,
+      "loss": 0.5998,
+      "step": 3200
+    },
+    {
+      "epoch": 5.502145922746781,
+      "grad_norm": 0.3572762608528137,
+      "learning_rate": 9.976009242917307e-05,
+      "loss": 0.6031,
+      "step": 3205
+    },
+    {
+      "epoch": 5.510729613733906,
+      "grad_norm": 0.33951064944267273,
+      "learning_rate": 9.946021006920959e-05,
+      "loss": 0.6046,
+      "step": 3210
+    },
+    {
+      "epoch": 5.51931330472103,
+      "grad_norm": 0.3542894423007965,
+      "learning_rate": 9.91603325636199e-05,
+      "loss": 0.6063,
+      "step": 3215
+    },
+    {
+      "epoch": 5.527896995708154,
+      "grad_norm": 0.3496350646018982,
+      "learning_rate": 9.886046260922634e-05,
+      "loss": 0.6059,
+      "step": 3220
+    },
+    {
+      "epoch": 5.536480686695279,
+      "grad_norm": 0.36511626839637756,
+      "learning_rate": 9.856060290278337e-05,
+      "loss": 0.6011,
+      "step": 3225
+    },
+    {
+      "epoch": 5.545064377682404,
+      "grad_norm": 0.3340092897415161,
+      "learning_rate": 9.826075614095311e-05,
+      "loss": 0.6098,
+      "step": 3230
+    },
+    {
+      "epoch": 5.553648068669528,
+      "grad_norm": 0.35916590690612793,
+      "learning_rate": 9.796092502028145e-05,
+      "loss": 0.6047,
+      "step": 3235
+    },
+    {
+      "epoch": 5.562231759656653,
+      "grad_norm": 0.3391835689544678,
+      "learning_rate": 9.766111223717352e-05,
+      "loss": 0.61,
+      "step": 3240
+    },
+    {
+      "epoch": 5.570815450643777,
+      "grad_norm": 0.3280404508113861,
+      "learning_rate": 9.736132048786954e-05,
+      "loss": 0.6088,
+      "step": 3245
+    },
+    {
+      "epoch": 5.579399141630901,
+      "grad_norm": 0.31755104660987854,
+      "learning_rate": 9.706155246842062e-05,
+      "loss": 0.6051,
+      "step": 3250
+    },
+    {
+      "epoch": 5.587982832618025,
+      "grad_norm": 0.33683615922927856,
+      "learning_rate": 9.676181087466444e-05,
+      "loss": 0.6071,
+      "step": 3255
+    },
+    {
+      "epoch": 5.5965665236051505,
+      "grad_norm": 0.34338319301605225,
+      "learning_rate": 9.646209840220098e-05,
+      "loss": 0.6083,
+      "step": 3260
+    },
+    {
+      "epoch": 5.605150214592275,
+      "grad_norm": 0.35656723380088806,
+      "learning_rate": 9.616241774636845e-05,
+      "loss": 0.6032,
+      "step": 3265
+    },
+    {
+      "epoch": 5.613733905579399,
+      "grad_norm": 0.33608657121658325,
+      "learning_rate": 9.586277160221884e-05,
+      "loss": 0.6176,
+      "step": 3270
+    },
+    {
+      "epoch": 5.622317596566524,
+      "grad_norm": 0.35035207867622375,
+      "learning_rate": 9.556316266449377e-05,
+      "loss": 0.6037,
+      "step": 3275
+    },
+    {
+      "epoch": 5.630901287553648,
+      "grad_norm": 0.3345491290092468,
+      "learning_rate": 9.526359362760032e-05,
+      "loss": 0.6022,
+      "step": 3280
+    },
+    {
+      "epoch": 5.639484978540772,
+      "grad_norm": 0.3335653841495514,
+      "learning_rate": 9.496406718558665e-05,
+      "loss": 0.6091,
+      "step": 3285
+    },
+    {
+      "epoch": 5.6480686695278965,
+      "grad_norm": 0.3456047773361206,
+      "learning_rate": 9.466458603211796e-05,
+      "loss": 0.615,
+      "step": 3290
+    },
+    {
+      "epoch": 5.656652360515022,
+      "grad_norm": 0.3243827223777771,
+      "learning_rate": 9.436515286045214e-05,
+      "loss": 0.6058,
+      "step": 3295
+    },
+    {
+      "epoch": 5.665236051502146,
+      "grad_norm": 0.329574853181839,
+      "learning_rate": 9.406577036341548e-05,
+      "loss": 0.6054,
+      "step": 3300
+    },
+    {
+      "epoch": 5.67381974248927,
+      "grad_norm": 0.3418329954147339,
+      "learning_rate": 9.376644123337867e-05,
+      "loss": 0.6098,
+      "step": 3305
+    },
+    {
+      "epoch": 5.682403433476395,
+      "grad_norm": 0.36660462617874146,
+      "learning_rate": 9.346716816223245e-05,
+      "loss": 0.6187,
+      "step": 3310
+    },
+    {
+      "epoch": 5.690987124463519,
+      "grad_norm": 0.3241427540779114,
+      "learning_rate": 9.316795384136333e-05,
+      "loss": 0.6121,
+      "step": 3315
+    },
+    {
+      "epoch": 5.6995708154506435,
+      "grad_norm": 0.36070680618286133,
+      "learning_rate": 9.286880096162956e-05,
+      "loss": 0.6095,
+      "step": 3320
+    },
+    {
+      "epoch": 5.708154506437769,
+      "grad_norm": 0.33510082960128784,
+      "learning_rate": 9.256971221333685e-05,
+      "loss": 0.6126,
+      "step": 3325
+    },
+    {
+      "epoch": 5.716738197424893,
+      "grad_norm": 0.3640751838684082,
+      "learning_rate": 9.227069028621406e-05,
+      "loss": 0.6199,
+      "step": 3330
+    },
+    {
+      "epoch": 5.725321888412017,
+      "grad_norm": 0.34373095631599426,
+      "learning_rate": 9.197173786938926e-05,
+      "loss": 0.6091,
+      "step": 3335
+    },
+    {
+      "epoch": 5.733905579399142,
+      "grad_norm": 0.33689820766448975,
+      "learning_rate": 9.167285765136533e-05,
+      "loss": 0.6032,
+      "step": 3340
+    },
+    {
+      "epoch": 5.742489270386266,
+      "grad_norm": 0.33981624245643616,
+      "learning_rate": 9.137405231999593e-05,
+      "loss": 0.6069,
+      "step": 3345
+    },
+    {
+      "epoch": 5.75107296137339,
+      "grad_norm": 0.34156525135040283,
+      "learning_rate": 9.107532456246114e-05,
+      "loss": 0.6006,
+      "step": 3350
+    },
+    {
+      "epoch": 5.7596566523605155,
+      "grad_norm": 0.34921032190322876,
+      "learning_rate": 9.077667706524348e-05,
+      "loss": 0.6165,
+      "step": 3355
+    },
+    {
+      "epoch": 5.76824034334764,
+      "grad_norm": 0.3511927127838135,
+      "learning_rate": 9.047811251410376e-05,
+      "loss": 0.6149,
+      "step": 3360
+    },
+    {
+      "epoch": 5.776824034334764,
+      "grad_norm": 0.3529740571975708,
+      "learning_rate": 9.01796335940567e-05,
+      "loss": 0.6088,
+      "step": 3365
+    },
+    {
+      "epoch": 5.785407725321888,
+      "grad_norm": 0.33018723130226135,
+      "learning_rate": 8.9881242989347e-05,
+      "loss": 0.6089,
+      "step": 3370
+    },
+    {
+      "epoch": 5.793991416309013,
+      "grad_norm": 0.33075249195098877,
+      "learning_rate": 8.95829433834252e-05,
+      "loss": 0.6107,
+      "step": 3375
+    },
+    {
+      "epoch": 5.802575107296137,
+      "grad_norm": 0.35009533166885376,
+      "learning_rate": 8.928473745892339e-05,
+      "loss": 0.6092,
+      "step": 3380
+    },
+    {
+      "epoch": 5.8111587982832615,
+      "grad_norm": 0.337410569190979,
+      "learning_rate": 8.898662789763115e-05,
+      "loss": 0.6049,
+      "step": 3385
+    },
+    {
+      "epoch": 5.819742489270387,
+      "grad_norm": 0.332481324672699,
+      "learning_rate": 8.868861738047158e-05,
+      "loss": 0.625,
+      "step": 3390
+    },
+    {
+      "epoch": 5.828326180257511,
+      "grad_norm": 0.34015268087387085,
+      "learning_rate": 8.839070858747697e-05,
+      "loss": 0.615,
+      "step": 3395
+    },
+    {
+      "epoch": 5.836909871244635,
+      "grad_norm": 0.35073375701904297,
+      "learning_rate": 8.809290419776488e-05,
+      "loss": 0.6038,
+      "step": 3400
+    },
+    {
+      "epoch": 5.845493562231759,
+      "grad_norm": 0.3357756733894348,
+      "learning_rate": 8.779520688951383e-05,
+      "loss": 0.6077,
+      "step": 3405
+    },
+    {
+      "epoch": 5.854077253218884,
+      "grad_norm": 0.33854493498802185,
+      "learning_rate": 8.749761933993945e-05,
+      "loss": 0.6058,
+      "step": 3410
+    },
+    {
+      "epoch": 5.8626609442060085,
+      "grad_norm": 0.3343227505683899,
+      "learning_rate": 8.720014422527034e-05,
+      "loss": 0.6113,
+      "step": 3415
+    },
+    {
+      "epoch": 5.871244635193133,
+      "grad_norm": 0.35862088203430176,
+      "learning_rate": 8.690278422072384e-05,
+      "loss": 0.606,
+      "step": 3420
+    },
+    {
+      "epoch": 5.879828326180258,
+      "grad_norm": 0.37902429699897766,
+      "learning_rate": 8.660554200048215e-05,
+      "loss": 0.6108,
+      "step": 3425
+    },
+    {
+      "epoch": 5.888412017167382,
+      "grad_norm": 0.3685941994190216,
+      "learning_rate": 8.630842023766831e-05,
+      "loss": 0.6138,
+      "step": 3430
+    },
+    {
+      "epoch": 5.896995708154506,
+      "grad_norm": 0.3437183201313019,
+      "learning_rate": 8.601142160432193e-05,
+      "loss": 0.6156,
+      "step": 3435
+    },
+    {
+      "epoch": 5.905579399141631,
+      "grad_norm": 0.34756121039390564,
+      "learning_rate": 8.571454877137539e-05,
+      "loss": 0.6076,
+      "step": 3440
+    },
+    {
+      "epoch": 5.914163090128755,
+      "grad_norm": 0.33176571130752563,
+      "learning_rate": 8.541780440862977e-05,
+      "loss": 0.6065,
+      "step": 3445
+    },
+    {
+      "epoch": 5.92274678111588,
+      "grad_norm": 0.3526177704334259,
+      "learning_rate": 8.512119118473067e-05,
+      "loss": 0.6125,
+      "step": 3450
+    },
+    {
+      "epoch": 5.931330472103005,
+      "grad_norm": 0.3659914433956146,
+      "learning_rate": 8.482471176714454e-05,
+      "loss": 0.6095,
+      "step": 3455
+    },
+    {
+      "epoch": 5.939914163090129,
+      "grad_norm": 0.34757936000823975,
+      "learning_rate": 8.45283688221344e-05,
+      "loss": 0.6134,
+      "step": 3460
+    },
+    {
+      "epoch": 5.948497854077253,
+      "grad_norm": 0.34589987993240356,
+      "learning_rate": 8.423216501473585e-05,
+      "loss": 0.6074,
+      "step": 3465
+    },
+    {
+      "epoch": 5.957081545064378,
+      "grad_norm": 0.34556832909584045,
+      "learning_rate": 8.393610300873345e-05,
+      "loss": 0.609,
+      "step": 3470
+    },
+    {
+      "epoch": 5.965665236051502,
+      "grad_norm": 0.3431447148323059,
+      "learning_rate": 8.364018546663634e-05,
+      "loss": 0.6157,
+      "step": 3475
+    },
+    {
+      "epoch": 5.9742489270386265,
+      "grad_norm": 0.35311102867126465,
+      "learning_rate": 8.334441504965455e-05,
+      "loss": 0.6143,
+      "step": 3480
+    },
+    {
+      "epoch": 5.982832618025751,
+      "grad_norm": 0.3422008156776428,
+      "learning_rate": 8.304879441767504e-05,
+      "loss": 0.6046,
+      "step": 3485
+    },
+    {
+      "epoch": 5.991416309012876,
+      "grad_norm": 0.3697339594364166,
+      "learning_rate": 8.275332622923769e-05,
+      "loss": 0.6141,
+      "step": 3490
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.332302987575531,
+      "learning_rate": 8.245801314151139e-05,
+      "loss": 0.6102,
+      "step": 3495
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.7089035511016846,
+      "eval_runtime": 0.3939,
+      "eval_samples_per_second": 15.232,
+      "eval_steps_per_second": 2.539,
+      "step": 3495
+    },
+    {
+      "epoch": 6.008583690987124,
+      "grad_norm": 0.3818419575691223,
+      "learning_rate": 8.216285781027036e-05,
+      "loss": 0.5334,
+      "step": 3500
+    },
+    {
+      "epoch": 6.017167381974249,
+      "grad_norm": 0.35173356533050537,
+      "learning_rate": 8.186786288986992e-05,
+      "loss": 0.5409,
+      "step": 3505
+    },
+    {
+      "epoch": 6.0257510729613735,
+      "grad_norm": 0.348001092672348,
+      "learning_rate": 8.157303103322296e-05,
+      "loss": 0.5294,
+      "step": 3510
+    },
+    {
+      "epoch": 6.034334763948498,
+      "grad_norm": 0.38012921810150146,
+      "learning_rate": 8.127836489177584e-05,
+      "loss": 0.5289,
+      "step": 3515
+    },
+    {
+      "epoch": 6.042918454935623,
+      "grad_norm": 0.3508910536766052,
+      "learning_rate": 8.098386711548458e-05,
+      "loss": 0.5375,
+      "step": 3520
+    },
+    {
+      "epoch": 6.051502145922747,
+      "grad_norm": 0.3329296410083771,
+      "learning_rate": 8.068954035279121e-05,
+      "loss": 0.5298,
+      "step": 3525
+    },
+    {
+      "epoch": 6.060085836909871,
+      "grad_norm": 0.3630905747413635,
+      "learning_rate": 8.039538725059976e-05,
+      "loss": 0.5349,
+      "step": 3530
+    },
+    {
+      "epoch": 6.068669527896995,
+      "grad_norm": 0.32074230909347534,
+      "learning_rate": 8.010141045425244e-05,
+      "loss": 0.5376,
+      "step": 3535
+    },
+    {
+      "epoch": 6.07725321888412,
+      "grad_norm": 0.3476736545562744,
+      "learning_rate": 7.980761260750607e-05,
+      "loss": 0.5279,
+      "step": 3540
+    },
+    {
+      "epoch": 6.085836909871245,
+      "grad_norm": 0.340426504611969,
+      "learning_rate": 7.951399635250806e-05,
+      "loss": 0.5323,
+      "step": 3545
+    },
+    {
+      "epoch": 6.094420600858369,
+      "grad_norm": 0.3367501497268677,
+      "learning_rate": 7.922056432977275e-05,
+      "loss": 0.5486,
+      "step": 3550
+    },
+    {
+      "epoch": 6.103004291845494,
+      "grad_norm": 0.3827115595340729,
+      "learning_rate": 7.892731917815774e-05,
+      "loss": 0.528,
+      "step": 3555
+    },
+    {
+      "epoch": 6.111587982832618,
+      "grad_norm": 0.33026212453842163,
+      "learning_rate": 7.863426353484002e-05,
+      "loss": 0.5303,
+      "step": 3560
+    },
+    {
+      "epoch": 6.120171673819742,
+      "grad_norm": 0.3674776256084442,
+      "learning_rate": 7.834140003529238e-05,
+      "loss": 0.5355,
+      "step": 3565
+    },
+    {
+      "epoch": 6.128755364806867,
+      "grad_norm": 0.3238033056259155,
+      "learning_rate": 7.804873131325954e-05,
+      "loss": 0.5382,
+      "step": 3570
+    },
+    {
+      "epoch": 6.1373390557939915,
+      "grad_norm": 0.3594464063644409,
+      "learning_rate": 7.775626000073463e-05,
+      "loss": 0.5319,
+      "step": 3575
+    },
+    {
+      "epoch": 6.145922746781116,
+      "grad_norm": 0.3666881322860718,
+      "learning_rate": 7.74639887279355e-05,
+      "loss": 0.5387,
+      "step": 3580
+    },
+    {
+      "epoch": 6.154506437768241,
+      "grad_norm": 0.34055057168006897,
+      "learning_rate": 7.7171920123281e-05,
+      "loss": 0.5296,
+      "step": 3585
+    },
+    {
+      "epoch": 6.163090128755365,
+      "grad_norm": 0.3717866837978363,
+      "learning_rate": 7.688005681336729e-05,
+      "loss": 0.5371,
+      "step": 3590
+    },
+    {
+      "epoch": 6.171673819742489,
+      "grad_norm": 0.3607926368713379,
+      "learning_rate": 7.658840142294444e-05,
+      "loss": 0.5446,
+      "step": 3595
+    },
+    {
+      "epoch": 6.180257510729613,
+      "grad_norm": 0.3458268940448761,
+      "learning_rate": 7.629695657489257e-05,
+      "loss": 0.5393,
+      "step": 3600
+    },
+    {
+      "epoch": 6.1888412017167385,
+      "grad_norm": 0.3347238600254059,
+      "learning_rate": 7.600572489019842e-05,
+      "loss": 0.5254,
+      "step": 3605
+    },
+    {
+      "epoch": 6.197424892703863,
+      "grad_norm": 0.3612549901008606,
+      "learning_rate": 7.571470898793173e-05,
+      "loss": 0.5275,
+      "step": 3610
+    },
+    {
+      "epoch": 6.206008583690987,
+      "grad_norm": 0.3335767686367035,
+      "learning_rate": 7.542391148522168e-05,
+      "loss": 0.5304,
+      "step": 3615
+    },
+    {
+      "epoch": 6.214592274678112,
+      "grad_norm": 0.3704369068145752,
+      "learning_rate": 7.513333499723343e-05,
+      "loss": 0.5404,
+      "step": 3620
+    },
+    {
+      "epoch": 6.223175965665236,
+      "grad_norm": 0.3659282922744751,
+      "learning_rate": 7.484298213714442e-05,
+      "loss": 0.5357,
+      "step": 3625
+    },
+    {
+      "epoch": 6.23175965665236,
+      "grad_norm": 0.34985628724098206,
+      "learning_rate": 7.455285551612105e-05,
+      "loss": 0.5411,
+      "step": 3630
+    },
+    {
+      "epoch": 6.240343347639485,
+      "grad_norm": 0.3511213958263397,
+      "learning_rate": 7.426295774329524e-05,
+      "loss": 0.5361,
+      "step": 3635
+    },
+    {
+      "epoch": 6.24892703862661,
+      "grad_norm": 0.36643365025520325,
+      "learning_rate": 7.397329142574063e-05,
+      "loss": 0.5256,
+      "step": 3640
+    },
+    {
+      "epoch": 6.257510729613734,
+      "grad_norm": 0.3581530451774597,
+      "learning_rate": 7.36838591684495e-05,
+      "loss": 0.5364,
+      "step": 3645
+    },
+    {
+      "epoch": 6.266094420600858,
+      "grad_norm": 0.3588225245475769,
+      "learning_rate": 7.339466357430928e-05,
+      "loss": 0.5433,
+      "step": 3650
+    },
+    {
+      "epoch": 6.274678111587983,
+      "grad_norm": 0.3423435389995575,
+      "learning_rate": 7.310570724407892e-05,
+      "loss": 0.5481,
+      "step": 3655
+    },
+    {
+      "epoch": 6.283261802575107,
+      "grad_norm": 0.37222719192504883,
+      "learning_rate": 7.281699277636572e-05,
+      "loss": 0.5418,
+      "step": 3660
+    },
+    {
+      "epoch": 6.291845493562231,
+      "grad_norm": 0.37203842401504517,
+      "learning_rate": 7.252852276760193e-05,
+      "loss": 0.5401,
+      "step": 3665
+    },
+    {
+      "epoch": 6.3004291845493565,
+      "grad_norm": 0.3703368306159973,
+      "learning_rate": 7.224029981202122e-05,
+      "loss": 0.5381,
+      "step": 3670
+    },
+    {
+      "epoch": 6.309012875536481,
+      "grad_norm": 0.36878761649131775,
+      "learning_rate": 7.195232650163575e-05,
+      "loss": 0.5257,
+      "step": 3675
+    },
+    {
+      "epoch": 6.317596566523605,
+      "grad_norm": 0.3529357314109802,
+      "learning_rate": 7.16646054262123e-05,
+      "loss": 0.542,
+      "step": 3680
+    },
+    {
+      "epoch": 6.32618025751073,
+      "grad_norm": 0.3673364520072937,
+      "learning_rate": 7.137713917324945e-05,
+      "loss": 0.5351,
+      "step": 3685
+    },
+    {
+      "epoch": 6.334763948497854,
+      "grad_norm": 0.3675953149795532,
+      "learning_rate": 7.108993032795418e-05,
+      "loss": 0.5455,
+      "step": 3690
+    },
+    {
+      "epoch": 6.343347639484978,
+      "grad_norm": 0.3677637279033661,
+      "learning_rate": 7.080298147321844e-05,
+      "loss": 0.5352,
+      "step": 3695
+    },
+    {
+      "epoch": 6.3519313304721035,
+      "grad_norm": 0.36320698261260986,
+      "learning_rate": 7.051629518959614e-05,
+      "loss": 0.5445,
+      "step": 3700
+    },
+    {
+      "epoch": 6.360515021459228,
+      "grad_norm": 0.36490681767463684,
+      "learning_rate": 7.022987405527997e-05,
+      "loss": 0.5379,
+      "step": 3705
+    },
+    {
+      "epoch": 6.369098712446352,
+      "grad_norm": 0.37636715173721313,
+      "learning_rate": 6.994372064607788e-05,
+      "loss": 0.5416,
+      "step": 3710
+    },
+    {
+      "epoch": 6.377682403433476,
+      "grad_norm": 0.3601493239402771,
+      "learning_rate": 6.96578375353903e-05,
+      "loss": 0.544,
+      "step": 3715
+    },
+    {
+      "epoch": 6.386266094420601,
+      "grad_norm": 0.3615502715110779,
+      "learning_rate": 6.93722272941869e-05,
+      "loss": 0.5368,
+      "step": 3720
+    },
+    {
+      "epoch": 6.394849785407725,
+      "grad_norm": 0.3495427668094635,
+      "learning_rate": 6.908689249098321e-05,
+      "loss": 0.5414,
+      "step": 3725
+    },
+    {
+      "epoch": 6.4034334763948495,
+      "grad_norm": 0.36156705021858215,
+      "learning_rate": 6.880183569181795e-05,
+      "loss": 0.5379,
+      "step": 3730
+    },
+    {
+      "epoch": 6.412017167381975,
+      "grad_norm": 0.38569867610931396,
+      "learning_rate": 6.85170594602296e-05,
+      "loss": 0.5445,
+      "step": 3735
+    },
+    {
+      "epoch": 6.420600858369099,
+      "grad_norm": 0.37031883001327515,
+      "learning_rate": 6.823256635723343e-05,
+      "loss": 0.55,
+      "step": 3740
+    },
+    {
+      "epoch": 6.429184549356223,
+      "grad_norm": 0.33991584181785583,
+      "learning_rate": 6.794835894129865e-05,
+      "loss": 0.5427,
+      "step": 3745
+    },
+    {
+      "epoch": 6.437768240343348,
+      "grad_norm": 0.35758277773857117,
+      "learning_rate": 6.766443976832517e-05,
+      "loss": 0.5393,
+      "step": 3750
+    },
+    {
+      "epoch": 6.446351931330472,
+      "grad_norm": 0.37792807817459106,
+      "learning_rate": 6.738081139162072e-05,
+      "loss": 0.5466,
+      "step": 3755
+    },
+    {
+      "epoch": 6.454935622317596,
+      "grad_norm": 0.3743073046207428,
+      "learning_rate": 6.709747636187789e-05,
+      "loss": 0.539,
+      "step": 3760
+    },
+    {
+      "epoch": 6.463519313304721,
+      "grad_norm": 0.397246390581131,
+      "learning_rate": 6.68144372271512e-05,
+      "loss": 0.5417,
+      "step": 3765
+    },
+    {
+      "epoch": 6.472103004291846,
+      "grad_norm": 0.3632808029651642,
+      "learning_rate": 6.653169653283406e-05,
+      "loss": 0.5403,
+      "step": 3770
+    },
+    {
+      "epoch": 6.48068669527897,
+      "grad_norm": 0.3583681583404541,
+      "learning_rate": 6.624925682163614e-05,
+      "loss": 0.5462,
+      "step": 3775
+    },
+    {
+      "epoch": 6.489270386266094,
+      "grad_norm": 0.3529057502746582,
+      "learning_rate": 6.59671206335602e-05,
+      "loss": 0.5425,
+      "step": 3780
+    },
+    {
+      "epoch": 6.497854077253219,
+      "grad_norm": 0.3624202311038971,
+      "learning_rate": 6.568529050587953e-05,
+      "loss": 0.5462,
+      "step": 3785
+    },
+    {
+      "epoch": 6.506437768240343,
+      "grad_norm": 0.3762538433074951,
+      "learning_rate": 6.540376897311489e-05,
+      "loss": 0.538,
+      "step": 3790
+    },
+    {
+      "epoch": 6.515021459227468,
+      "grad_norm": 0.37663355469703674,
+      "learning_rate": 6.512255856701177e-05,
+      "loss": 0.5432,
+      "step": 3795
+    },
+    {
+      "epoch": 6.523605150214593,
+      "grad_norm": 0.37055134773254395,
+      "learning_rate": 6.484166181651785e-05,
+      "loss": 0.5405,
+      "step": 3800
+    },
+    {
+      "epoch": 6.532188841201717,
+      "grad_norm": 0.3634713590145111,
+      "learning_rate": 6.456108124775999e-05,
+      "loss": 0.5442,
+      "step": 3805
+    },
+    {
+      "epoch": 6.540772532188841,
+      "grad_norm": 0.3575882017612457,
+      "learning_rate": 6.428081938402149e-05,
+      "loss": 0.5395,
+      "step": 3810
+    },
+    {
+      "epoch": 6.549356223175966,
+      "grad_norm": 0.3856394290924072,
+      "learning_rate": 6.400087874571973e-05,
+      "loss": 0.5417,
+      "step": 3815
+    },
+    {
+      "epoch": 6.55793991416309,
+      "grad_norm": 0.3542211651802063,
+      "learning_rate": 6.372126185038313e-05,
+      "loss": 0.5337,
+      "step": 3820
+    },
+    {
+      "epoch": 6.5665236051502145,
+      "grad_norm": 0.3997708559036255,
+      "learning_rate": 6.344197121262868e-05,
+      "loss": 0.5431,
+      "step": 3825
+    },
+    {
+      "epoch": 6.575107296137339,
+      "grad_norm": 0.3704608082771301,
+      "learning_rate": 6.316300934413935e-05,
+      "loss": 0.5356,
+      "step": 3830
+    },
+    {
+      "epoch": 6.583690987124464,
+      "grad_norm": 0.3824236989021301,
+      "learning_rate": 6.288437875364141e-05,
+      "loss": 0.5406,
+      "step": 3835
+    },
+    {
+      "epoch": 6.592274678111588,
+      "grad_norm": 0.3561914563179016,
+      "learning_rate": 6.260608194688206e-05,
+      "loss": 0.5405,
+      "step": 3840
+    },
+    {
+      "epoch": 6.600858369098712,
+      "grad_norm": 0.3756065368652344,
+      "learning_rate": 6.232812142660658e-05,
+      "loss": 0.5365,
+      "step": 3845
+    },
+    {
+      "epoch": 6.609442060085837,
+      "grad_norm": 0.3645598292350769,
+      "learning_rate": 6.205049969253605e-05,
+      "loss": 0.5358,
+      "step": 3850
+    },
+    {
+      "epoch": 6.618025751072961,
+      "grad_norm": 0.37949660420417786,
+      "learning_rate": 6.17732192413449e-05,
+      "loss": 0.5452,
+      "step": 3855
+    },
+    {
+      "epoch": 6.626609442060086,
+      "grad_norm": 0.36608970165252686,
+      "learning_rate": 6.149628256663827e-05,
+      "loss": 0.545,
+      "step": 3860
+    },
+    {
+      "epoch": 6.63519313304721,
+      "grad_norm": 0.3485977351665497,
+      "learning_rate": 6.121969215892972e-05,
+      "loss": 0.5445,
+      "step": 3865
+    },
+    {
+      "epoch": 6.643776824034335,
+      "grad_norm": 0.38575315475463867,
+      "learning_rate": 6.0943450505618917e-05,
+      "loss": 0.5448,
+      "step": 3870
+    },
+    {
+      "epoch": 6.652360515021459,
+      "grad_norm": 0.3650740087032318,
+      "learning_rate": 6.066756009096896e-05,
+      "loss": 0.5489,
+      "step": 3875
+    },
+    {
+      "epoch": 6.660944206008583,
+      "grad_norm": 0.3552764058113098,
+      "learning_rate": 6.039202339608432e-05,
+      "loss": 0.5461,
+      "step": 3880
+    },
+    {
+      "epoch": 6.669527896995708,
+      "grad_norm": 0.3777913749217987,
+      "learning_rate": 6.01168428988885e-05,
+      "loss": 0.5437,
+      "step": 3885
+    },
+    {
+      "epoch": 6.6781115879828326,
+      "grad_norm": 0.37254467606544495,
+      "learning_rate": 5.9842021074101605e-05,
+      "loss": 0.5351,
+      "step": 3890
+    },
+    {
+      "epoch": 6.686695278969957,
+      "grad_norm": 0.36322537064552307,
+      "learning_rate": 5.956756039321825e-05,
+      "loss": 0.5503,
+      "step": 3895
+    },
+    {
+      "epoch": 6.695278969957082,
+      "grad_norm": 0.3798597753047943,
+      "learning_rate": 5.929346332448511e-05,
+      "loss": 0.5336,
+      "step": 3900
+    },
+    {
+      "epoch": 6.703862660944206,
+      "grad_norm": 0.3622066080570221,
+      "learning_rate": 5.901973233287901e-05,
+      "loss": 0.5472,
+      "step": 3905
+    },
+    {
+      "epoch": 6.71244635193133,
+      "grad_norm": 0.37123680114746094,
+      "learning_rate": 5.874636988008457e-05,
+      "loss": 0.5376,
+      "step": 3910
+    },
+    {
+      "epoch": 6.721030042918455,
+      "grad_norm": 0.3789604902267456,
+      "learning_rate": 5.847337842447209e-05,
+      "loss": 0.5472,
+      "step": 3915
+    },
+    {
+      "epoch": 6.7296137339055795,
+      "grad_norm": 0.3710649311542511,
+      "learning_rate": 5.820076042107545e-05,
+      "loss": 0.5459,
+      "step": 3920
+    },
+    {
+      "epoch": 6.738197424892704,
+      "grad_norm": 0.41028717160224915,
+      "learning_rate": 5.792851832157014e-05,
+      "loss": 0.5415,
+      "step": 3925
+    },
+    {
+      "epoch": 6.746781115879829,
+      "grad_norm": 0.3710199296474457,
+      "learning_rate": 5.765665457425102e-05,
+      "loss": 0.5376,
+      "step": 3930
+    },
+    {
+      "epoch": 6.755364806866953,
+      "grad_norm": 0.37828171253204346,
+      "learning_rate": 5.7385171624010346e-05,
+      "loss": 0.5474,
+      "step": 3935
+    },
+    {
+      "epoch": 6.763948497854077,
+      "grad_norm": 0.35286852717399597,
+      "learning_rate": 5.711407191231602e-05,
+      "loss": 0.5435,
+      "step": 3940
+    },
+    {
+      "epoch": 6.772532188841201,
+      "grad_norm": 0.39667871594429016,
+      "learning_rate": 5.684335787718932e-05,
+      "loss": 0.5471,
+      "step": 3945
+    },
+    {
+      "epoch": 6.781115879828326,
+      "grad_norm": 0.3569738268852234,
+      "learning_rate": 5.657303195318311e-05,
+      "loss": 0.5362,
+      "step": 3950
+    },
+    {
+      "epoch": 6.789699570815451,
+      "grad_norm": 0.3528185784816742,
+      "learning_rate": 5.630309657135997e-05,
+      "loss": 0.5383,
+      "step": 3955
+    },
+    {
+      "epoch": 6.798283261802575,
+      "grad_norm": 0.3892223834991455,
+      "learning_rate": 5.6033554159270294e-05,
+      "loss": 0.5446,
+      "step": 3960
+    },
+    {
+      "epoch": 6.8068669527897,
+      "grad_norm": 0.3695877492427826,
+      "learning_rate": 5.576440714093046e-05,
+      "loss": 0.5488,
+      "step": 3965
+    },
+    {
+      "epoch": 6.815450643776824,
+      "grad_norm": 0.3762911856174469,
+      "learning_rate": 5.549565793680105e-05,
+      "loss": 0.5398,
+      "step": 3970
+    },
+    {
+      "epoch": 6.824034334763948,
+      "grad_norm": 0.36472398042678833,
+      "learning_rate": 5.522730896376506e-05,
+      "loss": 0.5457,
+      "step": 3975
+    },
+    {
+      "epoch": 6.8326180257510725,
+      "grad_norm": 0.37586814165115356,
+      "learning_rate": 5.495936263510617e-05,
+      "loss": 0.5396,
+      "step": 3980
+    },
+    {
+      "epoch": 6.8412017167381975,
+      "grad_norm": 0.37699511647224426,
+      "learning_rate": 5.4691821360487086e-05,
+      "loss": 0.5394,
+      "step": 3985
+    },
+    {
+      "epoch": 6.849785407725322,
+      "grad_norm": 0.3839593529701233,
+      "learning_rate": 5.4424687545927776e-05,
+      "loss": 0.5429,
+      "step": 3990
+    },
+    {
+      "epoch": 6.858369098712446,
+      "grad_norm": 0.3746870756149292,
+      "learning_rate": 5.415796359378393e-05,
+      "loss": 0.538,
+      "step": 3995
+    },
+    {
+      "epoch": 6.866952789699571,
+      "grad_norm": 0.36862820386886597,
+      "learning_rate": 5.389165190272527e-05,
+      "loss": 0.5433,
+      "step": 4000
+    },
+    {
+      "epoch": 6.875536480686695,
+      "grad_norm": 0.3722948729991913,
+      "learning_rate": 5.362575486771414e-05,
+      "loss": 0.5533,
+      "step": 4005
+    },
+    {
+      "epoch": 6.884120171673819,
+      "grad_norm": 0.3654981255531311,
+      "learning_rate": 5.3360274879983654e-05,
+      "loss": 0.5345,
+      "step": 4010
+    },
+    {
+      "epoch": 6.8927038626609445,
+      "grad_norm": 0.35018405318260193,
+      "learning_rate": 5.3095214327016474e-05,
+      "loss": 0.5484,
+      "step": 4015
+    },
+    {
+      "epoch": 6.901287553648069,
+      "grad_norm": 0.37176111340522766,
+      "learning_rate": 5.283057559252341e-05,
+      "loss": 0.5422,
+      "step": 4020
+    },
+    {
+      "epoch": 6.909871244635193,
+      "grad_norm": 0.37949976325035095,
+      "learning_rate": 5.256636105642154e-05,
+      "loss": 0.5501,
+      "step": 4025
+    },
+    {
+      "epoch": 6.918454935622318,
+      "grad_norm": 0.35853028297424316,
+      "learning_rate": 5.2302573094813266e-05,
+      "loss": 0.5427,
+      "step": 4030
+    },
+    {
+      "epoch": 6.927038626609442,
+      "grad_norm": 0.370491623878479,
+      "learning_rate": 5.2039214079964836e-05,
+      "loss": 0.5426,
+      "step": 4035
+    },
+    {
+      "epoch": 6.935622317596566,
+      "grad_norm": 0.3726717233657837,
+      "learning_rate": 5.177628638028472e-05,
+      "loss": 0.5447,
+      "step": 4040
+    },
+    {
+      "epoch": 6.944206008583691,
+      "grad_norm": 0.37951403856277466,
+      "learning_rate": 5.1513792360302696e-05,
+      "loss": 0.5369,
+      "step": 4045
+    },
+    {
+      "epoch": 6.952789699570816,
+      "grad_norm": 0.3583022654056549,
+      "learning_rate": 5.12517343806485e-05,
+      "loss": 0.5586,
+      "step": 4050
+    },
+    {
+      "epoch": 6.96137339055794,
+      "grad_norm": 0.37766262888908386,
+      "learning_rate": 5.099011479803025e-05,
+      "loss": 0.537,
+      "step": 4055
+    },
+    {
+      "epoch": 6.969957081545064,
+      "grad_norm": 0.36432990431785583,
+      "learning_rate": 5.0728935965213834e-05,
+      "loss": 0.5462,
+      "step": 4060
+    },
+    {
+      "epoch": 6.978540772532189,
+      "grad_norm": 0.37999647855758667,
+      "learning_rate": 5.0468200231001286e-05,
+      "loss": 0.5474,
+      "step": 4065
+    },
+    {
+      "epoch": 6.987124463519313,
+      "grad_norm": 0.3890798091888428,
+      "learning_rate": 5.020790994020972e-05,
+      "loss": 0.5395,
+      "step": 4070
+    },
+    {
+      "epoch": 6.9957081545064375,
+      "grad_norm": 0.3609655797481537,
+      "learning_rate": 4.994806743365057e-05,
+      "loss": 0.5466,
+      "step": 4075
+    },
+    {
+      "epoch": 6.9991416309012875,
+      "eval_loss": 3.0554237365722656,
+      "eval_runtime": 0.3942,
+      "eval_samples_per_second": 15.219,
+      "eval_steps_per_second": 2.537,
+      "step": 4077
+    },
+    {
+      "epoch": 7.0042918454935625,
+      "grad_norm": 0.2814909517765045,
+      "learning_rate": 4.96886750481082e-05,
+      "loss": 0.5172,
+      "step": 4080
+    },
+    {
+      "epoch": 7.012875536480687,
+      "grad_norm": 0.39267781376838684,
+      "learning_rate": 4.942973511631889e-05,
+      "loss": 0.4929,
+      "step": 4085
+    },
+    {
+      "epoch": 7.021459227467811,
+      "grad_norm": 0.35972246527671814,
+      "learning_rate": 4.9171249966950175e-05,
+      "loss": 0.4891,
+      "step": 4090
+    },
+    {
+      "epoch": 7.030042918454936,
+      "grad_norm": 0.31733205914497375,
+      "learning_rate": 4.8913221924579554e-05,
+      "loss": 0.4899,
+      "step": 4095
+    },
+    {
+      "epoch": 7.03862660944206,
+      "grad_norm": 0.37763702869415283,
+      "learning_rate": 4.8655653309673776e-05,
+      "loss": 0.4899,
+      "step": 4100
+    },
+    {
+      "epoch": 7.047210300429184,
+      "grad_norm": 0.39104217290878296,
+      "learning_rate": 4.839854643856792e-05,
+      "loss": 0.4843,
+      "step": 4105
+    },
+    {
+      "epoch": 7.055793991416309,
+      "grad_norm": 0.3271881937980652,
+      "learning_rate": 4.814190362344454e-05,
+      "loss": 0.4903,
+      "step": 4110
+    },
+    {
+      "epoch": 7.064377682403434,
+      "grad_norm": 0.35317346453666687,
+      "learning_rate": 4.788572717231293e-05,
+      "loss": 0.4916,
+      "step": 4115
+    },
+    {
+      "epoch": 7.072961373390558,
+      "grad_norm": 0.37199559807777405,
+      "learning_rate": 4.763001938898832e-05,
+      "loss": 0.4865,
+      "step": 4120
+    },
+    {
+      "epoch": 7.081545064377682,
+      "grad_norm": 0.36147797107696533,
+      "learning_rate": 4.7374782573071176e-05,
+      "loss": 0.4884,
+      "step": 4125
+    },
+    {
+      "epoch": 7.090128755364807,
+      "grad_norm": 0.3491626977920532,
+      "learning_rate": 4.712001901992652e-05,
+      "loss": 0.4926,
+      "step": 4130
+    },
+    {
+      "epoch": 7.098712446351931,
+      "grad_norm": 0.36010846495628357,
+      "learning_rate": 4.686573102066326e-05,
+      "loss": 0.4942,
+      "step": 4135
+    },
+    {
+      "epoch": 7.1072961373390555,
+      "grad_norm": 0.34614065289497375,
+      "learning_rate": 4.661192086211366e-05,
+      "loss": 0.4888,
+      "step": 4140
+    },
+    {
+      "epoch": 7.115879828326181,
+      "grad_norm": 0.37029707431793213,
+      "learning_rate": 4.6358590826812664e-05,
+      "loss": 0.493,
+      "step": 4145
+    },
+    {
+      "epoch": 7.124463519313305,
+      "grad_norm": 0.35328662395477295,
+      "learning_rate": 4.610574319297748e-05,
+      "loss": 0.4949,
+      "step": 4150
+    },
+    {
+      "epoch": 7.133047210300429,
+      "grad_norm": 0.33476021885871887,
+      "learning_rate": 4.585338023448702e-05,
+      "loss": 0.488,
+      "step": 4155
+    },
+    {
+      "epoch": 7.141630901287554,
+      "grad_norm": 0.36379748582839966,
+      "learning_rate": 4.560150422086147e-05,
+      "loss": 0.4927,
+      "step": 4160
+    },
+    {
+      "epoch": 7.150214592274678,
+      "grad_norm": 0.3896268308162689,
+      "learning_rate": 4.535011741724184e-05,
+      "loss": 0.4917,
+      "step": 4165
+    },
+    {
+      "epoch": 7.1587982832618025,
+      "grad_norm": 0.3854301869869232,
+      "learning_rate": 4.5099222084369805e-05,
+      "loss": 0.4853,
+      "step": 4170
+    },
+    {
+      "epoch": 7.167381974248927,
+      "grad_norm": 0.36009612679481506,
+      "learning_rate": 4.4848820478566966e-05,
+      "loss": 0.4905,
+      "step": 4175
+    },
+    {
+      "epoch": 7.175965665236052,
+      "grad_norm": 0.3766346573829651,
+      "learning_rate": 4.45989148517149e-05,
+      "loss": 0.4936,
+      "step": 4180
+    },
+    {
+      "epoch": 7.184549356223176,
+      "grad_norm": 0.3666467070579529,
+      "learning_rate": 4.4349507451234894e-05,
+      "loss": 0.487,
+      "step": 4185
+    },
+    {
+      "epoch": 7.1931330472103,
+      "grad_norm": 0.3508441150188446,
+      "learning_rate": 4.410060052006758e-05,
+      "loss": 0.4916,
+      "step": 4190
+    },
+    {
+      "epoch": 7.201716738197425,
+      "grad_norm": 0.3494192957878113,
+      "learning_rate": 4.3852196296652706e-05,
+      "loss": 0.4901,
+      "step": 4195
+    },
+    {
+      "epoch": 7.210300429184549,
+      "grad_norm": 0.35362881422042847,
+      "learning_rate": 4.360429701490934e-05,
+      "loss": 0.4933,
+      "step": 4200
+    },
+    {
+      "epoch": 7.218884120171674,
+      "grad_norm": 0.35061484575271606,
+      "learning_rate": 4.335690490421548e-05,
+      "loss": 0.4883,
+      "step": 4205
+    },
+    {
+      "epoch": 7.227467811158799,
+      "grad_norm": 0.3576537072658539,
+      "learning_rate": 4.311002218938798e-05,
+      "loss": 0.4896,
+      "step": 4210
+    },
+    {
+      "epoch": 7.236051502145923,
+      "grad_norm": 0.35517919063568115,
+      "learning_rate": 4.286365109066285e-05,
+      "loss": 0.4873,
+      "step": 4215
+    },
+    {
+      "epoch": 7.244635193133047,
+      "grad_norm": 0.3709685206413269,
+      "learning_rate": 4.261779382367499e-05,
+      "loss": 0.495,
+      "step": 4220
+    },
+    {
+      "epoch": 7.253218884120171,
+      "grad_norm": 0.39842909574508667,
+      "learning_rate": 4.237245259943837e-05,
+      "loss": 0.4957,
+      "step": 4225
+    },
+    {
+      "epoch": 7.261802575107296,
+      "grad_norm": 0.3722572922706604,
+      "learning_rate": 4.212762962432619e-05,
+      "loss": 0.4978,
+      "step": 4230
+    },
+    {
+      "epoch": 7.2703862660944205,
+      "grad_norm": 0.3434411287307739,
+      "learning_rate": 4.188332710005094e-05,
+      "loss": 0.4925,
+      "step": 4235
+    },
+    {
+      "epoch": 7.278969957081545,
+      "grad_norm": 0.3870338797569275,
+      "learning_rate": 4.1639547223644706e-05,
+      "loss": 0.4802,
+      "step": 4240
+    },
+    {
+      "epoch": 7.28755364806867,
+      "grad_norm": 0.3743104040622711,
+      "learning_rate": 4.139629218743931e-05,
+      "loss": 0.4847,
+      "step": 4245
+    },
+    {
+      "epoch": 7.296137339055794,
+      "grad_norm": 0.3608282506465912,
+      "learning_rate": 4.11535641790467e-05,
+      "loss": 0.486,
+      "step": 4250
+    },
+    {
+      "epoch": 7.304721030042918,
+      "grad_norm": 0.3679661452770233,
+      "learning_rate": 4.091136538133916e-05,
+      "loss": 0.4942,
+      "step": 4255
+    },
+    {
+      "epoch": 7.313304721030043,
+      "grad_norm": 0.3837164044380188,
+      "learning_rate": 4.06696979724298e-05,
+      "loss": 0.4881,
+      "step": 4260
+    },
+    {
+      "epoch": 7.3218884120171674,
+      "grad_norm": 0.37015727162361145,
+      "learning_rate": 4.042856412565287e-05,
+      "loss": 0.4875,
+      "step": 4265
+    },
+    {
+      "epoch": 7.330472103004292,
+      "grad_norm": 0.3824974596500397,
+      "learning_rate": 4.0187966009544255e-05,
+      "loss": 0.4895,
+      "step": 4270
+    },
+    {
+      "epoch": 7.339055793991417,
+      "grad_norm": 0.356283038854599,
+      "learning_rate": 3.994790578782198e-05,
+      "loss": 0.4961,
+      "step": 4275
+    },
+    {
+      "epoch": 7.347639484978541,
+      "grad_norm": 0.3656464219093323,
+      "learning_rate": 3.970838561936675e-05,
+      "loss": 0.5015,
+      "step": 4280
+    },
+    {
+      "epoch": 7.356223175965665,
+      "grad_norm": 0.3856269419193268,
+      "learning_rate": 3.9469407658202514e-05,
+      "loss": 0.4941,
+      "step": 4285
+    },
+    {
+      "epoch": 7.364806866952789,
+      "grad_norm": 0.4023449122905731,
+      "learning_rate": 3.9230974053477086e-05,
+      "loss": 0.4943,
+      "step": 4290
+    },
+    {
+      "epoch": 7.373390557939914,
+      "grad_norm": 0.34391605854034424,
+      "learning_rate": 3.899308694944298e-05,
+      "loss": 0.5006,
+      "step": 4295
+    },
+    {
+      "epoch": 7.381974248927039,
+      "grad_norm": 0.3965080976486206,
+      "learning_rate": 3.875574848543774e-05,
+      "loss": 0.4925,
+      "step": 4300
+    },
+    {
+      "epoch": 7.390557939914163,
+      "grad_norm": 0.37056249380111694,
+      "learning_rate": 3.85189607958651e-05,
+      "loss": 0.5052,
+      "step": 4305
+    },
+    {
+      "epoch": 7.399141630901288,
+      "grad_norm": 0.3915135860443115,
+      "learning_rate": 3.8282726010175715e-05,
+      "loss": 0.4885,
+      "step": 4310
+    },
+    {
+      "epoch": 7.407725321888412,
+      "grad_norm": 0.3784487247467041,
+      "learning_rate": 3.804704625284774e-05,
+      "loss": 0.4902,
+      "step": 4315
+    },
+    {
+      "epoch": 7.416309012875536,
+      "grad_norm": 0.36971473693847656,
+      "learning_rate": 3.7811923643367974e-05,
+      "loss": 0.4971,
+      "step": 4320
+    },
+    {
+      "epoch": 7.424892703862661,
+      "grad_norm": 0.36764466762542725,
+      "learning_rate": 3.757736029621292e-05,
+      "loss": 0.4873,
+      "step": 4325
+    },
+    {
+      "epoch": 7.4334763948497855,
+      "grad_norm": 0.3773200809955597,
+      "learning_rate": 3.734335832082927e-05,
+      "loss": 0.5019,
+      "step": 4330
+    },
+    {
+      "epoch": 7.44206008583691,
+      "grad_norm": 0.34619271755218506,
+      "learning_rate": 3.710991982161555e-05,
+      "loss": 0.4919,
+      "step": 4335
+    },
+    {
+      "epoch": 7.450643776824034,
+      "grad_norm": 0.33658042550086975,
+      "learning_rate": 3.687704689790277e-05,
+      "loss": 0.4883,
+      "step": 4340
+    },
+    {
+      "epoch": 7.459227467811159,
+      "grad_norm": 0.36298757791519165,
+      "learning_rate": 3.66447416439356e-05,
+      "loss": 0.5003,
+      "step": 4345
+    },
+    {
+      "epoch": 7.467811158798283,
+      "grad_norm": 0.35422852635383606,
+      "learning_rate": 3.641300614885378e-05,
+      "loss": 0.4923,
+      "step": 4350
+    },
+    {
+      "epoch": 7.476394849785407,
+      "grad_norm": 0.3848954737186432,
+      "learning_rate": 3.618184249667308e-05,
+      "loss": 0.4977,
+      "step": 4355
+    },
+    {
+      "epoch": 7.484978540772532,
+      "grad_norm": 0.3532540798187256,
+      "learning_rate": 3.595125276626653e-05,
+      "loss": 0.5032,
+      "step": 4360
+    },
+    {
+      "epoch": 7.493562231759657,
+      "grad_norm": 0.35986649990081787,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 0.4964,
+      "step": 4365
+    },
+    {
+      "epoch": 7.502145922746781,
+      "grad_norm": 0.3666352927684784,
+      "learning_rate": 3.549180336044352e-05,
+      "loss": 0.4992,
+      "step": 4370
+    },
+    {
+      "epoch": 7.510729613733906,
+      "grad_norm": 0.3380297124385834,
+      "learning_rate": 3.526294781689206e-05,
+      "loss": 0.4817,
+      "step": 4375
+    },
+    {
+      "epoch": 7.51931330472103,
+      "grad_norm": 0.36128494143486023,
+      "learning_rate": 3.503467445880789e-05,
+      "loss": 0.483,
+      "step": 4380
+    },
+    {
+      "epoch": 7.527896995708154,
+      "grad_norm": 0.3538447916507721,
+      "learning_rate": 3.480698533907152e-05,
+      "loss": 0.4921,
+      "step": 4385
+    },
+    {
+      "epoch": 7.536480686695279,
+      "grad_norm": 0.36427024006843567,
+      "learning_rate": 3.457988250530931e-05,
+      "loss": 0.4993,
+      "step": 4390
+    },
+    {
+      "epoch": 7.545064377682404,
+      "grad_norm": 0.37009111046791077,
+      "learning_rate": 3.435336799987514e-05,
+      "loss": 0.4961,
+      "step": 4395
+    },
+    {
+      "epoch": 7.553648068669528,
+      "grad_norm": 0.36786022782325745,
+      "learning_rate": 3.412744385983201e-05,
+      "loss": 0.4894,
+      "step": 4400
+    },
+    {
+      "epoch": 7.562231759656653,
+      "grad_norm": 0.3731597363948822,
+      "learning_rate": 3.390211211693369e-05,
+      "loss": 0.4916,
+      "step": 4405
+    },
+    {
+      "epoch": 7.570815450643777,
+      "grad_norm": 0.35336822271347046,
+      "learning_rate": 3.367737479760652e-05,
+      "loss": 0.4891,
+      "step": 4410
+    },
+    {
+      "epoch": 7.579399141630901,
+      "grad_norm": 0.35434237122535706,
+      "learning_rate": 3.3453233922931094e-05,
+      "loss": 0.4937,
+      "step": 4415
+    },
+    {
+      "epoch": 7.587982832618025,
+      "grad_norm": 0.3567320704460144,
+      "learning_rate": 3.322969150862416e-05,
+      "loss": 0.4979,
+      "step": 4420
+    },
+    {
+      "epoch": 7.5965665236051505,
+      "grad_norm": 0.3649292588233948,
+      "learning_rate": 3.300674956502047e-05,
+      "loss": 0.4925,
+      "step": 4425
+    },
+    {
+      "epoch": 7.605150214592275,
+      "grad_norm": 0.39201802015304565,
+      "learning_rate": 3.2784410097054666e-05,
+      "loss": 0.4866,
+      "step": 4430
+    },
+    {
+      "epoch": 7.613733905579399,
+      "grad_norm": 0.3701328933238983,
+      "learning_rate": 3.25626751042433e-05,
+      "loss": 0.4876,
+      "step": 4435
+    },
+    {
+      "epoch": 7.622317596566524,
+      "grad_norm": 0.3631632328033447,
+      "learning_rate": 3.2341546580666796e-05,
+      "loss": 0.4944,
+      "step": 4440
+    },
+    {
+      "epoch": 7.630901287553648,
+      "grad_norm": 0.3858960270881653,
+      "learning_rate": 3.212102651495167e-05,
+      "loss": 0.4971,
+      "step": 4445
+    },
+    {
+      "epoch": 7.639484978540772,
+      "grad_norm": 0.37257277965545654,
+      "learning_rate": 3.1901116890252345e-05,
+      "loss": 0.4971,
+      "step": 4450
+    },
+    {
+      "epoch": 7.6480686695278965,
+      "grad_norm": 0.37199750542640686,
+      "learning_rate": 3.1681819684233605e-05,
+      "loss": 0.4989,
+      "step": 4455
+    },
+    {
+      "epoch": 7.656652360515022,
+      "grad_norm": 0.3896372318267822,
+      "learning_rate": 3.146313686905279e-05,
+      "loss": 0.4939,
+      "step": 4460
+    },
+    {
+      "epoch": 7.665236051502146,
+      "grad_norm": 0.3580029010772705,
+      "learning_rate": 3.124507041134177e-05,
+      "loss": 0.4945,
+      "step": 4465
+    },
+    {
+      "epoch": 7.67381974248927,
+      "grad_norm": 0.36499252915382385,
+      "learning_rate": 3.102762227218957e-05,
+      "loss": 0.4912,
+      "step": 4470
+    },
+    {
+      "epoch": 7.682403433476395,
+      "grad_norm": 0.3598448634147644,
+      "learning_rate": 3.081079440712473e-05,
+      "loss": 0.4994,
+      "step": 4475
+    },
+    {
+      "epoch": 7.690987124463519,
+      "grad_norm": 0.3923290967941284,
+      "learning_rate": 3.059458876609742e-05,
+      "loss": 0.4894,
+      "step": 4480
+    },
+    {
+      "epoch": 7.6995708154506435,
+      "grad_norm": 0.3650890588760376,
+      "learning_rate": 3.0379007293462192e-05,
+      "loss": 0.4905,
+      "step": 4485
+    },
+    {
+      "epoch": 7.708154506437769,
+      "grad_norm": 0.38070616126060486,
+      "learning_rate": 3.0164051927960492e-05,
+      "loss": 0.4996,
+      "step": 4490
+    },
+    {
+      "epoch": 7.716738197424893,
+      "grad_norm": 0.3461267054080963,
+      "learning_rate": 2.994972460270291e-05,
+      "loss": 0.4939,
+      "step": 4495
+    },
+    {
+      "epoch": 7.725321888412017,
+      "grad_norm": 0.36452245712280273,
+      "learning_rate": 2.9736027245152275e-05,
+      "loss": 0.5021,
+      "step": 4500
+    },
+    {
+      "epoch": 7.733905579399142,
+      "grad_norm": 0.4071807861328125,
+      "learning_rate": 2.9522961777105897e-05,
+      "loss": 0.5019,
+      "step": 4505
+    },
+    {
+      "epoch": 7.742489270386266,
+      "grad_norm": 0.36440128087997437,
+      "learning_rate": 2.9310530114678502e-05,
+      "loss": 0.5024,
+      "step": 4510
+    },
+    {
+      "epoch": 7.75107296137339,
+      "grad_norm": 0.3590448796749115,
+      "learning_rate": 2.9098734168284968e-05,
+      "loss": 0.4874,
+      "step": 4515
+    },
+    {
+      "epoch": 7.7596566523605155,
+      "grad_norm": 0.3638148903846741,
+      "learning_rate": 2.8887575842623093e-05,
+      "loss": 0.483,
+      "step": 4520
+    },
+    {
+      "epoch": 7.76824034334764,
+      "grad_norm": 0.36555618047714233,
+      "learning_rate": 2.867705703665654e-05,
+      "loss": 0.4917,
+      "step": 4525
+    },
+    {
+      "epoch": 7.776824034334764,
+      "grad_norm": 0.3763795793056488,
+      "learning_rate": 2.8467179643597697e-05,
+      "loss": 0.4886,
+      "step": 4530
+    },
+    {
+      "epoch": 7.785407725321888,
+      "grad_norm": 0.3643328845500946,
+      "learning_rate": 2.8257945550890665e-05,
+      "loss": 0.4981,
+      "step": 4535
+    },
+    {
+      "epoch": 7.793991416309013,
+      "grad_norm": 0.3772119879722595,
+      "learning_rate": 2.8049356640194314e-05,
+      "loss": 0.4868,
+      "step": 4540
+    },
+    {
+      "epoch": 7.802575107296137,
+      "grad_norm": 0.3641767203807831,
+      "learning_rate": 2.784141478736534e-05,
+      "loss": 0.4928,
+      "step": 4545
+    },
+    {
+      "epoch": 7.8111587982832615,
+      "grad_norm": 0.3673217296600342,
+      "learning_rate": 2.7634121862441386e-05,
+      "loss": 0.4922,
+      "step": 4550
+    },
+    {
+      "epoch": 7.819742489270387,
+      "grad_norm": 0.3594400882720947,
+      "learning_rate": 2.742747972962424e-05,
+      "loss": 0.5024,
+      "step": 4555
+    },
+    {
+      "epoch": 7.828326180257511,
+      "grad_norm": 0.36666861176490784,
+      "learning_rate": 2.722149024726307e-05,
+      "loss": 0.5001,
+      "step": 4560
+    },
+    {
+      "epoch": 7.836909871244635,
+      "grad_norm": 0.3865159749984741,
+      "learning_rate": 2.7016155267837684e-05,
+      "loss": 0.4909,
+      "step": 4565
+    },
+    {
+      "epoch": 7.845493562231759,
+      "grad_norm": 0.3859226107597351,
+      "learning_rate": 2.6811476637941922e-05,
+      "loss": 0.4917,
+      "step": 4570
+    },
+    {
+      "epoch": 7.854077253218884,
+      "grad_norm": 0.37502434849739075,
+      "learning_rate": 2.660745619826701e-05,
+      "loss": 0.4934,
+      "step": 4575
+    },
+    {
+      "epoch": 7.8626609442060085,
+      "grad_norm": 0.3713277280330658,
+      "learning_rate": 2.6404095783585002e-05,
+      "loss": 0.5048,
+      "step": 4580
+    },
+    {
+      "epoch": 7.871244635193133,
+      "grad_norm": 0.39273905754089355,
+      "learning_rate": 2.6201397222732316e-05,
+      "loss": 0.4937,
+      "step": 4585
+    },
+    {
+      "epoch": 7.879828326180258,
+      "grad_norm": 0.377205491065979,
+      "learning_rate": 2.599936233859326e-05,
+      "loss": 0.4989,
+      "step": 4590
+    },
+    {
+      "epoch": 7.888412017167382,
+      "grad_norm": 0.3574148714542389,
+      "learning_rate": 2.5797992948083592e-05,
+      "loss": 0.492,
+      "step": 4595
+    },
+    {
+      "epoch": 7.896995708154506,
+      "grad_norm": 0.3615160286426544,
+      "learning_rate": 2.5597290862134405e-05,
+      "loss": 0.4859,
+      "step": 4600
+    },
+    {
+      "epoch": 7.905579399141631,
+      "grad_norm": 0.37071695923805237,
+      "learning_rate": 2.5397257885675397e-05,
+      "loss": 0.4884,
+      "step": 4605
+    },
+    {
+      "epoch": 7.914163090128755,
+      "grad_norm": 0.36150577664375305,
+      "learning_rate": 2.5197895817619153e-05,
+      "loss": 0.4903,
+      "step": 4610
+    },
+    {
+      "epoch": 7.92274678111588,
+      "grad_norm": 0.3787161409854889,
+      "learning_rate": 2.499920645084465e-05,
+      "loss": 0.498,
+      "step": 4615
+    },
+    {
+      "epoch": 7.931330472103005,
+      "grad_norm": 0.36254122853279114,
+      "learning_rate": 2.480119157218108e-05,
+      "loss": 0.4968,
+      "step": 4620
+    },
+    {
+      "epoch": 7.939914163090129,
+      "grad_norm": 0.3832210302352905,
+      "learning_rate": 2.4603852962392125e-05,
+      "loss": 0.4936,
+      "step": 4625
+    },
+    {
+      "epoch": 7.948497854077253,
+      "grad_norm": 0.39253130555152893,
+      "learning_rate": 2.4407192396159627e-05,
+      "loss": 0.4941,
+      "step": 4630
+    },
+    {
+      "epoch": 7.957081545064378,
+      "grad_norm": 0.3705868422985077,
+      "learning_rate": 2.4211211642067623e-05,
+      "loss": 0.4864,
+      "step": 4635
+    },
+    {
+      "epoch": 7.965665236051502,
+      "grad_norm": 0.38986867666244507,
+      "learning_rate": 2.401591246258673e-05,
+      "loss": 0.4971,
+      "step": 4640
+    },
+    {
+      "epoch": 7.9742489270386265,
+      "grad_norm": 0.3880539536476135,
+      "learning_rate": 2.3821296614058054e-05,
+      "loss": 0.4966,
+      "step": 4645
+    },
+    {
+      "epoch": 7.982832618025751,
+      "grad_norm": 0.3790036141872406,
+      "learning_rate": 2.3627365846677306e-05,
+      "loss": 0.5004,
+      "step": 4650
+    },
+    {
+      "epoch": 7.991416309012876,
+      "grad_norm": 0.3554070293903351,
+      "learning_rate": 2.3434121904479434e-05,
+      "loss": 0.4865,
+      "step": 4655
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.3582840859889984,
+      "learning_rate": 2.3241566525322554e-05,
+      "loss": 0.5038,
+      "step": 4660
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 3.4052770137786865,
+      "eval_runtime": 0.394,
+      "eval_samples_per_second": 15.23,
+      "eval_steps_per_second": 2.538,
+      "step": 4660
+    },
+    {
+      "epoch": 8.008583690987125,
+      "grad_norm": 0.29518163204193115,
+      "learning_rate": 2.304970144087255e-05,
+      "loss": 0.4553,
+      "step": 4665
+    },
+    {
+      "epoch": 8.017167381974248,
+      "grad_norm": 0.3456011414527893,
+      "learning_rate": 2.2858528376587407e-05,
+      "loss": 0.4638,
+      "step": 4670
+    },
+    {
+      "epoch": 8.025751072961373,
+      "grad_norm": 0.3549324572086334,
+      "learning_rate": 2.2668049051701713e-05,
+      "loss": 0.461,
+      "step": 4675
+    },
+    {
+      "epoch": 8.034334763948499,
+      "grad_norm": 0.31662818789482117,
+      "learning_rate": 2.247826517921121e-05,
+      "loss": 0.4662,
+      "step": 4680
+    },
+    {
+      "epoch": 8.042918454935622,
+      "grad_norm": 0.3052162230014801,
+      "learning_rate": 2.2289178465857397e-05,
+      "loss": 0.4645,
+      "step": 4685
+    },
+    {
+      "epoch": 8.051502145922747,
+      "grad_norm": 0.34132641553878784,
+      "learning_rate": 2.2100790612112133e-05,
+      "loss": 0.461,
+      "step": 4690
+    },
+    {
+      "epoch": 8.060085836909872,
+      "grad_norm": 0.3659987449645996,
+      "learning_rate": 2.19131033121624e-05,
+      "loss": 0.4602,
+      "step": 4695
+    },
+    {
+      "epoch": 8.068669527896995,
+      "grad_norm": 0.3580094575881958,
+      "learning_rate": 2.1726118253895034e-05,
+      "loss": 0.4593,
+      "step": 4700
+    },
+    {
+      "epoch": 8.07725321888412,
+      "grad_norm": 0.32578280568122864,
+      "learning_rate": 2.1539837118881567e-05,
+      "loss": 0.4593,
+      "step": 4705
+    },
+    {
+      "epoch": 8.085836909871245,
+      "grad_norm": 0.3422725200653076,
+      "learning_rate": 2.135426158236309e-05,
+      "loss": 0.4624,
+      "step": 4710
+    },
+    {
+      "epoch": 8.094420600858369,
+      "grad_norm": 0.33877745270729065,
+      "learning_rate": 2.116939331323514e-05,
+      "loss": 0.465,
+      "step": 4715
+    },
+    {
+      "epoch": 8.103004291845494,
+      "grad_norm": 0.3325134813785553,
+      "learning_rate": 2.098523397403288e-05,
+      "loss": 0.4585,
+      "step": 4720
+    },
+    {
+      "epoch": 8.111587982832617,
+      "grad_norm": 0.3371487259864807,
+      "learning_rate": 2.080178522091585e-05,
+      "loss": 0.4631,
+      "step": 4725
+    },
+    {
+      "epoch": 8.120171673819742,
+      "grad_norm": 0.3465471565723419,
+      "learning_rate": 2.0619048703653266e-05,
+      "loss": 0.469,
+      "step": 4730
+    },
+    {
+      "epoch": 8.128755364806867,
+      "grad_norm": 0.3354833424091339,
+      "learning_rate": 2.04370260656093e-05,
+      "loss": 0.4656,
+      "step": 4735
+    },
+    {
+      "epoch": 8.13733905579399,
+      "grad_norm": 0.335443913936615,
+      "learning_rate": 2.025571894372794e-05,
+      "loss": 0.4591,
+      "step": 4740
+    },
+    {
+      "epoch": 8.145922746781116,
+      "grad_norm": 0.35047757625579834,
+      "learning_rate": 2.0075128968518573e-05,
+      "loss": 0.4656,
+      "step": 4745
+    },
+    {
+      "epoch": 8.15450643776824,
+      "grad_norm": 0.373524934053421,
+      "learning_rate": 1.989525776404132e-05,
+      "loss": 0.4612,
+      "step": 4750
+    },
+    {
+      "epoch": 8.163090128755364,
+      "grad_norm": 0.3468015491962433,
+      "learning_rate": 1.9716106947892164e-05,
+      "loss": 0.4594,
+      "step": 4755
+    },
+    {
+      "epoch": 8.17167381974249,
+      "grad_norm": 0.3522886335849762,
+      "learning_rate": 1.9537678131188674e-05,
+      "loss": 0.4635,
+      "step": 4760
+    },
+    {
+      "epoch": 8.180257510729614,
+      "grad_norm": 0.350538045167923,
+      "learning_rate": 1.9359972918555492e-05,
+      "loss": 0.4615,
+      "step": 4765
+    },
+    {
+      "epoch": 8.188841201716738,
+      "grad_norm": 0.31984084844589233,
+      "learning_rate": 1.9182992908109644e-05,
+      "loss": 0.4623,
+      "step": 4770
+    },
+    {
+      "epoch": 8.197424892703863,
+      "grad_norm": 0.34820571541786194,
+      "learning_rate": 1.900673969144653e-05,
+      "loss": 0.465,
+      "step": 4775
+    },
+    {
+      "epoch": 8.206008583690988,
+      "grad_norm": 0.3526110053062439,
+      "learning_rate": 1.883121485362538e-05,
+      "loss": 0.4608,
+      "step": 4780
+    },
+    {
+      "epoch": 8.214592274678111,
+      "grad_norm": 0.3859311044216156,
+      "learning_rate": 1.865641997315496e-05,
+      "loss": 0.4705,
+      "step": 4785
+    },
+    {
+      "epoch": 8.223175965665236,
+      "grad_norm": 0.3409660756587982,
+      "learning_rate": 1.8482356621979645e-05,
+      "loss": 0.4647,
+      "step": 4790
+    },
+    {
+      "epoch": 8.231759656652361,
+      "grad_norm": 0.34335795044898987,
+      "learning_rate": 1.8309026365464998e-05,
+      "loss": 0.4647,
+      "step": 4795
+    },
+    {
+      "epoch": 8.240343347639485,
+      "grad_norm": 0.33586952090263367,
+      "learning_rate": 1.813643076238375e-05,
+      "loss": 0.4626,
+      "step": 4800
+    },
+    {
+      "epoch": 8.24892703862661,
+      "grad_norm": 0.343476265668869,
+      "learning_rate": 1.7964571364902005e-05,
+      "loss": 0.4604,
+      "step": 4805
+    },
+    {
+      "epoch": 8.257510729613735,
+      "grad_norm": 0.3527016341686249,
+      "learning_rate": 1.779344971856497e-05,
+      "loss": 0.4645,
+      "step": 4810
+    },
+    {
+      "epoch": 8.266094420600858,
+      "grad_norm": 0.34603551030158997,
+      "learning_rate": 1.7623067362283243e-05,
+      "loss": 0.4641,
+      "step": 4815
+    },
+    {
+      "epoch": 8.274678111587983,
+      "grad_norm": 0.3567690849304199,
+      "learning_rate": 1.7453425828318936e-05,
+      "loss": 0.4622,
+      "step": 4820
+    },
+    {
+      "epoch": 8.283261802575108,
+      "grad_norm": 0.3398036062717438,
+      "learning_rate": 1.728452664227187e-05,
+      "loss": 0.457,
+      "step": 4825
+    },
+    {
+      "epoch": 8.291845493562231,
+      "grad_norm": 0.359521746635437,
+      "learning_rate": 1.7116371323065883e-05,
+      "loss": 0.4616,
+      "step": 4830
+    },
+    {
+      "epoch": 8.300429184549357,
+      "grad_norm": 0.3150378465652466,
+      "learning_rate": 1.694896138293516e-05,
+      "loss": 0.4578,
+      "step": 4835
+    },
+    {
+      "epoch": 8.309012875536482,
+      "grad_norm": 0.3591357469558716,
+      "learning_rate": 1.6782298327410616e-05,
+      "loss": 0.4604,
+      "step": 4840
+    },
+    {
+      "epoch": 8.317596566523605,
+      "grad_norm": 0.33606967329978943,
+      "learning_rate": 1.66163836553064e-05,
+      "loss": 0.4521,
+      "step": 4845
+    },
+    {
+      "epoch": 8.32618025751073,
+      "grad_norm": 0.3668070435523987,
+      "learning_rate": 1.6451218858706374e-05,
+      "loss": 0.4737,
+      "step": 4850
+    },
+    {
+      "epoch": 8.334763948497853,
+      "grad_norm": 0.36258599162101746,
+      "learning_rate": 1.628680542295069e-05,
+      "loss": 0.4691,
+      "step": 4855
+    },
+    {
+      "epoch": 8.343347639484978,
+      "grad_norm": 0.3564538061618805,
+      "learning_rate": 1.6123144826622504e-05,
+      "loss": 0.4634,
+      "step": 4860
+    },
+    {
+      "epoch": 8.351931330472103,
+      "grad_norm": 0.36181172728538513,
+      "learning_rate": 1.5960238541534578e-05,
+      "loss": 0.4555,
+      "step": 4865
+    },
+    {
+      "epoch": 8.360515021459227,
+      "grad_norm": 0.36802351474761963,
+      "learning_rate": 1.579808803271612e-05,
+      "loss": 0.4605,
+      "step": 4870
+    },
+    {
+      "epoch": 8.369098712446352,
+      "grad_norm": 0.37159237265586853,
+      "learning_rate": 1.563669475839956e-05,
+      "loss": 0.46,
+      "step": 4875
+    },
+    {
+      "epoch": 8.377682403433477,
+      "grad_norm": 0.36820727586746216,
+      "learning_rate": 1.5476060170007457e-05,
+      "loss": 0.467,
+      "step": 4880
+    },
+    {
+      "epoch": 8.3862660944206,
+      "grad_norm": 0.3330000340938568,
+      "learning_rate": 1.531618571213953e-05,
+      "loss": 0.469,
+      "step": 4885
+    },
+    {
+      "epoch": 8.394849785407725,
+      "grad_norm": 0.38085103034973145,
+      "learning_rate": 1.5157072822559437e-05,
+      "loss": 0.4644,
+      "step": 4890
+    },
+    {
+      "epoch": 8.40343347639485,
+      "grad_norm": 0.35326817631721497,
+      "learning_rate": 1.4998722932182074e-05,
+      "loss": 0.4659,
+      "step": 4895
+    },
+    {
+      "epoch": 8.412017167381974,
+      "grad_norm": 0.3420933187007904,
+      "learning_rate": 1.4841137465060672e-05,
+      "loss": 0.4673,
+      "step": 4900
+    },
+    {
+      "epoch": 8.420600858369099,
+      "grad_norm": 0.3507622480392456,
+      "learning_rate": 1.4684317838373884e-05,
+      "loss": 0.4721,
+      "step": 4905
+    },
+    {
+      "epoch": 8.429184549356224,
+      "grad_norm": 0.35186630487442017,
+      "learning_rate": 1.4528265462413038e-05,
+      "loss": 0.4667,
+      "step": 4910
+    },
+    {
+      "epoch": 8.437768240343347,
+      "grad_norm": 0.3655546009540558,
+      "learning_rate": 1.4372981740569646e-05,
+      "loss": 0.4675,
+      "step": 4915
+    },
+    {
+      "epoch": 8.446351931330472,
+      "grad_norm": 0.3504914343357086,
+      "learning_rate": 1.4218468069322578e-05,
+      "loss": 0.4657,
+      "step": 4920
+    },
+    {
+      "epoch": 8.454935622317597,
+      "grad_norm": 0.3535081446170807,
+      "learning_rate": 1.4064725838225568e-05,
+      "loss": 0.4672,
+      "step": 4925
+    },
+    {
+      "epoch": 8.46351931330472,
+      "grad_norm": 0.38395631313323975,
+      "learning_rate": 1.3911756429894763e-05,
+      "loss": 0.4684,
+      "step": 4930
+    },
+    {
+      "epoch": 8.472103004291846,
+      "grad_norm": 0.3384489417076111,
+      "learning_rate": 1.3759561219996242e-05,
+      "loss": 0.4515,
+      "step": 4935
+    },
+    {
+      "epoch": 8.48068669527897,
+      "grad_norm": 0.3759305477142334,
+      "learning_rate": 1.3608141577233636e-05,
+      "loss": 0.4604,
+      "step": 4940
+    },
+    {
+      "epoch": 8.489270386266094,
+      "grad_norm": 0.3741336464881897,
+      "learning_rate": 1.345749886333586e-05,
+      "loss": 0.4683,
+      "step": 4945
+    },
+    {
+      "epoch": 8.49785407725322,
+      "grad_norm": 0.3483313322067261,
+      "learning_rate": 1.3307634433044846e-05,
+      "loss": 0.4639,
+      "step": 4950
+    },
+    {
+      "epoch": 8.506437768240342,
+      "grad_norm": 0.36218151450157166,
+      "learning_rate": 1.3158549634103357e-05,
+      "loss": 0.466,
+      "step": 4955
+    },
+    {
+      "epoch": 8.515021459227468,
+      "grad_norm": 0.363930344581604,
+      "learning_rate": 1.3010245807242849e-05,
+      "loss": 0.4617,
+      "step": 4960
+    },
+    {
+      "epoch": 8.523605150214593,
+      "grad_norm": 0.35775625705718994,
+      "learning_rate": 1.2862724286171467e-05,
+      "loss": 0.4717,
+      "step": 4965
+    },
+    {
+      "epoch": 8.532188841201716,
+      "grad_norm": 0.3388819098472595,
+      "learning_rate": 1.2715986397561997e-05,
+      "loss": 0.467,
+      "step": 4970
+    },
+    {
+      "epoch": 8.540772532188841,
+      "grad_norm": 0.3473096787929535,
+      "learning_rate": 1.2570033461039954e-05,
+      "loss": 0.4569,
+      "step": 4975
+    },
+    {
+      "epoch": 8.549356223175966,
+      "grad_norm": 0.36242905259132385,
+      "learning_rate": 1.2424866789171729e-05,
+      "loss": 0.4631,
+      "step": 4980
+    },
+    {
+      "epoch": 8.55793991416309,
+      "grad_norm": 0.33919695019721985,
+      "learning_rate": 1.2280487687452768e-05,
+      "loss": 0.4658,
+      "step": 4985
+    },
+    {
+      "epoch": 8.566523605150214,
+      "grad_norm": 0.36114802956581116,
+      "learning_rate": 1.2136897454295837e-05,
+      "loss": 0.4615,
+      "step": 4990
+    },
+    {
+      "epoch": 8.57510729613734,
+      "grad_norm": 0.3717144727706909,
+      "learning_rate": 1.199409738101933e-05,
+      "loss": 0.4604,
+      "step": 4995
+    },
+    {
+      "epoch": 8.583690987124463,
+      "grad_norm": 0.3811343014240265,
+      "learning_rate": 1.1852088751835689e-05,
+      "loss": 0.4623,
+      "step": 5000
+    },
+    {
+      "epoch": 8.592274678111588,
+      "grad_norm": 0.35531142354011536,
+      "learning_rate": 1.1710872843839804e-05,
+      "loss": 0.4609,
+      "step": 5005
+    },
+    {
+      "epoch": 8.600858369098713,
+      "grad_norm": 0.3563953936100006,
+      "learning_rate": 1.1570450926997655e-05,
+      "loss": 0.4699,
+      "step": 5010
+    },
+    {
+      "epoch": 8.609442060085836,
+      "grad_norm": 0.3635469377040863,
+      "learning_rate": 1.1430824264134654e-05,
+      "loss": 0.4632,
+      "step": 5015
+    },
+    {
+      "epoch": 8.618025751072961,
+      "grad_norm": 0.3603283762931824,
+      "learning_rate": 1.1291994110924509e-05,
+      "loss": 0.4671,
+      "step": 5020
+    },
+    {
+      "epoch": 8.626609442060087,
+      "grad_norm": 0.35889148712158203,
+      "learning_rate": 1.1153961715877914e-05,
+      "loss": 0.4586,
+      "step": 5025
+    },
+    {
+      "epoch": 8.63519313304721,
+      "grad_norm": 0.38485071063041687,
+      "learning_rate": 1.1016728320331093e-05,
+      "loss": 0.4698,
+      "step": 5030
+    },
+    {
+      "epoch": 8.643776824034335,
+      "grad_norm": 0.3366287052631378,
+      "learning_rate": 1.0880295158434983e-05,
+      "loss": 0.4598,
+      "step": 5035
+    },
+    {
+      "epoch": 8.65236051502146,
+      "grad_norm": 0.3784838914871216,
+      "learning_rate": 1.0744663457143878e-05,
+      "loss": 0.4637,
+      "step": 5040
+    },
+    {
+      "epoch": 8.660944206008583,
+      "grad_norm": 0.35765987634658813,
+      "learning_rate": 1.0609834436204403e-05,
+      "loss": 0.462,
+      "step": 5045
+    },
+    {
+      "epoch": 8.669527896995708,
+      "grad_norm": 0.37458154559135437,
+      "learning_rate": 1.0475809308144747e-05,
+      "loss": 0.4613,
+      "step": 5050
+    },
+    {
+      "epoch": 8.678111587982833,
+      "grad_norm": 0.374141126871109,
+      "learning_rate": 1.0342589278263559e-05,
+      "loss": 0.4614,
+      "step": 5055
+    },
+    {
+      "epoch": 8.686695278969957,
+      "grad_norm": 0.34101325273513794,
+      "learning_rate": 1.0210175544619116e-05,
+      "loss": 0.4627,
+      "step": 5060
+    },
+    {
+      "epoch": 8.695278969957082,
+      "grad_norm": 0.345047265291214,
+      "learning_rate": 1.0078569298018758e-05,
+      "loss": 0.4708,
+      "step": 5065
+    },
+    {
+      "epoch": 8.703862660944207,
+      "grad_norm": 0.3726472854614258,
+      "learning_rate": 9.947771722007915e-06,
+      "loss": 0.464,
+      "step": 5070
+    },
+    {
+      "epoch": 8.71244635193133,
+      "grad_norm": 0.3675495386123657,
+      "learning_rate": 9.817783992859564e-06,
+      "loss": 0.4633,
+      "step": 5075
+    },
+    {
+      "epoch": 8.721030042918455,
+      "grad_norm": 0.32659244537353516,
+      "learning_rate": 9.688607279563766e-06,
+      "loss": 0.4685,
+      "step": 5080
+    },
+    {
+      "epoch": 8.729613733905579,
+      "grad_norm": 0.3733295798301697,
+      "learning_rate": 9.560242743816972e-06,
+      "loss": 0.4532,
+      "step": 5085
+    },
+    {
+      "epoch": 8.738197424892704,
+      "grad_norm": 0.35878074169158936,
+      "learning_rate": 9.432691540011674e-06,
+      "loss": 0.4678,
+      "step": 5090
+    },
+    {
+      "epoch": 8.746781115879829,
+      "grad_norm": 0.3598923087120056,
+      "learning_rate": 9.305954815226014e-06,
+      "loss": 0.4715,
+      "step": 5095
+    },
+    {
+      "epoch": 8.755364806866952,
+      "grad_norm": 0.34524357318878174,
+      "learning_rate": 9.180033709213454e-06,
+      "loss": 0.463,
+      "step": 5100
+    },
+    {
+      "epoch": 8.763948497854077,
+      "grad_norm": 0.34148141741752625,
+      "learning_rate": 9.054929354392527e-06,
+      "loss": 0.4693,
+      "step": 5105
+    },
+    {
+      "epoch": 8.772532188841202,
+      "grad_norm": 0.35487231612205505,
+      "learning_rate": 8.93064287583667e-06,
+      "loss": 0.4625,
+      "step": 5110
+    },
+    {
+      "epoch": 8.781115879828326,
+      "grad_norm": 0.36163830757141113,
+      "learning_rate": 8.807175391264067e-06,
+      "loss": 0.4619,
+      "step": 5115
+    },
+    {
+      "epoch": 8.78969957081545,
+      "grad_norm": 0.34637895226478577,
+      "learning_rate": 8.684528011027659e-06,
+      "loss": 0.4612,
+      "step": 5120
+    },
+    {
+      "epoch": 8.798283261802576,
+      "grad_norm": 0.3432014584541321,
+      "learning_rate": 8.562701838105115e-06,
+      "loss": 0.4666,
+      "step": 5125
+    },
+    {
+      "epoch": 8.806866952789699,
+      "grad_norm": 0.34569093585014343,
+      "learning_rate": 8.441697968088891e-06,
+      "loss": 0.4659,
+      "step": 5130
+    },
+    {
+      "epoch": 8.815450643776824,
+      "grad_norm": 0.3551480770111084,
+      "learning_rate": 8.321517489176433e-06,
+      "loss": 0.4619,
+      "step": 5135
+    },
+    {
+      "epoch": 8.82403433476395,
+      "grad_norm": 0.35777968168258667,
+      "learning_rate": 8.202161482160353e-06,
+      "loss": 0.4583,
+      "step": 5140
+    },
+    {
+      "epoch": 8.832618025751072,
+      "grad_norm": 0.3783648908138275,
+      "learning_rate": 8.083631020418791e-06,
+      "loss": 0.4596,
+      "step": 5145
+    },
+    {
+      "epoch": 8.841201716738198,
+      "grad_norm": 0.33539873361587524,
+      "learning_rate": 7.965927169905551e-06,
+      "loss": 0.4711,
+      "step": 5150
+    },
+    {
+      "epoch": 8.849785407725323,
+      "grad_norm": 0.36662939190864563,
+      "learning_rate": 7.84905098914076e-06,
+      "loss": 0.4665,
+      "step": 5155
+    },
+    {
+      "epoch": 8.858369098712446,
+      "grad_norm": 0.34115639328956604,
+      "learning_rate": 7.733003529201278e-06,
+      "loss": 0.4581,
+      "step": 5160
+    },
+    {
+      "epoch": 8.866952789699571,
+      "grad_norm": 0.3474951386451721,
+      "learning_rate": 7.617785833711077e-06,
+      "loss": 0.4662,
+      "step": 5165
+    },
+    {
+      "epoch": 8.875536480686696,
+      "grad_norm": 0.34105169773101807,
+      "learning_rate": 7.503398938832107e-06,
+      "loss": 0.4575,
+      "step": 5170
+    },
+    {
+      "epoch": 8.88412017167382,
+      "grad_norm": 0.381610631942749,
+      "learning_rate": 7.389843873254843e-06,
+      "loss": 0.4616,
+      "step": 5175
+    },
+    {
+      "epoch": 8.892703862660944,
+      "grad_norm": 0.3617483079433441,
+      "learning_rate": 7.277121658189001e-06,
+      "loss": 0.4629,
+      "step": 5180
+    },
+    {
+      "epoch": 8.901287553648068,
+      "grad_norm": 0.3416938781738281,
+      "learning_rate": 7.165233307354446e-06,
+      "loss": 0.465,
+      "step": 5185
+    },
+    {
+      "epoch": 8.909871244635193,
+      "grad_norm": 0.35436323285102844,
+      "learning_rate": 7.054179826972074e-06,
+      "loss": 0.4628,
+      "step": 5190
+    },
+    {
+      "epoch": 8.918454935622318,
+      "grad_norm": 0.35174670815467834,
+      "learning_rate": 6.943962215754618e-06,
+      "loss": 0.4704,
+      "step": 5195
+    },
+    {
+      "epoch": 8.927038626609441,
+      "grad_norm": 0.3784787356853485,
+      "learning_rate": 6.834581464897871e-06,
+      "loss": 0.4683,
+      "step": 5200
+    },
+    {
+      "epoch": 8.935622317596566,
+      "grad_norm": 0.34359362721443176,
+      "learning_rate": 6.726038558071656e-06,
+      "loss": 0.4634,
+      "step": 5205
+    },
+    {
+      "epoch": 8.944206008583691,
+      "grad_norm": 0.35282644629478455,
+      "learning_rate": 6.618334471410925e-06,
+      "loss": 0.4608,
+      "step": 5210
+    },
+    {
+      "epoch": 8.952789699570815,
+      "grad_norm": 0.3536522388458252,
+      "learning_rate": 6.511470173507161e-06,
+      "loss": 0.4631,
+      "step": 5215
+    },
+    {
+      "epoch": 8.96137339055794,
+      "grad_norm": 0.34291592240333557,
+      "learning_rate": 6.405446625399481e-06,
+      "loss": 0.4628,
+      "step": 5220
+    },
+    {
+      "epoch": 8.969957081545065,
+      "grad_norm": 0.33180317282676697,
+      "learning_rate": 6.300264780566112e-06,
+      "loss": 0.4615,
+      "step": 5225
+    },
+    {
+      "epoch": 8.978540772532188,
+      "grad_norm": 0.3489115536212921,
+      "learning_rate": 6.195925584915752e-06,
+      "loss": 0.4596,
+      "step": 5230
+    },
+    {
+      "epoch": 8.987124463519313,
+      "grad_norm": 0.34033530950546265,
+      "learning_rate": 6.0924299767791126e-06,
+      "loss": 0.47,
+      "step": 5235
+    },
+    {
+      "epoch": 8.995708154506438,
+      "grad_norm": 0.37230873107910156,
+      "learning_rate": 5.989778886900432e-06,
+      "loss": 0.4624,
+      "step": 5240
+    },
+    {
+      "epoch": 8.999141630901288,
+      "eval_loss": 3.695244073867798,
+      "eval_runtime": 0.3944,
+      "eval_samples_per_second": 15.212,
+      "eval_steps_per_second": 2.535,
+      "step": 5242
+    },
+    {
+      "epoch": 9.004291845493562,
+      "grad_norm": 0.28615859150886536,
+      "learning_rate": 5.887973238429145e-06,
+      "loss": 0.4573,
+      "step": 5245
+    },
+    {
+      "epoch": 9.012875536480687,
+      "grad_norm": 0.3141264319419861,
+      "learning_rate": 5.787013946911546e-06,
+      "loss": 0.4503,
+      "step": 5250
+    },
+    {
+      "epoch": 9.021459227467812,
+      "grad_norm": 0.32362473011016846,
+      "learning_rate": 5.686901920282606e-06,
+      "loss": 0.4558,
+      "step": 5255
+    },
+    {
+      "epoch": 9.030042918454935,
+      "grad_norm": 0.32775941491127014,
+      "learning_rate": 5.587638058857736e-06,
+      "loss": 0.445,
+      "step": 5260
+    },
+    {
+      "epoch": 9.03862660944206,
+      "grad_norm": 0.33696043491363525,
+      "learning_rate": 5.48922325532476e-06,
+      "loss": 0.4521,
+      "step": 5265
+    },
+    {
+      "epoch": 9.047210300429185,
+      "grad_norm": 0.3470819294452667,
+      "learning_rate": 5.391658394735855e-06,
+      "loss": 0.4513,
+      "step": 5270
+    },
+    {
+      "epoch": 9.055793991416309,
+      "grad_norm": 0.3222349286079407,
+      "learning_rate": 5.2949443544995644e-06,
+      "loss": 0.4488,
+      "step": 5275
+    },
+    {
+      "epoch": 9.064377682403434,
+      "grad_norm": 0.33785441517829895,
+      "learning_rate": 5.199082004372957e-06,
+      "loss": 0.4493,
+      "step": 5280
+    },
+    {
+      "epoch": 9.072961373390559,
+      "grad_norm": 0.3577852249145508,
+      "learning_rate": 5.104072206453802e-06,
+      "loss": 0.4615,
+      "step": 5285
+    },
+    {
+      "epoch": 9.081545064377682,
+      "grad_norm": 0.32605546712875366,
+      "learning_rate": 5.009915815172772e-06,
+      "loss": 0.4482,
+      "step": 5290
+    },
+    {
+      "epoch": 9.090128755364807,
+      "grad_norm": 0.320216566324234,
+      "learning_rate": 4.916613677285786e-06,
+      "loss": 0.4518,
+      "step": 5295
+    },
+    {
+      "epoch": 9.098712446351932,
+      "grad_norm": 0.323912650346756,
+      "learning_rate": 4.8241666318664115e-06,
+      "loss": 0.4442,
+      "step": 5300
+    },
+    {
+      "epoch": 9.107296137339056,
+      "grad_norm": 0.342655748128891,
+      "learning_rate": 4.732575510298276e-06,
+      "loss": 0.4437,
+      "step": 5305
+    },
+    {
+      "epoch": 9.11587982832618,
+      "grad_norm": 0.34046629071235657,
+      "learning_rate": 4.641841136267666e-06,
+      "loss": 0.4497,
+      "step": 5310
+    },
+    {
+      "epoch": 9.124463519313304,
+      "grad_norm": 0.3281947374343872,
+      "learning_rate": 4.551964325756031e-06,
+      "loss": 0.4569,
+      "step": 5315
+    },
+    {
+      "epoch": 9.133047210300429,
+      "grad_norm": 0.3604039251804352,
+      "learning_rate": 4.462945887032632e-06,
+      "loss": 0.451,
+      "step": 5320
+    },
+    {
+      "epoch": 9.141630901287554,
+      "grad_norm": 0.3501492738723755,
+      "learning_rate": 4.374786620647442e-06,
+      "loss": 0.448,
+      "step": 5325
+    },
+    {
+      "epoch": 9.150214592274677,
+      "grad_norm": 0.3506092429161072,
+      "learning_rate": 4.287487319423756e-06,
+      "loss": 0.4459,
+      "step": 5330
+    },
+    {
+      "epoch": 9.158798283261802,
+      "grad_norm": 0.3382214307785034,
+      "learning_rate": 4.20104876845111e-06,
+      "loss": 0.452,
+      "step": 5335
+    },
+    {
+      "epoch": 9.167381974248928,
+      "grad_norm": 0.3224546015262604,
+      "learning_rate": 4.115471745078314e-06,
+      "loss": 0.4535,
+      "step": 5340
+    },
+    {
+      "epoch": 9.17596566523605,
+      "grad_norm": 0.3321012854576111,
+      "learning_rate": 4.03075701890635e-06,
+      "loss": 0.4477,
+      "step": 5345
+    },
+    {
+      "epoch": 9.184549356223176,
+      "grad_norm": 0.32435712218284607,
+      "learning_rate": 3.946905351781472e-06,
+      "loss": 0.4494,
+      "step": 5350
+    },
+    {
+      "epoch": 9.193133047210301,
+      "grad_norm": 0.33920931816101074,
+      "learning_rate": 3.863917497788438e-06,
+      "loss": 0.456,
+      "step": 5355
+    },
+    {
+      "epoch": 9.201716738197424,
+      "grad_norm": 0.33260124921798706,
+      "learning_rate": 3.7817942032436048e-06,
+      "loss": 0.4471,
+      "step": 5360
+    },
+    {
+      "epoch": 9.21030042918455,
+      "grad_norm": 0.3275390863418579,
+      "learning_rate": 3.700536206688321e-06,
+      "loss": 0.4493,
+      "step": 5365
+    },
+    {
+      "epoch": 9.218884120171674,
+      "grad_norm": 0.35647067427635193,
+      "learning_rate": 3.620144238882206e-06,
+      "loss": 0.4491,
+      "step": 5370
+    },
+    {
+      "epoch": 9.227467811158798,
+      "grad_norm": 0.3307458162307739,
+      "learning_rate": 3.5406190227966427e-06,
+      "loss": 0.4504,
+      "step": 5375
+    },
+    {
+      "epoch": 9.236051502145923,
+      "grad_norm": 0.35020336508750916,
+      "learning_rate": 3.4619612736082273e-06,
+      "loss": 0.4577,
+      "step": 5380
+    },
+    {
+      "epoch": 9.244635193133048,
+      "grad_norm": 0.33766666054725647,
+      "learning_rate": 3.3841716986923624e-06,
+      "loss": 0.4531,
+      "step": 5385
+    },
+    {
+      "epoch": 9.253218884120171,
+      "grad_norm": 0.33843091130256653,
+      "learning_rate": 3.3072509976169065e-06,
+      "loss": 0.4564,
+      "step": 5390
+    },
+    {
+      "epoch": 9.261802575107296,
+      "grad_norm": 0.3248330056667328,
+      "learning_rate": 3.2311998621358363e-06,
+      "loss": 0.4526,
+      "step": 5395
+    },
+    {
+      "epoch": 9.270386266094421,
+      "grad_norm": 0.3351515829563141,
+      "learning_rate": 3.1560189761830728e-06,
+      "loss": 0.4544,
+      "step": 5400
+    },
+    {
+      "epoch": 9.278969957081545,
+      "grad_norm": 0.3289077877998352,
+      "learning_rate": 3.0817090158663185e-06,
+      "loss": 0.4449,
+      "step": 5405
+    },
+    {
+      "epoch": 9.28755364806867,
+      "grad_norm": 0.32089975476264954,
+      "learning_rate": 3.008270649460965e-06,
+      "loss": 0.4496,
+      "step": 5410
+    },
+    {
+      "epoch": 9.296137339055793,
+      "grad_norm": 0.2968757748603821,
+      "learning_rate": 2.9357045374040825e-06,
+      "loss": 0.4458,
+      "step": 5415
+    },
+    {
+      "epoch": 9.304721030042918,
+      "grad_norm": 0.34240734577178955,
+      "learning_rate": 2.8640113322885185e-06,
+      "loss": 0.4469,
+      "step": 5420
+    },
+    {
+      "epoch": 9.313304721030043,
+      "grad_norm": 0.33385157585144043,
+      "learning_rate": 2.7931916788569545e-06,
+      "loss": 0.4527,
+      "step": 5425
+    },
+    {
+      "epoch": 9.321888412017167,
+      "grad_norm": 0.34486281871795654,
+      "learning_rate": 2.723246213996178e-06,
+      "loss": 0.4542,
+      "step": 5430
+    },
+    {
+      "epoch": 9.330472103004292,
+      "grad_norm": 0.3246801495552063,
+      "learning_rate": 2.654175566731365e-06,
+      "loss": 0.4574,
+      "step": 5435
+    },
+    {
+      "epoch": 9.339055793991417,
+      "grad_norm": 0.33539149165153503,
+      "learning_rate": 2.5859803582202968e-06,
+      "loss": 0.4457,
+      "step": 5440
+    },
+    {
+      "epoch": 9.34763948497854,
+      "grad_norm": 0.33203625679016113,
+      "learning_rate": 2.518661201747918e-06,
+      "loss": 0.4567,
+      "step": 5445
+    },
+    {
+      "epoch": 9.356223175965665,
+      "grad_norm": 0.32282063364982605,
+      "learning_rate": 2.452218702720821e-06,
+      "loss": 0.4427,
+      "step": 5450
+    },
+    {
+      "epoch": 9.36480686695279,
+      "grad_norm": 0.333141028881073,
+      "learning_rate": 2.3866534586616364e-06,
+      "loss": 0.4548,
+      "step": 5455
+    },
+    {
+      "epoch": 9.373390557939913,
+      "grad_norm": 0.3323938250541687,
+      "learning_rate": 2.3219660592038285e-06,
+      "loss": 0.4558,
+      "step": 5460
+    },
+    {
+      "epoch": 9.381974248927039,
+      "grad_norm": 0.33186817169189453,
+      "learning_rate": 2.258157086086388e-06,
+      "loss": 0.4499,
+      "step": 5465
+    },
+    {
+      "epoch": 9.390557939914164,
+      "grad_norm": 0.33666694164276123,
+      "learning_rate": 2.1952271131484236e-06,
+      "loss": 0.4533,
+      "step": 5470
+    },
+    {
+      "epoch": 9.399141630901287,
+      "grad_norm": 0.3561409115791321,
+      "learning_rate": 2.133176706324236e-06,
+      "loss": 0.4574,
+      "step": 5475
+    },
+    {
+      "epoch": 9.407725321888412,
+      "grad_norm": 0.3282804489135742,
+      "learning_rate": 2.0720064236380842e-06,
+      "loss": 0.4511,
+      "step": 5480
+    },
+    {
+      "epoch": 9.416309012875537,
+      "grad_norm": 0.3417915403842926,
+      "learning_rate": 2.0117168151991606e-06,
+      "loss": 0.4517,
+      "step": 5485
+    },
+    {
+      "epoch": 9.42489270386266,
+      "grad_norm": 0.35541415214538574,
+      "learning_rate": 1.9523084231967358e-06,
+      "loss": 0.4498,
+      "step": 5490
+    },
+    {
+      "epoch": 9.433476394849786,
+      "grad_norm": 0.33606576919555664,
+      "learning_rate": 1.893781781895232e-06,
+      "loss": 0.4466,
+      "step": 5495
+    },
+    {
+      "epoch": 9.44206008583691,
+      "grad_norm": 0.333290159702301,
+      "learning_rate": 1.8361374176293467e-06,
+      "loss": 0.4514,
+      "step": 5500
+    },
+    {
+      "epoch": 9.450643776824034,
+      "grad_norm": 0.3518344461917877,
+      "learning_rate": 1.7793758487994694e-06,
+      "loss": 0.4566,
+      "step": 5505
+    },
+    {
+      "epoch": 9.459227467811159,
+      "grad_norm": 0.44788244366645813,
+      "learning_rate": 1.7234975858669178e-06,
+      "loss": 0.4564,
+      "step": 5510
+    },
+    {
+      "epoch": 9.467811158798284,
+      "grad_norm": 0.34256601333618164,
+      "learning_rate": 1.6685031313493416e-06,
+      "loss": 0.4493,
+      "step": 5515
+    },
+    {
+      "epoch": 9.476394849785407,
+      "grad_norm": 0.33245575428009033,
+      "learning_rate": 1.6143929798162704e-06,
+      "loss": 0.4479,
+      "step": 5520
+    },
+    {
+      "epoch": 9.484978540772532,
+      "grad_norm": 0.34188759326934814,
+      "learning_rate": 1.5611676178845958e-06,
+      "loss": 0.4459,
+      "step": 5525
+    },
+    {
+      "epoch": 9.493562231759658,
+      "grad_norm": 0.32970142364501953,
+      "learning_rate": 1.5088275242142402e-06,
+      "loss": 0.45,
+      "step": 5530
+    },
+    {
+      "epoch": 9.50214592274678,
+      "grad_norm": 0.34352561831474304,
+      "learning_rate": 1.4573731695038395e-06,
+      "loss": 0.452,
+      "step": 5535
+    },
+    {
+      "epoch": 9.510729613733906,
+      "grad_norm": 0.35988888144493103,
+      "learning_rate": 1.4068050164864898e-06,
+      "loss": 0.4497,
+      "step": 5540
+    },
+    {
+      "epoch": 9.51931330472103,
+      "grad_norm": 0.32545995712280273,
+      "learning_rate": 1.3571235199256405e-06,
+      "loss": 0.4515,
+      "step": 5545
+    },
+    {
+      "epoch": 9.527896995708154,
+      "grad_norm": 0.3102465569972992,
+      "learning_rate": 1.30832912661093e-06,
+      "loss": 0.4405,
+      "step": 5550
+    },
+    {
+      "epoch": 9.53648068669528,
+      "grad_norm": 0.32020366191864014,
+      "learning_rate": 1.2604222753542339e-06,
+      "loss": 0.4479,
+      "step": 5555
+    },
+    {
+      "epoch": 9.545064377682403,
+      "grad_norm": 0.357705295085907,
+      "learning_rate": 1.2134033969856907e-06,
+      "loss": 0.4435,
+      "step": 5560
+    },
+    {
+      "epoch": 9.553648068669528,
+      "grad_norm": 0.3494960367679596,
+      "learning_rate": 1.1672729143497929e-06,
+      "loss": 0.4502,
+      "step": 5565
+    },
+    {
+      "epoch": 9.562231759656653,
+      "grad_norm": 0.33992525935173035,
+      "learning_rate": 1.1220312423016687e-06,
+      "loss": 0.4597,
+      "step": 5570
+    },
+    {
+      "epoch": 9.570815450643776,
+      "grad_norm": 0.3502410352230072,
+      "learning_rate": 1.0776787877032736e-06,
+      "loss": 0.4532,
+      "step": 5575
+    },
+    {
+      "epoch": 9.579399141630901,
+      "grad_norm": 0.3116472065448761,
+      "learning_rate": 1.034215949419748e-06,
+      "loss": 0.4447,
+      "step": 5580
+    },
+    {
+      "epoch": 9.587982832618026,
+      "grad_norm": 0.31818586587905884,
+      "learning_rate": 9.916431183158881e-07,
+      "loss": 0.449,
+      "step": 5585
+    },
+    {
+      "epoch": 9.59656652360515,
+      "grad_norm": 0.31284070014953613,
+      "learning_rate": 9.499606772525371e-07,
+      "loss": 0.4426,
+      "step": 5590
+    },
+    {
+      "epoch": 9.605150214592275,
+      "grad_norm": 0.35043418407440186,
+      "learning_rate": 9.091690010831988e-07,
+      "loss": 0.4521,
+      "step": 5595
+    },
+    {
+      "epoch": 9.6137339055794,
+      "grad_norm": 0.352905809879303,
+      "learning_rate": 8.692684566506959e-07,
+      "loss": 0.4451,
+      "step": 5600
+    },
+    {
+      "epoch": 9.622317596566523,
+      "grad_norm": 0.31967219710350037,
+      "learning_rate": 8.30259402783784e-07,
+      "loss": 0.4576,
+      "step": 5605
+    },
+    {
+      "epoch": 9.630901287553648,
+      "grad_norm": 0.3698691129684448,
+      "learning_rate": 7.921421902939874e-07,
+      "loss": 0.4494,
+      "step": 5610
+    },
+    {
+      "epoch": 9.639484978540773,
+      "grad_norm": 0.32335957884788513,
+      "learning_rate": 7.54917161972446e-07,
+      "loss": 0.4464,
+      "step": 5615
+    },
+    {
+      "epoch": 9.648068669527897,
+      "grad_norm": 0.3521655201911926,
+      "learning_rate": 7.185846525867956e-07,
+      "loss": 0.4571,
+      "step": 5620
+    },
+    {
+      "epoch": 9.656652360515022,
+      "grad_norm": 0.3388623893260956,
+      "learning_rate": 6.831449888781926e-07,
+      "loss": 0.453,
+      "step": 5625
+    },
+    {
+      "epoch": 9.665236051502147,
+      "grad_norm": 0.3277793228626251,
+      "learning_rate": 6.485984895583608e-07,
+      "loss": 0.4486,
+      "step": 5630
+    },
+    {
+      "epoch": 9.67381974248927,
+      "grad_norm": 0.32895511388778687,
+      "learning_rate": 6.149454653067044e-07,
+      "loss": 0.4509,
+      "step": 5635
+    },
+    {
+      "epoch": 9.682403433476395,
+      "grad_norm": 0.34782108664512634,
+      "learning_rate": 5.821862187675775e-07,
+      "loss": 0.4537,
+      "step": 5640
+    },
+    {
+      "epoch": 9.690987124463518,
+      "grad_norm": 0.33518868684768677,
+      "learning_rate": 5.503210445474638e-07,
+      "loss": 0.4543,
+      "step": 5645
+    },
+    {
+      "epoch": 9.699570815450643,
+      "grad_norm": 0.33902445435523987,
+      "learning_rate": 5.193502292124341e-07,
+      "loss": 0.4487,
+      "step": 5650
+    },
+    {
+      "epoch": 9.708154506437769,
+      "grad_norm": 0.3404318690299988,
+      "learning_rate": 4.892740512854932e-07,
+      "loss": 0.4597,
+      "step": 5655
+    },
+    {
+      "epoch": 9.716738197424892,
+      "grad_norm": 0.33036890625953674,
+      "learning_rate": 4.600927812441036e-07,
+      "loss": 0.4472,
+      "step": 5660
+    },
+    {
+      "epoch": 9.725321888412017,
+      "grad_norm": 0.34414026141166687,
+      "learning_rate": 4.318066815177435e-07,
+      "loss": 0.4452,
+      "step": 5665
+    },
+    {
+      "epoch": 9.733905579399142,
+      "grad_norm": 0.33726632595062256,
+      "learning_rate": 4.044160064855751e-07,
+      "loss": 0.45,
+      "step": 5670
+    },
+    {
+      "epoch": 9.742489270386265,
+      "grad_norm": 0.31507617235183716,
+      "learning_rate": 3.779210024741131e-07,
+      "loss": 0.4436,
+      "step": 5675
+    },
+    {
+      "epoch": 9.75107296137339,
+      "grad_norm": 0.35751578211784363,
+      "learning_rate": 3.523219077550488e-07,
+      "loss": 0.4514,
+      "step": 5680
+    },
+    {
+      "epoch": 9.759656652360515,
+      "grad_norm": 0.3282091021537781,
+      "learning_rate": 3.2761895254306287e-07,
+      "loss": 0.4472,
+      "step": 5685
+    },
+    {
+      "epoch": 9.768240343347639,
+      "grad_norm": 0.3441978693008423,
+      "learning_rate": 3.038123589938047e-07,
+      "loss": 0.4516,
+      "step": 5690
+    },
+    {
+      "epoch": 9.776824034334764,
+      "grad_norm": 0.33709728717803955,
+      "learning_rate": 2.8090234120188295e-07,
+      "loss": 0.4508,
+      "step": 5695
+    },
+    {
+      "epoch": 9.785407725321889,
+      "grad_norm": 0.32524409890174866,
+      "learning_rate": 2.588891051988895e-07,
+      "loss": 0.445,
+      "step": 5700
+    },
+    {
+      "epoch": 9.793991416309012,
+      "grad_norm": 0.3647370934486389,
+      "learning_rate": 2.3777284895162288e-07,
+      "loss": 0.444,
+      "step": 5705
+    },
+    {
+      "epoch": 9.802575107296137,
+      "grad_norm": 0.3221174478530884,
+      "learning_rate": 2.1755376236025637e-07,
+      "loss": 0.4478,
+      "step": 5710
+    },
+    {
+      "epoch": 9.811158798283262,
+      "grad_norm": 0.35065704584121704,
+      "learning_rate": 1.9823202725665068e-07,
+      "loss": 0.4538,
+      "step": 5715
+    },
+    {
+      "epoch": 9.819742489270386,
+      "grad_norm": 0.3380087912082672,
+      "learning_rate": 1.7980781740268848e-07,
+      "loss": 0.4477,
+      "step": 5720
+    },
+    {
+      "epoch": 9.82832618025751,
+      "grad_norm": 0.3177869915962219,
+      "learning_rate": 1.622812984887867e-07,
+      "loss": 0.4496,
+      "step": 5725
+    },
+    {
+      "epoch": 9.836909871244636,
+      "grad_norm": 0.3140113353729248,
+      "learning_rate": 1.4565262813230894e-07,
+      "loss": 0.4485,
+      "step": 5730
+    },
+    {
+      "epoch": 9.84549356223176,
+      "grad_norm": 0.32537147402763367,
+      "learning_rate": 1.2992195587619993e-07,
+      "loss": 0.4433,
+      "step": 5735
+    },
+    {
+      "epoch": 9.854077253218884,
+      "grad_norm": 0.3178805410861969,
+      "learning_rate": 1.1508942318767535e-07,
+      "loss": 0.4465,
+      "step": 5740
+    },
+    {
+      "epoch": 9.86266094420601,
+      "grad_norm": 0.33153483271598816,
+      "learning_rate": 1.0115516345686749e-07,
+      "loss": 0.4567,
+      "step": 5745
+    },
+    {
+      "epoch": 9.871244635193133,
+      "grad_norm": 0.31883201003074646,
+      "learning_rate": 8.811930199568163e-08,
+      "loss": 0.449,
+      "step": 5750
+    },
+    {
+      "epoch": 9.879828326180258,
+      "grad_norm": 0.3389532268047333,
+      "learning_rate": 7.598195603666369e-08,
+      "loss": 0.4533,
+      "step": 5755
+    },
+    {
+      "epoch": 9.888412017167383,
+      "grad_norm": 0.3354385197162628,
+      "learning_rate": 6.474323473194543e-08,
+      "loss": 0.4558,
+      "step": 5760
+    },
+    {
+      "epoch": 9.896995708154506,
+      "grad_norm": 0.3773539364337921,
+      "learning_rate": 5.4403239152212013e-08,
+      "loss": 0.4524,
+      "step": 5765
+    },
+    {
+      "epoch": 9.905579399141631,
+      "grad_norm": 0.30916520953178406,
+      "learning_rate": 4.4962062285902607e-08,
+      "loss": 0.4474,
+      "step": 5770
+    },
+    {
+      "epoch": 9.914163090128756,
+      "grad_norm": 0.3748643100261688,
+      "learning_rate": 3.6419789038244504e-08,
+      "loss": 0.4449,
+      "step": 5775
+    },
+    {
+      "epoch": 9.92274678111588,
+      "grad_norm": 0.33963683247566223,
+      "learning_rate": 2.877649623059808e-08,
+      "loss": 0.4491,
+      "step": 5780
+    },
+    {
+      "epoch": 9.931330472103005,
+      "grad_norm": 0.3399483859539032,
+      "learning_rate": 2.2032252599690773e-08,
+      "loss": 0.4523,
+      "step": 5785
+    },
+    {
+      "epoch": 9.939914163090128,
+      "grad_norm": 0.3306158781051636,
+      "learning_rate": 1.6187118797061917e-08,
+      "loss": 0.4551,
+      "step": 5790
+    },
+    {
+      "epoch": 9.948497854077253,
+      "grad_norm": 0.3417421579360962,
+      "learning_rate": 1.1241147388452167e-08,
+      "loss": 0.4527,
+      "step": 5795
+    },
+    {
+      "epoch": 9.957081545064378,
+      "grad_norm": 0.34274721145629883,
+      "learning_rate": 7.194382853370485e-09,
+      "loss": 0.4442,
+      "step": 5800
+    },
+    {
+      "epoch": 9.965665236051501,
+      "grad_norm": 0.36596229672431946,
+      "learning_rate": 4.046861584705575e-09,
+      "loss": 0.4545,
+      "step": 5805
+    },
+    {
+      "epoch": 9.974248927038627,
+      "grad_norm": 0.3520822823047638,
+      "learning_rate": 1.798611888370605e-09,
+      "loss": 0.439,
+      "step": 5810
+    },
+    {
+      "epoch": 9.982832618025752,
+      "grad_norm": 0.3437098562717438,
+      "learning_rate": 4.4965398303675745e-10,
+      "loss": 0.4463,
+      "step": 5815
+    },
+    {
+      "epoch": 9.991416309012875,
+      "grad_norm": 0.3176514208316803,
+      "learning_rate": 0.0,
+      "loss": 0.454,
+      "step": 5820
+    },
+    {
+      "epoch": 9.991416309012875,
+      "eval_loss": 3.8423588275909424,
+      "eval_runtime": 0.4223,
+      "eval_samples_per_second": 14.208,
+      "eval_steps_per_second": 2.368,
+      "step": 5820
+    },
+    {
+      "epoch": 9.991416309012875,
+      "step": 5820,
+      "total_flos": 8.683561975386472e+18,
+      "train_loss": 0.6765515476977293,
+      "train_runtime": 24420.5896,
+      "train_samples_per_second": 7.63,
+      "train_steps_per_second": 0.238
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 5820,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.683561975386472e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}