diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,15560 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9995261786306564,
+  "eval_steps": 22,
+  "global_step": 2110,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0009476427386875149,
+      "grad_norm": 20.756122057594123,
+      "learning_rate": 1.5625e-07,
+      "loss": 1.6431,
+      "step": 1
+    },
+    {
+      "epoch": 0.0018952854773750297,
+      "grad_norm": 17.596939119407057,
+      "learning_rate": 3.125e-07,
+      "loss": 1.5603,
+      "step": 2
+    },
+    {
+      "epoch": 0.0028429282160625444,
+      "grad_norm": 24.550176754105873,
+      "learning_rate": 4.6875000000000006e-07,
+      "loss": 1.695,
+      "step": 3
+    },
+    {
+      "epoch": 0.0037905709547500594,
+      "grad_norm": 23.98623237019658,
+      "learning_rate": 6.25e-07,
+      "loss": 1.7187,
+      "step": 4
+    },
+    {
+      "epoch": 0.004738213693437574,
+      "grad_norm": 40.536528584444476,
+      "learning_rate": 7.8125e-07,
+      "loss": 1.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005685856432125089,
+      "grad_norm": 30.04208597312355,
+      "learning_rate": 9.375000000000001e-07,
+      "loss": 1.7002,
+      "step": 6
+    },
+    {
+      "epoch": 0.006633499170812604,
+      "grad_norm": 18.07515828827753,
+      "learning_rate": 1.0937500000000001e-06,
+      "loss": 1.5672,
+      "step": 7
+    },
+    {
+      "epoch": 0.007581141909500119,
+      "grad_norm": 33.28181380236923,
+      "learning_rate": 1.25e-06,
+      "loss": 1.7278,
+      "step": 8
+    },
+    {
+      "epoch": 0.008528784648187633,
+      "grad_norm": 16.829151356577462,
+      "learning_rate": 1.40625e-06,
+      "loss": 1.7151,
+      "step": 9
+    },
+    {
+      "epoch": 0.009476427386875147,
+      "grad_norm": 17.773159683100857,
+      "learning_rate": 1.5625e-06,
+      "loss": 1.5353,
+      "step": 10
+    },
+    {
+      "epoch": 0.010424070125562663,
+      "grad_norm": 18.419114948105452,
+      "learning_rate": 1.71875e-06,
+      "loss": 1.6073,
+      "step": 11
+    },
+    {
+      "epoch": 0.011371712864250177,
+      "grad_norm": 13.6273956773613,
+      "learning_rate": 1.8750000000000003e-06,
+      "loss": 1.5124,
+      "step": 12
+    },
+    {
+      "epoch": 0.012319355602937692,
+      "grad_norm": 14.235281012395534,
+      "learning_rate": 2.0312500000000002e-06,
+      "loss": 1.4477,
+      "step": 13
+    },
+    {
+      "epoch": 0.013266998341625208,
+      "grad_norm": 8.20052746448492,
+      "learning_rate": 2.1875000000000002e-06,
+      "loss": 1.4623,
+      "step": 14
+    },
+    {
+      "epoch": 0.014214641080312722,
+      "grad_norm": 11.658169078611403,
+      "learning_rate": 2.3437500000000002e-06,
+      "loss": 1.4695,
+      "step": 15
+    },
+    {
+      "epoch": 0.015162283819000238,
+      "grad_norm": 6.911597435569106,
+      "learning_rate": 2.5e-06,
+      "loss": 1.4164,
+      "step": 16
+    },
+    {
+      "epoch": 0.01610992655768775,
+      "grad_norm": 6.514587154523329,
+      "learning_rate": 2.65625e-06,
+      "loss": 1.3714,
+      "step": 17
+    },
+    {
+      "epoch": 0.017057569296375266,
+      "grad_norm": 7.470333408959338,
+      "learning_rate": 2.8125e-06,
+      "loss": 1.4672,
+      "step": 18
+    },
+    {
+      "epoch": 0.018005212035062782,
+      "grad_norm": 11.59118534986973,
+      "learning_rate": 2.96875e-06,
+      "loss": 1.3828,
+      "step": 19
+    },
+    {
+      "epoch": 0.018952854773750295,
+      "grad_norm": 3.9301638513755988,
+      "learning_rate": 3.125e-06,
+      "loss": 1.4137,
+      "step": 20
+    },
+    {
+      "epoch": 0.01990049751243781,
+      "grad_norm": 4.6357203859292495,
+      "learning_rate": 3.28125e-06,
+      "loss": 1.3915,
+      "step": 21
+    },
+    {
+      "epoch": 0.020848140251125327,
+      "grad_norm": 4.530214142494876,
+      "learning_rate": 3.4375e-06,
+      "loss": 1.3658,
+      "step": 22
+    },
+    {
+      "epoch": 0.020848140251125327,
+      "eval_loss": 1.2158129215240479,
+      "eval_runtime": 60.2576,
+      "eval_samples_per_second": 45.272,
+      "eval_steps_per_second": 0.714,
+      "step": 22
+    },
+    {
+      "epoch": 0.02179578298981284,
+      "grad_norm": 3.7036600794639396,
+      "learning_rate": 3.59375e-06,
+      "loss": 1.3058,
+      "step": 23
+    },
+    {
+      "epoch": 0.022743425728500355,
+      "grad_norm": 4.229128089681862,
+      "learning_rate": 3.7500000000000005e-06,
+      "loss": 1.2303,
+      "step": 24
+    },
+    {
+      "epoch": 0.02369106846718787,
+      "grad_norm": 5.376319236600739,
+      "learning_rate": 3.90625e-06,
+      "loss": 1.2898,
+      "step": 25
+    },
+    {
+      "epoch": 0.024638711205875383,
+      "grad_norm": 3.621037119144236,
+      "learning_rate": 4.0625000000000005e-06,
+      "loss": 1.2614,
+      "step": 26
+    },
+    {
+      "epoch": 0.0255863539445629,
+      "grad_norm": 3.7708968075921865,
+      "learning_rate": 4.21875e-06,
+      "loss": 1.3618,
+      "step": 27
+    },
+    {
+      "epoch": 0.026533996683250415,
+      "grad_norm": 4.5895243093772535,
+      "learning_rate": 4.3750000000000005e-06,
+      "loss": 1.2304,
+      "step": 28
+    },
+    {
+      "epoch": 0.027481639421937928,
+      "grad_norm": 10.299410261479563,
+      "learning_rate": 4.53125e-06,
+      "loss": 1.2345,
+      "step": 29
+    },
+    {
+      "epoch": 0.028429282160625444,
+      "grad_norm": 3.8644074526148184,
+      "learning_rate": 4.6875000000000004e-06,
+      "loss": 1.2523,
+      "step": 30
+    },
+    {
+      "epoch": 0.02937692489931296,
+      "grad_norm": 3.547834365401974,
+      "learning_rate": 4.84375e-06,
+      "loss": 1.2344,
+      "step": 31
+    },
+    {
+      "epoch": 0.030324567638000476,
+      "grad_norm": 3.578798181550234,
+      "learning_rate": 5e-06,
+      "loss": 1.2818,
+      "step": 32
+    },
+    {
+      "epoch": 0.03127221037668799,
+      "grad_norm": 3.711700954572684,
+      "learning_rate": 5.156250000000001e-06,
+      "loss": 1.2483,
+      "step": 33
+    },
+    {
+      "epoch": 0.0322198531153755,
+      "grad_norm": 3.5746276017801537,
+      "learning_rate": 5.3125e-06,
+      "loss": 1.2565,
+      "step": 34
+    },
+    {
+      "epoch": 0.03316749585406302,
+      "grad_norm": 3.7129697591402016,
+      "learning_rate": 5.468750000000001e-06,
+      "loss": 1.2261,
+      "step": 35
+    },
+    {
+      "epoch": 0.03411513859275053,
+      "grad_norm": 3.1701938959510656,
+      "learning_rate": 5.625e-06,
+      "loss": 1.1836,
+      "step": 36
+    },
+    {
+      "epoch": 0.035062781331438045,
+      "grad_norm": 3.081675212149766,
+      "learning_rate": 5.781250000000001e-06,
+      "loss": 1.1683,
+      "step": 37
+    },
+    {
+      "epoch": 0.036010424070125564,
+      "grad_norm": 4.351693679221342,
+      "learning_rate": 5.9375e-06,
+      "loss": 1.2133,
+      "step": 38
+    },
+    {
+      "epoch": 0.03695806680881308,
+      "grad_norm": 3.1336691817253204,
+      "learning_rate": 6.093750000000001e-06,
+      "loss": 1.1948,
+      "step": 39
+    },
+    {
+      "epoch": 0.03790570954750059,
+      "grad_norm": 2.703982626093151,
+      "learning_rate": 6.25e-06,
+      "loss": 1.1376,
+      "step": 40
+    },
+    {
+      "epoch": 0.03885335228618811,
+      "grad_norm": 3.003118804501732,
+      "learning_rate": 6.406250000000001e-06,
+      "loss": 1.2059,
+      "step": 41
+    },
+    {
+      "epoch": 0.03980099502487562,
+      "grad_norm": 3.3721112860961577,
+      "learning_rate": 6.5625e-06,
+      "loss": 1.2294,
+      "step": 42
+    },
+    {
+      "epoch": 0.040748637763563134,
+      "grad_norm": 2.935148387991293,
+      "learning_rate": 6.718750000000001e-06,
+      "loss": 1.2948,
+      "step": 43
+    },
+    {
+      "epoch": 0.04169628050225065,
+      "grad_norm": 2.7546718703597,
+      "learning_rate": 6.875e-06,
+      "loss": 1.1058,
+      "step": 44
+    },
+    {
+      "epoch": 0.04169628050225065,
+      "eval_loss": 1.1239995956420898,
+      "eval_runtime": 62.5263,
+      "eval_samples_per_second": 43.63,
+      "eval_steps_per_second": 0.688,
+      "step": 44
+    },
+    {
+      "epoch": 0.042643923240938165,
+      "grad_norm": 3.2372160639143885,
+      "learning_rate": 7.031250000000001e-06,
+      "loss": 1.187,
+      "step": 45
+    },
+    {
+      "epoch": 0.04359156597962568,
+      "grad_norm": 3.3104832856910233,
+      "learning_rate": 7.1875e-06,
+      "loss": 1.1547,
+      "step": 46
+    },
+    {
+      "epoch": 0.0445392087183132,
+      "grad_norm": 2.9630493187419096,
+      "learning_rate": 7.343750000000001e-06,
+      "loss": 1.205,
+      "step": 47
+    },
+    {
+      "epoch": 0.04548685145700071,
+      "grad_norm": 2.8169766087618537,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 1.1583,
+      "step": 48
+    },
+    {
+      "epoch": 0.04643449419568822,
+      "grad_norm": 3.0223679686127736,
+      "learning_rate": 7.656250000000001e-06,
+      "loss": 1.1546,
+      "step": 49
+    },
+    {
+      "epoch": 0.04738213693437574,
+      "grad_norm": 2.9245386601496417,
+      "learning_rate": 7.8125e-06,
+      "loss": 1.0963,
+      "step": 50
+    },
+    {
+      "epoch": 0.048329779673063254,
+      "grad_norm": 3.3416755825594207,
+      "learning_rate": 7.96875e-06,
+      "loss": 1.0911,
+      "step": 51
+    },
+    {
+      "epoch": 0.04927742241175077,
+      "grad_norm": 3.2146723217948754,
+      "learning_rate": 8.125000000000001e-06,
+      "loss": 1.1444,
+      "step": 52
+    },
+    {
+      "epoch": 0.050225065150438286,
+      "grad_norm": 3.7591880901694688,
+      "learning_rate": 8.281250000000001e-06,
+      "loss": 1.1644,
+      "step": 53
+    },
+    {
+      "epoch": 0.0511727078891258,
+      "grad_norm": 3.597908493062599,
+      "learning_rate": 8.4375e-06,
+      "loss": 1.1245,
+      "step": 54
+    },
+    {
+      "epoch": 0.05212035062781331,
+      "grad_norm": 3.499015413751106,
+      "learning_rate": 8.59375e-06,
+      "loss": 1.172,
+      "step": 55
+    },
+    {
+      "epoch": 0.05306799336650083,
+      "grad_norm": 3.309932625198402,
+      "learning_rate": 8.750000000000001e-06,
+      "loss": 1.1184,
+      "step": 56
+    },
+    {
+      "epoch": 0.05401563610518834,
+      "grad_norm": 3.2523198848476125,
+      "learning_rate": 8.906250000000001e-06,
+      "loss": 1.1571,
+      "step": 57
+    },
+    {
+      "epoch": 0.054963278843875855,
+      "grad_norm": 3.2980910638210545,
+      "learning_rate": 9.0625e-06,
+      "loss": 1.132,
+      "step": 58
+    },
+    {
+      "epoch": 0.055910921582563375,
+      "grad_norm": 3.1322189199908053,
+      "learning_rate": 9.21875e-06,
+      "loss": 1.0936,
+      "step": 59
+    },
+    {
+      "epoch": 0.05685856432125089,
+      "grad_norm": 3.4181316461149884,
+      "learning_rate": 9.375000000000001e-06,
+      "loss": 1.1518,
+      "step": 60
+    },
+    {
+      "epoch": 0.0578062070599384,
+      "grad_norm": 3.238262923289073,
+      "learning_rate": 9.531250000000001e-06,
+      "loss": 1.0584,
+      "step": 61
+    },
+    {
+      "epoch": 0.05875384979862592,
+      "grad_norm": 3.1080324386613274,
+      "learning_rate": 9.6875e-06,
+      "loss": 1.1286,
+      "step": 62
+    },
+    {
+      "epoch": 0.05970149253731343,
+      "grad_norm": 3.181648226621564,
+      "learning_rate": 9.84375e-06,
+      "loss": 1.0876,
+      "step": 63
+    },
+    {
+      "epoch": 0.06064913527600095,
+      "grad_norm": 2.937644097353525,
+      "learning_rate": 1e-05,
+      "loss": 1.1379,
+      "step": 64
+    },
+    {
+      "epoch": 0.061596778014688464,
+      "grad_norm": 2.958067762883821,
+      "learning_rate": 9.99999410575193e-06,
+      "loss": 1.1133,
+      "step": 65
+    },
+    {
+      "epoch": 0.06254442075337598,
+      "grad_norm": 2.96685444746321,
+      "learning_rate": 9.999976423021617e-06,
+      "loss": 1.1313,
+      "step": 66
+    },
+    {
+      "epoch": 0.06254442075337598,
+      "eval_loss": 1.0586260557174683,
+      "eval_runtime": 62.0231,
+      "eval_samples_per_second": 43.984,
+      "eval_steps_per_second": 0.693,
+      "step": 66
+    },
+    {
+      "epoch": 0.06349206349206349,
+      "grad_norm": 3.566526642424616,
+      "learning_rate": 9.99994695185075e-06,
+      "loss": 1.1156,
+      "step": 67
+    },
+    {
+      "epoch": 0.064439706230751,
+      "grad_norm": 3.220513610820652,
+      "learning_rate": 9.999905692308813e-06,
+      "loss": 1.0942,
+      "step": 68
+    },
+    {
+      "epoch": 0.06538734896943853,
+      "grad_norm": 2.545710751356861,
+      "learning_rate": 9.999852644493086e-06,
+      "loss": 1.0751,
+      "step": 69
+    },
+    {
+      "epoch": 0.06633499170812604,
+      "grad_norm": 3.3933596790334772,
+      "learning_rate": 9.999787808528639e-06,
+      "loss": 1.1213,
+      "step": 70
+    },
+    {
+      "epoch": 0.06728263444681355,
+      "grad_norm": 2.9414176412578303,
+      "learning_rate": 9.999711184568334e-06,
+      "loss": 1.0759,
+      "step": 71
+    },
+    {
+      "epoch": 0.06823027718550106,
+      "grad_norm": 2.8170462064724937,
+      "learning_rate": 9.999622772792829e-06,
+      "loss": 1.0679,
+      "step": 72
+    },
+    {
+      "epoch": 0.06917791992418858,
+      "grad_norm": 2.82044984183835,
+      "learning_rate": 9.99952257341057e-06,
+      "loss": 1.0539,
+      "step": 73
+    },
+    {
+      "epoch": 0.07012556266287609,
+      "grad_norm": 2.6332325002107333,
+      "learning_rate": 9.999410586657801e-06,
+      "loss": 1.0523,
+      "step": 74
+    },
+    {
+      "epoch": 0.07107320540156362,
+      "grad_norm": 3.0223476132960276,
+      "learning_rate": 9.99928681279855e-06,
+      "loss": 1.0355,
+      "step": 75
+    },
+    {
+      "epoch": 0.07202084814025113,
+      "grad_norm": 3.8896388319954753,
+      "learning_rate": 9.999151252124639e-06,
+      "loss": 1.1244,
+      "step": 76
+    },
+    {
+      "epoch": 0.07296849087893864,
+      "grad_norm": 2.542497788954799,
+      "learning_rate": 9.99900390495568e-06,
+      "loss": 0.9883,
+      "step": 77
+    },
+    {
+      "epoch": 0.07391613361762615,
+      "grad_norm": 2.75973540600498,
+      "learning_rate": 9.998844771639073e-06,
+      "loss": 1.0339,
+      "step": 78
+    },
+    {
+      "epoch": 0.07486377635631367,
+      "grad_norm": 12.963000608570283,
+      "learning_rate": 9.998673852550007e-06,
+      "loss": 1.0512,
+      "step": 79
+    },
+    {
+      "epoch": 0.07581141909500118,
+      "grad_norm": 2.3419089920048184,
+      "learning_rate": 9.998491148091457e-06,
+      "loss": 1.0479,
+      "step": 80
+    },
+    {
+      "epoch": 0.0767590618336887,
+      "grad_norm": 2.53974833474546,
+      "learning_rate": 9.998296658694185e-06,
+      "loss": 0.9376,
+      "step": 81
+    },
+    {
+      "epoch": 0.07770670457237622,
+      "grad_norm": 2.455905437125952,
+      "learning_rate": 9.99809038481674e-06,
+      "loss": 0.9294,
+      "step": 82
+    },
+    {
+      "epoch": 0.07865434731106373,
+      "grad_norm": 2.7870267445347343,
+      "learning_rate": 9.997872326945452e-06,
+      "loss": 1.0241,
+      "step": 83
+    },
+    {
+      "epoch": 0.07960199004975124,
+      "grad_norm": 2.3034635295759256,
+      "learning_rate": 9.997642485594436e-06,
+      "loss": 1.0372,
+      "step": 84
+    },
+    {
+      "epoch": 0.08054963278843875,
+      "grad_norm": 2.243933826068351,
+      "learning_rate": 9.99740086130559e-06,
+      "loss": 1.0229,
+      "step": 85
+    },
+    {
+      "epoch": 0.08149727552712627,
+      "grad_norm": 2.1942261875771,
+      "learning_rate": 9.99714745464859e-06,
+      "loss": 1.0116,
+      "step": 86
+    },
+    {
+      "epoch": 0.0824449182658138,
+      "grad_norm": 1.9009368734428258,
+      "learning_rate": 9.996882266220895e-06,
+      "loss": 0.9982,
+      "step": 87
+    },
+    {
+      "epoch": 0.0833925610045013,
+      "grad_norm": 2.197821917224577,
+      "learning_rate": 9.996605296647737e-06,
+      "loss": 1.0379,
+      "step": 88
+    },
+    {
+      "epoch": 0.0833925610045013,
+      "eval_loss": 1.0077329874038696,
+      "eval_runtime": 69.6734,
+      "eval_samples_per_second": 39.154,
+      "eval_steps_per_second": 0.617,
+      "step": 88
+    },
+    {
+      "epoch": 0.08434020374318882,
+      "grad_norm": 1.844506043554688,
+      "learning_rate": 9.99631654658213e-06,
+      "loss": 0.8956,
+      "step": 89
+    },
+    {
+      "epoch": 0.08528784648187633,
+      "grad_norm": 3.4121023273390283,
+      "learning_rate": 9.996016016704854e-06,
+      "loss": 0.9807,
+      "step": 90
+    },
+    {
+      "epoch": 0.08623548922056384,
+      "grad_norm": 1.629161161843959,
+      "learning_rate": 9.995703707724474e-06,
+      "loss": 0.9534,
+      "step": 91
+    },
+    {
+      "epoch": 0.08718313195925136,
+      "grad_norm": 2.1015114519988893,
+      "learning_rate": 9.995379620377319e-06,
+      "loss": 0.9817,
+      "step": 92
+    },
+    {
+      "epoch": 0.08813077469793888,
+      "grad_norm": 1.6484379590895988,
+      "learning_rate": 9.995043755427487e-06,
+      "loss": 0.9181,
+      "step": 93
+    },
+    {
+      "epoch": 0.0890784174366264,
+      "grad_norm": 1.5470369951292255,
+      "learning_rate": 9.99469611366685e-06,
+      "loss": 0.9655,
+      "step": 94
+    },
+    {
+      "epoch": 0.09002606017531391,
+      "grad_norm": 1.7131607780023708,
+      "learning_rate": 9.994336695915041e-06,
+      "loss": 0.9522,
+      "step": 95
+    },
+    {
+      "epoch": 0.09097370291400142,
+      "grad_norm": 3.4039383281352684,
+      "learning_rate": 9.993965503019457e-06,
+      "loss": 0.9977,
+      "step": 96
+    },
+    {
+      "epoch": 0.09192134565268893,
+      "grad_norm": 3.2140501680385536,
+      "learning_rate": 9.993582535855265e-06,
+      "loss": 0.8933,
+      "step": 97
+    },
+    {
+      "epoch": 0.09286898839137644,
+      "grad_norm": 1.9418785282824027,
+      "learning_rate": 9.993187795325381e-06,
+      "loss": 1.0526,
+      "step": 98
+    },
+    {
+      "epoch": 0.09381663113006397,
+      "grad_norm": 1.4531258804082263,
+      "learning_rate": 9.992781282360486e-06,
+      "loss": 0.9921,
+      "step": 99
+    },
+    {
+      "epoch": 0.09476427386875148,
+      "grad_norm": 2.0708416718589273,
+      "learning_rate": 9.992362997919016e-06,
+      "loss": 0.9248,
+      "step": 100
+    },
+    {
+      "epoch": 0.095711916607439,
+      "grad_norm": 1.7208209294560886,
+      "learning_rate": 9.99193294298716e-06,
+      "loss": 0.9778,
+      "step": 101
+    },
+    {
+      "epoch": 0.09665955934612651,
+      "grad_norm": 1.573086346708121,
+      "learning_rate": 9.991491118578856e-06,
+      "loss": 0.9369,
+      "step": 102
+    },
+    {
+      "epoch": 0.09760720208481402,
+      "grad_norm": 1.716766032223459,
+      "learning_rate": 9.991037525735794e-06,
+      "loss": 0.9718,
+      "step": 103
+    },
+    {
+      "epoch": 0.09855484482350153,
+      "grad_norm": 1.4683346581590864,
+      "learning_rate": 9.990572165527413e-06,
+      "loss": 1.0322,
+      "step": 104
+    },
+    {
+      "epoch": 0.09950248756218906,
+      "grad_norm": 1.6854495739199276,
+      "learning_rate": 9.990095039050886e-06,
+      "loss": 1.0259,
+      "step": 105
+    },
+    {
+      "epoch": 0.10045013030087657,
+      "grad_norm": 1.5007691822796942,
+      "learning_rate": 9.98960614743114e-06,
+      "loss": 1.012,
+      "step": 106
+    },
+    {
+      "epoch": 0.10139777303956408,
+      "grad_norm": 1.688237803394784,
+      "learning_rate": 9.98910549182083e-06,
+      "loss": 0.9611,
+      "step": 107
+    },
+    {
+      "epoch": 0.1023454157782516,
+      "grad_norm": 1.2412298690878776,
+      "learning_rate": 9.988593073400354e-06,
+      "loss": 0.9543,
+      "step": 108
+    },
+    {
+      "epoch": 0.10329305851693911,
+      "grad_norm": 1.4280638521919857,
+      "learning_rate": 9.988068893377841e-06,
+      "loss": 1.0555,
+      "step": 109
+    },
+    {
+      "epoch": 0.10424070125562662,
+      "grad_norm": 1.6064972212595918,
+      "learning_rate": 9.987532952989145e-06,
+      "loss": 0.947,
+      "step": 110
+    },
+    {
+      "epoch": 0.10424070125562662,
+      "eval_loss": 0.9927965998649597,
+      "eval_runtime": 64.552,
+      "eval_samples_per_second": 42.261,
+      "eval_steps_per_second": 0.666,
+      "step": 110
+    },
+    {
+      "epoch": 0.10518834399431415,
+      "grad_norm": 1.3706857779346073,
+      "learning_rate": 9.986985253497859e-06,
+      "loss": 0.958,
+      "step": 111
+    },
+    {
+      "epoch": 0.10613598673300166,
+      "grad_norm": 1.1044678274982156,
+      "learning_rate": 9.986425796195287e-06,
+      "loss": 0.9613,
+      "step": 112
+    },
+    {
+      "epoch": 0.10708362947168917,
+      "grad_norm": 1.2204032164258463,
+      "learning_rate": 9.985854582400465e-06,
+      "loss": 0.9637,
+      "step": 113
+    },
+    {
+      "epoch": 0.10803127221037669,
+      "grad_norm": 1.387543526294841,
+      "learning_rate": 9.985271613460144e-06,
+      "loss": 0.988,
+      "step": 114
+    },
+    {
+      "epoch": 0.1089789149490642,
+      "grad_norm": 1.5020093056571016,
+      "learning_rate": 9.984676890748787e-06,
+      "loss": 0.986,
+      "step": 115
+    },
+    {
+      "epoch": 0.10992655768775171,
+      "grad_norm": 1.503672693192212,
+      "learning_rate": 9.984070415668574e-06,
+      "loss": 0.9858,
+      "step": 116
+    },
+    {
+      "epoch": 0.11087420042643924,
+      "grad_norm": 1.4535398323237316,
+      "learning_rate": 9.983452189649388e-06,
+      "loss": 0.9324,
+      "step": 117
+    },
+    {
+      "epoch": 0.11182184316512675,
+      "grad_norm": 1.4452814142099772,
+      "learning_rate": 9.98282221414882e-06,
+      "loss": 0.9353,
+      "step": 118
+    },
+    {
+      "epoch": 0.11276948590381426,
+      "grad_norm": 1.2444175503325663,
+      "learning_rate": 9.982180490652165e-06,
+      "loss": 0.9864,
+      "step": 119
+    },
+    {
+      "epoch": 0.11371712864250177,
+      "grad_norm": 1.484366794735356,
+      "learning_rate": 9.981527020672413e-06,
+      "loss": 0.9683,
+      "step": 120
+    },
+    {
+      "epoch": 0.11466477138118929,
+      "grad_norm": 1.6287273073283466,
+      "learning_rate": 9.98086180575025e-06,
+      "loss": 0.9295,
+      "step": 121
+    },
+    {
+      "epoch": 0.1156124141198768,
+      "grad_norm": 2.1645074117977363,
+      "learning_rate": 9.980184847454052e-06,
+      "loss": 0.9474,
+      "step": 122
+    },
+    {
+      "epoch": 0.11656005685856433,
+      "grad_norm": 1.4428592940450689,
+      "learning_rate": 9.979496147379883e-06,
+      "loss": 1.0116,
+      "step": 123
+    },
+    {
+      "epoch": 0.11750769959725184,
+      "grad_norm": 1.442916857159405,
+      "learning_rate": 9.978795707151492e-06,
+      "loss": 0.9565,
+      "step": 124
+    },
+    {
+      "epoch": 0.11845534233593935,
+      "grad_norm": 1.3024301965632774,
+      "learning_rate": 9.978083528420303e-06,
+      "loss": 0.9471,
+      "step": 125
+    },
+    {
+      "epoch": 0.11940298507462686,
+      "grad_norm": 1.2357348655586111,
+      "learning_rate": 9.977359612865424e-06,
+      "loss": 0.9684,
+      "step": 126
+    },
+    {
+      "epoch": 0.12035062781331438,
+      "grad_norm": 1.4890088752220003,
+      "learning_rate": 9.976623962193627e-06,
+      "loss": 0.9535,
+      "step": 127
+    },
+    {
+      "epoch": 0.1212982705520019,
+      "grad_norm": 1.3469393114862873,
+      "learning_rate": 9.975876578139355e-06,
+      "loss": 0.986,
+      "step": 128
+    },
+    {
+      "epoch": 0.12224591329068941,
+      "grad_norm": 1.5755457587661565,
+      "learning_rate": 9.975117462464716e-06,
+      "loss": 1.019,
+      "step": 129
+    },
+    {
+      "epoch": 0.12319355602937693,
+      "grad_norm": 1.4168899024501311,
+      "learning_rate": 9.974346616959476e-06,
+      "loss": 0.9368,
+      "step": 130
+    },
+    {
+      "epoch": 0.12414119876806444,
+      "grad_norm": 1.4122617861876974,
+      "learning_rate": 9.973564043441057e-06,
+      "loss": 0.9563,
+      "step": 131
+    },
+    {
+      "epoch": 0.12508884150675195,
+      "grad_norm": 1.3777217200154346,
+      "learning_rate": 9.972769743754532e-06,
+      "loss": 1.0045,
+      "step": 132
+    },
+    {
+      "epoch": 0.12508884150675195,
+      "eval_loss": 0.9894475340843201,
+      "eval_runtime": 67.9151,
+      "eval_samples_per_second": 40.168,
+      "eval_steps_per_second": 0.633,
+      "step": 132
+    },
+    {
+      "epoch": 0.12603648424543948,
+      "grad_norm": 1.3796166756668033,
+      "learning_rate": 9.971963719772621e-06,
+      "loss": 0.9492,
+      "step": 133
+    },
+    {
+      "epoch": 0.12698412698412698,
+      "grad_norm": 1.8484880273102298,
+      "learning_rate": 9.971145973395685e-06,
+      "loss": 0.9844,
+      "step": 134
+    },
+    {
+      "epoch": 0.1279317697228145,
+      "grad_norm": 1.3341411571681234,
+      "learning_rate": 9.970316506551726e-06,
+      "loss": 0.9752,
+      "step": 135
+    },
+    {
+      "epoch": 0.128879412461502,
+      "grad_norm": 1.6561053976261508,
+      "learning_rate": 9.969475321196374e-06,
+      "loss": 0.9745,
+      "step": 136
+    },
+    {
+      "epoch": 0.12982705520018953,
+      "grad_norm": 1.4801234453473886,
+      "learning_rate": 9.968622419312895e-06,
+      "loss": 0.983,
+      "step": 137
+    },
+    {
+      "epoch": 0.13077469793887705,
+      "grad_norm": 1.650605001197027,
+      "learning_rate": 9.967757802912172e-06,
+      "loss": 0.9226,
+      "step": 138
+    },
+    {
+      "epoch": 0.13172234067756455,
+      "grad_norm": 1.5926202309459188,
+      "learning_rate": 9.966881474032711e-06,
+      "loss": 0.9754,
+      "step": 139
+    },
+    {
+      "epoch": 0.13266998341625208,
+      "grad_norm": 1.601896363818333,
+      "learning_rate": 9.965993434740634e-06,
+      "loss": 0.9812,
+      "step": 140
+    },
+    {
+      "epoch": 0.13361762615493958,
+      "grad_norm": 1.2173182907186852,
+      "learning_rate": 9.965093687129669e-06,
+      "loss": 0.987,
+      "step": 141
+    },
+    {
+      "epoch": 0.1345652688936271,
+      "grad_norm": 1.1914620826791744,
+      "learning_rate": 9.96418223332115e-06,
+      "loss": 0.9049,
+      "step": 142
+    },
+    {
+      "epoch": 0.13551291163231463,
+      "grad_norm": 1.1718797680017474,
+      "learning_rate": 9.963259075464011e-06,
+      "loss": 1.0314,
+      "step": 143
+    },
+    {
+      "epoch": 0.13646055437100213,
+      "grad_norm": 1.1921400894385041,
+      "learning_rate": 9.962324215734782e-06,
+      "loss": 0.9804,
+      "step": 144
+    },
+    {
+      "epoch": 0.13740819710968966,
+      "grad_norm": 1.483815329510506,
+      "learning_rate": 9.961377656337579e-06,
+      "loss": 0.9371,
+      "step": 145
+    },
+    {
+      "epoch": 0.13835583984837715,
+      "grad_norm": 1.174598142994827,
+      "learning_rate": 9.960419399504107e-06,
+      "loss": 0.9357,
+      "step": 146
+    },
+    {
+      "epoch": 0.13930348258706468,
+      "grad_norm": 1.768580101280523,
+      "learning_rate": 9.959449447493643e-06,
+      "loss": 0.9801,
+      "step": 147
+    },
+    {
+      "epoch": 0.14025112532575218,
+      "grad_norm": 1.8249892638670866,
+      "learning_rate": 9.958467802593046e-06,
+      "loss": 0.9553,
+      "step": 148
+    },
+    {
+      "epoch": 0.1411987680644397,
+      "grad_norm": 1.464158444501908,
+      "learning_rate": 9.957474467116739e-06,
+      "loss": 0.9816,
+      "step": 149
+    },
+    {
+      "epoch": 0.14214641080312723,
+      "grad_norm": 1.4006303093968397,
+      "learning_rate": 9.956469443406707e-06,
+      "loss": 0.959,
+      "step": 150
+    },
+    {
+      "epoch": 0.14309405354181473,
+      "grad_norm": 1.2677409714516314,
+      "learning_rate": 9.955452733832493e-06,
+      "loss": 0.9901,
+      "step": 151
+    },
+    {
+      "epoch": 0.14404169628050226,
+      "grad_norm": 1.616294750537421,
+      "learning_rate": 9.954424340791195e-06,
+      "loss": 0.9611,
+      "step": 152
+    },
+    {
+      "epoch": 0.14498933901918976,
+      "grad_norm": 1.2762321668275929,
+      "learning_rate": 9.953384266707453e-06,
+      "loss": 0.9971,
+      "step": 153
+    },
+    {
+      "epoch": 0.14593698175787728,
+      "grad_norm": 1.243174536133587,
+      "learning_rate": 9.952332514033449e-06,
+      "loss": 0.9545,
+      "step": 154
+    },
+    {
+      "epoch": 0.14593698175787728,
+      "eval_loss": 0.982397735118866,
+      "eval_runtime": 66.0802,
+      "eval_samples_per_second": 41.283,
+      "eval_steps_per_second": 0.651,
+      "step": 154
+    },
+    {
+      "epoch": 0.1468846244965648,
+      "grad_norm": 1.3382329641420807,
+      "learning_rate": 9.951269085248898e-06,
+      "loss": 0.9934,
+      "step": 155
+    },
+    {
+      "epoch": 0.1478322672352523,
+      "grad_norm": 1.2257802109789377,
+      "learning_rate": 9.950193982861048e-06,
+      "loss": 0.9528,
+      "step": 156
+    },
+    {
+      "epoch": 0.14877990997393983,
+      "grad_norm": 1.232838273393549,
+      "learning_rate": 9.949107209404664e-06,
+      "loss": 0.9719,
+      "step": 157
+    },
+    {
+      "epoch": 0.14972755271262733,
+      "grad_norm": 1.6289871091304047,
+      "learning_rate": 9.948008767442034e-06,
+      "loss": 0.9634,
+      "step": 158
+    },
+    {
+      "epoch": 0.15067519545131486,
+      "grad_norm": 1.486191374802309,
+      "learning_rate": 9.94689865956295e-06,
+      "loss": 0.9457,
+      "step": 159
+    },
+    {
+      "epoch": 0.15162283819000236,
+      "grad_norm": 1.2145460306596223,
+      "learning_rate": 9.94577688838472e-06,
+      "loss": 0.9841,
+      "step": 160
+    },
+    {
+      "epoch": 0.15257048092868988,
+      "grad_norm": 1.2094755905610057,
+      "learning_rate": 9.944643456552133e-06,
+      "loss": 0.9577,
+      "step": 161
+    },
+    {
+      "epoch": 0.1535181236673774,
+      "grad_norm": 1.2312169745263408,
+      "learning_rate": 9.943498366737487e-06,
+      "loss": 0.935,
+      "step": 162
+    },
+    {
+      "epoch": 0.1544657664060649,
+      "grad_norm": 1.996437404759428,
+      "learning_rate": 9.942341621640558e-06,
+      "loss": 0.9949,
+      "step": 163
+    },
+    {
+      "epoch": 0.15541340914475243,
+      "grad_norm": 1.434919063522936,
+      "learning_rate": 9.941173223988603e-06,
+      "loss": 0.961,
+      "step": 164
+    },
+    {
+      "epoch": 0.15636105188343993,
+      "grad_norm": 1.5694305163048035,
+      "learning_rate": 9.93999317653635e-06,
+      "loss": 1.0382,
+      "step": 165
+    },
+    {
+      "epoch": 0.15730869462212746,
+      "grad_norm": 1.4810485545937977,
+      "learning_rate": 9.938801482065998e-06,
+      "loss": 0.9782,
+      "step": 166
+    },
+    {
+      "epoch": 0.15825633736081499,
+      "grad_norm": 1.2852835752717688,
+      "learning_rate": 9.937598143387207e-06,
+      "loss": 0.9012,
+      "step": 167
+    },
+    {
+      "epoch": 0.15920398009950248,
+      "grad_norm": 1.3425076199539143,
+      "learning_rate": 9.93638316333708e-06,
+      "loss": 0.92,
+      "step": 168
+    },
+    {
+      "epoch": 0.16015162283819,
+      "grad_norm": 1.1023252456779573,
+      "learning_rate": 9.935156544780183e-06,
+      "loss": 0.9383,
+      "step": 169
+    },
+    {
+      "epoch": 0.1610992655768775,
+      "grad_norm": 1.4060044099112272,
+      "learning_rate": 9.93391829060851e-06,
+      "loss": 0.9764,
+      "step": 170
+    },
+    {
+      "epoch": 0.16204690831556504,
+      "grad_norm": 1.2799421690962227,
+      "learning_rate": 9.932668403741488e-06,
+      "loss": 0.8693,
+      "step": 171
+    },
+    {
+      "epoch": 0.16299455105425253,
+      "grad_norm": 1.139116134527199,
+      "learning_rate": 9.93140688712598e-06,
+      "loss": 0.9494,
+      "step": 172
+    },
+    {
+      "epoch": 0.16394219379294006,
+      "grad_norm": 1.217839709509947,
+      "learning_rate": 9.930133743736261e-06,
+      "loss": 0.8957,
+      "step": 173
+    },
+    {
+      "epoch": 0.1648898365316276,
+      "grad_norm": 1.301115684104673,
+      "learning_rate": 9.92884897657402e-06,
+      "loss": 0.9477,
+      "step": 174
+    },
+    {
+      "epoch": 0.16583747927031509,
+      "grad_norm": 1.2295422076552296,
+      "learning_rate": 9.92755258866835e-06,
+      "loss": 0.9441,
+      "step": 175
+    },
+    {
+      "epoch": 0.1667851220090026,
+      "grad_norm": 1.2198046764803545,
+      "learning_rate": 9.926244583075748e-06,
+      "loss": 0.9556,
+      "step": 176
+    },
+    {
+      "epoch": 0.1667851220090026,
+      "eval_loss": 0.9768843054771423,
+      "eval_runtime": 65.0055,
+      "eval_samples_per_second": 41.966,
+      "eval_steps_per_second": 0.661,
+      "step": 176
+    },
+    {
+      "epoch": 0.1677327647476901,
+      "grad_norm": 1.2348345090307584,
+      "learning_rate": 9.924924962880093e-06,
+      "loss": 0.9633,
+      "step": 177
+    },
+    {
+      "epoch": 0.16868040748637764,
+      "grad_norm": 1.2606209599886706,
+      "learning_rate": 9.923593731192655e-06,
+      "loss": 0.98,
+      "step": 178
+    },
+    {
+      "epoch": 0.16962805022506516,
+      "grad_norm": 2.6831467782092995,
+      "learning_rate": 9.922250891152078e-06,
+      "loss": 0.994,
+      "step": 179
+    },
+    {
+      "epoch": 0.17057569296375266,
+      "grad_norm": 1.2766929465017953,
+      "learning_rate": 9.920896445924372e-06,
+      "loss": 0.9753,
+      "step": 180
+    },
+    {
+      "epoch": 0.1715233357024402,
+      "grad_norm": 1.1262505199746897,
+      "learning_rate": 9.919530398702917e-06,
+      "loss": 0.9641,
+      "step": 181
+    },
+    {
+      "epoch": 0.1724709784411277,
+      "grad_norm": 1.442803425071792,
+      "learning_rate": 9.918152752708437e-06,
+      "loss": 0.9601,
+      "step": 182
+    },
+    {
+      "epoch": 0.1734186211798152,
+      "grad_norm": 1.2137409175608287,
+      "learning_rate": 9.916763511189009e-06,
+      "loss": 0.9747,
+      "step": 183
+    },
+    {
+      "epoch": 0.1743662639185027,
+      "grad_norm": 1.3271533812680127,
+      "learning_rate": 9.915362677420045e-06,
+      "loss": 0.9384,
+      "step": 184
+    },
+    {
+      "epoch": 0.17531390665719024,
+      "grad_norm": 1.2077500566101147,
+      "learning_rate": 9.913950254704291e-06,
+      "loss": 0.9372,
+      "step": 185
+    },
+    {
+      "epoch": 0.17626154939587776,
+      "grad_norm": 1.1297753271104558,
+      "learning_rate": 9.912526246371815e-06,
+      "loss": 0.8775,
+      "step": 186
+    },
+    {
+      "epoch": 0.17720919213456526,
+      "grad_norm": 1.2198507607935039,
+      "learning_rate": 9.911090655779997e-06,
+      "loss": 1.0036,
+      "step": 187
+    },
+    {
+      "epoch": 0.1781568348732528,
+      "grad_norm": 1.305484389615825,
+      "learning_rate": 9.909643486313533e-06,
+      "loss": 0.9687,
+      "step": 188
+    },
+    {
+      "epoch": 0.1791044776119403,
+      "grad_norm": 1.527203085727602,
+      "learning_rate": 9.908184741384412e-06,
+      "loss": 0.9225,
+      "step": 189
+    },
+    {
+      "epoch": 0.18005212035062781,
+      "grad_norm": 1.1568401663216765,
+      "learning_rate": 9.906714424431914e-06,
+      "loss": 0.9112,
+      "step": 190
+    },
+    {
+      "epoch": 0.18099976308931534,
+      "grad_norm": 1.2426671937737235,
+      "learning_rate": 9.905232538922604e-06,
+      "loss": 0.9509,
+      "step": 191
+    },
+    {
+      "epoch": 0.18194740582800284,
+      "grad_norm": 1.535223723588726,
+      "learning_rate": 9.903739088350325e-06,
+      "loss": 0.8984,
+      "step": 192
+    },
+    {
+      "epoch": 0.18289504856669037,
+      "grad_norm": 1.5431131775034228,
+      "learning_rate": 9.902234076236182e-06,
+      "loss": 0.9602,
+      "step": 193
+    },
+    {
+      "epoch": 0.18384269130537786,
+      "grad_norm": 1.182953828246788,
+      "learning_rate": 9.90071750612854e-06,
+      "loss": 0.887,
+      "step": 194
+    },
+    {
+      "epoch": 0.1847903340440654,
+      "grad_norm": 1.4338081609253326,
+      "learning_rate": 9.899189381603018e-06,
+      "loss": 0.9818,
+      "step": 195
+    },
+    {
+      "epoch": 0.1857379767827529,
+      "grad_norm": 1.4971956239027924,
+      "learning_rate": 9.897649706262474e-06,
+      "loss": 0.9455,
+      "step": 196
+    },
+    {
+      "epoch": 0.18668561952144042,
+      "grad_norm": 1.2771616220713862,
+      "learning_rate": 9.896098483736995e-06,
+      "loss": 0.9563,
+      "step": 197
+    },
+    {
+      "epoch": 0.18763326226012794,
+      "grad_norm": 1.1927091790410977,
+      "learning_rate": 9.894535717683902e-06,
+      "loss": 0.9376,
+      "step": 198
+    },
+    {
+      "epoch": 0.18763326226012794,
+      "eval_loss": 0.9750568270683289,
+      "eval_runtime": 68.0671,
+      "eval_samples_per_second": 40.078,
+      "eval_steps_per_second": 0.632,
+      "step": 198
+    },
+    {
+      "epoch": 0.18858090499881544,
+      "grad_norm": 1.447983084882731,
+      "learning_rate": 9.892961411787725e-06,
+      "loss": 0.941,
+      "step": 199
+    },
+    {
+      "epoch": 0.18952854773750297,
+      "grad_norm": 1.179964004851603,
+      "learning_rate": 9.891375569760205e-06,
+      "loss": 1.0044,
+      "step": 200
+    },
+    {
+      "epoch": 0.19047619047619047,
+      "grad_norm": 1.1672903614747536,
+      "learning_rate": 9.88977819534028e-06,
+      "loss": 0.9087,
+      "step": 201
+    },
+    {
+      "epoch": 0.191423833214878,
+      "grad_norm": 1.2604577059340927,
+      "learning_rate": 9.888169292294077e-06,
+      "loss": 0.97,
+      "step": 202
+    },
+    {
+      "epoch": 0.19237147595356552,
+      "grad_norm": 1.4285797440582975,
+      "learning_rate": 9.886548864414906e-06,
+      "loss": 0.9296,
+      "step": 203
+    },
+    {
+      "epoch": 0.19331911869225302,
+      "grad_norm": 1.4094308770717812,
+      "learning_rate": 9.88491691552325e-06,
+      "loss": 1.0148,
+      "step": 204
+    },
+    {
+      "epoch": 0.19426676143094054,
+      "grad_norm": 1.5152496759647966,
+      "learning_rate": 9.883273449466755e-06,
+      "loss": 0.9839,
+      "step": 205
+    },
+    {
+      "epoch": 0.19521440416962804,
+      "grad_norm": 1.4100497762615254,
+      "learning_rate": 9.881618470120216e-06,
+      "loss": 0.9112,
+      "step": 206
+    },
+    {
+      "epoch": 0.19616204690831557,
+      "grad_norm": 1.2060557963303735,
+      "learning_rate": 9.879951981385577e-06,
+      "loss": 1.0107,
+      "step": 207
+    },
+    {
+      "epoch": 0.19710968964700307,
+      "grad_norm": 1.1817847604275118,
+      "learning_rate": 9.87827398719192e-06,
+      "loss": 0.9401,
+      "step": 208
+    },
+    {
+      "epoch": 0.1980573323856906,
+      "grad_norm": 4.640069295683942,
+      "learning_rate": 9.876584491495448e-06,
+      "loss": 0.9453,
+      "step": 209
+    },
+    {
+      "epoch": 0.19900497512437812,
+      "grad_norm": 1.3678287853797575,
+      "learning_rate": 9.874883498279485e-06,
+      "loss": 0.9139,
+      "step": 210
+    },
+    {
+      "epoch": 0.19995261786306562,
+      "grad_norm": 1.2020105753823802,
+      "learning_rate": 9.87317101155446e-06,
+      "loss": 0.8995,
+      "step": 211
+    },
+    {
+      "epoch": 0.20090026060175314,
+      "grad_norm": 1.560649904766898,
+      "learning_rate": 9.871447035357903e-06,
+      "loss": 0.9953,
+      "step": 212
+    },
+    {
+      "epoch": 0.20184790334044064,
+      "grad_norm": 1.5587492681660762,
+      "learning_rate": 9.869711573754433e-06,
+      "loss": 0.9954,
+      "step": 213
+    },
+    {
+      "epoch": 0.20279554607912817,
+      "grad_norm": 1.1589889744586952,
+      "learning_rate": 9.867964630835742e-06,
+      "loss": 0.9664,
+      "step": 214
+    },
+    {
+      "epoch": 0.2037431888178157,
+      "grad_norm": 1.4941711737316694,
+      "learning_rate": 9.8662062107206e-06,
+      "loss": 0.9087,
+      "step": 215
+    },
+    {
+      "epoch": 0.2046908315565032,
+      "grad_norm": 1.1922425845332252,
+      "learning_rate": 9.86443631755483e-06,
+      "loss": 1.0093,
+      "step": 216
+    },
+    {
+      "epoch": 0.20563847429519072,
+      "grad_norm": 1.236697642847563,
+      "learning_rate": 9.862654955511309e-06,
+      "loss": 0.9649,
+      "step": 217
+    },
+    {
+      "epoch": 0.20658611703387822,
+      "grad_norm": 1.2350057563906354,
+      "learning_rate": 9.860862128789954e-06,
+      "loss": 0.9714,
+      "step": 218
+    },
+    {
+      "epoch": 0.20753375977256575,
+      "grad_norm": 1.4642161662286084,
+      "learning_rate": 9.859057841617709e-06,
+      "loss": 0.951,
+      "step": 219
+    },
+    {
+      "epoch": 0.20848140251125324,
+      "grad_norm": 1.1189678628969209,
+      "learning_rate": 9.857242098248543e-06,
+      "loss": 0.9097,
+      "step": 220
+    },
+    {
+      "epoch": 0.20848140251125324,
+      "eval_loss": 0.9686124324798584,
+      "eval_runtime": 68.177,
+      "eval_samples_per_second": 40.013,
+      "eval_steps_per_second": 0.631,
+      "step": 220
+    },
+    {
+      "epoch": 0.20942904524994077,
+      "grad_norm": 1.1409361807030405,
+      "learning_rate": 9.85541490296343e-06,
+      "loss": 0.913,
+      "step": 221
+    },
+    {
+      "epoch": 0.2103766879886283,
+      "grad_norm": 1.4175269201432499,
+      "learning_rate": 9.853576260070348e-06,
+      "loss": 0.956,
+      "step": 222
+    },
+    {
+      "epoch": 0.2113243307273158,
+      "grad_norm": 1.202975487777318,
+      "learning_rate": 9.851726173904264e-06,
+      "loss": 0.9681,
+      "step": 223
+    },
+    {
+      "epoch": 0.21227197346600332,
+      "grad_norm": 1.2528114366347458,
+      "learning_rate": 9.849864648827126e-06,
+      "loss": 0.9339,
+      "step": 224
+    },
+    {
+      "epoch": 0.21321961620469082,
+      "grad_norm": 1.5633193545585717,
+      "learning_rate": 9.847991689227848e-06,
+      "loss": 0.9481,
+      "step": 225
+    },
+    {
+      "epoch": 0.21416725894337835,
+      "grad_norm": 1.3036681318560188,
+      "learning_rate": 9.846107299522305e-06,
+      "loss": 0.9669,
+      "step": 226
+    },
+    {
+      "epoch": 0.21511490168206587,
+      "grad_norm": 1.276332389348374,
+      "learning_rate": 9.844211484153326e-06,
+      "loss": 1.0051,
+      "step": 227
+    },
+    {
+      "epoch": 0.21606254442075337,
+      "grad_norm": 1.3574477388118054,
+      "learning_rate": 9.842304247590668e-06,
+      "loss": 0.9185,
+      "step": 228
+    },
+    {
+      "epoch": 0.2170101871594409,
+      "grad_norm": 1.2290424692902366,
+      "learning_rate": 9.840385594331022e-06,
+      "loss": 0.9402,
+      "step": 229
+    },
+    {
+      "epoch": 0.2179578298981284,
+      "grad_norm": 1.3663071377926381,
+      "learning_rate": 9.838455528897998e-06,
+      "loss": 0.9303,
+      "step": 230
+    },
+    {
+      "epoch": 0.21890547263681592,
+      "grad_norm": 1.1297310850238833,
+      "learning_rate": 9.836514055842109e-06,
+      "loss": 0.8715,
+      "step": 231
+    },
+    {
+      "epoch": 0.21985311537550342,
+      "grad_norm": 1.1981756396394987,
+      "learning_rate": 9.834561179740763e-06,
+      "loss": 0.9603,
+      "step": 232
+    },
+    {
+      "epoch": 0.22080075811419095,
+      "grad_norm": 1.0960664647793084,
+      "learning_rate": 9.832596905198255e-06,
+      "loss": 0.9352,
+      "step": 233
+    },
+    {
+      "epoch": 0.22174840085287847,
+      "grad_norm": 1.2698198526002429,
+      "learning_rate": 9.830621236845755e-06,
+      "loss": 0.9044,
+      "step": 234
+    },
+    {
+      "epoch": 0.22269604359156597,
+      "grad_norm": 1.4209652174245544,
+      "learning_rate": 9.828634179341292e-06,
+      "loss": 0.9839,
+      "step": 235
+    },
+    {
+      "epoch": 0.2236436863302535,
+      "grad_norm": 1.5896834703549265,
+      "learning_rate": 9.826635737369752e-06,
+      "loss": 0.9479,
+      "step": 236
+    },
+    {
+      "epoch": 0.224591329068941,
+      "grad_norm": 1.118663687959167,
+      "learning_rate": 9.82462591564286e-06,
+      "loss": 0.9568,
+      "step": 237
+    },
+    {
+      "epoch": 0.22553897180762852,
+      "grad_norm": 1.081723075754863,
+      "learning_rate": 9.82260471889917e-06,
+      "loss": 1.0009,
+      "step": 238
+    },
+    {
+      "epoch": 0.22648661454631605,
+      "grad_norm": 1.3816847638469698,
+      "learning_rate": 9.82057215190406e-06,
+      "loss": 0.9565,
+      "step": 239
+    },
+    {
+      "epoch": 0.22743425728500355,
+      "grad_norm": 1.3650320361676973,
+      "learning_rate": 9.818528219449705e-06,
+      "loss": 0.9435,
+      "step": 240
+    },
+    {
+      "epoch": 0.22838190002369108,
+      "grad_norm": 1.1163028465916651,
+      "learning_rate": 9.816472926355087e-06,
+      "loss": 0.9926,
+      "step": 241
+    },
+    {
+      "epoch": 0.22932954276237857,
+      "grad_norm": 1.1783321971909724,
+      "learning_rate": 9.814406277465969e-06,
+      "loss": 0.9908,
+      "step": 242
+    },
+    {
+      "epoch": 0.22932954276237857,
+      "eval_loss": 0.9650764465332031,
+      "eval_runtime": 63.7155,
+      "eval_samples_per_second": 42.815,
+      "eval_steps_per_second": 0.675,
+      "step": 242
+    },
+    {
+      "epoch": 0.2302771855010661,
+      "grad_norm": 1.078825580859753,
+      "learning_rate": 9.812328277654889e-06,
+      "loss": 0.9395,
+      "step": 243
+    },
+    {
+      "epoch": 0.2312248282397536,
+      "grad_norm": 1.1093483786757967,
+      "learning_rate": 9.810238931821139e-06,
+      "loss": 0.9178,
+      "step": 244
+    },
+    {
+      "epoch": 0.23217247097844113,
+      "grad_norm": 1.3499071449657545,
+      "learning_rate": 9.808138244890775e-06,
+      "loss": 0.952,
+      "step": 245
+    },
+    {
+      "epoch": 0.23312011371712865,
+      "grad_norm": 1.1761313846911488,
+      "learning_rate": 9.806026221816582e-06,
+      "loss": 0.9497,
+      "step": 246
+    },
+    {
+      "epoch": 0.23406775645581615,
+      "grad_norm": 1.2110375794344939,
+      "learning_rate": 9.803902867578075e-06,
+      "loss": 0.944,
+      "step": 247
+    },
+    {
+      "epoch": 0.23501539919450368,
+      "grad_norm": 1.2034987469557872,
+      "learning_rate": 9.801768187181487e-06,
+      "loss": 0.986,
+      "step": 248
+    },
+    {
+      "epoch": 0.23596304193319118,
+      "grad_norm": 1.3058009296406379,
+      "learning_rate": 9.799622185659748e-06,
+      "loss": 0.967,
+      "step": 249
+    },
+    {
+      "epoch": 0.2369106846718787,
+      "grad_norm": 1.1123429020549715,
+      "learning_rate": 9.797464868072489e-06,
+      "loss": 0.9217,
+      "step": 250
+    },
+    {
+      "epoch": 0.23785832741056623,
+      "grad_norm": 1.089125109757041,
+      "learning_rate": 9.795296239506011e-06,
+      "loss": 0.8866,
+      "step": 251
+    },
+    {
+      "epoch": 0.23880597014925373,
+      "grad_norm": 1.2123667069466009,
+      "learning_rate": 9.793116305073292e-06,
+      "loss": 0.9307,
+      "step": 252
+    },
+    {
+      "epoch": 0.23975361288794125,
+      "grad_norm": 1.4622869606703903,
+      "learning_rate": 9.790925069913962e-06,
+      "loss": 0.9538,
+      "step": 253
+    },
+    {
+      "epoch": 0.24070125562662875,
+      "grad_norm": 1.5523797111635822,
+      "learning_rate": 9.788722539194291e-06,
+      "loss": 0.969,
+      "step": 254
+    },
+    {
+      "epoch": 0.24164889836531628,
+      "grad_norm": 1.1827311652398949,
+      "learning_rate": 9.786508718107184e-06,
+      "loss": 0.9849,
+      "step": 255
+    },
+    {
+      "epoch": 0.2425965411040038,
+      "grad_norm": 1.2881186217827927,
+      "learning_rate": 9.78428361187217e-06,
+      "loss": 0.9295,
+      "step": 256
+    },
+    {
+      "epoch": 0.2435441838426913,
+      "grad_norm": 1.474451652001404,
+      "learning_rate": 9.782047225735376e-06,
+      "loss": 0.9576,
+      "step": 257
+    },
+    {
+      "epoch": 0.24449182658137883,
+      "grad_norm": 1.2287731326656932,
+      "learning_rate": 9.77979956496953e-06,
+      "loss": 0.9485,
+      "step": 258
+    },
+    {
+      "epoch": 0.24543946932006633,
+      "grad_norm": 1.3059618909257746,
+      "learning_rate": 9.777540634873939e-06,
+      "loss": 0.9961,
+      "step": 259
+    },
+    {
+      "epoch": 0.24638711205875385,
+      "grad_norm": 1.25801433279188,
+      "learning_rate": 9.775270440774481e-06,
+      "loss": 0.9374,
+      "step": 260
+    },
+    {
+      "epoch": 0.24733475479744135,
+      "grad_norm": 1.4594944714968974,
+      "learning_rate": 9.772988988023589e-06,
+      "loss": 0.9714,
+      "step": 261
+    },
+    {
+      "epoch": 0.24828239753612888,
+      "grad_norm": 1.1788267508576873,
+      "learning_rate": 9.770696282000245e-06,
+      "loss": 0.9251,
+      "step": 262
+    },
+    {
+      "epoch": 0.2492300402748164,
+      "grad_norm": 1.2489815438864824,
+      "learning_rate": 9.76839232810996e-06,
+      "loss": 0.9126,
+      "step": 263
+    },
+    {
+      "epoch": 0.2501776830135039,
+      "grad_norm": 1.3083502635920439,
+      "learning_rate": 9.766077131784764e-06,
+      "loss": 0.94,
+      "step": 264
+    },
+    {
+      "epoch": 0.2501776830135039,
+      "eval_loss": 0.9628852605819702,
+      "eval_runtime": 65.8683,
+      "eval_samples_per_second": 41.416,
+      "eval_steps_per_second": 0.653,
+      "step": 264
+    },
+    {
+      "epoch": 0.25112532575219143,
+      "grad_norm": 1.2876315572259667,
+      "learning_rate": 9.763750698483192e-06,
+      "loss": 0.9824,
+      "step": 265
+    },
+    {
+      "epoch": 0.25207296849087896,
+      "grad_norm": 1.4509050672400128,
+      "learning_rate": 9.761413033690276e-06,
+      "loss": 1.01,
+      "step": 266
+    },
+    {
+      "epoch": 0.25302061122956643,
+      "grad_norm": 1.2615386437049756,
+      "learning_rate": 9.759064142917526e-06,
+      "loss": 0.9336,
+      "step": 267
+    },
+    {
+      "epoch": 0.25396825396825395,
+      "grad_norm": 1.1835961624076299,
+      "learning_rate": 9.756704031702919e-06,
+      "loss": 0.9462,
+      "step": 268
+    },
+    {
+      "epoch": 0.2549158967069415,
+      "grad_norm": 1.2900537501658034,
+      "learning_rate": 9.75433270561089e-06,
+      "loss": 0.9071,
+      "step": 269
+    },
+    {
+      "epoch": 0.255863539445629,
+      "grad_norm": 1.138429016575903,
+      "learning_rate": 9.75195017023231e-06,
+      "loss": 0.8544,
+      "step": 270
+    },
+    {
+      "epoch": 0.25681118218431653,
+      "grad_norm": 1.1853801438439096,
+      "learning_rate": 9.74955643118448e-06,
+      "loss": 0.92,
+      "step": 271
+    },
+    {
+      "epoch": 0.257758824923004,
+      "grad_norm": 1.2610744856343499,
+      "learning_rate": 9.74715149411112e-06,
+      "loss": 0.9012,
+      "step": 272
+    },
+    {
+      "epoch": 0.25870646766169153,
+      "grad_norm": 1.4706709692456896,
+      "learning_rate": 9.744735364682347e-06,
+      "loss": 0.9476,
+      "step": 273
+    },
+    {
+      "epoch": 0.25965411040037906,
+      "grad_norm": 1.4148479481637295,
+      "learning_rate": 9.742308048594665e-06,
+      "loss": 0.9121,
+      "step": 274
+    },
+    {
+      "epoch": 0.2606017531390666,
+      "grad_norm": 1.236422033348515,
+      "learning_rate": 9.73986955157096e-06,
+      "loss": 0.9135,
+      "step": 275
+    },
+    {
+      "epoch": 0.2615493958777541,
+      "grad_norm": 1.1477317083396126,
+      "learning_rate": 9.737419879360471e-06,
+      "loss": 0.9516,
+      "step": 276
+    },
+    {
+      "epoch": 0.2624970386164416,
+      "grad_norm": 2.5546186723319373,
+      "learning_rate": 9.734959037738788e-06,
+      "loss": 0.9422,
+      "step": 277
+    },
+    {
+      "epoch": 0.2634446813551291,
+      "grad_norm": 1.3564480695771186,
+      "learning_rate": 9.732487032507837e-06,
+      "loss": 0.8961,
+      "step": 278
+    },
+    {
+      "epoch": 0.26439232409381663,
+      "grad_norm": 1.4878738583178996,
+      "learning_rate": 9.730003869495863e-06,
+      "loss": 0.9457,
+      "step": 279
+    },
+    {
+      "epoch": 0.26533996683250416,
+      "grad_norm": 1.1351790275971436,
+      "learning_rate": 9.727509554557416e-06,
+      "loss": 0.8766,
+      "step": 280
+    },
+    {
+      "epoch": 0.2662876095711917,
+      "grad_norm": 1.3900072874584015,
+      "learning_rate": 9.725004093573343e-06,
+      "loss": 0.8972,
+      "step": 281
+    },
+    {
+      "epoch": 0.26723525230987916,
+      "grad_norm": 1.1866023759013848,
+      "learning_rate": 9.722487492450764e-06,
+      "loss": 0.9335,
+      "step": 282
+    },
+    {
+      "epoch": 0.2681828950485667,
+      "grad_norm": 1.2381217486697587,
+      "learning_rate": 9.719959757123073e-06,
+      "loss": 0.9083,
+      "step": 283
+    },
+    {
+      "epoch": 0.2691305377872542,
+      "grad_norm": 1.6107373228302189,
+      "learning_rate": 9.717420893549902e-06,
+      "loss": 0.9913,
+      "step": 284
+    },
+    {
+      "epoch": 0.27007818052594174,
+      "grad_norm": 1.3012559103471528,
+      "learning_rate": 9.714870907717134e-06,
+      "loss": 0.9384,
+      "step": 285
+    },
+    {
+      "epoch": 0.27102582326462926,
+      "grad_norm": 1.3512266977948462,
+      "learning_rate": 9.712309805636863e-06,
+      "loss": 0.9738,
+      "step": 286
+    },
+    {
+      "epoch": 0.27102582326462926,
+      "eval_loss": 0.9620270729064941,
+      "eval_runtime": 59.315,
+      "eval_samples_per_second": 45.992,
+      "eval_steps_per_second": 0.725,
+      "step": 286
+    },
+    {
+      "epoch": 0.27197346600331673,
+      "grad_norm": 1.1737111003693583,
+      "learning_rate": 9.709737593347404e-06,
+      "loss": 0.9669,
+      "step": 287
+    },
+    {
+      "epoch": 0.27292110874200426,
+      "grad_norm": 1.158891062157781,
+      "learning_rate": 9.707154276913255e-06,
+      "loss": 0.9724,
+      "step": 288
+    },
+    {
+      "epoch": 0.2738687514806918,
+      "grad_norm": 1.1818539669598636,
+      "learning_rate": 9.704559862425101e-06,
+      "loss": 0.9411,
+      "step": 289
+    },
+    {
+      "epoch": 0.2748163942193793,
+      "grad_norm": 1.317223158403057,
+      "learning_rate": 9.701954355999791e-06,
+      "loss": 0.8897,
+      "step": 290
+    },
+    {
+      "epoch": 0.2757640369580668,
+      "grad_norm": 1.2827511719089313,
+      "learning_rate": 9.699337763780325e-06,
+      "loss": 0.9062,
+      "step": 291
+    },
+    {
+      "epoch": 0.2767116796967543,
+      "grad_norm": 1.28805108052852,
+      "learning_rate": 9.696710091935842e-06,
+      "loss": 0.9176,
+      "step": 292
+    },
+    {
+      "epoch": 0.27765932243544184,
+      "grad_norm": 1.3367234242878245,
+      "learning_rate": 9.6940713466616e-06,
+      "loss": 0.9009,
+      "step": 293
+    },
+    {
+      "epoch": 0.27860696517412936,
+      "grad_norm": 1.2541386047985268,
+      "learning_rate": 9.691421534178966e-06,
+      "loss": 0.9109,
+      "step": 294
+    },
+    {
+      "epoch": 0.2795546079128169,
+      "grad_norm": 1.5026012491650225,
+      "learning_rate": 9.688760660735403e-06,
+      "loss": 0.9709,
+      "step": 295
+    },
+    {
+      "epoch": 0.28050225065150436,
+      "grad_norm": 1.2922689184697398,
+      "learning_rate": 9.68608873260445e-06,
+      "loss": 0.8457,
+      "step": 296
+    },
+    {
+      "epoch": 0.2814498933901919,
+      "grad_norm": 1.1843338944530994,
+      "learning_rate": 9.683405756085708e-06,
+      "loss": 0.9313,
+      "step": 297
+    },
+    {
+      "epoch": 0.2823975361288794,
+      "grad_norm": 1.315466417029974,
+      "learning_rate": 9.680711737504832e-06,
+      "loss": 1.019,
+      "step": 298
+    },
+    {
+      "epoch": 0.28334517886756694,
+      "grad_norm": 1.0199556490757884,
+      "learning_rate": 9.678006683213503e-06,
+      "loss": 0.8922,
+      "step": 299
+    },
+    {
+      "epoch": 0.28429282160625446,
+      "grad_norm": 1.1400934246384171,
+      "learning_rate": 9.675290599589429e-06,
+      "loss": 0.908,
+      "step": 300
+    },
+    {
+      "epoch": 0.28524046434494194,
+      "grad_norm": 1.8423074242848725,
+      "learning_rate": 9.672563493036318e-06,
+      "loss": 1.0065,
+      "step": 301
+    },
+    {
+      "epoch": 0.28618810708362946,
+      "grad_norm": 1.1796939423622033,
+      "learning_rate": 9.669825369983865e-06,
+      "loss": 0.9303,
+      "step": 302
+    },
+    {
+      "epoch": 0.287135749822317,
+      "grad_norm": 1.2479579843600068,
+      "learning_rate": 9.667076236887743e-06,
+      "loss": 1.0198,
+      "step": 303
+    },
+    {
+      "epoch": 0.2880833925610045,
+      "grad_norm": 1.229386161002158,
+      "learning_rate": 9.664316100229578e-06,
+      "loss": 0.9328,
+      "step": 304
+    },
+    {
+      "epoch": 0.28903103529969204,
+      "grad_norm": 1.354608076441114,
+      "learning_rate": 9.661544966516945e-06,
+      "loss": 0.8865,
+      "step": 305
+    },
+    {
+      "epoch": 0.2899786780383795,
+      "grad_norm": 1.2733991556068809,
+      "learning_rate": 9.658762842283343e-06,
+      "loss": 0.9805,
+      "step": 306
+    },
+    {
+      "epoch": 0.29092632077706704,
+      "grad_norm": 1.2495713583949597,
+      "learning_rate": 9.655969734088184e-06,
+      "loss": 0.9302,
+      "step": 307
+    },
+    {
+      "epoch": 0.29187396351575456,
+      "grad_norm": 1.2103907414095358,
+      "learning_rate": 9.653165648516777e-06,
+      "loss": 0.885,
+      "step": 308
+    },
+    {
+      "epoch": 0.29187396351575456,
+      "eval_loss": 0.9591483473777771,
+      "eval_runtime": 68.3896,
+      "eval_samples_per_second": 39.889,
+      "eval_steps_per_second": 0.629,
+      "step": 308
+    },
+    {
+      "epoch": 0.2928216062544421,
+      "grad_norm": 1.1956016894279018,
+      "learning_rate": 9.650350592180312e-06,
+      "loss": 0.9577,
+      "step": 309
+    },
+    {
+      "epoch": 0.2937692489931296,
+      "grad_norm": 1.140247620602589,
+      "learning_rate": 9.647524571715843e-06,
+      "loss": 0.9264,
+      "step": 310
+    },
+    {
+      "epoch": 0.2947168917318171,
+      "grad_norm": 1.2006266683263125,
+      "learning_rate": 9.644687593786282e-06,
+      "loss": 0.9792,
+      "step": 311
+    },
+    {
+      "epoch": 0.2956645344705046,
+      "grad_norm": 1.2812673838645852,
+      "learning_rate": 9.641839665080363e-06,
+      "loss": 0.954,
+      "step": 312
+    },
+    {
+      "epoch": 0.29661217720919214,
+      "grad_norm": 1.010846565968867,
+      "learning_rate": 9.638980792312651e-06,
+      "loss": 0.9515,
+      "step": 313
+    },
+    {
+      "epoch": 0.29755981994787967,
+      "grad_norm": 1.508846485133625,
+      "learning_rate": 9.636110982223505e-06,
+      "loss": 0.9611,
+      "step": 314
+    },
+    {
+      "epoch": 0.29850746268656714,
+      "grad_norm": 1.2091515162070219,
+      "learning_rate": 9.633230241579075e-06,
+      "loss": 0.8803,
+      "step": 315
+    },
+    {
+      "epoch": 0.29945510542525466,
+      "grad_norm": 1.251566988747115,
+      "learning_rate": 9.630338577171282e-06,
+      "loss": 0.9102,
+      "step": 316
+    },
+    {
+      "epoch": 0.3004027481639422,
+      "grad_norm": 1.4368558329637313,
+      "learning_rate": 9.627435995817799e-06,
+      "loss": 0.9681,
+      "step": 317
+    },
+    {
+      "epoch": 0.3013503909026297,
+      "grad_norm": 1.2724580288581318,
+      "learning_rate": 9.624522504362039e-06,
+      "loss": 0.9714,
+      "step": 318
+    },
+    {
+      "epoch": 0.30229803364131724,
+      "grad_norm": 1.2457801062593066,
+      "learning_rate": 9.621598109673142e-06,
+      "loss": 0.9663,
+      "step": 319
+    },
+    {
+      "epoch": 0.3032456763800047,
+      "grad_norm": 1.5450412575397683,
+      "learning_rate": 9.618662818645949e-06,
+      "loss": 0.973,
+      "step": 320
+    },
+    {
+      "epoch": 0.30419331911869224,
+      "grad_norm": 1.3301347899029445,
+      "learning_rate": 9.615716638200993e-06,
+      "loss": 0.9292,
+      "step": 321
+    },
+    {
+      "epoch": 0.30514096185737977,
+      "grad_norm": 1.5045379413960773,
+      "learning_rate": 9.612759575284483e-06,
+      "loss": 0.9943,
+      "step": 322
+    },
+    {
+      "epoch": 0.3060886045960673,
+      "grad_norm": 1.2146706034284283,
+      "learning_rate": 9.60979163686828e-06,
+      "loss": 0.8828,
+      "step": 323
+    },
+    {
+      "epoch": 0.3070362473347548,
+      "grad_norm": 1.1864956541845377,
+      "learning_rate": 9.606812829949896e-06,
+      "loss": 0.92,
+      "step": 324
+    },
+    {
+      "epoch": 0.3079838900734423,
+      "grad_norm": 1.41143117586689,
+      "learning_rate": 9.603823161552459e-06,
+      "loss": 0.9539,
+      "step": 325
+    },
+    {
+      "epoch": 0.3089315328121298,
+      "grad_norm": 2.5914491078059796,
+      "learning_rate": 9.600822638724704e-06,
+      "loss": 0.9211,
+      "step": 326
+    },
+    {
+      "epoch": 0.30987917555081734,
+      "grad_norm": 1.104156076330228,
+      "learning_rate": 9.597811268540969e-06,
+      "loss": 0.9148,
+      "step": 327
+    },
+    {
+      "epoch": 0.31082681828950487,
+      "grad_norm": 1.1472423105684746,
+      "learning_rate": 9.594789058101154e-06,
+      "loss": 0.9518,
+      "step": 328
+    },
+    {
+      "epoch": 0.3117744610281924,
+      "grad_norm": 1.1393816701130914,
+      "learning_rate": 9.591756014530723e-06,
+      "loss": 1.0076,
+      "step": 329
+    },
+    {
+      "epoch": 0.31272210376687987,
+      "grad_norm": 1.2776861681261165,
+      "learning_rate": 9.588712144980681e-06,
+      "loss": 0.8784,
+      "step": 330
+    },
+    {
+      "epoch": 0.31272210376687987,
+      "eval_loss": 0.9570937156677246,
+      "eval_runtime": 68.898,
+      "eval_samples_per_second": 39.595,
+      "eval_steps_per_second": 0.624,
+      "step": 330
+    },
+    {
+      "epoch": 0.3136697465055674,
+      "grad_norm": 1.192795131650072,
+      "learning_rate": 9.585657456627557e-06,
+      "loss": 0.9045,
+      "step": 331
+    },
+    {
+      "epoch": 0.3146173892442549,
+      "grad_norm": 1.2042562619274322,
+      "learning_rate": 9.582591956673387e-06,
+      "loss": 0.9683,
+      "step": 332
+    },
+    {
+      "epoch": 0.31556503198294245,
+      "grad_norm": 1.1444088880890944,
+      "learning_rate": 9.579515652345699e-06,
+      "loss": 0.8678,
+      "step": 333
+    },
+    {
+      "epoch": 0.31651267472162997,
+      "grad_norm": 1.0769104211549974,
+      "learning_rate": 9.57642855089749e-06,
+      "loss": 0.9175,
+      "step": 334
+    },
+    {
+      "epoch": 0.31746031746031744,
+      "grad_norm": 1.2380307581631063,
+      "learning_rate": 9.57333065960722e-06,
+      "loss": 0.9351,
+      "step": 335
+    },
+    {
+      "epoch": 0.31840796019900497,
+      "grad_norm": 1.071043290520968,
+      "learning_rate": 9.570221985778785e-06,
+      "loss": 0.8855,
+      "step": 336
+    },
+    {
+      "epoch": 0.3193556029376925,
+      "grad_norm": 1.1849521886922723,
+      "learning_rate": 9.567102536741501e-06,
+      "loss": 0.917,
+      "step": 337
+    },
+    {
+      "epoch": 0.32030324567638,
+      "grad_norm": 1.20214216361167,
+      "learning_rate": 9.563972319850092e-06,
+      "loss": 0.9147,
+      "step": 338
+    },
+    {
+      "epoch": 0.3212508884150675,
+      "grad_norm": 1.266949477776236,
+      "learning_rate": 9.560831342484668e-06,
+      "loss": 0.9383,
+      "step": 339
+    },
+    {
+      "epoch": 0.322198531153755,
+      "grad_norm": 1.5670977324953559,
+      "learning_rate": 9.557679612050708e-06,
+      "loss": 1.0023,
+      "step": 340
+    },
+    {
+      "epoch": 0.32314617389244255,
+      "grad_norm": 1.237648169383608,
+      "learning_rate": 9.554517135979044e-06,
+      "loss": 0.9671,
+      "step": 341
+    },
+    {
+      "epoch": 0.32409381663113007,
+      "grad_norm": 1.0260918280422053,
+      "learning_rate": 9.551343921725844e-06,
+      "loss": 0.879,
+      "step": 342
+    },
+    {
+      "epoch": 0.3250414593698176,
+      "grad_norm": 1.155124445578137,
+      "learning_rate": 9.548159976772593e-06,
+      "loss": 0.9416,
+      "step": 343
+    },
+    {
+      "epoch": 0.32598910210850507,
+      "grad_norm": 1.1950689084580686,
+      "learning_rate": 9.544965308626075e-06,
+      "loss": 0.9,
+      "step": 344
+    },
+    {
+      "epoch": 0.3269367448471926,
+      "grad_norm": 1.2849959856276705,
+      "learning_rate": 9.541759924818358e-06,
+      "loss": 0.9332,
+      "step": 345
+    },
+    {
+      "epoch": 0.3278843875858801,
+      "grad_norm": 1.0302992790409418,
+      "learning_rate": 9.538543832906773e-06,
+      "loss": 0.9051,
+      "step": 346
+    },
+    {
+      "epoch": 0.32883203032456765,
+      "grad_norm": 1.2345608543091064,
+      "learning_rate": 9.535317040473895e-06,
+      "loss": 0.9806,
+      "step": 347
+    },
+    {
+      "epoch": 0.3297796730632552,
+      "grad_norm": 1.1665835041880899,
+      "learning_rate": 9.532079555127532e-06,
+      "loss": 0.9433,
+      "step": 348
+    },
+    {
+      "epoch": 0.33072731580194265,
+      "grad_norm": 1.265860203994782,
+      "learning_rate": 9.528831384500699e-06,
+      "loss": 0.9776,
+      "step": 349
+    },
+    {
+      "epoch": 0.33167495854063017,
+      "grad_norm": 1.293238505576827,
+      "learning_rate": 9.525572536251608e-06,
+      "loss": 1.0388,
+      "step": 350
+    },
+    {
+      "epoch": 0.3326226012793177,
+      "grad_norm": 1.2363591052870795,
+      "learning_rate": 9.52230301806364e-06,
+      "loss": 0.9252,
+      "step": 351
+    },
+    {
+      "epoch": 0.3335702440180052,
+      "grad_norm": 1.3748905848676085,
+      "learning_rate": 9.519022837645337e-06,
+      "loss": 0.8923,
+      "step": 352
+    },
+    {
+      "epoch": 0.3335702440180052,
+      "eval_loss": 0.9540281891822815,
+      "eval_runtime": 62.2753,
+      "eval_samples_per_second": 43.805,
+      "eval_steps_per_second": 0.69,
+      "step": 352
+    },
+    {
+      "epoch": 0.33451788675669275,
+      "grad_norm": 1.1703557022342401,
+      "learning_rate": 9.51573200273038e-06,
+      "loss": 0.9791,
+      "step": 353
+    },
+    {
+      "epoch": 0.3354655294953802,
+      "grad_norm": 1.3163659131319334,
+      "learning_rate": 9.512430521077565e-06,
+      "loss": 0.8974,
+      "step": 354
+    },
+    {
+      "epoch": 0.33641317223406775,
+      "grad_norm": 1.1823387827110081,
+      "learning_rate": 9.509118400470792e-06,
+      "loss": 0.8668,
+      "step": 355
+    },
+    {
+      "epoch": 0.3373608149727553,
+      "grad_norm": 1.0471543968324866,
+      "learning_rate": 9.505795648719049e-06,
+      "loss": 0.9248,
+      "step": 356
+    },
+    {
+      "epoch": 0.3383084577114428,
+      "grad_norm": 1.2873543382804975,
+      "learning_rate": 9.502462273656381e-06,
+      "loss": 0.8897,
+      "step": 357
+    },
+    {
+      "epoch": 0.3392561004501303,
+      "grad_norm": 1.2157109813891434,
+      "learning_rate": 9.499118283141887e-06,
+      "loss": 0.9304,
+      "step": 358
+    },
+    {
+      "epoch": 0.3402037431888178,
+      "grad_norm": 1.093181377661525,
+      "learning_rate": 9.495763685059689e-06,
+      "loss": 0.9237,
+      "step": 359
+    },
+    {
+      "epoch": 0.3411513859275053,
+      "grad_norm": 1.095774592001467,
+      "learning_rate": 9.492398487318922e-06,
+      "loss": 0.8669,
+      "step": 360
+    },
+    {
+      "epoch": 0.34209902866619285,
+      "grad_norm": 1.1676179176818222,
+      "learning_rate": 9.48902269785371e-06,
+      "loss": 0.9338,
+      "step": 361
+    },
+    {
+      "epoch": 0.3430466714048804,
+      "grad_norm": 1.082117155119373,
+      "learning_rate": 9.485636324623147e-06,
+      "loss": 0.9301,
+      "step": 362
+    },
+    {
+      "epoch": 0.34399431414356785,
+      "grad_norm": 1.5869790381600608,
+      "learning_rate": 9.482239375611282e-06,
+      "loss": 0.8566,
+      "step": 363
+    },
+    {
+      "epoch": 0.3449419568822554,
+      "grad_norm": 2.1300888287293436,
+      "learning_rate": 9.478831858827105e-06,
+      "loss": 0.9462,
+      "step": 364
+    },
+    {
+      "epoch": 0.3458895996209429,
+      "grad_norm": 1.329321349965101,
+      "learning_rate": 9.475413782304509e-06,
+      "loss": 0.9344,
+      "step": 365
+    },
+    {
+      "epoch": 0.3468372423596304,
+      "grad_norm": 3.4098937413401678,
+      "learning_rate": 9.471985154102292e-06,
+      "loss": 0.881,
+      "step": 366
+    },
+    {
+      "epoch": 0.34778488509831795,
+      "grad_norm": 1.374583167993129,
+      "learning_rate": 9.468545982304132e-06,
+      "loss": 0.8899,
+      "step": 367
+    },
+    {
+      "epoch": 0.3487325278370054,
+      "grad_norm": 1.2132880433358602,
+      "learning_rate": 9.465096275018556e-06,
+      "loss": 0.9016,
+      "step": 368
+    },
+    {
+      "epoch": 0.34968017057569295,
+      "grad_norm": 1.132880559404501,
+      "learning_rate": 9.461636040378941e-06,
+      "loss": 0.9424,
+      "step": 369
+    },
+    {
+      "epoch": 0.3506278133143805,
+      "grad_norm": 1.573588626293436,
+      "learning_rate": 9.458165286543477e-06,
+      "loss": 0.9758,
+      "step": 370
+    },
+    {
+      "epoch": 0.351575456053068,
+      "grad_norm": 1.0016737529772646,
+      "learning_rate": 9.454684021695157e-06,
+      "loss": 0.9522,
+      "step": 371
+    },
+    {
+      "epoch": 0.35252309879175553,
+      "grad_norm": 1.2060571666651005,
+      "learning_rate": 9.451192254041759e-06,
+      "loss": 0.8995,
+      "step": 372
+    },
+    {
+      "epoch": 0.353470741530443,
+      "grad_norm": 1.5491588961886638,
+      "learning_rate": 9.447689991815819e-06,
+      "loss": 0.9497,
+      "step": 373
+    },
+    {
+      "epoch": 0.3544183842691305,
+      "grad_norm": 2.323597523498367,
+      "learning_rate": 9.444177243274619e-06,
+      "loss": 0.9483,
+      "step": 374
+    },
+    {
+      "epoch": 0.3544183842691305,
+      "eval_loss": 0.9546486139297485,
+      "eval_runtime": 67.7741,
+      "eval_samples_per_second": 40.251,
+      "eval_steps_per_second": 0.634,
+      "step": 374
+    },
+    {
+      "epoch": 0.35536602700781805,
+      "grad_norm": 1.1957751593867816,
+      "learning_rate": 9.440654016700161e-06,
+      "loss": 0.9069,
+      "step": 375
+    },
+    {
+      "epoch": 0.3563136697465056,
+      "grad_norm": 1.3480198545553501,
+      "learning_rate": 9.437120320399158e-06,
+      "loss": 0.9206,
+      "step": 376
+    },
+    {
+      "epoch": 0.3572613124851931,
+      "grad_norm": 1.1240947731641266,
+      "learning_rate": 9.433576162703e-06,
+      "loss": 0.9686,
+      "step": 377
+    },
+    {
+      "epoch": 0.3582089552238806,
+      "grad_norm": 1.258961853327028,
+      "learning_rate": 9.430021551967745e-06,
+      "loss": 0.9156,
+      "step": 378
+    },
+    {
+      "epoch": 0.3591565979625681,
+      "grad_norm": 1.1465674821214438,
+      "learning_rate": 9.426456496574095e-06,
+      "loss": 0.9027,
+      "step": 379
+    },
+    {
+      "epoch": 0.36010424070125563,
+      "grad_norm": 1.334135631113088,
+      "learning_rate": 9.422881004927383e-06,
+      "loss": 0.9215,
+      "step": 380
+    },
+    {
+      "epoch": 0.36105188343994316,
+      "grad_norm": 1.052076097463688,
+      "learning_rate": 9.419295085457536e-06,
+      "loss": 0.8708,
+      "step": 381
+    },
+    {
+      "epoch": 0.3619995261786307,
+      "grad_norm": 1.3069872390381696,
+      "learning_rate": 9.41569874661908e-06,
+      "loss": 0.9392,
+      "step": 382
+    },
+    {
+      "epoch": 0.36294716891731815,
+      "grad_norm": 1.1946541917496492,
+      "learning_rate": 9.412091996891097e-06,
+      "loss": 0.9186,
+      "step": 383
+    },
+    {
+      "epoch": 0.3638948116560057,
+      "grad_norm": 1.130570319952377,
+      "learning_rate": 9.408474844777218e-06,
+      "loss": 0.9231,
+      "step": 384
+    },
+    {
+      "epoch": 0.3648424543946932,
+      "grad_norm": 1.230122090333074,
+      "learning_rate": 9.4048472988056e-06,
+      "loss": 1.0082,
+      "step": 385
+    },
+    {
+      "epoch": 0.36579009713338073,
+      "grad_norm": 1.0720696634128188,
+      "learning_rate": 9.401209367528907e-06,
+      "loss": 0.9291,
+      "step": 386
+    },
+    {
+      "epoch": 0.36673773987206826,
+      "grad_norm": 1.1723709465115237,
+      "learning_rate": 9.397561059524285e-06,
+      "loss": 0.9175,
+      "step": 387
+    },
+    {
+      "epoch": 0.36768538261075573,
+      "grad_norm": 1.5238004908651446,
+      "learning_rate": 9.393902383393347e-06,
+      "loss": 0.9621,
+      "step": 388
+    },
+    {
+      "epoch": 0.36863302534944326,
+      "grad_norm": 1.0814097944853873,
+      "learning_rate": 9.39023334776215e-06,
+      "loss": 0.9293,
+      "step": 389
+    },
+    {
+      "epoch": 0.3695806680881308,
+      "grad_norm": 1.1650689858883694,
+      "learning_rate": 9.386553961281179e-06,
+      "loss": 0.9582,
+      "step": 390
+    },
+    {
+      "epoch": 0.3705283108268183,
+      "grad_norm": 1.2458078695599824,
+      "learning_rate": 9.382864232625321e-06,
+      "loss": 0.9581,
+      "step": 391
+    },
+    {
+      "epoch": 0.3714759535655058,
+      "grad_norm": 1.339036266204836,
+      "learning_rate": 9.379164170493844e-06,
+      "loss": 0.8931,
+      "step": 392
+    },
+    {
+      "epoch": 0.3724235963041933,
+      "grad_norm": 1.0125589713218854,
+      "learning_rate": 9.375453783610381e-06,
+      "loss": 0.9012,
+      "step": 393
+    },
+    {
+      "epoch": 0.37337123904288083,
+      "grad_norm": 1.0329885700845731,
+      "learning_rate": 9.371733080722911e-06,
+      "loss": 0.8628,
+      "step": 394
+    },
+    {
+      "epoch": 0.37431888178156836,
+      "grad_norm": 1.439005100467098,
+      "learning_rate": 9.368002070603731e-06,
+      "loss": 0.8827,
+      "step": 395
+    },
+    {
+      "epoch": 0.3752665245202559,
+      "grad_norm": 1.0085800308385358,
+      "learning_rate": 9.36426076204944e-06,
+      "loss": 0.8743,
+      "step": 396
+    },
+    {
+      "epoch": 0.3752665245202559,
+      "eval_loss": 0.9504217505455017,
+      "eval_runtime": 66.3641,
+      "eval_samples_per_second": 41.107,
+      "eval_steps_per_second": 0.648,
+      "step": 396
+    },
+    {
+      "epoch": 0.37621416725894335,
+      "grad_norm": 1.3876480177466899,
+      "learning_rate": 9.36050916388092e-06,
+      "loss": 0.9472,
+      "step": 397
+    },
+    {
+      "epoch": 0.3771618099976309,
+      "grad_norm": 1.1708472397542733,
+      "learning_rate": 9.35674728494331e-06,
+      "loss": 0.9283,
+      "step": 398
+    },
+    {
+      "epoch": 0.3781094527363184,
+      "grad_norm": 1.0918645378867784,
+      "learning_rate": 9.35297513410599e-06,
+      "loss": 0.8862,
+      "step": 399
+    },
+    {
+      "epoch": 0.37905709547500593,
+      "grad_norm": 0.9955698293935606,
+      "learning_rate": 9.349192720262556e-06,
+      "loss": 0.8965,
+      "step": 400
+    },
+    {
+      "epoch": 0.38000473821369346,
+      "grad_norm": 1.305075253905476,
+      "learning_rate": 9.345400052330802e-06,
+      "loss": 0.8806,
+      "step": 401
+    },
+    {
+      "epoch": 0.38095238095238093,
+      "grad_norm": 0.99053214435014,
+      "learning_rate": 9.341597139252698e-06,
+      "loss": 1.0084,
+      "step": 402
+    },
+    {
+      "epoch": 0.38190002369106846,
+      "grad_norm": 1.3393098226066853,
+      "learning_rate": 9.337783989994371e-06,
+      "loss": 0.9356,
+      "step": 403
+    },
+    {
+      "epoch": 0.382847666429756,
+      "grad_norm": 1.01675988520605,
+      "learning_rate": 9.333960613546079e-06,
+      "loss": 0.8987,
+      "step": 404
+    },
+    {
+      "epoch": 0.3837953091684435,
+      "grad_norm": 1.1624028341398043,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 0.8895,
+      "step": 405
+    },
+    {
+      "epoch": 0.38474295190713104,
+      "grad_norm": 1.2237993995607808,
+      "learning_rate": 9.326283215161177e-06,
+      "loss": 0.8879,
+      "step": 406
+    },
+    {
+      "epoch": 0.3856905946458185,
+      "grad_norm": 1.2652821842567468,
+      "learning_rate": 9.322429211325567e-06,
+      "loss": 0.8893,
+      "step": 407
+    },
+    {
+      "epoch": 0.38663823738450603,
+      "grad_norm": 1.1025981014314234,
+      "learning_rate": 9.31856501650194e-06,
+      "loss": 0.9746,
+      "step": 408
+    },
+    {
+      "epoch": 0.38758588012319356,
+      "grad_norm": 3.9246963935175763,
+      "learning_rate": 9.314690639800906e-06,
+      "loss": 0.9352,
+      "step": 409
+    },
+    {
+      "epoch": 0.3885335228618811,
+      "grad_norm": 1.1259828261006313,
+      "learning_rate": 9.310806090357083e-06,
+      "loss": 0.9083,
+      "step": 410
+    },
+    {
+      "epoch": 0.3894811656005686,
+      "grad_norm": 1.2708123609203328,
+      "learning_rate": 9.306911377329067e-06,
+      "loss": 0.9167,
+      "step": 411
+    },
+    {
+      "epoch": 0.3904288083392561,
+      "grad_norm": 2.090787985556849,
+      "learning_rate": 9.30300650989942e-06,
+      "loss": 0.9976,
+      "step": 412
+    },
+    {
+      "epoch": 0.3913764510779436,
+      "grad_norm": 1.036822985622544,
+      "learning_rate": 9.299091497274647e-06,
+      "loss": 1.002,
+      "step": 413
+    },
+    {
+      "epoch": 0.39232409381663114,
+      "grad_norm": 1.08027979908674,
+      "learning_rate": 9.295166348685169e-06,
+      "loss": 0.883,
+      "step": 414
+    },
+    {
+      "epoch": 0.39327173655531866,
+      "grad_norm": 1.3152353131345889,
+      "learning_rate": 9.291231073385306e-06,
+      "loss": 0.9368,
+      "step": 415
+    },
+    {
+      "epoch": 0.39421937929400613,
+      "grad_norm": 1.3362457501149774,
+      "learning_rate": 9.287285680653254e-06,
+      "loss": 0.9923,
+      "step": 416
+    },
+    {
+      "epoch": 0.39516702203269366,
+      "grad_norm": 1.1378299427204326,
+      "learning_rate": 9.283330179791063e-06,
+      "loss": 0.9013,
+      "step": 417
+    },
+    {
+      "epoch": 0.3961146647713812,
+      "grad_norm": 0.9698571591778787,
+      "learning_rate": 9.279364580124615e-06,
+      "loss": 0.8294,
+      "step": 418
+    },
+    {
+      "epoch": 0.3961146647713812,
+      "eval_loss": 0.9494832754135132,
+      "eval_runtime": 62.1923,
+      "eval_samples_per_second": 43.864,
+      "eval_steps_per_second": 0.691,
+      "step": 418
+    },
+    {
+      "epoch": 0.3970623075100687,
+      "grad_norm": 1.2329603475368258,
+      "learning_rate": 9.275388891003596e-06,
+      "loss": 0.9132,
+      "step": 419
+    },
+    {
+      "epoch": 0.39800995024875624,
+      "grad_norm": 1.0253483109899053,
+      "learning_rate": 9.271403121801492e-06,
+      "loss": 0.9966,
+      "step": 420
+    },
+    {
+      "epoch": 0.3989575929874437,
+      "grad_norm": 1.1122937106526114,
+      "learning_rate": 9.267407281915541e-06,
+      "loss": 0.8949,
+      "step": 421
+    },
+    {
+      "epoch": 0.39990523572613124,
+      "grad_norm": 1.0623316599321453,
+      "learning_rate": 9.263401380766739e-06,
+      "loss": 0.9192,
+      "step": 422
+    },
+    {
+      "epoch": 0.40085287846481876,
+      "grad_norm": 1.109212522270619,
+      "learning_rate": 9.25938542779979e-06,
+      "loss": 0.9212,
+      "step": 423
+    },
+    {
+      "epoch": 0.4018005212035063,
+      "grad_norm": 1.1148931056175715,
+      "learning_rate": 9.255359432483106e-06,
+      "loss": 0.8824,
+      "step": 424
+    },
+    {
+      "epoch": 0.4027481639421938,
+      "grad_norm": 1.469688611294437,
+      "learning_rate": 9.251323404308774e-06,
+      "loss": 0.8941,
+      "step": 425
+    },
+    {
+      "epoch": 0.4036958066808813,
+      "grad_norm": 1.1366864229617593,
+      "learning_rate": 9.247277352792534e-06,
+      "loss": 0.9542,
+      "step": 426
+    },
+    {
+      "epoch": 0.4046434494195688,
+      "grad_norm": 1.2380214332997066,
+      "learning_rate": 9.243221287473755e-06,
+      "loss": 0.9417,
+      "step": 427
+    },
+    {
+      "epoch": 0.40559109215825634,
+      "grad_norm": 1.292587978067118,
+      "learning_rate": 9.239155217915422e-06,
+      "loss": 0.9531,
+      "step": 428
+    },
+    {
+      "epoch": 0.40653873489694387,
+      "grad_norm": 1.1996181211866257,
+      "learning_rate": 9.235079153704108e-06,
+      "loss": 0.993,
+      "step": 429
+    },
+    {
+      "epoch": 0.4074863776356314,
+      "grad_norm": 1.7152618861500344,
+      "learning_rate": 9.23099310444994e-06,
+      "loss": 0.88,
+      "step": 430
+    },
+    {
+      "epoch": 0.40843402037431886,
+      "grad_norm": 1.236710405113469,
+      "learning_rate": 9.226897079786594e-06,
+      "loss": 0.8924,
+      "step": 431
+    },
+    {
+      "epoch": 0.4093816631130064,
+      "grad_norm": 1.026683565261258,
+      "learning_rate": 9.222791089371266e-06,
+      "loss": 0.8627,
+      "step": 432
+    },
+    {
+      "epoch": 0.4103293058516939,
+      "grad_norm": 1.0752239634813958,
+      "learning_rate": 9.218675142884648e-06,
+      "loss": 0.9457,
+      "step": 433
+    },
+    {
+      "epoch": 0.41127694859038144,
+      "grad_norm": 1.1942159706425186,
+      "learning_rate": 9.214549250030899e-06,
+      "loss": 0.9697,
+      "step": 434
+    },
+    {
+      "epoch": 0.41222459132906897,
+      "grad_norm": 1.302875719838314,
+      "learning_rate": 9.210413420537638e-06,
+      "loss": 0.9266,
+      "step": 435
+    },
+    {
+      "epoch": 0.41317223406775644,
+      "grad_norm": 1.2858086492476544,
+      "learning_rate": 9.206267664155906e-06,
+      "loss": 0.8556,
+      "step": 436
+    },
+    {
+      "epoch": 0.41411987680644397,
+      "grad_norm": 1.2092507326298383,
+      "learning_rate": 9.20211199066015e-06,
+      "loss": 0.8873,
+      "step": 437
+    },
+    {
+      "epoch": 0.4150675195451315,
+      "grad_norm": 1.0641345729826912,
+      "learning_rate": 9.197946409848196e-06,
+      "loss": 0.927,
+      "step": 438
+    },
+    {
+      "epoch": 0.416015162283819,
+      "grad_norm": 0.9922730475025484,
+      "learning_rate": 9.19377093154123e-06,
+      "loss": 0.8922,
+      "step": 439
+    },
+    {
+      "epoch": 0.4169628050225065,
+      "grad_norm": 1.1994954411324383,
+      "learning_rate": 9.189585565583779e-06,
+      "loss": 0.934,
+      "step": 440
+    },
+    {
+      "epoch": 0.4169628050225065,
+      "eval_loss": 0.9466658234596252,
+      "eval_runtime": 64.5961,
+      "eval_samples_per_second": 42.232,
+      "eval_steps_per_second": 0.666,
+      "step": 440
+    },
+    {
+      "epoch": 0.417910447761194,
+      "grad_norm": 1.2490962663664558,
+      "learning_rate": 9.185390321843673e-06,
+      "loss": 0.901,
+      "step": 441
+    },
+    {
+      "epoch": 0.41885809049988154,
+      "grad_norm": 1.015254380962658,
+      "learning_rate": 9.181185210212034e-06,
+      "loss": 0.9519,
+      "step": 442
+    },
+    {
+      "epoch": 0.41980573323856907,
+      "grad_norm": 1.1895181384960887,
+      "learning_rate": 9.176970240603253e-06,
+      "loss": 0.8807,
+      "step": 443
+    },
+    {
+      "epoch": 0.4207533759772566,
+      "grad_norm": 1.3706219828971085,
+      "learning_rate": 9.172745422954961e-06,
+      "loss": 0.9148,
+      "step": 444
+    },
+    {
+      "epoch": 0.42170101871594406,
+      "grad_norm": 1.0379378858579145,
+      "learning_rate": 9.168510767228008e-06,
+      "loss": 0.9468,
+      "step": 445
+    },
+    {
+      "epoch": 0.4226486614546316,
+      "grad_norm": 1.2178466709823097,
+      "learning_rate": 9.164266283406433e-06,
+      "loss": 0.9242,
+      "step": 446
+    },
+    {
+      "epoch": 0.4235963041933191,
+      "grad_norm": 1.2808190385423623,
+      "learning_rate": 9.160011981497458e-06,
+      "loss": 0.8654,
+      "step": 447
+    },
+    {
+      "epoch": 0.42454394693200664,
+      "grad_norm": 1.250260948302257,
+      "learning_rate": 9.155747871531444e-06,
+      "loss": 0.9284,
+      "step": 448
+    },
+    {
+      "epoch": 0.42549158967069417,
+      "grad_norm": 1.2672376071125921,
+      "learning_rate": 9.151473963561884e-06,
+      "loss": 0.9568,
+      "step": 449
+    },
+    {
+      "epoch": 0.42643923240938164,
+      "grad_norm": 1.0461013649789057,
+      "learning_rate": 9.147190267665361e-06,
+      "loss": 0.8883,
+      "step": 450
+    },
+    {
+      "epoch": 0.42738687514806917,
+      "grad_norm": 1.1516556206793171,
+      "learning_rate": 9.142896793941546e-06,
+      "loss": 0.9596,
+      "step": 451
+    },
+    {
+      "epoch": 0.4283345178867567,
+      "grad_norm": 1.1510780017093964,
+      "learning_rate": 9.13859355251316e-06,
+      "loss": 0.9444,
+      "step": 452
+    },
+    {
+      "epoch": 0.4292821606254442,
+      "grad_norm": 0.9978574311141366,
+      "learning_rate": 9.134280553525946e-06,
+      "loss": 0.8698,
+      "step": 453
+    },
+    {
+      "epoch": 0.43022980336413175,
+      "grad_norm": 1.0518208149889676,
+      "learning_rate": 9.129957807148666e-06,
+      "loss": 0.8508,
+      "step": 454
+    },
+    {
+      "epoch": 0.4311774461028192,
+      "grad_norm": 1.0777071914790497,
+      "learning_rate": 9.12562532357305e-06,
+      "loss": 0.9219,
+      "step": 455
+    },
+    {
+      "epoch": 0.43212508884150674,
+      "grad_norm": 1.3003109116219143,
+      "learning_rate": 9.121283113013794e-06,
+      "loss": 0.9354,
+      "step": 456
+    },
+    {
+      "epoch": 0.43307273158019427,
+      "grad_norm": 1.231896880939342,
+      "learning_rate": 9.116931185708523e-06,
+      "loss": 0.8797,
+      "step": 457
+    },
+    {
+      "epoch": 0.4340203743188818,
+      "grad_norm": 1.167418023483012,
+      "learning_rate": 9.112569551917773e-06,
+      "loss": 0.9122,
+      "step": 458
+    },
+    {
+      "epoch": 0.4349680170575693,
+      "grad_norm": 1.2433163300824168,
+      "learning_rate": 9.108198221924966e-06,
+      "loss": 0.9241,
+      "step": 459
+    },
+    {
+      "epoch": 0.4359156597962568,
+      "grad_norm": 1.2957389966436808,
+      "learning_rate": 9.103817206036383e-06,
+      "loss": 0.9653,
+      "step": 460
+    },
+    {
+      "epoch": 0.4368633025349443,
+      "grad_norm": 1.1967614874308203,
+      "learning_rate": 9.09942651458114e-06,
+      "loss": 0.9555,
+      "step": 461
+    },
+    {
+      "epoch": 0.43781094527363185,
+      "grad_norm": 1.0311787596301678,
+      "learning_rate": 9.095026157911166e-06,
+      "loss": 0.8532,
+      "step": 462
+    },
+    {
+      "epoch": 0.43781094527363185,
+      "eval_loss": 0.9448354840278625,
+      "eval_runtime": 63.5673,
+      "eval_samples_per_second": 42.915,
+      "eval_steps_per_second": 0.676,
+      "step": 462
+    },
+    {
+      "epoch": 0.4387585880123194,
+      "grad_norm": 1.1614684984564378,
+      "learning_rate": 9.090616146401183e-06,
+      "loss": 0.911,
+      "step": 463
+    },
+    {
+      "epoch": 0.43970623075100684,
+      "grad_norm": 1.1848933141897011,
+      "learning_rate": 9.086196490448668e-06,
+      "loss": 0.8495,
+      "step": 464
+    },
+    {
+      "epoch": 0.44065387348969437,
+      "grad_norm": 1.0920125977106059,
+      "learning_rate": 9.081767200473842e-06,
+      "loss": 0.9195,
+      "step": 465
+    },
+    {
+      "epoch": 0.4416015162283819,
+      "grad_norm": 1.0487746428767522,
+      "learning_rate": 9.077328286919638e-06,
+      "loss": 0.8775,
+      "step": 466
+    },
+    {
+      "epoch": 0.4425491589670694,
+      "grad_norm": 1.0480719750913268,
+      "learning_rate": 9.07287976025168e-06,
+      "loss": 0.8879,
+      "step": 467
+    },
+    {
+      "epoch": 0.44349680170575695,
+      "grad_norm": 1.156105288349571,
+      "learning_rate": 9.068421630958254e-06,
+      "loss": 0.9004,
+      "step": 468
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 1.1479660233621711,
+      "learning_rate": 9.063953909550289e-06,
+      "loss": 0.9652,
+      "step": 469
+    },
+    {
+      "epoch": 0.44539208718313195,
+      "grad_norm": 1.158618048287916,
+      "learning_rate": 9.059476606561328e-06,
+      "loss": 0.8643,
+      "step": 470
+    },
+    {
+      "epoch": 0.4463397299218195,
+      "grad_norm": 1.1045055506935484,
+      "learning_rate": 9.054989732547507e-06,
+      "loss": 0.8307,
+      "step": 471
+    },
+    {
+      "epoch": 0.447287372660507,
+      "grad_norm": 1.189869710423804,
+      "learning_rate": 9.050493298087523e-06,
+      "loss": 0.8693,
+      "step": 472
+    },
+    {
+      "epoch": 0.4482350153991945,
+      "grad_norm": 1.5017065849353626,
+      "learning_rate": 9.045987313782616e-06,
+      "loss": 0.8868,
+      "step": 473
+    },
+    {
+      "epoch": 0.449182658137882,
+      "grad_norm": 1.288348522111584,
+      "learning_rate": 9.041471790256543e-06,
+      "loss": 0.9984,
+      "step": 474
+    },
+    {
+      "epoch": 0.4501303008765695,
+      "grad_norm": 1.3428427133159277,
+      "learning_rate": 9.036946738155548e-06,
+      "loss": 0.9328,
+      "step": 475
+    },
+    {
+      "epoch": 0.45107794361525705,
+      "grad_norm": 0.9887938032536074,
+      "learning_rate": 9.032412168148345e-06,
+      "loss": 0.9483,
+      "step": 476
+    },
+    {
+      "epoch": 0.4520255863539446,
+      "grad_norm": 1.0713968856815155,
+      "learning_rate": 9.027868090926088e-06,
+      "loss": 0.8861,
+      "step": 477
+    },
+    {
+      "epoch": 0.4529732290926321,
+      "grad_norm": 1.162032207786328,
+      "learning_rate": 9.023314517202341e-06,
+      "loss": 0.9014,
+      "step": 478
+    },
+    {
+      "epoch": 0.4539208718313196,
+      "grad_norm": 1.135173292644661,
+      "learning_rate": 9.018751457713062e-06,
+      "loss": 0.882,
+      "step": 479
+    },
+    {
+      "epoch": 0.4548685145700071,
+      "grad_norm": 1.2191006204661359,
+      "learning_rate": 9.014178923216572e-06,
+      "loss": 0.8936,
+      "step": 480
+    },
+    {
+      "epoch": 0.4558161573086946,
+      "grad_norm": 1.1422417367554563,
+      "learning_rate": 9.009596924493536e-06,
+      "loss": 0.9046,
+      "step": 481
+    },
+    {
+      "epoch": 0.45676380004738215,
+      "grad_norm": 1.0960107607325966,
+      "learning_rate": 9.005005472346923e-06,
+      "loss": 0.8608,
+      "step": 482
+    },
+    {
+      "epoch": 0.4577114427860697,
+      "grad_norm": 1.2860608689094808,
+      "learning_rate": 9.000404577602003e-06,
+      "loss": 0.92,
+      "step": 483
+    },
+    {
+      "epoch": 0.45865908552475715,
+      "grad_norm": 1.148989084195761,
+      "learning_rate": 8.995794251106295e-06,
+      "loss": 0.9675,
+      "step": 484
+    },
+    {
+      "epoch": 0.45865908552475715,
+      "eval_loss": 0.9426133632659912,
+      "eval_runtime": 62.3901,
+      "eval_samples_per_second": 43.725,
+      "eval_steps_per_second": 0.689,
+      "step": 484
+    },
+    {
+      "epoch": 0.4596067282634447,
+      "grad_norm": 1.1715395816498915,
+      "learning_rate": 8.991174503729567e-06,
+      "loss": 0.9505,
+      "step": 485
+    },
+    {
+      "epoch": 0.4605543710021322,
+      "grad_norm": 1.1418428811721806,
+      "learning_rate": 8.986545346363792e-06,
+      "loss": 0.9194,
+      "step": 486
+    },
+    {
+      "epoch": 0.4615020137408197,
+      "grad_norm": 1.2704284828900592,
+      "learning_rate": 8.98190678992313e-06,
+      "loss": 0.9404,
+      "step": 487
+    },
+    {
+      "epoch": 0.4624496564795072,
+      "grad_norm": 1.4180260493906214,
+      "learning_rate": 8.977258845343904e-06,
+      "loss": 0.8881,
+      "step": 488
+    },
+    {
+      "epoch": 0.4633972992181947,
+      "grad_norm": 1.4745602251152343,
+      "learning_rate": 8.97260152358457e-06,
+      "loss": 0.8991,
+      "step": 489
+    },
+    {
+      "epoch": 0.46434494195688225,
+      "grad_norm": 1.5516611931425326,
+      "learning_rate": 8.96793483562569e-06,
+      "loss": 0.8868,
+      "step": 490
+    },
+    {
+      "epoch": 0.4652925846955698,
+      "grad_norm": 1.1672873798559753,
+      "learning_rate": 8.963258792469908e-06,
+      "loss": 0.9032,
+      "step": 491
+    },
+    {
+      "epoch": 0.4662402274342573,
+      "grad_norm": 0.9800479447492024,
+      "learning_rate": 8.958573405141932e-06,
+      "loss": 0.8875,
+      "step": 492
+    },
+    {
+      "epoch": 0.4671878701729448,
+      "grad_norm": 1.3344568834573243,
+      "learning_rate": 8.953878684688492e-06,
+      "loss": 0.8834,
+      "step": 493
+    },
+    {
+      "epoch": 0.4681355129116323,
+      "grad_norm": 1.0491821400957775,
+      "learning_rate": 8.949174642178333e-06,
+      "loss": 0.9002,
+      "step": 494
+    },
+    {
+      "epoch": 0.4690831556503198,
+      "grad_norm": 1.237676770681135,
+      "learning_rate": 8.944461288702166e-06,
+      "loss": 0.8832,
+      "step": 495
+    },
+    {
+      "epoch": 0.47003079838900735,
+      "grad_norm": 1.2759707423338387,
+      "learning_rate": 8.939738635372664e-06,
+      "loss": 0.8949,
+      "step": 496
+    },
+    {
+      "epoch": 0.4709784411276949,
+      "grad_norm": 1.1263638492681127,
+      "learning_rate": 8.935006693324423e-06,
+      "loss": 0.8969,
+      "step": 497
+    },
+    {
+      "epoch": 0.47192608386638235,
+      "grad_norm": 1.154527093025846,
+      "learning_rate": 8.930265473713939e-06,
+      "loss": 0.8759,
+      "step": 498
+    },
+    {
+      "epoch": 0.4728737266050699,
+      "grad_norm": 1.2033690454214934,
+      "learning_rate": 8.92551498771958e-06,
+      "loss": 0.9447,
+      "step": 499
+    },
+    {
+      "epoch": 0.4738213693437574,
+      "grad_norm": 1.188345342085479,
+      "learning_rate": 8.920755246541563e-06,
+      "loss": 0.9698,
+      "step": 500
+    },
+    {
+      "epoch": 0.47476901208244493,
+      "grad_norm": 1.1460258736111513,
+      "learning_rate": 8.91598626140193e-06,
+      "loss": 0.8861,
+      "step": 501
+    },
+    {
+      "epoch": 0.47571665482113246,
+      "grad_norm": 1.0983544593959635,
+      "learning_rate": 8.911208043544513e-06,
+      "loss": 0.9099,
+      "step": 502
+    },
+    {
+      "epoch": 0.4766642975598199,
+      "grad_norm": 1.2526221170984964,
+      "learning_rate": 8.906420604234908e-06,
+      "loss": 0.9153,
+      "step": 503
+    },
+    {
+      "epoch": 0.47761194029850745,
+      "grad_norm": 1.4378576625792787,
+      "learning_rate": 8.90162395476046e-06,
+      "loss": 0.9098,
+      "step": 504
+    },
+    {
+      "epoch": 0.478559583037195,
+      "grad_norm": 1.021190259086082,
+      "learning_rate": 8.896818106430225e-06,
+      "loss": 0.9201,
+      "step": 505
+    },
+    {
+      "epoch": 0.4795072257758825,
+      "grad_norm": 1.2166947590641954,
+      "learning_rate": 8.89200307057495e-06,
+      "loss": 0.9498,
+      "step": 506
+    },
+    {
+      "epoch": 0.4795072257758825,
+      "eval_loss": 0.9415593147277832,
+      "eval_runtime": 63.0823,
+      "eval_samples_per_second": 43.245,
+      "eval_steps_per_second": 0.682,
+      "step": 506
+    },
+    {
+      "epoch": 0.48045486851457003,
+      "grad_norm": 1.099897475733057,
+      "learning_rate": 8.887178858547039e-06,
+      "loss": 0.8785,
+      "step": 507
+    },
+    {
+      "epoch": 0.4814025112532575,
+      "grad_norm": 1.1053789477176734,
+      "learning_rate": 8.882345481720533e-06,
+      "loss": 0.9781,
+      "step": 508
+    },
+    {
+      "epoch": 0.48235015399194503,
+      "grad_norm": 1.2550219679746741,
+      "learning_rate": 8.877502951491083e-06,
+      "loss": 0.9175,
+      "step": 509
+    },
+    {
+      "epoch": 0.48329779673063256,
+      "grad_norm": 1.035777482131784,
+      "learning_rate": 8.872651279275917e-06,
+      "loss": 0.9394,
+      "step": 510
+    },
+    {
+      "epoch": 0.4842454394693201,
+      "grad_norm": 1.1823889985534881,
+      "learning_rate": 8.867790476513818e-06,
+      "loss": 0.8619,
+      "step": 511
+    },
+    {
+      "epoch": 0.4851930822080076,
+      "grad_norm": 1.0806837978842365,
+      "learning_rate": 8.862920554665098e-06,
+      "loss": 0.8847,
+      "step": 512
+    },
+    {
+      "epoch": 0.4861407249466951,
+      "grad_norm": 1.1417084903673171,
+      "learning_rate": 8.858041525211569e-06,
+      "loss": 0.8984,
+      "step": 513
+    },
+    {
+      "epoch": 0.4870883676853826,
+      "grad_norm": 1.046685136616654,
+      "learning_rate": 8.853153399656513e-06,
+      "loss": 0.9343,
+      "step": 514
+    },
+    {
+      "epoch": 0.48803601042407013,
+      "grad_norm": 1.1600934932807847,
+      "learning_rate": 8.848256189524661e-06,
+      "loss": 0.903,
+      "step": 515
+    },
+    {
+      "epoch": 0.48898365316275766,
+      "grad_norm": 0.9999805389325372,
+      "learning_rate": 8.843349906362163e-06,
+      "loss": 0.9087,
+      "step": 516
+    },
+    {
+      "epoch": 0.48993129590144513,
+      "grad_norm": 1.1693797728638526,
+      "learning_rate": 8.838434561736556e-06,
+      "loss": 0.9083,
+      "step": 517
+    },
+    {
+      "epoch": 0.49087893864013266,
+      "grad_norm": 1.1372932570585796,
+      "learning_rate": 8.833510167236747e-06,
+      "loss": 0.9713,
+      "step": 518
+    },
+    {
+      "epoch": 0.4918265813788202,
+      "grad_norm": 1.0947618440390705,
+      "learning_rate": 8.828576734472975e-06,
+      "loss": 0.8689,
+      "step": 519
+    },
+    {
+      "epoch": 0.4927742241175077,
+      "grad_norm": 1.1318492632095214,
+      "learning_rate": 8.823634275076792e-06,
+      "loss": 0.8625,
+      "step": 520
+    },
+    {
+      "epoch": 0.49372186685619524,
+      "grad_norm": 1.3142475847243504,
+      "learning_rate": 8.818682800701028e-06,
+      "loss": 0.8914,
+      "step": 521
+    },
+    {
+      "epoch": 0.4946695095948827,
+      "grad_norm": 1.0542269379359606,
+      "learning_rate": 8.813722323019774e-06,
+      "loss": 0.9204,
+      "step": 522
+    },
+    {
+      "epoch": 0.49561715233357023,
+      "grad_norm": 1.2759846986205978,
+      "learning_rate": 8.808752853728341e-06,
+      "loss": 0.9044,
+      "step": 523
+    },
+    {
+      "epoch": 0.49656479507225776,
+      "grad_norm": 1.0846144562638056,
+      "learning_rate": 8.803774404543246e-06,
+      "loss": 0.9123,
+      "step": 524
+    },
+    {
+      "epoch": 0.4975124378109453,
+      "grad_norm": 1.1086474451297028,
+      "learning_rate": 8.798786987202175e-06,
+      "loss": 0.9293,
+      "step": 525
+    },
+    {
+      "epoch": 0.4984600805496328,
+      "grad_norm": 0.9413825393223179,
+      "learning_rate": 8.793790613463956e-06,
+      "loss": 0.8654,
+      "step": 526
+    },
+    {
+      "epoch": 0.4994077232883203,
+      "grad_norm": 1.1832807749456735,
+      "learning_rate": 8.788785295108536e-06,
+      "loss": 0.8636,
+      "step": 527
+    },
+    {
+      "epoch": 0.5003553660270078,
+      "grad_norm": 1.0977629074376605,
+      "learning_rate": 8.783771043936949e-06,
+      "loss": 0.8765,
+      "step": 528
+    },
+    {
+      "epoch": 0.5003553660270078,
+      "eval_loss": 0.941301167011261,
+      "eval_runtime": 61.1844,
+      "eval_samples_per_second": 44.587,
+      "eval_steps_per_second": 0.703,
+      "step": 528
+    },
+    {
+      "epoch": 0.5013030087656953,
+      "grad_norm": 1.146767921801711,
+      "learning_rate": 8.778747871771293e-06,
+      "loss": 0.8989,
+      "step": 529
+    },
+    {
+      "epoch": 0.5022506515043829,
+      "grad_norm": 1.2639703543113263,
+      "learning_rate": 8.773715790454695e-06,
+      "loss": 0.9151,
+      "step": 530
+    },
+    {
+      "epoch": 0.5031982942430704,
+      "grad_norm": 1.113218960186029,
+      "learning_rate": 8.768674811851293e-06,
+      "loss": 0.8692,
+      "step": 531
+    },
+    {
+      "epoch": 0.5041459369817579,
+      "grad_norm": 0.9991478843453905,
+      "learning_rate": 8.763624947846195e-06,
+      "loss": 0.8764,
+      "step": 532
+    },
+    {
+      "epoch": 0.5050935797204454,
+      "grad_norm": 1.1051839359484277,
+      "learning_rate": 8.758566210345464e-06,
+      "loss": 0.9142,
+      "step": 533
+    },
+    {
+      "epoch": 0.5060412224591329,
+      "grad_norm": 1.5864593937619376,
+      "learning_rate": 8.75349861127608e-06,
+      "loss": 0.9167,
+      "step": 534
+    },
+    {
+      "epoch": 0.5069888651978204,
+      "grad_norm": 1.0055893256047008,
+      "learning_rate": 8.748422162585915e-06,
+      "loss": 0.9583,
+      "step": 535
+    },
+    {
+      "epoch": 0.5079365079365079,
+      "grad_norm": 1.1419438277764564,
+      "learning_rate": 8.743336876243712e-06,
+      "loss": 0.8847,
+      "step": 536
+    },
+    {
+      "epoch": 0.5088841506751954,
+      "grad_norm": 1.1093929329894858,
+      "learning_rate": 8.738242764239046e-06,
+      "loss": 0.9657,
+      "step": 537
+    },
+    {
+      "epoch": 0.509831793413883,
+      "grad_norm": 1.0924153336293334,
+      "learning_rate": 8.733139838582299e-06,
+      "loss": 0.9452,
+      "step": 538
+    },
+    {
+      "epoch": 0.5107794361525705,
+      "grad_norm": 1.100904420569305,
+      "learning_rate": 8.728028111304639e-06,
+      "loss": 0.8705,
+      "step": 539
+    },
+    {
+      "epoch": 0.511727078891258,
+      "grad_norm": 1.0377959902393181,
+      "learning_rate": 8.722907594457975e-06,
+      "loss": 0.9021,
+      "step": 540
+    },
+    {
+      "epoch": 0.5126747216299455,
+      "grad_norm": 1.3028881798201601,
+      "learning_rate": 8.717778300114952e-06,
+      "loss": 0.9004,
+      "step": 541
+    },
+    {
+      "epoch": 0.5136223643686331,
+      "grad_norm": 1.2219633113574593,
+      "learning_rate": 8.712640240368899e-06,
+      "loss": 0.9146,
+      "step": 542
+    },
+    {
+      "epoch": 0.5145700071073206,
+      "grad_norm": 1.16735139559823,
+      "learning_rate": 8.707493427333817e-06,
+      "loss": 0.9336,
+      "step": 543
+    },
+    {
+      "epoch": 0.515517649846008,
+      "grad_norm": 1.1223934613953974,
+      "learning_rate": 8.702337873144343e-06,
+      "loss": 0.8959,
+      "step": 544
+    },
+    {
+      "epoch": 0.5164652925846955,
+      "grad_norm": 1.0381379384688154,
+      "learning_rate": 8.697173589955724e-06,
+      "loss": 0.9147,
+      "step": 545
+    },
+    {
+      "epoch": 0.5174129353233831,
+      "grad_norm": 1.071551123667491,
+      "learning_rate": 8.692000589943785e-06,
+      "loss": 0.8713,
+      "step": 546
+    },
+    {
+      "epoch": 0.5183605780620706,
+      "grad_norm": 1.1572023966023732,
+      "learning_rate": 8.686818885304907e-06,
+      "loss": 0.9468,
+      "step": 547
+    },
+    {
+      "epoch": 0.5193082208007581,
+      "grad_norm": 1.0966755633051661,
+      "learning_rate": 8.681628488255986e-06,
+      "loss": 0.9746,
+      "step": 548
+    },
+    {
+      "epoch": 0.5202558635394456,
+      "grad_norm": 1.0054623347539213,
+      "learning_rate": 8.676429411034423e-06,
+      "loss": 0.889,
+      "step": 549
+    },
+    {
+      "epoch": 0.5212035062781332,
+      "grad_norm": 1.0688410228225136,
+      "learning_rate": 8.671221665898074e-06,
+      "loss": 0.8986,
+      "step": 550
+    },
+    {
+      "epoch": 0.5212035062781332,
+      "eval_loss": 0.9385759234428406,
+      "eval_runtime": 60.7338,
+      "eval_samples_per_second": 44.917,
+      "eval_steps_per_second": 0.708,
+      "step": 550
+    },
+    {
+      "epoch": 0.5221511490168207,
+      "grad_norm": 1.1912290308637075,
+      "learning_rate": 8.666005265125238e-06,
+      "loss": 0.9032,
+      "step": 551
+    },
+    {
+      "epoch": 0.5230987917555082,
+      "grad_norm": 1.0819840961903495,
+      "learning_rate": 8.660780221014617e-06,
+      "loss": 0.9549,
+      "step": 552
+    },
+    {
+      "epoch": 0.5240464344941956,
+      "grad_norm": 1.584365865940181,
+      "learning_rate": 8.655546545885294e-06,
+      "loss": 0.9895,
+      "step": 553
+    },
+    {
+      "epoch": 0.5249940772328832,
+      "grad_norm": 1.5230791449620116,
+      "learning_rate": 8.650304252076704e-06,
+      "loss": 0.9359,
+      "step": 554
+    },
+    {
+      "epoch": 0.5259417199715707,
+      "grad_norm": 1.2812899118028946,
+      "learning_rate": 8.645053351948594e-06,
+      "loss": 0.8863,
+      "step": 555
+    },
+    {
+      "epoch": 0.5268893627102582,
+      "grad_norm": 1.1090479481617728,
+      "learning_rate": 8.63979385788101e-06,
+      "loss": 0.9549,
+      "step": 556
+    },
+    {
+      "epoch": 0.5278370054489457,
+      "grad_norm": 1.0243309497173194,
+      "learning_rate": 8.63452578227426e-06,
+      "loss": 0.8837,
+      "step": 557
+    },
+    {
+      "epoch": 0.5287846481876333,
+      "grad_norm": 1.1652281440552321,
+      "learning_rate": 8.629249137548873e-06,
+      "loss": 0.8833,
+      "step": 558
+    },
+    {
+      "epoch": 0.5297322909263208,
+      "grad_norm": 1.0941817825792766,
+      "learning_rate": 8.6239639361456e-06,
+      "loss": 0.9423,
+      "step": 559
+    },
+    {
+      "epoch": 0.5306799336650083,
+      "grad_norm": 1.2574492154883083,
+      "learning_rate": 8.61867019052535e-06,
+      "loss": 0.9524,
+      "step": 560
+    },
+    {
+      "epoch": 0.5316275764036958,
+      "grad_norm": 1.1528975788038949,
+      "learning_rate": 8.613367913169188e-06,
+      "loss": 0.8843,
+      "step": 561
+    },
+    {
+      "epoch": 0.5325752191423834,
+      "grad_norm": 1.260334993982276,
+      "learning_rate": 8.608057116578283e-06,
+      "loss": 0.9527,
+      "step": 562
+    },
+    {
+      "epoch": 0.5335228618810708,
+      "grad_norm": 1.0336321970328701,
+      "learning_rate": 8.602737813273901e-06,
+      "loss": 0.885,
+      "step": 563
+    },
+    {
+      "epoch": 0.5344705046197583,
+      "grad_norm": 1.4071128107796536,
+      "learning_rate": 8.597410015797358e-06,
+      "loss": 0.9056,
+      "step": 564
+    },
+    {
+      "epoch": 0.5354181473584458,
+      "grad_norm": 1.3243499763253614,
+      "learning_rate": 8.592073736709996e-06,
+      "loss": 0.9816,
+      "step": 565
+    },
+    {
+      "epoch": 0.5363657900971334,
+      "grad_norm": 1.0252110946238864,
+      "learning_rate": 8.586728988593158e-06,
+      "loss": 0.8939,
+      "step": 566
+    },
+    {
+      "epoch": 0.5373134328358209,
+      "grad_norm": 1.5674480203253196,
+      "learning_rate": 8.581375784048154e-06,
+      "loss": 0.8716,
+      "step": 567
+    },
+    {
+      "epoch": 0.5382610755745084,
+      "grad_norm": 1.3373495536241256,
+      "learning_rate": 8.576014135696227e-06,
+      "loss": 0.9189,
+      "step": 568
+    },
+    {
+      "epoch": 0.539208718313196,
+      "grad_norm": 1.0083923948069164,
+      "learning_rate": 8.570644056178533e-06,
+      "loss": 0.8696,
+      "step": 569
+    },
+    {
+      "epoch": 0.5401563610518835,
+      "grad_norm": 1.134010279426964,
+      "learning_rate": 8.565265558156101e-06,
+      "loss": 0.9171,
+      "step": 570
+    },
+    {
+      "epoch": 0.541104003790571,
+      "grad_norm": 1.0122940996397913,
+      "learning_rate": 8.559878654309818e-06,
+      "loss": 0.8536,
+      "step": 571
+    },
+    {
+      "epoch": 0.5420516465292585,
+      "grad_norm": 1.0417709805855406,
+      "learning_rate": 8.554483357340379e-06,
+      "loss": 0.8757,
+      "step": 572
+    },
+    {
+      "epoch": 0.5420516465292585,
+      "eval_loss": 0.9370559453964233,
+      "eval_runtime": 65.2018,
+      "eval_samples_per_second": 41.839,
+      "eval_steps_per_second": 0.659,
+      "step": 572
+    },
+    {
+      "epoch": 0.5429992892679459,
+      "grad_norm": 1.098518656213201,
+      "learning_rate": 8.549079679968272e-06,
+      "loss": 0.8879,
+      "step": 573
+    },
+    {
+      "epoch": 0.5439469320066335,
+      "grad_norm": 1.212951157381051,
+      "learning_rate": 8.543667634933743e-06,
+      "loss": 0.8697,
+      "step": 574
+    },
+    {
+      "epoch": 0.544894574745321,
+      "grad_norm": 1.3330907351600239,
+      "learning_rate": 8.538247234996766e-06,
+      "loss": 0.8615,
+      "step": 575
+    },
+    {
+      "epoch": 0.5458422174840085,
+      "grad_norm": 1.2057308113799874,
+      "learning_rate": 8.532818492937014e-06,
+      "loss": 0.9033,
+      "step": 576
+    },
+    {
+      "epoch": 0.546789860222696,
+      "grad_norm": 1.1709128709827088,
+      "learning_rate": 8.52738142155383e-06,
+      "loss": 0.9136,
+      "step": 577
+    },
+    {
+      "epoch": 0.5477375029613836,
+      "grad_norm": 1.1465991381882117,
+      "learning_rate": 8.521936033666187e-06,
+      "loss": 0.9102,
+      "step": 578
+    },
+    {
+      "epoch": 0.5486851457000711,
+      "grad_norm": 1.4618014976340794,
+      "learning_rate": 8.51648234211268e-06,
+      "loss": 0.8733,
+      "step": 579
+    },
+    {
+      "epoch": 0.5496327884387586,
+      "grad_norm": 1.449685521781311,
+      "learning_rate": 8.511020359751467e-06,
+      "loss": 0.9106,
+      "step": 580
+    },
+    {
+      "epoch": 0.5505804311774462,
+      "grad_norm": 1.0171758381766154,
+      "learning_rate": 8.505550099460264e-06,
+      "loss": 0.9353,
+      "step": 581
+    },
+    {
+      "epoch": 0.5515280739161336,
+      "grad_norm": 1.290290565129861,
+      "learning_rate": 8.500071574136297e-06,
+      "loss": 0.837,
+      "step": 582
+    },
+    {
+      "epoch": 0.5524757166548211,
+      "grad_norm": 1.1275094814541378,
+      "learning_rate": 8.49458479669628e-06,
+      "loss": 0.9316,
+      "step": 583
+    },
+    {
+      "epoch": 0.5534233593935086,
+      "grad_norm": 1.762720464593278,
+      "learning_rate": 8.489089780076387e-06,
+      "loss": 0.9394,
+      "step": 584
+    },
+    {
+      "epoch": 0.5543710021321961,
+      "grad_norm": 1.227259697952017,
+      "learning_rate": 8.483586537232212e-06,
+      "loss": 0.8798,
+      "step": 585
+    },
+    {
+      "epoch": 0.5553186448708837,
+      "grad_norm": 1.1785938474090234,
+      "learning_rate": 8.478075081138746e-06,
+      "loss": 0.9288,
+      "step": 586
+    },
+    {
+      "epoch": 0.5562662876095712,
+      "grad_norm": 1.1067839714490098,
+      "learning_rate": 8.472555424790348e-06,
+      "loss": 0.833,
+      "step": 587
+    },
+    {
+      "epoch": 0.5572139303482587,
+      "grad_norm": 1.1232716949263366,
+      "learning_rate": 8.467027581200702e-06,
+      "loss": 0.9166,
+      "step": 588
+    },
+    {
+      "epoch": 0.5581615730869462,
+      "grad_norm": 1.261715047880492,
+      "learning_rate": 8.461491563402807e-06,
+      "loss": 0.9618,
+      "step": 589
+    },
+    {
+      "epoch": 0.5591092158256338,
+      "grad_norm": 1.1832942718518242,
+      "learning_rate": 8.455947384448926e-06,
+      "loss": 0.8843,
+      "step": 590
+    },
+    {
+      "epoch": 0.5600568585643213,
+      "grad_norm": 1.1707357848301445,
+      "learning_rate": 8.450395057410561e-06,
+      "loss": 0.8667,
+      "step": 591
+    },
+    {
+      "epoch": 0.5610045013030087,
+      "grad_norm": 1.051280206948217,
+      "learning_rate": 8.444834595378434e-06,
+      "loss": 0.9182,
+      "step": 592
+    },
+    {
+      "epoch": 0.5619521440416962,
+      "grad_norm": 1.5197971665007415,
+      "learning_rate": 8.43926601146244e-06,
+      "loss": 0.9023,
+      "step": 593
+    },
+    {
+      "epoch": 0.5628997867803838,
+      "grad_norm": 1.2707540574566858,
+      "learning_rate": 8.433689318791628e-06,
+      "loss": 0.936,
+      "step": 594
+    },
+    {
+      "epoch": 0.5628997867803838,
+      "eval_loss": 0.9368069767951965,
+      "eval_runtime": 59.0081,
+      "eval_samples_per_second": 46.231,
+      "eval_steps_per_second": 0.729,
+      "step": 594
+    },
+    {
+      "epoch": 0.5638474295190713,
+      "grad_norm": 1.1108224917689546,
+      "learning_rate": 8.428104530514156e-06,
+      "loss": 0.853,
+      "step": 595
+    },
+    {
+      "epoch": 0.5647950722577588,
+      "grad_norm": 1.039118804478871,
+      "learning_rate": 8.42251165979728e-06,
+      "loss": 0.9154,
+      "step": 596
+    },
+    {
+      "epoch": 0.5657427149964463,
+      "grad_norm": 1.0970139269789967,
+      "learning_rate": 8.416910719827304e-06,
+      "loss": 0.9166,
+      "step": 597
+    },
+    {
+      "epoch": 0.5666903577351339,
+      "grad_norm": 1.0306693005295113,
+      "learning_rate": 8.411301723809563e-06,
+      "loss": 0.9061,
+      "step": 598
+    },
+    {
+      "epoch": 0.5676380004738214,
+      "grad_norm": 2.3153529152284746,
+      "learning_rate": 8.405684684968383e-06,
+      "loss": 0.9242,
+      "step": 599
+    },
+    {
+      "epoch": 0.5685856432125089,
+      "grad_norm": 1.1053912735757487,
+      "learning_rate": 8.400059616547046e-06,
+      "loss": 0.8394,
+      "step": 600
+    },
+    {
+      "epoch": 0.5695332859511963,
+      "grad_norm": 1.2091214685314218,
+      "learning_rate": 8.394426531807777e-06,
+      "loss": 0.9289,
+      "step": 601
+    },
+    {
+      "epoch": 0.5704809286898839,
+      "grad_norm": 1.1879706774303542,
+      "learning_rate": 8.388785444031695e-06,
+      "loss": 0.9362,
+      "step": 602
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 1.2386582317865258,
+      "learning_rate": 8.383136366518788e-06,
+      "loss": 0.9061,
+      "step": 603
+    },
+    {
+      "epoch": 0.5723762141672589,
+      "grad_norm": 1.027254780148272,
+      "learning_rate": 8.37747931258788e-06,
+      "loss": 0.9291,
+      "step": 604
+    },
+    {
+      "epoch": 0.5733238569059464,
+      "grad_norm": 1.2061736249322361,
+      "learning_rate": 8.371814295576604e-06,
+      "loss": 0.9435,
+      "step": 605
+    },
+    {
+      "epoch": 0.574271499644634,
+      "grad_norm": 1.1051297934960431,
+      "learning_rate": 8.366141328841367e-06,
+      "loss": 0.9444,
+      "step": 606
+    },
+    {
+      "epoch": 0.5752191423833215,
+      "grad_norm": 1.0492890936420853,
+      "learning_rate": 8.360460425757316e-06,
+      "loss": 0.8896,
+      "step": 607
+    },
+    {
+      "epoch": 0.576166785122009,
+      "grad_norm": 1.1855288112590538,
+      "learning_rate": 8.354771599718313e-06,
+      "loss": 0.9024,
+      "step": 608
+    },
+    {
+      "epoch": 0.5771144278606966,
+      "grad_norm": 1.0894896483096521,
+      "learning_rate": 8.349074864136897e-06,
+      "loss": 0.8718,
+      "step": 609
+    },
+    {
+      "epoch": 0.5780620705993841,
+      "grad_norm": 1.1673204787473177,
+      "learning_rate": 8.34337023244426e-06,
+      "loss": 0.9477,
+      "step": 610
+    },
+    {
+      "epoch": 0.5790097133380715,
+      "grad_norm": 1.1746428108459406,
+      "learning_rate": 8.33765771809021e-06,
+      "loss": 0.9633,
+      "step": 611
+    },
+    {
+      "epoch": 0.579957356076759,
+      "grad_norm": 1.6815219702121096,
+      "learning_rate": 8.331937334543132e-06,
+      "loss": 0.9357,
+      "step": 612
+    },
+    {
+      "epoch": 0.5809049988154465,
+      "grad_norm": 1.284563514540576,
+      "learning_rate": 8.326209095289973e-06,
+      "loss": 0.9576,
+      "step": 613
+    },
+    {
+      "epoch": 0.5818526415541341,
+      "grad_norm": 1.1141153791855245,
+      "learning_rate": 8.320473013836197e-06,
+      "loss": 0.9207,
+      "step": 614
+    },
+    {
+      "epoch": 0.5828002842928216,
+      "grad_norm": 1.0820567139576633,
+      "learning_rate": 8.314729103705758e-06,
+      "loss": 0.8984,
+      "step": 615
+    },
+    {
+      "epoch": 0.5837479270315091,
+      "grad_norm": 1.0636345740480533,
+      "learning_rate": 8.308977378441072e-06,
+      "loss": 0.9086,
+      "step": 616
+    },
+    {
+      "epoch": 0.5837479270315091,
+      "eval_loss": 0.9341678619384766,
+      "eval_runtime": 65.7189,
+      "eval_samples_per_second": 41.51,
+      "eval_steps_per_second": 0.654,
+      "step": 616
+    },
+    {
+      "epoch": 0.5846955697701967,
+      "grad_norm": 1.3632356445316784,
+      "learning_rate": 8.303217851602973e-06,
+      "loss": 0.8918,
+      "step": 617
+    },
+    {
+      "epoch": 0.5856432125088842,
+      "grad_norm": 1.1417039643528692,
+      "learning_rate": 8.297450536770697e-06,
+      "loss": 0.8531,
+      "step": 618
+    },
+    {
+      "epoch": 0.5865908552475717,
+      "grad_norm": 1.03859128947666,
+      "learning_rate": 8.291675447541834e-06,
+      "loss": 0.8609,
+      "step": 619
+    },
+    {
+      "epoch": 0.5875384979862592,
+      "grad_norm": 1.2256793137128281,
+      "learning_rate": 8.285892597532311e-06,
+      "loss": 0.9384,
+      "step": 620
+    },
+    {
+      "epoch": 0.5884861407249466,
+      "grad_norm": 1.1848786072557997,
+      "learning_rate": 8.280102000376346e-06,
+      "loss": 0.8621,
+      "step": 621
+    },
+    {
+      "epoch": 0.5894337834636342,
+      "grad_norm": 1.0897670274263946,
+      "learning_rate": 8.274303669726427e-06,
+      "loss": 0.8895,
+      "step": 622
+    },
+    {
+      "epoch": 0.5903814262023217,
+      "grad_norm": 1.2338961521515757,
+      "learning_rate": 8.268497619253273e-06,
+      "loss": 0.9397,
+      "step": 623
+    },
+    {
+      "epoch": 0.5913290689410092,
+      "grad_norm": 1.1260558549955006,
+      "learning_rate": 8.262683862645804e-06,
+      "loss": 0.8779,
+      "step": 624
+    },
+    {
+      "epoch": 0.5922767116796968,
+      "grad_norm": 1.0331575412446614,
+      "learning_rate": 8.256862413611113e-06,
+      "loss": 0.912,
+      "step": 625
+    },
+    {
+      "epoch": 0.5932243544183843,
+      "grad_norm": 1.192716956765876,
+      "learning_rate": 8.25103328587442e-06,
+      "loss": 0.8503,
+      "step": 626
+    },
+    {
+      "epoch": 0.5941719971570718,
+      "grad_norm": 1.1264420514270421,
+      "learning_rate": 8.245196493179061e-06,
+      "loss": 0.968,
+      "step": 627
+    },
+    {
+      "epoch": 0.5951196398957593,
+      "grad_norm": 1.1664248935291284,
+      "learning_rate": 8.239352049286435e-06,
+      "loss": 0.9293,
+      "step": 628
+    },
+    {
+      "epoch": 0.5960672826344469,
+      "grad_norm": 1.165344238639824,
+      "learning_rate": 8.233499967975981e-06,
+      "loss": 0.9285,
+      "step": 629
+    },
+    {
+      "epoch": 0.5970149253731343,
+      "grad_norm": 1.1132735526032906,
+      "learning_rate": 8.22764026304515e-06,
+      "loss": 0.8583,
+      "step": 630
+    },
+    {
+      "epoch": 0.5979625681118218,
+      "grad_norm": 1.2263330088822129,
+      "learning_rate": 8.221772948309363e-06,
+      "loss": 0.8848,
+      "step": 631
+    },
+    {
+      "epoch": 0.5989102108505093,
+      "grad_norm": 1.242835973780116,
+      "learning_rate": 8.215898037601981e-06,
+      "loss": 0.9078,
+      "step": 632
+    },
+    {
+      "epoch": 0.5998578535891969,
+      "grad_norm": 1.0611322995754056,
+      "learning_rate": 8.210015544774279e-06,
+      "loss": 0.9158,
+      "step": 633
+    },
+    {
+      "epoch": 0.6008054963278844,
+      "grad_norm": 1.0776982828638144,
+      "learning_rate": 8.204125483695403e-06,
+      "loss": 0.8951,
+      "step": 634
+    },
+    {
+      "epoch": 0.6017531390665719,
+      "grad_norm": 1.1010692683885481,
+      "learning_rate": 8.198227868252348e-06,
+      "loss": 0.8796,
+      "step": 635
+    },
+    {
+      "epoch": 0.6027007818052594,
+      "grad_norm": 1.1791589543105867,
+      "learning_rate": 8.192322712349917e-06,
+      "loss": 0.8649,
+      "step": 636
+    },
+    {
+      "epoch": 0.603648424543947,
+      "grad_norm": 1.0601001331133804,
+      "learning_rate": 8.186410029910694e-06,
+      "loss": 0.9523,
+      "step": 637
+    },
+    {
+      "epoch": 0.6045960672826345,
+      "grad_norm": 1.1485122140349338,
+      "learning_rate": 8.180489834875e-06,
+      "loss": 0.9796,
+      "step": 638
+    },
+    {
+      "epoch": 0.6045960672826345,
+      "eval_loss": 0.9337397813796997,
+      "eval_runtime": 64.0008,
+      "eval_samples_per_second": 42.624,
+      "eval_steps_per_second": 0.672,
+      "step": 638
+    },
+    {
+      "epoch": 0.605543710021322,
+      "grad_norm": 1.164013089234412,
+      "learning_rate": 8.174562141200878e-06,
+      "loss": 0.8544,
+      "step": 639
+    },
+    {
+      "epoch": 0.6064913527600094,
+      "grad_norm": 0.9760399939930886,
+      "learning_rate": 8.168626962864045e-06,
+      "loss": 0.9098,
+      "step": 640
+    },
+    {
+      "epoch": 0.607438995498697,
+      "grad_norm": 1.2834957539805654,
+      "learning_rate": 8.162684313857869e-06,
+      "loss": 0.9297,
+      "step": 641
+    },
+    {
+      "epoch": 0.6083866382373845,
+      "grad_norm": 1.1309875805256084,
+      "learning_rate": 8.156734208193327e-06,
+      "loss": 0.8415,
+      "step": 642
+    },
+    {
+      "epoch": 0.609334280976072,
+      "grad_norm": 1.1022824206738733,
+      "learning_rate": 8.15077665989898e-06,
+      "loss": 0.89,
+      "step": 643
+    },
+    {
+      "epoch": 0.6102819237147595,
+      "grad_norm": 1.024845261979357,
+      "learning_rate": 8.144811683020932e-06,
+      "loss": 0.9135,
+      "step": 644
+    },
+    {
+      "epoch": 0.6112295664534471,
+      "grad_norm": 1.1297286487540084,
+      "learning_rate": 8.138839291622807e-06,
+      "loss": 0.9178,
+      "step": 645
+    },
+    {
+      "epoch": 0.6121772091921346,
+      "grad_norm": 1.0969400851261308,
+      "learning_rate": 8.132859499785708e-06,
+      "loss": 0.8944,
+      "step": 646
+    },
+    {
+      "epoch": 0.6131248519308221,
+      "grad_norm": 1.1361902149740226,
+      "learning_rate": 8.126872321608185e-06,
+      "loss": 0.8428,
+      "step": 647
+    },
+    {
+      "epoch": 0.6140724946695096,
+      "grad_norm": 4.987461142356876,
+      "learning_rate": 8.120877771206201e-06,
+      "loss": 0.9267,
+      "step": 648
+    },
+    {
+      "epoch": 0.6150201374081972,
+      "grad_norm": 1.1003270497805429,
+      "learning_rate": 8.114875862713107e-06,
+      "loss": 0.9126,
+      "step": 649
+    },
+    {
+      "epoch": 0.6159677801468846,
+      "grad_norm": 1.1145863010291335,
+      "learning_rate": 8.108866610279595e-06,
+      "loss": 0.9069,
+      "step": 650
+    },
+    {
+      "epoch": 0.6169154228855721,
+      "grad_norm": 1.070440031809025,
+      "learning_rate": 8.102850028073674e-06,
+      "loss": 0.9805,
+      "step": 651
+    },
+    {
+      "epoch": 0.6178630656242596,
+      "grad_norm": 1.1878084074243875,
+      "learning_rate": 8.09682613028064e-06,
+      "loss": 0.8608,
+      "step": 652
+    },
+    {
+      "epoch": 0.6188107083629472,
+      "grad_norm": 1.1666517787266597,
+      "learning_rate": 8.090794931103026e-06,
+      "loss": 0.8649,
+      "step": 653
+    },
+    {
+      "epoch": 0.6197583511016347,
+      "grad_norm": 1.1109949887709072,
+      "learning_rate": 8.08475644476059e-06,
+      "loss": 0.8555,
+      "step": 654
+    },
+    {
+      "epoch": 0.6207059938403222,
+      "grad_norm": 1.1293611851504917,
+      "learning_rate": 8.078710685490266e-06,
+      "loss": 0.9048,
+      "step": 655
+    },
+    {
+      "epoch": 0.6216536365790097,
+      "grad_norm": 1.0517383761314782,
+      "learning_rate": 8.072657667546136e-06,
+      "loss": 0.8665,
+      "step": 656
+    },
+    {
+      "epoch": 0.6226012793176973,
+      "grad_norm": 1.143650330668765,
+      "learning_rate": 8.066597405199393e-06,
+      "loss": 0.8833,
+      "step": 657
+    },
+    {
+      "epoch": 0.6235489220563848,
+      "grad_norm": 1.2088279253982825,
+      "learning_rate": 8.060529912738316e-06,
+      "loss": 0.9369,
+      "step": 658
+    },
+    {
+      "epoch": 0.6244965647950722,
+      "grad_norm": 1.1771192147073226,
+      "learning_rate": 8.054455204468225e-06,
+      "loss": 0.8912,
+      "step": 659
+    },
+    {
+      "epoch": 0.6254442075337597,
+      "grad_norm": 0.9872215985965054,
+      "learning_rate": 8.048373294711455e-06,
+      "loss": 0.8272,
+      "step": 660
+    },
+    {
+      "epoch": 0.6254442075337597,
+      "eval_loss": 0.9312112927436829,
+      "eval_runtime": 61.3917,
+      "eval_samples_per_second": 44.436,
+      "eval_steps_per_second": 0.7,
+      "step": 660
+    },
+    {
+      "epoch": 0.6263918502724473,
+      "grad_norm": 1.112849369485224,
+      "learning_rate": 8.042284197807323e-06,
+      "loss": 0.8914,
+      "step": 661
+    },
+    {
+      "epoch": 0.6273394930111348,
+      "grad_norm": 1.1777170728187258,
+      "learning_rate": 8.036187928112087e-06,
+      "loss": 0.8983,
+      "step": 662
+    },
+    {
+      "epoch": 0.6282871357498223,
+      "grad_norm": 1.1537880977099835,
+      "learning_rate": 8.030084499998916e-06,
+      "loss": 0.8823,
+      "step": 663
+    },
+    {
+      "epoch": 0.6292347784885098,
+      "grad_norm": 1.1620930961053082,
+      "learning_rate": 8.023973927857857e-06,
+      "loss": 0.9361,
+      "step": 664
+    },
+    {
+      "epoch": 0.6301824212271974,
+      "grad_norm": 1.3868544628160782,
+      "learning_rate": 8.017856226095804e-06,
+      "loss": 0.9183,
+      "step": 665
+    },
+    {
+      "epoch": 0.6311300639658849,
+      "grad_norm": 1.1115391562362278,
+      "learning_rate": 8.011731409136454e-06,
+      "loss": 0.8678,
+      "step": 666
+    },
+    {
+      "epoch": 0.6320777067045724,
+      "grad_norm": 1.1189548409555135,
+      "learning_rate": 8.005599491420288e-06,
+      "loss": 0.9562,
+      "step": 667
+    },
+    {
+      "epoch": 0.6330253494432599,
+      "grad_norm": 1.1551587897622888,
+      "learning_rate": 7.99946048740452e-06,
+      "loss": 0.9742,
+      "step": 668
+    },
+    {
+      "epoch": 0.6339729921819474,
+      "grad_norm": 0.9985847295223576,
+      "learning_rate": 7.993314411563075e-06,
+      "loss": 0.8763,
+      "step": 669
+    },
+    {
+      "epoch": 0.6349206349206349,
+      "grad_norm": 0.9938954837958673,
+      "learning_rate": 7.987161278386555e-06,
+      "loss": 0.8941,
+      "step": 670
+    },
+    {
+      "epoch": 0.6358682776593224,
+      "grad_norm": 1.2517564440987765,
+      "learning_rate": 7.981001102382192e-06,
+      "loss": 0.8922,
+      "step": 671
+    },
+    {
+      "epoch": 0.6368159203980099,
+      "grad_norm": 1.669042851630183,
+      "learning_rate": 7.974833898073832e-06,
+      "loss": 0.8734,
+      "step": 672
+    },
+    {
+      "epoch": 0.6377635631366975,
+      "grad_norm": 1.733742728719525,
+      "learning_rate": 7.968659680001887e-06,
+      "loss": 0.9224,
+      "step": 673
+    },
+    {
+      "epoch": 0.638711205875385,
+      "grad_norm": 1.4086875008087318,
+      "learning_rate": 7.962478462723306e-06,
+      "loss": 0.8862,
+      "step": 674
+    },
+    {
+      "epoch": 0.6396588486140725,
+      "grad_norm": 1.118275938120274,
+      "learning_rate": 7.95629026081154e-06,
+      "loss": 0.9075,
+      "step": 675
+    },
+    {
+      "epoch": 0.64060649135276,
+      "grad_norm": 1.2853033943409442,
+      "learning_rate": 7.950095088856509e-06,
+      "loss": 0.857,
+      "step": 676
+    },
+    {
+      "epoch": 0.6415541340914476,
+      "grad_norm": 1.0400548366155555,
+      "learning_rate": 7.943892961464562e-06,
+      "loss": 0.9434,
+      "step": 677
+    },
+    {
+      "epoch": 0.642501776830135,
+      "grad_norm": 1.3041391262819717,
+      "learning_rate": 7.937683893258454e-06,
+      "loss": 0.9685,
+      "step": 678
+    },
+    {
+      "epoch": 0.6434494195688225,
+      "grad_norm": 1.1598673981069736,
+      "learning_rate": 7.931467898877298e-06,
+      "loss": 0.8632,
+      "step": 679
+    },
+    {
+      "epoch": 0.64439706230751,
+      "grad_norm": 1.0009937654408843,
+      "learning_rate": 7.925244992976538e-06,
+      "loss": 0.8824,
+      "step": 680
+    },
+    {
+      "epoch": 0.6453447050461976,
+      "grad_norm": 1.017837058069719,
+      "learning_rate": 7.919015190227919e-06,
+      "loss": 0.8505,
+      "step": 681
+    },
+    {
+      "epoch": 0.6462923477848851,
+      "grad_norm": 1.1641241757138032,
+      "learning_rate": 7.912778505319436e-06,
+      "loss": 0.8432,
+      "step": 682
+    },
+    {
+      "epoch": 0.6462923477848851,
+      "eval_loss": 0.9309917688369751,
+      "eval_runtime": 60.3302,
+      "eval_samples_per_second": 45.218,
+      "eval_steps_per_second": 0.713,
+      "step": 682
+    },
+    {
+      "epoch": 0.6472399905235726,
+      "grad_norm": 1.1169534397607939,
+      "learning_rate": 7.906534952955321e-06,
+      "loss": 0.9085,
+      "step": 683
+    },
+    {
+      "epoch": 0.6481876332622601,
+      "grad_norm": 1.0900655365751404,
+      "learning_rate": 7.900284547855992e-06,
+      "loss": 0.9411,
+      "step": 684
+    },
+    {
+      "epoch": 0.6491352760009477,
+      "grad_norm": 1.2161831558733007,
+      "learning_rate": 7.894027304758023e-06,
+      "loss": 0.8769,
+      "step": 685
+    },
+    {
+      "epoch": 0.6500829187396352,
+      "grad_norm": 1.044138736537594,
+      "learning_rate": 7.88776323841411e-06,
+      "loss": 0.9436,
+      "step": 686
+    },
+    {
+      "epoch": 0.6510305614783227,
+      "grad_norm": 1.0705430979469939,
+      "learning_rate": 7.88149236359304e-06,
+      "loss": 0.8941,
+      "step": 687
+    },
+    {
+      "epoch": 0.6519782042170101,
+      "grad_norm": 1.3845056323680385,
+      "learning_rate": 7.875214695079647e-06,
+      "loss": 0.9501,
+      "step": 688
+    },
+    {
+      "epoch": 0.6529258469556977,
+      "grad_norm": 1.0170616350914143,
+      "learning_rate": 7.868930247674787e-06,
+      "loss": 0.9,
+      "step": 689
+    },
+    {
+      "epoch": 0.6538734896943852,
+      "grad_norm": 1.0921009934181993,
+      "learning_rate": 7.862639036195298e-06,
+      "loss": 0.9174,
+      "step": 690
+    },
+    {
+      "epoch": 0.6548211324330727,
+      "grad_norm": 1.218634642701156,
+      "learning_rate": 7.856341075473963e-06,
+      "loss": 0.9376,
+      "step": 691
+    },
+    {
+      "epoch": 0.6557687751717602,
+      "grad_norm": 0.9907566710047155,
+      "learning_rate": 7.850036380359479e-06,
+      "loss": 0.8849,
+      "step": 692
+    },
+    {
+      "epoch": 0.6567164179104478,
+      "grad_norm": 1.0543716934739653,
+      "learning_rate": 7.843724965716419e-06,
+      "loss": 0.9345,
+      "step": 693
+    },
+    {
+      "epoch": 0.6576640606491353,
+      "grad_norm": 0.9814925522801817,
+      "learning_rate": 7.837406846425205e-06,
+      "loss": 0.8675,
+      "step": 694
+    },
+    {
+      "epoch": 0.6586117033878228,
+      "grad_norm": 1.192089061098573,
+      "learning_rate": 7.831082037382057e-06,
+      "loss": 0.9501,
+      "step": 695
+    },
+    {
+      "epoch": 0.6595593461265103,
+      "grad_norm": 1.1020254975949058,
+      "learning_rate": 7.824750553498977e-06,
+      "loss": 0.9811,
+      "step": 696
+    },
+    {
+      "epoch": 0.6605069888651979,
+      "grad_norm": 1.1111842691891292,
+      "learning_rate": 7.818412409703695e-06,
+      "loss": 0.9328,
+      "step": 697
+    },
+    {
+      "epoch": 0.6614546316038853,
+      "grad_norm": 1.1347234782175453,
+      "learning_rate": 7.812067620939653e-06,
+      "loss": 0.9614,
+      "step": 698
+    },
+    {
+      "epoch": 0.6624022743425728,
+      "grad_norm": 1.065352923892995,
+      "learning_rate": 7.805716202165949e-06,
+      "loss": 0.8818,
+      "step": 699
+    },
+    {
+      "epoch": 0.6633499170812603,
+      "grad_norm": 1.1030004540487208,
+      "learning_rate": 7.799358168357323e-06,
+      "loss": 0.8465,
+      "step": 700
+    },
+    {
+      "epoch": 0.6642975598199479,
+      "grad_norm": 0.9089874996068653,
+      "learning_rate": 7.792993534504103e-06,
+      "loss": 0.9243,
+      "step": 701
+    },
+    {
+      "epoch": 0.6652452025586354,
+      "grad_norm": 1.2054269196194143,
+      "learning_rate": 7.786622315612182e-06,
+      "loss": 0.8688,
+      "step": 702
+    },
+    {
+      "epoch": 0.6661928452973229,
+      "grad_norm": 1.2746014427627002,
+      "learning_rate": 7.78024452670298e-06,
+      "loss": 0.9181,
+      "step": 703
+    },
+    {
+      "epoch": 0.6671404880360104,
+      "grad_norm": 1.077600022154621,
+      "learning_rate": 7.773860182813404e-06,
+      "loss": 0.8492,
+      "step": 704
+    },
+    {
+      "epoch": 0.6671404880360104,
+      "eval_loss": 0.928534984588623,
+      "eval_runtime": 64.6468,
+      "eval_samples_per_second": 42.199,
+      "eval_steps_per_second": 0.665,
+      "step": 704
+    },
+    {
+      "epoch": 0.668088130774698,
+      "grad_norm": 1.033501298641634,
+      "learning_rate": 7.767469298995813e-06,
+      "loss": 0.8854,
+      "step": 705
+    },
+    {
+      "epoch": 0.6690357735133855,
+      "grad_norm": 0.9867061772593498,
+      "learning_rate": 7.761071890317994e-06,
+      "loss": 0.8431,
+      "step": 706
+    },
+    {
+      "epoch": 0.6699834162520729,
+      "grad_norm": 1.1600752814036814,
+      "learning_rate": 7.754667971863112e-06,
+      "loss": 0.9133,
+      "step": 707
+    },
+    {
+      "epoch": 0.6709310589907604,
+      "grad_norm": 1.1443042381662363,
+      "learning_rate": 7.748257558729677e-06,
+      "loss": 0.9184,
+      "step": 708
+    },
+    {
+      "epoch": 0.671878701729448,
+      "grad_norm": 1.1577240117048646,
+      "learning_rate": 7.741840666031517e-06,
+      "loss": 0.8738,
+      "step": 709
+    },
+    {
+      "epoch": 0.6728263444681355,
+      "grad_norm": 1.308361165359508,
+      "learning_rate": 7.735417308897737e-06,
+      "loss": 0.8414,
+      "step": 710
+    },
+    {
+      "epoch": 0.673773987206823,
+      "grad_norm": 1.1739658962601087,
+      "learning_rate": 7.728987502472678e-06,
+      "loss": 0.8551,
+      "step": 711
+    },
+    {
+      "epoch": 0.6747216299455105,
+      "grad_norm": 1.0699527544498066,
+      "learning_rate": 7.72255126191589e-06,
+      "loss": 0.8514,
+      "step": 712
+    },
+    {
+      "epoch": 0.6756692726841981,
+      "grad_norm": 1.481959370807152,
+      "learning_rate": 7.716108602402094e-06,
+      "loss": 0.8944,
+      "step": 713
+    },
+    {
+      "epoch": 0.6766169154228856,
+      "grad_norm": 1.1240913687798249,
+      "learning_rate": 7.709659539121144e-06,
+      "loss": 0.8578,
+      "step": 714
+    },
+    {
+      "epoch": 0.6775645581615731,
+      "grad_norm": 1.1176326008647557,
+      "learning_rate": 7.703204087277989e-06,
+      "loss": 0.9374,
+      "step": 715
+    },
+    {
+      "epoch": 0.6785122009002607,
+      "grad_norm": 1.2863389842208885,
+      "learning_rate": 7.696742262092643e-06,
+      "loss": 0.8846,
+      "step": 716
+    },
+    {
+      "epoch": 0.6794598436389481,
+      "grad_norm": 1.3631041557393186,
+      "learning_rate": 7.690274078800148e-06,
+      "loss": 0.8766,
+      "step": 717
+    },
+    {
+      "epoch": 0.6804074863776356,
+      "grad_norm": 1.4470714060969805,
+      "learning_rate": 7.683799552650534e-06,
+      "loss": 0.9231,
+      "step": 718
+    },
+    {
+      "epoch": 0.6813551291163231,
+      "grad_norm": 1.1150660700565789,
+      "learning_rate": 7.677318698908788e-06,
+      "loss": 0.8391,
+      "step": 719
+    },
+    {
+      "epoch": 0.6823027718550106,
+      "grad_norm": 1.1713456925403845,
+      "learning_rate": 7.670831532854811e-06,
+      "loss": 0.9214,
+      "step": 720
+    },
+    {
+      "epoch": 0.6832504145936982,
+      "grad_norm": 1.219437200101105,
+      "learning_rate": 7.66433806978339e-06,
+      "loss": 0.8944,
+      "step": 721
+    },
+    {
+      "epoch": 0.6841980573323857,
+      "grad_norm": 1.2287162936444869,
+      "learning_rate": 7.65783832500416e-06,
+      "loss": 0.8854,
+      "step": 722
+    },
+    {
+      "epoch": 0.6851457000710732,
+      "grad_norm": 1.344476187588291,
+      "learning_rate": 7.651332313841562e-06,
+      "loss": 0.8488,
+      "step": 723
+    },
+    {
+      "epoch": 0.6860933428097608,
+      "grad_norm": 1.0371186151252827,
+      "learning_rate": 7.644820051634813e-06,
+      "loss": 0.8473,
+      "step": 724
+    },
+    {
+      "epoch": 0.6870409855484483,
+      "grad_norm": 1.1863536224030726,
+      "learning_rate": 7.638301553737871e-06,
+      "loss": 0.9155,
+      "step": 725
+    },
+    {
+      "epoch": 0.6879886282871357,
+      "grad_norm": 1.200025733783506,
+      "learning_rate": 7.63177683551939e-06,
+      "loss": 0.8828,
+      "step": 726
+    },
+    {
+      "epoch": 0.6879886282871357,
+      "eval_loss": 0.9281173944473267,
+      "eval_runtime": 65.2556,
+      "eval_samples_per_second": 41.805,
+      "eval_steps_per_second": 0.659,
+      "step": 726
+    },
+    {
+      "epoch": 0.6889362710258232,
+      "grad_norm": 1.1992637929718537,
+      "learning_rate": 7.625245912362699e-06,
+      "loss": 0.87,
+      "step": 727
+    },
+    {
+      "epoch": 0.6898839137645107,
+      "grad_norm": 1.0955940037433252,
+      "learning_rate": 7.618708799665745e-06,
+      "loss": 0.8636,
+      "step": 728
+    },
+    {
+      "epoch": 0.6908315565031983,
+      "grad_norm": 1.0564308744295217,
+      "learning_rate": 7.612165512841076e-06,
+      "loss": 0.9153,
+      "step": 729
+    },
+    {
+      "epoch": 0.6917791992418858,
+      "grad_norm": 1.2470763038912207,
+      "learning_rate": 7.605616067315793e-06,
+      "loss": 0.9199,
+      "step": 730
+    },
+    {
+      "epoch": 0.6927268419805733,
+      "grad_norm": 1.2122791323172828,
+      "learning_rate": 7.599060478531519e-06,
+      "loss": 0.9248,
+      "step": 731
+    },
+    {
+      "epoch": 0.6936744847192609,
+      "grad_norm": 1.097709504783866,
+      "learning_rate": 7.592498761944363e-06,
+      "loss": 0.8689,
+      "step": 732
+    },
+    {
+      "epoch": 0.6946221274579484,
+      "grad_norm": 1.230662404513117,
+      "learning_rate": 7.585930933024874e-06,
+      "loss": 0.9021,
+      "step": 733
+    },
+    {
+      "epoch": 0.6955697701966359,
+      "grad_norm": 1.0146506247183977,
+      "learning_rate": 7.579357007258022e-06,
+      "loss": 0.9065,
+      "step": 734
+    },
+    {
+      "epoch": 0.6965174129353234,
+      "grad_norm": 1.1522681830781554,
+      "learning_rate": 7.572777000143145e-06,
+      "loss": 0.8689,
+      "step": 735
+    },
+    {
+      "epoch": 0.6974650556740108,
+      "grad_norm": 1.1876718763825516,
+      "learning_rate": 7.56619092719392e-06,
+      "loss": 0.8553,
+      "step": 736
+    },
+    {
+      "epoch": 0.6984126984126984,
+      "grad_norm": 1.254710090679274,
+      "learning_rate": 7.559598803938328e-06,
+      "loss": 0.8994,
+      "step": 737
+    },
+    {
+      "epoch": 0.6993603411513859,
+      "grad_norm": 1.3772630607131078,
+      "learning_rate": 7.5530006459186115e-06,
+      "loss": 0.9072,
+      "step": 738
+    },
+    {
+      "epoch": 0.7003079838900734,
+      "grad_norm": 1.2658717437982785,
+      "learning_rate": 7.546396468691241e-06,
+      "loss": 0.8704,
+      "step": 739
+    },
+    {
+      "epoch": 0.701255626628761,
+      "grad_norm": 1.051563030074167,
+      "learning_rate": 7.539786287826885e-06,
+      "loss": 0.9211,
+      "step": 740
+    },
+    {
+      "epoch": 0.7022032693674485,
+      "grad_norm": 1.0078569587527633,
+      "learning_rate": 7.533170118910356e-06,
+      "loss": 0.8865,
+      "step": 741
+    },
+    {
+      "epoch": 0.703150912106136,
+      "grad_norm": 1.2344336456961045,
+      "learning_rate": 7.526547977540592e-06,
+      "loss": 0.9072,
+      "step": 742
+    },
+    {
+      "epoch": 0.7040985548448235,
+      "grad_norm": 1.1416986384728778,
+      "learning_rate": 7.5199198793306135e-06,
+      "loss": 0.873,
+      "step": 743
+    },
+    {
+      "epoch": 0.7050461975835111,
+      "grad_norm": 1.0678982977792042,
+      "learning_rate": 7.51328583990748e-06,
+      "loss": 0.913,
+      "step": 744
+    },
+    {
+      "epoch": 0.7059938403221986,
+      "grad_norm": 1.1437093215592007,
+      "learning_rate": 7.506645874912264e-06,
+      "loss": 0.9799,
+      "step": 745
+    },
+    {
+      "epoch": 0.706941483060886,
+      "grad_norm": 1.0498302149495389,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.9027,
+      "step": 746
+    },
+    {
+      "epoch": 0.7078891257995735,
+      "grad_norm": 2.056830676476333,
+      "learning_rate": 7.4933482308396686e-06,
+      "loss": 0.8287,
+      "step": 747
+    },
+    {
+      "epoch": 0.708836768538261,
+      "grad_norm": 1.053677680621179,
+      "learning_rate": 7.486690583114137e-06,
+      "loss": 0.9333,
+      "step": 748
+    },
+    {
+      "epoch": 0.708836768538261,
+      "eval_loss": 0.92720627784729,
+      "eval_runtime": 65.9587,
+      "eval_samples_per_second": 41.359,
+      "eval_steps_per_second": 0.652,
+      "step": 748
+    },
+    {
+      "epoch": 0.7097844112769486,
+      "grad_norm": 1.5701143667704405,
+      "learning_rate": 7.480027072520137e-06,
+      "loss": 0.8974,
+      "step": 749
+    },
+    {
+      "epoch": 0.7107320540156361,
+      "grad_norm": 1.235022340393013,
+      "learning_rate": 7.473357714768222e-06,
+      "loss": 0.9207,
+      "step": 750
+    },
+    {
+      "epoch": 0.7116796967543236,
+      "grad_norm": 1.1141250146076045,
+      "learning_rate": 7.466682525582732e-06,
+      "loss": 0.8674,
+      "step": 751
+    },
+    {
+      "epoch": 0.7126273394930112,
+      "grad_norm": 1.1886209864097594,
+      "learning_rate": 7.460001520701756e-06,
+      "loss": 0.8858,
+      "step": 752
+    },
+    {
+      "epoch": 0.7135749822316987,
+      "grad_norm": 1.1322038451733678,
+      "learning_rate": 7.453314715877094e-06,
+      "loss": 0.843,
+      "step": 753
+    },
+    {
+      "epoch": 0.7145226249703862,
+      "grad_norm": 1.0033173758679002,
+      "learning_rate": 7.446622126874219e-06,
+      "loss": 0.8674,
+      "step": 754
+    },
+    {
+      "epoch": 0.7154702677090736,
+      "grad_norm": 1.1688006539249634,
+      "learning_rate": 7.439923769472244e-06,
+      "loss": 0.8825,
+      "step": 755
+    },
+    {
+      "epoch": 0.7164179104477612,
+      "grad_norm": 1.306296172244326,
+      "learning_rate": 7.4332196594638815e-06,
+      "loss": 0.9753,
+      "step": 756
+    },
+    {
+      "epoch": 0.7173655531864487,
+      "grad_norm": 0.9730531759417155,
+      "learning_rate": 7.4265098126554065e-06,
+      "loss": 0.8617,
+      "step": 757
+    },
+    {
+      "epoch": 0.7183131959251362,
+      "grad_norm": 1.385852128592582,
+      "learning_rate": 7.419794244866619e-06,
+      "loss": 0.8978,
+      "step": 758
+    },
+    {
+      "epoch": 0.7192608386638237,
+      "grad_norm": 1.0569360851945016,
+      "learning_rate": 7.413072971930807e-06,
+      "loss": 0.9421,
+      "step": 759
+    },
+    {
+      "epoch": 0.7202084814025113,
+      "grad_norm": 0.9964478050464701,
+      "learning_rate": 7.406346009694713e-06,
+      "loss": 0.85,
+      "step": 760
+    },
+    {
+      "epoch": 0.7211561241411988,
+      "grad_norm": 1.193990300075423,
+      "learning_rate": 7.39961337401849e-06,
+      "loss": 0.8751,
+      "step": 761
+    },
+    {
+      "epoch": 0.7221037668798863,
+      "grad_norm": 2.0570268630743156,
+      "learning_rate": 7.3928750807756656e-06,
+      "loss": 0.9026,
+      "step": 762
+    },
+    {
+      "epoch": 0.7230514096185738,
+      "grad_norm": 40.56532136308673,
+      "learning_rate": 7.386131145853111e-06,
+      "loss": 0.858,
+      "step": 763
+    },
+    {
+      "epoch": 0.7239990523572614,
+      "grad_norm": 1.7276587552930305,
+      "learning_rate": 7.379381585150997e-06,
+      "loss": 0.8743,
+      "step": 764
+    },
+    {
+      "epoch": 0.7249466950959488,
+      "grad_norm": 1.1917953134999557,
+      "learning_rate": 7.372626414582754e-06,
+      "loss": 0.9486,
+      "step": 765
+    },
+    {
+      "epoch": 0.7258943378346363,
+      "grad_norm": 1.1432814350506717,
+      "learning_rate": 7.365865650075046e-06,
+      "loss": 0.9477,
+      "step": 766
+    },
+    {
+      "epoch": 0.7268419805733238,
+      "grad_norm": 1.09550223259366,
+      "learning_rate": 7.359099307567721e-06,
+      "loss": 0.9092,
+      "step": 767
+    },
+    {
+      "epoch": 0.7277896233120114,
+      "grad_norm": 1.150998457087419,
+      "learning_rate": 7.352327403013779e-06,
+      "loss": 0.8982,
+      "step": 768
+    },
+    {
+      "epoch": 0.7287372660506989,
+      "grad_norm": 1.108499327493803,
+      "learning_rate": 7.345549952379334e-06,
+      "loss": 0.9682,
+      "step": 769
+    },
+    {
+      "epoch": 0.7296849087893864,
+      "grad_norm": 1.153466896232557,
+      "learning_rate": 7.338766971643579e-06,
+      "loss": 0.8988,
+      "step": 770
+    },
+    {
+      "epoch": 0.7296849087893864,
+      "eval_loss": 0.926575779914856,
+      "eval_runtime": 68.8528,
+      "eval_samples_per_second": 39.621,
+      "eval_steps_per_second": 0.625,
+      "step": 770
+    },
+    {
+      "epoch": 0.7306325515280739,
+      "grad_norm": 1.0796573599853159,
+      "learning_rate": 7.331978476798738e-06,
+      "loss": 0.8149,
+      "step": 771
+    },
+    {
+      "epoch": 0.7315801942667615,
+      "grad_norm": 1.0931539077757213,
+      "learning_rate": 7.325184483850043e-06,
+      "loss": 0.9123,
+      "step": 772
+    },
+    {
+      "epoch": 0.732527837005449,
+      "grad_norm": 1.4182919844521371,
+      "learning_rate": 7.318385008815686e-06,
+      "loss": 0.95,
+      "step": 773
+    },
+    {
+      "epoch": 0.7334754797441365,
+      "grad_norm": 1.2970117028124304,
+      "learning_rate": 7.311580067726783e-06,
+      "loss": 0.8689,
+      "step": 774
+    },
+    {
+      "epoch": 0.7344231224828239,
+      "grad_norm": 1.0927364681801075,
+      "learning_rate": 7.304769676627339e-06,
+      "loss": 0.8769,
+      "step": 775
+    },
+    {
+      "epoch": 0.7353707652215115,
+      "grad_norm": 1.0964389214826673,
+      "learning_rate": 7.297953851574207e-06,
+      "loss": 0.9555,
+      "step": 776
+    },
+    {
+      "epoch": 0.736318407960199,
+      "grad_norm": 1.0832681682992362,
+      "learning_rate": 7.291132608637053e-06,
+      "loss": 0.9345,
+      "step": 777
+    },
+    {
+      "epoch": 0.7372660506988865,
+      "grad_norm": 1.227070674353931,
+      "learning_rate": 7.284305963898315e-06,
+      "loss": 0.8685,
+      "step": 778
+    },
+    {
+      "epoch": 0.738213693437574,
+      "grad_norm": 3.9684909478672976,
+      "learning_rate": 7.27747393345317e-06,
+      "loss": 0.8362,
+      "step": 779
+    },
+    {
+      "epoch": 0.7391613361762616,
+      "grad_norm": 1.4889622029850116,
+      "learning_rate": 7.270636533409491e-06,
+      "loss": 0.9391,
+      "step": 780
+    },
+    {
+      "epoch": 0.7401089789149491,
+      "grad_norm": 1.1829326412392815,
+      "learning_rate": 7.2637937798878085e-06,
+      "loss": 0.9182,
+      "step": 781
+    },
+    {
+      "epoch": 0.7410566216536366,
+      "grad_norm": 1.190511352024864,
+      "learning_rate": 7.25694568902128e-06,
+      "loss": 0.8477,
+      "step": 782
+    },
+    {
+      "epoch": 0.7420042643923241,
+      "grad_norm": 1.3407968776745007,
+      "learning_rate": 7.250092276955642e-06,
+      "loss": 0.8861,
+      "step": 783
+    },
+    {
+      "epoch": 0.7429519071310116,
+      "grad_norm": 1.0306033495447815,
+      "learning_rate": 7.243233559849179e-06,
+      "loss": 0.8723,
+      "step": 784
+    },
+    {
+      "epoch": 0.7438995498696991,
+      "grad_norm": 1.032943199516836,
+      "learning_rate": 7.236369553872684e-06,
+      "loss": 0.8848,
+      "step": 785
+    },
+    {
+      "epoch": 0.7448471926083866,
+      "grad_norm": 1.1806035892508238,
+      "learning_rate": 7.229500275209418e-06,
+      "loss": 0.9254,
+      "step": 786
+    },
+    {
+      "epoch": 0.7457948353470741,
+      "grad_norm": 1.1412881455081196,
+      "learning_rate": 7.222625740055072e-06,
+      "loss": 0.8766,
+      "step": 787
+    },
+    {
+      "epoch": 0.7467424780857617,
+      "grad_norm": 1.1293924713464558,
+      "learning_rate": 7.215745964617737e-06,
+      "loss": 0.8932,
+      "step": 788
+    },
+    {
+      "epoch": 0.7476901208244492,
+      "grad_norm": 1.0502148008128274,
+      "learning_rate": 7.2088609651178505e-06,
+      "loss": 0.8693,
+      "step": 789
+    },
+    {
+      "epoch": 0.7486377635631367,
+      "grad_norm": 1.1691866376909303,
+      "learning_rate": 7.201970757788172e-06,
+      "loss": 0.9133,
+      "step": 790
+    },
+    {
+      "epoch": 0.7495854063018242,
+      "grad_norm": 1.673809188183371,
+      "learning_rate": 7.195075358873738e-06,
+      "loss": 0.8997,
+      "step": 791
+    },
+    {
+      "epoch": 0.7505330490405118,
+      "grad_norm": 1.136212481513878,
+      "learning_rate": 7.188174784631824e-06,
+      "loss": 0.8343,
+      "step": 792
+    },
+    {
+      "epoch": 0.7505330490405118,
+      "eval_loss": 0.925286591053009,
+      "eval_runtime": 67.6399,
+      "eval_samples_per_second": 40.331,
+      "eval_steps_per_second": 0.636,
+      "step": 792
+    },
+    {
+      "epoch": 0.7514806917791993,
+      "grad_norm": 1.17472555336564,
+      "learning_rate": 7.18126905133191e-06,
+      "loss": 0.8889,
+      "step": 793
+    },
+    {
+      "epoch": 0.7524283345178867,
+      "grad_norm": 1.1001788390092748,
+      "learning_rate": 7.174358175255636e-06,
+      "loss": 0.8534,
+      "step": 794
+    },
+    {
+      "epoch": 0.7533759772565742,
+      "grad_norm": 1.465010157671844,
+      "learning_rate": 7.1674421726967704e-06,
+      "loss": 0.8603,
+      "step": 795
+    },
+    {
+      "epoch": 0.7543236199952618,
+      "grad_norm": 1.1953256640932477,
+      "learning_rate": 7.160521059961169e-06,
+      "loss": 0.8345,
+      "step": 796
+    },
+    {
+      "epoch": 0.7552712627339493,
+      "grad_norm": 3.8996540248644043,
+      "learning_rate": 7.153594853366731e-06,
+      "loss": 0.8398,
+      "step": 797
+    },
+    {
+      "epoch": 0.7562189054726368,
+      "grad_norm": 1.245025773648245,
+      "learning_rate": 7.14666356924337e-06,
+      "loss": 0.9068,
+      "step": 798
+    },
+    {
+      "epoch": 0.7571665482113243,
+      "grad_norm": 1.084110814960542,
+      "learning_rate": 7.1397272239329684e-06,
+      "loss": 0.881,
+      "step": 799
+    },
+    {
+      "epoch": 0.7581141909500119,
+      "grad_norm": 1.4178645946238013,
+      "learning_rate": 7.132785833789344e-06,
+      "loss": 0.9458,
+      "step": 800
+    },
+    {
+      "epoch": 0.7590618336886994,
+      "grad_norm": 1.0724390633923235,
+      "learning_rate": 7.125839415178204e-06,
+      "loss": 0.8477,
+      "step": 801
+    },
+    {
+      "epoch": 0.7600094764273869,
+      "grad_norm": 1.1299800999994787,
+      "learning_rate": 7.118887984477116e-06,
+      "loss": 0.8842,
+      "step": 802
+    },
+    {
+      "epoch": 0.7609571191660743,
+      "grad_norm": 1.1509151559671802,
+      "learning_rate": 7.111931558075465e-06,
+      "loss": 0.8459,
+      "step": 803
+    },
+    {
+      "epoch": 0.7619047619047619,
+      "grad_norm": 1.154455319378924,
+      "learning_rate": 7.104970152374405e-06,
+      "loss": 0.9082,
+      "step": 804
+    },
+    {
+      "epoch": 0.7628524046434494,
+      "grad_norm": 1.15539942235384,
+      "learning_rate": 7.098003783786844e-06,
+      "loss": 0.9114,
+      "step": 805
+    },
+    {
+      "epoch": 0.7638000473821369,
+      "grad_norm": 1.1800047790109787,
+      "learning_rate": 7.091032468737382e-06,
+      "loss": 0.8608,
+      "step": 806
+    },
+    {
+      "epoch": 0.7647476901208244,
+      "grad_norm": 1.1477444619292674,
+      "learning_rate": 7.084056223662282e-06,
+      "loss": 0.8842,
+      "step": 807
+    },
+    {
+      "epoch": 0.765695332859512,
+      "grad_norm": 1.1737093276570716,
+      "learning_rate": 7.0770750650094335e-06,
+      "loss": 0.9007,
+      "step": 808
+    },
+    {
+      "epoch": 0.7666429755981995,
+      "grad_norm": 1.0202933754062562,
+      "learning_rate": 7.070089009238306e-06,
+      "loss": 0.9234,
+      "step": 809
+    },
+    {
+      "epoch": 0.767590618336887,
+      "grad_norm": 1.0550807990536413,
+      "learning_rate": 7.063098072819919e-06,
+      "loss": 0.8696,
+      "step": 810
+    },
+    {
+      "epoch": 0.7685382610755745,
+      "grad_norm": 1.0531655916431704,
+      "learning_rate": 7.056102272236799e-06,
+      "loss": 0.8853,
+      "step": 811
+    },
+    {
+      "epoch": 0.7694859038142621,
+      "grad_norm": 1.2354614191985032,
+      "learning_rate": 7.049101623982938e-06,
+      "loss": 0.883,
+      "step": 812
+    },
+    {
+      "epoch": 0.7704335465529495,
+      "grad_norm": 0.9726437070709893,
+      "learning_rate": 7.04209614456376e-06,
+      "loss": 0.9153,
+      "step": 813
+    },
+    {
+      "epoch": 0.771381189291637,
+      "grad_norm": 2.241432047297512,
+      "learning_rate": 7.035085850496079e-06,
+      "loss": 0.94,
+      "step": 814
+    },
+    {
+      "epoch": 0.771381189291637,
+      "eval_loss": 0.9247336387634277,
+      "eval_runtime": 60.7578,
+      "eval_samples_per_second": 44.9,
+      "eval_steps_per_second": 0.708,
+      "step": 814
+    },
+    {
+      "epoch": 0.7723288320303245,
+      "grad_norm": 1.1356145449628114,
+      "learning_rate": 7.028070758308059e-06,
+      "loss": 0.8219,
+      "step": 815
+    },
+    {
+      "epoch": 0.7732764747690121,
+      "grad_norm": 1.1079321851319959,
+      "learning_rate": 7.021050884539178e-06,
+      "loss": 0.8588,
+      "step": 816
+    },
+    {
+      "epoch": 0.7742241175076996,
+      "grad_norm": 1.3040268308315734,
+      "learning_rate": 7.014026245740185e-06,
+      "loss": 0.8419,
+      "step": 817
+    },
+    {
+      "epoch": 0.7751717602463871,
+      "grad_norm": 1.230458455518736,
+      "learning_rate": 7.006996858473068e-06,
+      "loss": 0.9624,
+      "step": 818
+    },
+    {
+      "epoch": 0.7761194029850746,
+      "grad_norm": 0.9567806690635783,
+      "learning_rate": 6.999962739311008e-06,
+      "loss": 0.8194,
+      "step": 819
+    },
+    {
+      "epoch": 0.7770670457237622,
+      "grad_norm": 1.243710347278979,
+      "learning_rate": 6.992923904838341e-06,
+      "loss": 0.8955,
+      "step": 820
+    },
+    {
+      "epoch": 0.7780146884624497,
+      "grad_norm": 1.0335878204965474,
+      "learning_rate": 6.98588037165052e-06,
+      "loss": 0.9045,
+      "step": 821
+    },
+    {
+      "epoch": 0.7789623312011372,
+      "grad_norm": 1.0941302857809432,
+      "learning_rate": 6.97883215635408e-06,
+      "loss": 0.8902,
+      "step": 822
+    },
+    {
+      "epoch": 0.7799099739398246,
+      "grad_norm": 1.123840857900373,
+      "learning_rate": 6.971779275566593e-06,
+      "loss": 0.8913,
+      "step": 823
+    },
+    {
+      "epoch": 0.7808576166785122,
+      "grad_norm": 1.1195400281687917,
+      "learning_rate": 6.96472174591663e-06,
+      "loss": 0.8474,
+      "step": 824
+    },
+    {
+      "epoch": 0.7818052594171997,
+      "grad_norm": 1.3420603070760255,
+      "learning_rate": 6.957659584043724e-06,
+      "loss": 0.9077,
+      "step": 825
+    },
+    {
+      "epoch": 0.7827529021558872,
+      "grad_norm": 1.0041194424246165,
+      "learning_rate": 6.9505928065983275e-06,
+      "loss": 0.9597,
+      "step": 826
+    },
+    {
+      "epoch": 0.7837005448945747,
+      "grad_norm": 1.0270969195404063,
+      "learning_rate": 6.943521430241777e-06,
+      "loss": 0.8403,
+      "step": 827
+    },
+    {
+      "epoch": 0.7846481876332623,
+      "grad_norm": 1.1552904131971864,
+      "learning_rate": 6.936445471646249e-06,
+      "loss": 0.9044,
+      "step": 828
+    },
+    {
+      "epoch": 0.7855958303719498,
+      "grad_norm": 1.304060055980435,
+      "learning_rate": 6.929364947494729e-06,
+      "loss": 0.9,
+      "step": 829
+    },
+    {
+      "epoch": 0.7865434731106373,
+      "grad_norm": 1.210030944302157,
+      "learning_rate": 6.922279874480959e-06,
+      "loss": 0.9113,
+      "step": 830
+    },
+    {
+      "epoch": 0.7874911158493249,
+      "grad_norm": 1.025883162070808,
+      "learning_rate": 6.915190269309416e-06,
+      "loss": 0.9074,
+      "step": 831
+    },
+    {
+      "epoch": 0.7884387585880123,
+      "grad_norm": 1.1256642999783826,
+      "learning_rate": 6.908096148695251e-06,
+      "loss": 0.9119,
+      "step": 832
+    },
+    {
+      "epoch": 0.7893864013266998,
+      "grad_norm": 1.0860563647829231,
+      "learning_rate": 6.900997529364269e-06,
+      "loss": 0.9093,
+      "step": 833
+    },
+    {
+      "epoch": 0.7903340440653873,
+      "grad_norm": 1.0010442291598631,
+      "learning_rate": 6.893894428052881e-06,
+      "loss": 0.8858,
+      "step": 834
+    },
+    {
+      "epoch": 0.7912816868040748,
+      "grad_norm": 1.0574820455803635,
+      "learning_rate": 6.886786861508061e-06,
+      "loss": 0.8924,
+      "step": 835
+    },
+    {
+      "epoch": 0.7922293295427624,
+      "grad_norm": 1.2017601371365032,
+      "learning_rate": 6.879674846487314e-06,
+      "loss": 0.8959,
+      "step": 836
+    },
+    {
+      "epoch": 0.7922293295427624,
+      "eval_loss": 0.9229027628898621,
+      "eval_runtime": 61.6949,
+      "eval_samples_per_second": 44.218,
+      "eval_steps_per_second": 0.697,
+      "step": 836
+    },
+    {
+      "epoch": 0.7931769722814499,
+      "grad_norm": 1.0725982823665645,
+      "learning_rate": 6.872558399758633e-06,
+      "loss": 0.8485,
+      "step": 837
+    },
+    {
+      "epoch": 0.7941246150201374,
+      "grad_norm": 1.0560214413221218,
+      "learning_rate": 6.865437538100456e-06,
+      "loss": 0.8418,
+      "step": 838
+    },
+    {
+      "epoch": 0.795072257758825,
+      "grad_norm": 1.1398098092881728,
+      "learning_rate": 6.858312278301638e-06,
+      "loss": 0.8506,
+      "step": 839
+    },
+    {
+      "epoch": 0.7960199004975125,
+      "grad_norm": 1.1966491026312496,
+      "learning_rate": 6.8511826371613955e-06,
+      "loss": 0.9207,
+      "step": 840
+    },
+    {
+      "epoch": 0.7969675432362,
+      "grad_norm": 1.100229416684982,
+      "learning_rate": 6.8440486314892775e-06,
+      "loss": 0.8327,
+      "step": 841
+    },
+    {
+      "epoch": 0.7979151859748874,
+      "grad_norm": 1.0044010076247918,
+      "learning_rate": 6.836910278105124e-06,
+      "loss": 0.823,
+      "step": 842
+    },
+    {
+      "epoch": 0.798862828713575,
+      "grad_norm": 1.0289580305146189,
+      "learning_rate": 6.8297675938390275e-06,
+      "loss": 0.8566,
+      "step": 843
+    },
+    {
+      "epoch": 0.7998104714522625,
+      "grad_norm": 1.701693817598629,
+      "learning_rate": 6.822620595531286e-06,
+      "loss": 0.9532,
+      "step": 844
+    },
+    {
+      "epoch": 0.80075811419095,
+      "grad_norm": 1.1349425721498254,
+      "learning_rate": 6.815469300032374e-06,
+      "loss": 0.914,
+      "step": 845
+    },
+    {
+      "epoch": 0.8017057569296375,
+      "grad_norm": 1.139104764424171,
+      "learning_rate": 6.808313724202894e-06,
+      "loss": 0.9461,
+      "step": 846
+    },
+    {
+      "epoch": 0.802653399668325,
+      "grad_norm": 1.4336731001103296,
+      "learning_rate": 6.801153884913541e-06,
+      "loss": 0.8307,
+      "step": 847
+    },
+    {
+      "epoch": 0.8036010424070126,
+      "grad_norm": 1.0889586206123247,
+      "learning_rate": 6.793989799045067e-06,
+      "loss": 0.9337,
+      "step": 848
+    },
+    {
+      "epoch": 0.8045486851457001,
+      "grad_norm": 1.1232729735134666,
+      "learning_rate": 6.7868214834882265e-06,
+      "loss": 0.9321,
+      "step": 849
+    },
+    {
+      "epoch": 0.8054963278843876,
+      "grad_norm": 1.1373251435631513,
+      "learning_rate": 6.779648955143754e-06,
+      "loss": 0.8665,
+      "step": 850
+    },
+    {
+      "epoch": 0.806443970623075,
+      "grad_norm": 1.2552274257774585,
+      "learning_rate": 6.772472230922313e-06,
+      "loss": 0.8871,
+      "step": 851
+    },
+    {
+      "epoch": 0.8073916133617626,
+      "grad_norm": 1.0665372846687498,
+      "learning_rate": 6.765291327744463e-06,
+      "loss": 0.8943,
+      "step": 852
+    },
+    {
+      "epoch": 0.8083392561004501,
+      "grad_norm": 2.382365978345813,
+      "learning_rate": 6.758106262540611e-06,
+      "loss": 0.96,
+      "step": 853
+    },
+    {
+      "epoch": 0.8092868988391376,
+      "grad_norm": 1.2804659583348557,
+      "learning_rate": 6.750917052250981e-06,
+      "loss": 0.9211,
+      "step": 854
+    },
+    {
+      "epoch": 0.8102345415778252,
+      "grad_norm": 1.1113739193035552,
+      "learning_rate": 6.7437237138255686e-06,
+      "loss": 0.9385,
+      "step": 855
+    },
+    {
+      "epoch": 0.8111821843165127,
+      "grad_norm": 1.1014129494663154,
+      "learning_rate": 6.736526264224101e-06,
+      "loss": 0.8579,
+      "step": 856
+    },
+    {
+      "epoch": 0.8121298270552002,
+      "grad_norm": 0.9349690740722351,
+      "learning_rate": 6.7293247204160024e-06,
+      "loss": 0.8415,
+      "step": 857
+    },
+    {
+      "epoch": 0.8130774697938877,
+      "grad_norm": 1.0528528824598096,
+      "learning_rate": 6.722119099380345e-06,
+      "loss": 0.9034,
+      "step": 858
+    },
+    {
+      "epoch": 0.8130774697938877,
+      "eval_loss": 0.9221681952476501,
+      "eval_runtime": 66.5105,
+      "eval_samples_per_second": 41.016,
+      "eval_steps_per_second": 0.647,
+      "step": 858
+    },
+    {
+      "epoch": 0.8140251125325753,
+      "grad_norm": 1.1189012672893626,
+      "learning_rate": 6.714909418105816e-06,
+      "loss": 0.8928,
+      "step": 859
+    },
+    {
+      "epoch": 0.8149727552712628,
+      "grad_norm": 1.0491667848828412,
+      "learning_rate": 6.7076956935906756e-06,
+      "loss": 0.8846,
+      "step": 860
+    },
+    {
+      "epoch": 0.8159203980099502,
+      "grad_norm": 1.1800217977478704,
+      "learning_rate": 6.700477942842717e-06,
+      "loss": 0.8467,
+      "step": 861
+    },
+    {
+      "epoch": 0.8168680407486377,
+      "grad_norm": 1.3635685933149135,
+      "learning_rate": 6.693256182879224e-06,
+      "loss": 0.8875,
+      "step": 862
+    },
+    {
+      "epoch": 0.8178156834873253,
+      "grad_norm": 0.9738480603904128,
+      "learning_rate": 6.686030430726938e-06,
+      "loss": 0.8611,
+      "step": 863
+    },
+    {
+      "epoch": 0.8187633262260128,
+      "grad_norm": 1.0646192263592438,
+      "learning_rate": 6.678800703422004e-06,
+      "loss": 0.9442,
+      "step": 864
+    },
+    {
+      "epoch": 0.8197109689647003,
+      "grad_norm": 1.133138450105503,
+      "learning_rate": 6.671567018009948e-06,
+      "loss": 0.8936,
+      "step": 865
+    },
+    {
+      "epoch": 0.8206586117033878,
+      "grad_norm": 1.1779803833294133,
+      "learning_rate": 6.664329391545625e-06,
+      "loss": 0.8945,
+      "step": 866
+    },
+    {
+      "epoch": 0.8216062544420754,
+      "grad_norm": 1.14213552665385,
+      "learning_rate": 6.657087841093179e-06,
+      "loss": 0.8796,
+      "step": 867
+    },
+    {
+      "epoch": 0.8225538971807629,
+      "grad_norm": 1.0919659396363655,
+      "learning_rate": 6.649842383726011e-06,
+      "loss": 0.9093,
+      "step": 868
+    },
+    {
+      "epoch": 0.8235015399194504,
+      "grad_norm": 1.1540405069787951,
+      "learning_rate": 6.642593036526728e-06,
+      "loss": 0.8398,
+      "step": 869
+    },
+    {
+      "epoch": 0.8244491826581379,
+      "grad_norm": 0.9630828315476891,
+      "learning_rate": 6.635339816587109e-06,
+      "loss": 0.8616,
+      "step": 870
+    },
+    {
+      "epoch": 0.8253968253968254,
+      "grad_norm": 1.138452507177444,
+      "learning_rate": 6.628082741008068e-06,
+      "loss": 0.9328,
+      "step": 871
+    },
+    {
+      "epoch": 0.8263444681355129,
+      "grad_norm": 1.1595340664384783,
+      "learning_rate": 6.620821826899606e-06,
+      "loss": 0.8951,
+      "step": 872
+    },
+    {
+      "epoch": 0.8272921108742004,
+      "grad_norm": 1.0403640766877473,
+      "learning_rate": 6.613557091380771e-06,
+      "loss": 0.8403,
+      "step": 873
+    },
+    {
+      "epoch": 0.8282397536128879,
+      "grad_norm": 1.0491318496593032,
+      "learning_rate": 6.606288551579629e-06,
+      "loss": 0.8726,
+      "step": 874
+    },
+    {
+      "epoch": 0.8291873963515755,
+      "grad_norm": 1.0770673557649837,
+      "learning_rate": 6.599016224633209e-06,
+      "loss": 0.8777,
+      "step": 875
+    },
+    {
+      "epoch": 0.830135039090263,
+      "grad_norm": 1.2586661831963606,
+      "learning_rate": 6.59174012768747e-06,
+      "loss": 0.9406,
+      "step": 876
+    },
+    {
+      "epoch": 0.8310826818289505,
+      "grad_norm": 0.9975414535990195,
+      "learning_rate": 6.584460277897262e-06,
+      "loss": 0.9178,
+      "step": 877
+    },
+    {
+      "epoch": 0.832030324567638,
+      "grad_norm": 1.1950907387823217,
+      "learning_rate": 6.5771766924262795e-06,
+      "loss": 0.8673,
+      "step": 878
+    },
+    {
+      "epoch": 0.8329779673063256,
+      "grad_norm": 1.026575911873265,
+      "learning_rate": 6.569889388447025e-06,
+      "loss": 0.8515,
+      "step": 879
+    },
+    {
+      "epoch": 0.833925610045013,
+      "grad_norm": 1.1555312684415828,
+      "learning_rate": 6.562598383140773e-06,
+      "loss": 0.9227,
+      "step": 880
+    },
+    {
+      "epoch": 0.833925610045013,
+      "eval_loss": 0.9205263257026672,
+      "eval_runtime": 65.5841,
+      "eval_samples_per_second": 41.595,
+      "eval_steps_per_second": 0.656,
+      "step": 880
+    },
+    {
+      "epoch": 0.8348732527837005,
+      "grad_norm": 1.0410870491482918,
+      "learning_rate": 6.555303693697517e-06,
+      "loss": 0.8879,
+      "step": 881
+    },
+    {
+      "epoch": 0.835820895522388,
+      "grad_norm": 1.2046733679595938,
+      "learning_rate": 6.548005337315943e-06,
+      "loss": 0.9327,
+      "step": 882
+    },
+    {
+      "epoch": 0.8367685382610756,
+      "grad_norm": 1.4002394037690635,
+      "learning_rate": 6.540703331203382e-06,
+      "loss": 0.8616,
+      "step": 883
+    },
+    {
+      "epoch": 0.8377161809997631,
+      "grad_norm": 1.0820553537637987,
+      "learning_rate": 6.533397692575766e-06,
+      "loss": 0.8599,
+      "step": 884
+    },
+    {
+      "epoch": 0.8386638237384506,
+      "grad_norm": 1.1367287841963307,
+      "learning_rate": 6.526088438657594e-06,
+      "loss": 0.9047,
+      "step": 885
+    },
+    {
+      "epoch": 0.8396114664771381,
+      "grad_norm": 1.1941704415773284,
+      "learning_rate": 6.518775586681887e-06,
+      "loss": 0.8552,
+      "step": 886
+    },
+    {
+      "epoch": 0.8405591092158257,
+      "grad_norm": 1.059195584331362,
+      "learning_rate": 6.511459153890152e-06,
+      "loss": 0.9146,
+      "step": 887
+    },
+    {
+      "epoch": 0.8415067519545132,
+      "grad_norm": 1.105675547858621,
+      "learning_rate": 6.504139157532338e-06,
+      "loss": 0.8386,
+      "step": 888
+    },
+    {
+      "epoch": 0.8424543946932007,
+      "grad_norm": 1.176419395830192,
+      "learning_rate": 6.496815614866792e-06,
+      "loss": 0.851,
+      "step": 889
+    },
+    {
+      "epoch": 0.8434020374318881,
+      "grad_norm": 1.4493743507868786,
+      "learning_rate": 6.489488543160225e-06,
+      "loss": 0.9137,
+      "step": 890
+    },
+    {
+      "epoch": 0.8443496801705757,
+      "grad_norm": 0.9826791105867756,
+      "learning_rate": 6.4821579596876705e-06,
+      "loss": 0.9117,
+      "step": 891
+    },
+    {
+      "epoch": 0.8452973229092632,
+      "grad_norm": 1.0251650573055395,
+      "learning_rate": 6.4748238817324395e-06,
+      "loss": 0.9214,
+      "step": 892
+    },
+    {
+      "epoch": 0.8462449656479507,
+      "grad_norm": 1.025330926521032,
+      "learning_rate": 6.46748632658608e-06,
+      "loss": 0.8434,
+      "step": 893
+    },
+    {
+      "epoch": 0.8471926083866382,
+      "grad_norm": 1.0829549887894614,
+      "learning_rate": 6.460145311548341e-06,
+      "loss": 0.9142,
+      "step": 894
+    },
+    {
+      "epoch": 0.8481402511253258,
+      "grad_norm": 1.0949066487399224,
+      "learning_rate": 6.452800853927128e-06,
+      "loss": 0.8257,
+      "step": 895
+    },
+    {
+      "epoch": 0.8490878938640133,
+      "grad_norm": 1.153959748533555,
+      "learning_rate": 6.445452971038464e-06,
+      "loss": 0.9253,
+      "step": 896
+    },
+    {
+      "epoch": 0.8500355366027008,
+      "grad_norm": 1.193405218423659,
+      "learning_rate": 6.438101680206444e-06,
+      "loss": 0.9291,
+      "step": 897
+    },
+    {
+      "epoch": 0.8509831793413883,
+      "grad_norm": 1.2544535828853964,
+      "learning_rate": 6.430746998763204e-06,
+      "loss": 0.9173,
+      "step": 898
+    },
+    {
+      "epoch": 0.8519308220800759,
+      "grad_norm": 0.9715729053189112,
+      "learning_rate": 6.42338894404887e-06,
+      "loss": 0.8385,
+      "step": 899
+    },
+    {
+      "epoch": 0.8528784648187633,
+      "grad_norm": 1.0084105628200437,
+      "learning_rate": 6.41602753341152e-06,
+      "loss": 0.886,
+      "step": 900
+    },
+    {
+      "epoch": 0.8538261075574508,
+      "grad_norm": 1.0846787519964416,
+      "learning_rate": 6.408662784207149e-06,
+      "loss": 0.8862,
+      "step": 901
+    },
+    {
+      "epoch": 0.8547737502961383,
+      "grad_norm": 1.553862723805949,
+      "learning_rate": 6.4012947137996175e-06,
+      "loss": 0.9481,
+      "step": 902
+    },
+    {
+      "epoch": 0.8547737502961383,
+      "eval_loss": 0.9192849397659302,
+      "eval_runtime": 61.9949,
+      "eval_samples_per_second": 44.004,
+      "eval_steps_per_second": 0.694,
+      "step": 902
+    },
+    {
+      "epoch": 0.8557213930348259,
+      "grad_norm": 1.190837748503028,
+      "learning_rate": 6.393923339560621e-06,
+      "loss": 0.9056,
+      "step": 903
+    },
+    {
+      "epoch": 0.8566690357735134,
+      "grad_norm": 1.0240140450014306,
+      "learning_rate": 6.386548678869644e-06,
+      "loss": 0.8862,
+      "step": 904
+    },
+    {
+      "epoch": 0.8576166785122009,
+      "grad_norm": 1.1684091911040653,
+      "learning_rate": 6.379170749113918e-06,
+      "loss": 0.9077,
+      "step": 905
+    },
+    {
+      "epoch": 0.8585643212508884,
+      "grad_norm": 1.2458608937767364,
+      "learning_rate": 6.37178956768838e-06,
+      "loss": 0.8883,
+      "step": 906
+    },
+    {
+      "epoch": 0.859511963989576,
+      "grad_norm": 1.0899866220356935,
+      "learning_rate": 6.3644051519956366e-06,
+      "loss": 0.8953,
+      "step": 907
+    },
+    {
+      "epoch": 0.8604596067282635,
+      "grad_norm": 1.0159031729823804,
+      "learning_rate": 6.3570175194459205e-06,
+      "loss": 0.8946,
+      "step": 908
+    },
+    {
+      "epoch": 0.8614072494669509,
+      "grad_norm": 1.0422661890872638,
+      "learning_rate": 6.349626687457045e-06,
+      "loss": 0.9217,
+      "step": 909
+    },
+    {
+      "epoch": 0.8623548922056384,
+      "grad_norm": 1.0650534348419232,
+      "learning_rate": 6.342232673454371e-06,
+      "loss": 0.8993,
+      "step": 910
+    },
+    {
+      "epoch": 0.863302534944326,
+      "grad_norm": 1.162095463386776,
+      "learning_rate": 6.334835494870759e-06,
+      "loss": 0.9435,
+      "step": 911
+    },
+    {
+      "epoch": 0.8642501776830135,
+      "grad_norm": 0.9740991606947988,
+      "learning_rate": 6.3274351691465305e-06,
+      "loss": 0.8874,
+      "step": 912
+    },
+    {
+      "epoch": 0.865197820421701,
+      "grad_norm": 1.0752595651605357,
+      "learning_rate": 6.320031713729429e-06,
+      "loss": 0.8733,
+      "step": 913
+    },
+    {
+      "epoch": 0.8661454631603885,
+      "grad_norm": 1.003012232982504,
+      "learning_rate": 6.312625146074574e-06,
+      "loss": 0.8997,
+      "step": 914
+    },
+    {
+      "epoch": 0.8670931058990761,
+      "grad_norm": 1.2050664252308885,
+      "learning_rate": 6.305215483644427e-06,
+      "loss": 0.9121,
+      "step": 915
+    },
+    {
+      "epoch": 0.8680407486377636,
+      "grad_norm": 1.0509143532390477,
+      "learning_rate": 6.2978027439087405e-06,
+      "loss": 0.9215,
+      "step": 916
+    },
+    {
+      "epoch": 0.8689883913764511,
+      "grad_norm": 1.1000294092216198,
+      "learning_rate": 6.290386944344527e-06,
+      "loss": 0.8209,
+      "step": 917
+    },
+    {
+      "epoch": 0.8699360341151386,
+      "grad_norm": 1.4381680474891718,
+      "learning_rate": 6.28296810243601e-06,
+      "loss": 0.9485,
+      "step": 918
+    },
+    {
+      "epoch": 0.8708836768538261,
+      "grad_norm": 1.0986281816759764,
+      "learning_rate": 6.2755462356745885e-06,
+      "loss": 0.8677,
+      "step": 919
+    },
+    {
+      "epoch": 0.8718313195925136,
+      "grad_norm": 1.1944430191428006,
+      "learning_rate": 6.268121361558792e-06,
+      "loss": 0.932,
+      "step": 920
+    },
+    {
+      "epoch": 0.8727789623312011,
+      "grad_norm": 30.495921119331683,
+      "learning_rate": 6.2606934975942415e-06,
+      "loss": 0.9977,
+      "step": 921
+    },
+    {
+      "epoch": 0.8737266050698886,
+      "grad_norm": 1.0629329656422315,
+      "learning_rate": 6.2532626612936035e-06,
+      "loss": 0.9012,
+      "step": 922
+    },
+    {
+      "epoch": 0.8746742478085762,
+      "grad_norm": 1.212840019137115,
+      "learning_rate": 6.245828870176557e-06,
+      "loss": 0.8842,
+      "step": 923
+    },
+    {
+      "epoch": 0.8756218905472637,
+      "grad_norm": 1.1166380211338318,
+      "learning_rate": 6.238392141769743e-06,
+      "loss": 0.8853,
+      "step": 924
+    },
+    {
+      "epoch": 0.8756218905472637,
+      "eval_loss": 0.9188343286514282,
+      "eval_runtime": 71.0766,
+      "eval_samples_per_second": 38.381,
+      "eval_steps_per_second": 0.605,
+      "step": 924
+    },
+    {
+      "epoch": 0.8765695332859512,
+      "grad_norm": 1.0374942288524216,
+      "learning_rate": 6.2309524936067344e-06,
+      "loss": 0.8285,
+      "step": 925
+    },
+    {
+      "epoch": 0.8775171760246387,
+      "grad_norm": 1.294054885875714,
+      "learning_rate": 6.22350994322798e-06,
+      "loss": 0.9001,
+      "step": 926
+    },
+    {
+      "epoch": 0.8784648187633263,
+      "grad_norm": 1.0637784431739705,
+      "learning_rate": 6.216064508180778e-06,
+      "loss": 0.8865,
+      "step": 927
+    },
+    {
+      "epoch": 0.8794124615020137,
+      "grad_norm": 1.0389026144557505,
+      "learning_rate": 6.208616206019225e-06,
+      "loss": 0.8368,
+      "step": 928
+    },
+    {
+      "epoch": 0.8803601042407012,
+      "grad_norm": 1.008199300013402,
+      "learning_rate": 6.2011650543041734e-06,
+      "loss": 0.8638,
+      "step": 929
+    },
+    {
+      "epoch": 0.8813077469793887,
+      "grad_norm": 1.0169656783697818,
+      "learning_rate": 6.193711070603202e-06,
+      "loss": 0.8854,
+      "step": 930
+    },
+    {
+      "epoch": 0.8822553897180763,
+      "grad_norm": 1.3473009889737735,
+      "learning_rate": 6.1862542724905605e-06,
+      "loss": 0.8851,
+      "step": 931
+    },
+    {
+      "epoch": 0.8832030324567638,
+      "grad_norm": 0.9515757094852794,
+      "learning_rate": 6.178794677547138e-06,
+      "loss": 0.8049,
+      "step": 932
+    },
+    {
+      "epoch": 0.8841506751954513,
+      "grad_norm": 1.0333010505925766,
+      "learning_rate": 6.171332303360411e-06,
+      "loss": 0.8989,
+      "step": 933
+    },
+    {
+      "epoch": 0.8850983179341388,
+      "grad_norm": 1.0350926221697927,
+      "learning_rate": 6.163867167524419e-06,
+      "loss": 0.8401,
+      "step": 934
+    },
+    {
+      "epoch": 0.8860459606728264,
+      "grad_norm": 1.0721173411729321,
+      "learning_rate": 6.156399287639703e-06,
+      "loss": 0.9309,
+      "step": 935
+    },
+    {
+      "epoch": 0.8869936034115139,
+      "grad_norm": 1.3120269262792263,
+      "learning_rate": 6.14892868131328e-06,
+      "loss": 0.8991,
+      "step": 936
+    },
+    {
+      "epoch": 0.8879412461502014,
+      "grad_norm": 1.1865006482607656,
+      "learning_rate": 6.1414553661585905e-06,
+      "loss": 0.8683,
+      "step": 937
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 1.2132247708632098,
+      "learning_rate": 6.1339793597954675e-06,
+      "loss": 0.8569,
+      "step": 938
+    },
+    {
+      "epoch": 0.8898365316275764,
+      "grad_norm": 1.356211439062163,
+      "learning_rate": 6.126500679850082e-06,
+      "loss": 0.8543,
+      "step": 939
+    },
+    {
+      "epoch": 0.8907841743662639,
+      "grad_norm": 1.0301726611069624,
+      "learning_rate": 6.119019343954914e-06,
+      "loss": 0.8244,
+      "step": 940
+    },
+    {
+      "epoch": 0.8917318171049514,
+      "grad_norm": 1.2284199784911718,
+      "learning_rate": 6.111535369748702e-06,
+      "loss": 0.9085,
+      "step": 941
+    },
+    {
+      "epoch": 0.892679459843639,
+      "grad_norm": 1.0917917658926368,
+      "learning_rate": 6.104048774876407e-06,
+      "loss": 0.9026,
+      "step": 942
+    },
+    {
+      "epoch": 0.8936271025823265,
+      "grad_norm": 1.0918457932637566,
+      "learning_rate": 6.096559576989166e-06,
+      "loss": 0.8416,
+      "step": 943
+    },
+    {
+      "epoch": 0.894574745321014,
+      "grad_norm": 1.090778224144194,
+      "learning_rate": 6.089067793744258e-06,
+      "loss": 0.9331,
+      "step": 944
+    },
+    {
+      "epoch": 0.8955223880597015,
+      "grad_norm": 1.1895282418746094,
+      "learning_rate": 6.0815734428050535e-06,
+      "loss": 0.9023,
+      "step": 945
+    },
+    {
+      "epoch": 0.896470030798389,
+      "grad_norm": 1.1429336151352636,
+      "learning_rate": 6.074076541840978e-06,
+      "loss": 0.8708,
+      "step": 946
+    },
+    {
+      "epoch": 0.896470030798389,
+      "eval_loss": 0.9174872636795044,
+      "eval_runtime": 64.8721,
+      "eval_samples_per_second": 42.052,
+      "eval_steps_per_second": 0.663,
+      "step": 946
+    },
+    {
+      "epoch": 0.8974176735370766,
+      "grad_norm": 1.0370632105036575,
+      "learning_rate": 6.066577108527469e-06,
+      "loss": 0.8657,
+      "step": 947
+    },
+    {
+      "epoch": 0.898365316275764,
+      "grad_norm": 1.0186442974981031,
+      "learning_rate": 6.059075160545933e-06,
+      "loss": 0.8767,
+      "step": 948
+    },
+    {
+      "epoch": 0.8993129590144515,
+      "grad_norm": 1.1803999185158343,
+      "learning_rate": 6.05157071558371e-06,
+      "loss": 0.9213,
+      "step": 949
+    },
+    {
+      "epoch": 0.900260601753139,
+      "grad_norm": 1.2808021169879162,
+      "learning_rate": 6.044063791334023e-06,
+      "loss": 0.7969,
+      "step": 950
+    },
+    {
+      "epoch": 0.9012082444918266,
+      "grad_norm": 1.100561129817383,
+      "learning_rate": 6.03655440549594e-06,
+      "loss": 0.9169,
+      "step": 951
+    },
+    {
+      "epoch": 0.9021558872305141,
+      "grad_norm": 1.0079024114327502,
+      "learning_rate": 6.029042575774334e-06,
+      "loss": 0.8063,
+      "step": 952
+    },
+    {
+      "epoch": 0.9031035299692016,
+      "grad_norm": 1.188940267604656,
+      "learning_rate": 6.021528319879843e-06,
+      "loss": 0.8283,
+      "step": 953
+    },
+    {
+      "epoch": 0.9040511727078892,
+      "grad_norm": 1.4178205131395747,
+      "learning_rate": 6.01401165552882e-06,
+      "loss": 0.9282,
+      "step": 954
+    },
+    {
+      "epoch": 0.9049988154465767,
+      "grad_norm": 1.0472375312060485,
+      "learning_rate": 6.006492600443301e-06,
+      "loss": 0.8396,
+      "step": 955
+    },
+    {
+      "epoch": 0.9059464581852642,
+      "grad_norm": 1.1298052814130883,
+      "learning_rate": 5.998971172350953e-06,
+      "loss": 0.8898,
+      "step": 956
+    },
+    {
+      "epoch": 0.9068941009239516,
+      "grad_norm": 1.1111085147456654,
+      "learning_rate": 5.991447388985045e-06,
+      "loss": 0.8682,
+      "step": 957
+    },
+    {
+      "epoch": 0.9078417436626391,
+      "grad_norm": 1.4266031924717084,
+      "learning_rate": 5.9839212680843925e-06,
+      "loss": 0.8415,
+      "step": 958
+    },
+    {
+      "epoch": 0.9087893864013267,
+      "grad_norm": 1.3571311779441146,
+      "learning_rate": 5.976392827393326e-06,
+      "loss": 0.9395,
+      "step": 959
+    },
+    {
+      "epoch": 0.9097370291400142,
+      "grad_norm": 1.0252667256604506,
+      "learning_rate": 5.968862084661643e-06,
+      "loss": 0.8144,
+      "step": 960
+    },
+    {
+      "epoch": 0.9106846718787017,
+      "grad_norm": 1.0648709592997376,
+      "learning_rate": 5.961329057644571e-06,
+      "loss": 0.8239,
+      "step": 961
+    },
+    {
+      "epoch": 0.9116323146173892,
+      "grad_norm": 1.1431114083793006,
+      "learning_rate": 5.9537937641027225e-06,
+      "loss": 0.8986,
+      "step": 962
+    },
+    {
+      "epoch": 0.9125799573560768,
+      "grad_norm": 1.051205202397749,
+      "learning_rate": 5.946256221802052e-06,
+      "loss": 0.8686,
+      "step": 963
+    },
+    {
+      "epoch": 0.9135276000947643,
+      "grad_norm": 1.1593903949141868,
+      "learning_rate": 5.938716448513819e-06,
+      "loss": 0.8353,
+      "step": 964
+    },
+    {
+      "epoch": 0.9144752428334518,
+      "grad_norm": 1.1838771825939993,
+      "learning_rate": 5.931174462014538e-06,
+      "loss": 0.9348,
+      "step": 965
+    },
+    {
+      "epoch": 0.9154228855721394,
+      "grad_norm": 1.0369730284903498,
+      "learning_rate": 5.923630280085948e-06,
+      "loss": 0.8309,
+      "step": 966
+    },
+    {
+      "epoch": 0.9163705283108268,
+      "grad_norm": 1.1424198652925397,
+      "learning_rate": 5.916083920514959e-06,
+      "loss": 0.8653,
+      "step": 967
+    },
+    {
+      "epoch": 0.9173181710495143,
+      "grad_norm": 1.345398860745349,
+      "learning_rate": 5.908535401093618e-06,
+      "loss": 0.871,
+      "step": 968
+    },
+    {
+      "epoch": 0.9173181710495143,
+      "eval_loss": 0.9164453744888306,
+      "eval_runtime": 70.7681,
+      "eval_samples_per_second": 38.548,
+      "eval_steps_per_second": 0.608,
+      "step": 968
+    },
+    {
+      "epoch": 0.9182658137882018,
+      "grad_norm": 1.2793924011477549,
+      "learning_rate": 5.900984739619062e-06,
+      "loss": 0.9352,
+      "step": 969
+    },
+    {
+      "epoch": 0.9192134565268893,
+      "grad_norm": 1.069579103341338,
+      "learning_rate": 5.893431953893483e-06,
+      "loss": 0.8886,
+      "step": 970
+    },
+    {
+      "epoch": 0.9201610992655769,
+      "grad_norm": 0.9577007372371958,
+      "learning_rate": 5.885877061724075e-06,
+      "loss": 0.9196,
+      "step": 971
+    },
+    {
+      "epoch": 0.9211087420042644,
+      "grad_norm": 1.1384165277972425,
+      "learning_rate": 5.878320080923001e-06,
+      "loss": 0.8944,
+      "step": 972
+    },
+    {
+      "epoch": 0.9220563847429519,
+      "grad_norm": 1.0993789420249829,
+      "learning_rate": 5.8707610293073524e-06,
+      "loss": 0.8718,
+      "step": 973
+    },
+    {
+      "epoch": 0.9230040274816395,
+      "grad_norm": 0.990859843943125,
+      "learning_rate": 5.8631999246990954e-06,
+      "loss": 0.8815,
+      "step": 974
+    },
+    {
+      "epoch": 0.923951670220327,
+      "grad_norm": 0.9596254067235986,
+      "learning_rate": 5.855636784925044e-06,
+      "loss": 0.8873,
+      "step": 975
+    },
+    {
+      "epoch": 0.9248993129590144,
+      "grad_norm": 1.1971458477938866,
+      "learning_rate": 5.848071627816804e-06,
+      "loss": 0.9301,
+      "step": 976
+    },
+    {
+      "epoch": 0.9258469556977019,
+      "grad_norm": 1.293695132429081,
+      "learning_rate": 5.840504471210742e-06,
+      "loss": 0.8826,
+      "step": 977
+    },
+    {
+      "epoch": 0.9267945984363894,
+      "grad_norm": 1.0637632989021661,
+      "learning_rate": 5.832935332947937e-06,
+      "loss": 0.8744,
+      "step": 978
+    },
+    {
+      "epoch": 0.927742241175077,
+      "grad_norm": 1.0066024634519515,
+      "learning_rate": 5.82536423087414e-06,
+      "loss": 0.8836,
+      "step": 979
+    },
+    {
+      "epoch": 0.9286898839137645,
+      "grad_norm": 1.114286515649878,
+      "learning_rate": 5.817791182839734e-06,
+      "loss": 0.8973,
+      "step": 980
+    },
+    {
+      "epoch": 0.929637526652452,
+      "grad_norm": 1.2393549521144707,
+      "learning_rate": 5.810216206699686e-06,
+      "loss": 0.9605,
+      "step": 981
+    },
+    {
+      "epoch": 0.9305851693911396,
+      "grad_norm": 1.0160087719698536,
+      "learning_rate": 5.8026393203135145e-06,
+      "loss": 0.9383,
+      "step": 982
+    },
+    {
+      "epoch": 0.9315328121298271,
+      "grad_norm": 1.1104693027389803,
+      "learning_rate": 5.7950605415452365e-06,
+      "loss": 0.8697,
+      "step": 983
+    },
+    {
+      "epoch": 0.9324804548685146,
+      "grad_norm": 1.0139561721996317,
+      "learning_rate": 5.787479888263333e-06,
+      "loss": 0.8634,
+      "step": 984
+    },
+    {
+      "epoch": 0.9334280976072021,
+      "grad_norm": 0.9149964477354714,
+      "learning_rate": 5.779897378340705e-06,
+      "loss": 0.8692,
+      "step": 985
+    },
+    {
+      "epoch": 0.9343757403458895,
+      "grad_norm": 1.081719984477777,
+      "learning_rate": 5.772313029654631e-06,
+      "loss": 0.8752,
+      "step": 986
+    },
+    {
+      "epoch": 0.9353233830845771,
+      "grad_norm": 0.9686190602510739,
+      "learning_rate": 5.76472686008672e-06,
+      "loss": 0.9374,
+      "step": 987
+    },
+    {
+      "epoch": 0.9362710258232646,
+      "grad_norm": 1.1186604360938692,
+      "learning_rate": 5.757138887522884e-06,
+      "loss": 0.9454,
+      "step": 988
+    },
+    {
+      "epoch": 0.9372186685619521,
+      "grad_norm": 1.1474863690068453,
+      "learning_rate": 5.749549129853277e-06,
+      "loss": 0.9526,
+      "step": 989
+    },
+    {
+      "epoch": 0.9381663113006397,
+      "grad_norm": 1.2065827627859584,
+      "learning_rate": 5.741957604972264e-06,
+      "loss": 0.9015,
+      "step": 990
+    },
+    {
+      "epoch": 0.9381663113006397,
+      "eval_loss": 0.9157132506370544,
+      "eval_runtime": 63.106,
+      "eval_samples_per_second": 43.229,
+      "eval_steps_per_second": 0.681,
+      "step": 990
+    },
+    {
+      "epoch": 0.9391139540393272,
+      "grad_norm": 1.0667864777077534,
+      "learning_rate": 5.734364330778381e-06,
+      "loss": 0.9115,
+      "step": 991
+    },
+    {
+      "epoch": 0.9400615967780147,
+      "grad_norm": 1.0077841132377081,
+      "learning_rate": 5.726769325174279e-06,
+      "loss": 0.8647,
+      "step": 992
+    },
+    {
+      "epoch": 0.9410092395167022,
+      "grad_norm": 1.0542855609546453,
+      "learning_rate": 5.719172606066703e-06,
+      "loss": 0.871,
+      "step": 993
+    },
+    {
+      "epoch": 0.9419568822553898,
+      "grad_norm": 1.0723234157404569,
+      "learning_rate": 5.711574191366427e-06,
+      "loss": 0.8652,
+      "step": 994
+    },
+    {
+      "epoch": 0.9429045249940773,
+      "grad_norm": 1.0614000028665043,
+      "learning_rate": 5.703974098988229e-06,
+      "loss": 0.8693,
+      "step": 995
+    },
+    {
+      "epoch": 0.9438521677327647,
+      "grad_norm": 1.184972220370901,
+      "learning_rate": 5.696372346850842e-06,
+      "loss": 0.8748,
+      "step": 996
+    },
+    {
+      "epoch": 0.9447998104714522,
+      "grad_norm": 3.0693319889655357,
+      "learning_rate": 5.68876895287691e-06,
+      "loss": 0.9179,
+      "step": 997
+    },
+    {
+      "epoch": 0.9457474532101398,
+      "grad_norm": 1.364979369089619,
+      "learning_rate": 5.68116393499295e-06,
+      "loss": 0.9287,
+      "step": 998
+    },
+    {
+      "epoch": 0.9466950959488273,
+      "grad_norm": 1.138619446576553,
+      "learning_rate": 5.673557311129306e-06,
+      "loss": 0.9642,
+      "step": 999
+    },
+    {
+      "epoch": 0.9476427386875148,
+      "grad_norm": 1.1855283956687006,
+      "learning_rate": 5.66594909922011e-06,
+      "loss": 0.8375,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9485903814262023,
+      "grad_norm": 1.1632815768908809,
+      "learning_rate": 5.658339317203235e-06,
+      "loss": 0.8411,
+      "step": 1001
+    },
+    {
+      "epoch": 0.9495380241648899,
+      "grad_norm": 1.2165292478376695,
+      "learning_rate": 5.650727983020262e-06,
+      "loss": 0.802,
+      "step": 1002
+    },
+    {
+      "epoch": 0.9504856669035774,
+      "grad_norm": 1.1272166319806751,
+      "learning_rate": 5.6431151146164255e-06,
+      "loss": 0.8764,
+      "step": 1003
+    },
+    {
+      "epoch": 0.9514333096422649,
+      "grad_norm": 1.179748089906736,
+      "learning_rate": 5.635500729940578e-06,
+      "loss": 0.8728,
+      "step": 1004
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 1.1413912763569145,
+      "learning_rate": 5.627884846945151e-06,
+      "loss": 0.8815,
+      "step": 1005
+    },
+    {
+      "epoch": 0.9533285951196399,
+      "grad_norm": 1.1198726067886378,
+      "learning_rate": 5.6202674835861045e-06,
+      "loss": 0.8549,
+      "step": 1006
+    },
+    {
+      "epoch": 0.9542762378583274,
+      "grad_norm": 1.0106130328632965,
+      "learning_rate": 5.6126486578228926e-06,
+      "loss": 0.8785,
+      "step": 1007
+    },
+    {
+      "epoch": 0.9552238805970149,
+      "grad_norm": 1.092869267593721,
+      "learning_rate": 5.605028387618412e-06,
+      "loss": 0.9306,
+      "step": 1008
+    },
+    {
+      "epoch": 0.9561715233357024,
+      "grad_norm": 1.2094913165105556,
+      "learning_rate": 5.597406690938969e-06,
+      "loss": 0.8963,
+      "step": 1009
+    },
+    {
+      "epoch": 0.95711916607439,
+      "grad_norm": 1.1750720019167373,
+      "learning_rate": 5.5897835857542315e-06,
+      "loss": 0.8666,
+      "step": 1010
+    },
+    {
+      "epoch": 0.9580668088130775,
+      "grad_norm": 1.3245562535134334,
+      "learning_rate": 5.582159090037189e-06,
+      "loss": 0.8291,
+      "step": 1011
+    },
+    {
+      "epoch": 0.959014451551765,
+      "grad_norm": 1.2042706760957538,
+      "learning_rate": 5.574533221764109e-06,
+      "loss": 0.8684,
+      "step": 1012
+    },
+    {
+      "epoch": 0.959014451551765,
+      "eval_loss": 0.9151268601417542,
+      "eval_runtime": 62.5892,
+      "eval_samples_per_second": 43.586,
+      "eval_steps_per_second": 0.687,
+      "step": 1012
+    },
+    {
+      "epoch": 0.9599620942904525,
+      "grad_norm": 1.3006236475762307,
+      "learning_rate": 5.566905998914496e-06,
+      "loss": 0.8668,
+      "step": 1013
+    },
+    {
+      "epoch": 0.9609097370291401,
+      "grad_norm": 1.0257069903720804,
+      "learning_rate": 5.559277439471047e-06,
+      "loss": 0.8478,
+      "step": 1014
+    },
+    {
+      "epoch": 0.9618573797678275,
+      "grad_norm": 1.511736931943181,
+      "learning_rate": 5.551647561419611e-06,
+      "loss": 0.8859,
+      "step": 1015
+    },
+    {
+      "epoch": 0.962805022506515,
+      "grad_norm": 1.0720441919793242,
+      "learning_rate": 5.544016382749146e-06,
+      "loss": 0.8665,
+      "step": 1016
+    },
+    {
+      "epoch": 0.9637526652452025,
+      "grad_norm": 1.5150355984208133,
+      "learning_rate": 5.536383921451673e-06,
+      "loss": 0.8628,
+      "step": 1017
+    },
+    {
+      "epoch": 0.9647003079838901,
+      "grad_norm": 1.2280290508409115,
+      "learning_rate": 5.528750195522244e-06,
+      "loss": 0.8873,
+      "step": 1018
+    },
+    {
+      "epoch": 0.9656479507225776,
+      "grad_norm": 1.0449090226929965,
+      "learning_rate": 5.521115222958889e-06,
+      "loss": 0.9395,
+      "step": 1019
+    },
+    {
+      "epoch": 0.9665955934612651,
+      "grad_norm": 1.286647376114692,
+      "learning_rate": 5.513479021762573e-06,
+      "loss": 0.8706,
+      "step": 1020
+    },
+    {
+      "epoch": 0.9675432361999526,
+      "grad_norm": 1.2238701214843728,
+      "learning_rate": 5.505841609937162e-06,
+      "loss": 0.85,
+      "step": 1021
+    },
+    {
+      "epoch": 0.9684908789386402,
+      "grad_norm": 1.08714516330967,
+      "learning_rate": 5.498203005489378e-06,
+      "loss": 0.8235,
+      "step": 1022
+    },
+    {
+      "epoch": 0.9694385216773277,
+      "grad_norm": 0.9837819321269746,
+      "learning_rate": 5.490563226428756e-06,
+      "loss": 0.824,
+      "step": 1023
+    },
+    {
+      "epoch": 0.9703861644160152,
+      "grad_norm": 1.0960448192217682,
+      "learning_rate": 5.4829222907675895e-06,
+      "loss": 0.8735,
+      "step": 1024
+    },
+    {
+      "epoch": 0.9713338071547026,
+      "grad_norm": 1.0733602660245445,
+      "learning_rate": 5.475280216520913e-06,
+      "loss": 0.846,
+      "step": 1025
+    },
+    {
+      "epoch": 0.9722814498933902,
+      "grad_norm": 1.0672536201083378,
+      "learning_rate": 5.467637021706438e-06,
+      "loss": 0.8457,
+      "step": 1026
+    },
+    {
+      "epoch": 0.9732290926320777,
+      "grad_norm": 1.2208010933061697,
+      "learning_rate": 5.459992724344516e-06,
+      "loss": 0.8684,
+      "step": 1027
+    },
+    {
+      "epoch": 0.9741767353707652,
+      "grad_norm": 1.0289674507090458,
+      "learning_rate": 5.4523473424581045e-06,
+      "loss": 0.8768,
+      "step": 1028
+    },
+    {
+      "epoch": 0.9751243781094527,
+      "grad_norm": 1.1688616748496152,
+      "learning_rate": 5.444700894072712e-06,
+      "loss": 0.8708,
+      "step": 1029
+    },
+    {
+      "epoch": 0.9760720208481403,
+      "grad_norm": 1.0710612536808917,
+      "learning_rate": 5.437053397216364e-06,
+      "loss": 0.9093,
+      "step": 1030
+    },
+    {
+      "epoch": 0.9770196635868278,
+      "grad_norm": 1.210378896652165,
+      "learning_rate": 5.429404869919559e-06,
+      "loss": 0.788,
+      "step": 1031
+    },
+    {
+      "epoch": 0.9779673063255153,
+      "grad_norm": 1.0438767939858762,
+      "learning_rate": 5.421755330215223e-06,
+      "loss": 0.95,
+      "step": 1032
+    },
+    {
+      "epoch": 0.9789149490642028,
+      "grad_norm": 1.071781685166561,
+      "learning_rate": 5.4141047961386724e-06,
+      "loss": 0.8668,
+      "step": 1033
+    },
+    {
+      "epoch": 0.9798625918028903,
+      "grad_norm": 1.0966538771750634,
+      "learning_rate": 5.4064532857275645e-06,
+      "loss": 0.9063,
+      "step": 1034
+    },
+    {
+      "epoch": 0.9798625918028903,
+      "eval_loss": 0.9142357707023621,
+      "eval_runtime": 67.863,
+      "eval_samples_per_second": 40.199,
+      "eval_steps_per_second": 0.634,
+      "step": 1034
+    },
+    {
+      "epoch": 0.9808102345415778,
+      "grad_norm": 1.0629520496350149,
+      "learning_rate": 5.398800817021857e-06,
+      "loss": 0.9179,
+      "step": 1035
+    },
+    {
+      "epoch": 0.9817578772802653,
+      "grad_norm": 1.315356991713657,
+      "learning_rate": 5.3911474080637705e-06,
+      "loss": 0.862,
+      "step": 1036
+    },
+    {
+      "epoch": 0.9827055200189528,
+      "grad_norm": 1.5792297604082457,
+      "learning_rate": 5.383493076897742e-06,
+      "loss": 0.8413,
+      "step": 1037
+    },
+    {
+      "epoch": 0.9836531627576404,
+      "grad_norm": 1.4810853298698745,
+      "learning_rate": 5.3758378415703825e-06,
+      "loss": 0.845,
+      "step": 1038
+    },
+    {
+      "epoch": 0.9846008054963279,
+      "grad_norm": 1.4008685021146463,
+      "learning_rate": 5.368181720130434e-06,
+      "loss": 0.8588,
+      "step": 1039
+    },
+    {
+      "epoch": 0.9855484482350154,
+      "grad_norm": 1.0609355637628701,
+      "learning_rate": 5.3605247306287275e-06,
+      "loss": 0.8704,
+      "step": 1040
+    },
+    {
+      "epoch": 0.9864960909737029,
+      "grad_norm": 1.147604350423996,
+      "learning_rate": 5.352866891118143e-06,
+      "loss": 0.8918,
+      "step": 1041
+    },
+    {
+      "epoch": 0.9874437337123905,
+      "grad_norm": 1.0985464941272645,
+      "learning_rate": 5.345208219653562e-06,
+      "loss": 0.9016,
+      "step": 1042
+    },
+    {
+      "epoch": 0.988391376451078,
+      "grad_norm": 1.05526373628293,
+      "learning_rate": 5.337548734291827e-06,
+      "loss": 0.8496,
+      "step": 1043
+    },
+    {
+      "epoch": 0.9893390191897654,
+      "grad_norm": 0.9639147858281681,
+      "learning_rate": 5.329888453091701e-06,
+      "loss": 0.8429,
+      "step": 1044
+    },
+    {
+      "epoch": 0.9902866619284529,
+      "grad_norm": 1.0367677286445547,
+      "learning_rate": 5.322227394113826e-06,
+      "loss": 0.9336,
+      "step": 1045
+    },
+    {
+      "epoch": 0.9912343046671405,
+      "grad_norm": 1.2202980010199613,
+      "learning_rate": 5.314565575420671e-06,
+      "loss": 0.8396,
+      "step": 1046
+    },
+    {
+      "epoch": 0.992181947405828,
+      "grad_norm": 1.043650160849041,
+      "learning_rate": 5.306903015076502e-06,
+      "loss": 0.9273,
+      "step": 1047
+    },
+    {
+      "epoch": 0.9931295901445155,
+      "grad_norm": 1.2196944237357306,
+      "learning_rate": 5.299239731147332e-06,
+      "loss": 0.882,
+      "step": 1048
+    },
+    {
+      "epoch": 0.994077232883203,
+      "grad_norm": 1.160879798524974,
+      "learning_rate": 5.291575741700878e-06,
+      "loss": 0.8874,
+      "step": 1049
+    },
+    {
+      "epoch": 0.9950248756218906,
+      "grad_norm": 1.1650239134630531,
+      "learning_rate": 5.283911064806522e-06,
+      "loss": 0.8936,
+      "step": 1050
+    },
+    {
+      "epoch": 0.9959725183605781,
+      "grad_norm": 1.0978030906121916,
+      "learning_rate": 5.2762457185352685e-06,
+      "loss": 0.8426,
+      "step": 1051
+    },
+    {
+      "epoch": 0.9969201610992656,
+      "grad_norm": 1.1662853004003075,
+      "learning_rate": 5.268579720959698e-06,
+      "loss": 0.8447,
+      "step": 1052
+    },
+    {
+      "epoch": 0.997867803837953,
+      "grad_norm": 1.0212044797920623,
+      "learning_rate": 5.260913090153928e-06,
+      "loss": 0.8577,
+      "step": 1053
+    },
+    {
+      "epoch": 0.9988154465766406,
+      "grad_norm": 0.9166563097634649,
+      "learning_rate": 5.253245844193564e-06,
+      "loss": 0.8203,
+      "step": 1054
+    },
+    {
+      "epoch": 0.9997630893153281,
+      "grad_norm": 1.2814929737482674,
+      "learning_rate": 5.24557800115567e-06,
+      "loss": 0.878,
+      "step": 1055
+    },
+    {
+      "epoch": 1.0007107320540156,
+      "grad_norm": 1.029394422791808,
+      "learning_rate": 5.237909579118713e-06,
+      "loss": 0.7881,
+      "step": 1056
+    },
+    {
+      "epoch": 1.0007107320540156,
+      "eval_loss": 0.9143710732460022,
+      "eval_runtime": 61.9673,
+      "eval_samples_per_second": 44.023,
+      "eval_steps_per_second": 0.694,
+      "step": 1056
+    },
+    {
+      "epoch": 1.0016583747927033,
+      "grad_norm": 1.0123693619172263,
+      "learning_rate": 5.2302405961625225e-06,
+      "loss": 0.7238,
+      "step": 1057
+    },
+    {
+      "epoch": 1.0026060175313907,
+      "grad_norm": 0.9727230093690866,
+      "learning_rate": 5.222571070368258e-06,
+      "loss": 0.7209,
+      "step": 1058
+    },
+    {
+      "epoch": 1.003553660270078,
+      "grad_norm": 0.972226576136118,
+      "learning_rate": 5.214901019818353e-06,
+      "loss": 0.7445,
+      "step": 1059
+    },
+    {
+      "epoch": 1.0045013030087657,
+      "grad_norm": 1.0690844562597608,
+      "learning_rate": 5.2072304625964785e-06,
+      "loss": 0.721,
+      "step": 1060
+    },
+    {
+      "epoch": 1.0054489457474531,
+      "grad_norm": 0.892297063646139,
+      "learning_rate": 5.199559416787503e-06,
+      "loss": 0.7467,
+      "step": 1061
+    },
+    {
+      "epoch": 1.0063965884861408,
+      "grad_norm": 0.8630887724649647,
+      "learning_rate": 5.191887900477444e-06,
+      "loss": 0.7242,
+      "step": 1062
+    },
+    {
+      "epoch": 1.0073442312248282,
+      "grad_norm": 0.8982089162859859,
+      "learning_rate": 5.1842159317534304e-06,
+      "loss": 0.6937,
+      "step": 1063
+    },
+    {
+      "epoch": 1.0082918739635158,
+      "grad_norm": 0.9672944077294386,
+      "learning_rate": 5.176543528703657e-06,
+      "loss": 0.7022,
+      "step": 1064
+    },
+    {
+      "epoch": 1.0092395167022032,
+      "grad_norm": 0.8839557121691414,
+      "learning_rate": 5.168870709417342e-06,
+      "loss": 0.7057,
+      "step": 1065
+    },
+    {
+      "epoch": 1.0101871594408909,
+      "grad_norm": 0.9782692878540812,
+      "learning_rate": 5.161197491984684e-06,
+      "loss": 0.7163,
+      "step": 1066
+    },
+    {
+      "epoch": 1.0111348021795783,
+      "grad_norm": 0.886106192089488,
+      "learning_rate": 5.153523894496826e-06,
+      "loss": 0.7415,
+      "step": 1067
+    },
+    {
+      "epoch": 1.0120824449182657,
+      "grad_norm": 0.9342675578725627,
+      "learning_rate": 5.1458499350458e-06,
+      "loss": 0.7005,
+      "step": 1068
+    },
+    {
+      "epoch": 1.0130300876569533,
+      "grad_norm": 0.9845076442772586,
+      "learning_rate": 5.138175631724495e-06,
+      "loss": 0.679,
+      "step": 1069
+    },
+    {
+      "epoch": 1.0139777303956408,
+      "grad_norm": 1.0058057680221788,
+      "learning_rate": 5.130501002626609e-06,
+      "loss": 0.7382,
+      "step": 1070
+    },
+    {
+      "epoch": 1.0149253731343284,
+      "grad_norm": 1.0402911368548144,
+      "learning_rate": 5.12282606584661e-06,
+      "loss": 0.7102,
+      "step": 1071
+    },
+    {
+      "epoch": 1.0158730158730158,
+      "grad_norm": 0.9653271354076595,
+      "learning_rate": 5.11515083947969e-06,
+      "loss": 0.7487,
+      "step": 1072
+    },
+    {
+      "epoch": 1.0168206586117035,
+      "grad_norm": 1.0059830380040296,
+      "learning_rate": 5.107475341621726e-06,
+      "loss": 0.697,
+      "step": 1073
+    },
+    {
+      "epoch": 1.0177683013503909,
+      "grad_norm": 0.9592083024886124,
+      "learning_rate": 5.099799590369231e-06,
+      "loss": 0.7111,
+      "step": 1074
+    },
+    {
+      "epoch": 1.0187159440890785,
+      "grad_norm": 0.9113464173920633,
+      "learning_rate": 5.092123603819318e-06,
+      "loss": 0.6739,
+      "step": 1075
+    },
+    {
+      "epoch": 1.019663586827766,
+      "grad_norm": 0.9658198205738235,
+      "learning_rate": 5.084447400069656e-06,
+      "loss": 0.672,
+      "step": 1076
+    },
+    {
+      "epoch": 1.0206112295664536,
+      "grad_norm": 0.9288542434671724,
+      "learning_rate": 5.076770997218424e-06,
+      "loss": 0.7281,
+      "step": 1077
+    },
+    {
+      "epoch": 1.021558872305141,
+      "grad_norm": 0.9793083780049829,
+      "learning_rate": 5.069094413364272e-06,
+      "loss": 0.6441,
+      "step": 1078
+    },
+    {
+      "epoch": 1.021558872305141,
+      "eval_loss": 0.9252648949623108,
+      "eval_runtime": 63.0278,
+      "eval_samples_per_second": 43.283,
+      "eval_steps_per_second": 0.682,
+      "step": 1078
+    },
+    {
+      "epoch": 1.0225065150438284,
+      "grad_norm": 0.9273171413752292,
+      "learning_rate": 5.061417666606274e-06,
+      "loss": 0.6967,
+      "step": 1079
+    },
+    {
+      "epoch": 1.023454157782516,
+      "grad_norm": 1.5839472443961204,
+      "learning_rate": 5.053740775043891e-06,
+      "loss": 0.7093,
+      "step": 1080
+    },
+    {
+      "epoch": 1.0244018005212034,
+      "grad_norm": 0.9765979302224368,
+      "learning_rate": 5.046063756776926e-06,
+      "loss": 0.6671,
+      "step": 1081
+    },
+    {
+      "epoch": 1.025349443259891,
+      "grad_norm": 0.9457182691315833,
+      "learning_rate": 5.038386629905475e-06,
+      "loss": 0.7088,
+      "step": 1082
+    },
+    {
+      "epoch": 1.0262970859985785,
+      "grad_norm": 1.0209484041244559,
+      "learning_rate": 5.030709412529896e-06,
+      "loss": 0.6753,
+      "step": 1083
+    },
+    {
+      "epoch": 1.0272447287372661,
+      "grad_norm": 1.0234656179924495,
+      "learning_rate": 5.0230321227507595e-06,
+      "loss": 0.7002,
+      "step": 1084
+    },
+    {
+      "epoch": 1.0281923714759535,
+      "grad_norm": 0.9799867444681427,
+      "learning_rate": 5.015354778668805e-06,
+      "loss": 0.6913,
+      "step": 1085
+    },
+    {
+      "epoch": 1.0291400142146412,
+      "grad_norm": 1.0327323089162292,
+      "learning_rate": 5.007677398384902e-06,
+      "loss": 0.7102,
+      "step": 1086
+    },
+    {
+      "epoch": 1.0300876569533286,
+      "grad_norm": 0.8542430531938927,
+      "learning_rate": 5e-06,
+      "loss": 0.7311,
+      "step": 1087
+    },
+    {
+      "epoch": 1.031035299692016,
+      "grad_norm": 0.9551198273884293,
+      "learning_rate": 4.992322601615101e-06,
+      "loss": 0.8065,
+      "step": 1088
+    },
+    {
+      "epoch": 1.0319829424307037,
+      "grad_norm": 1.2082682361729435,
+      "learning_rate": 4.984645221331196e-06,
+      "loss": 0.7087,
+      "step": 1089
+    },
+    {
+      "epoch": 1.032930585169391,
+      "grad_norm": 1.0470114950375882,
+      "learning_rate": 4.976967877249242e-06,
+      "loss": 0.694,
+      "step": 1090
+    },
+    {
+      "epoch": 1.0338782279080787,
+      "grad_norm": 0.8968991535832135,
+      "learning_rate": 4.969290587470106e-06,
+      "loss": 0.6542,
+      "step": 1091
+    },
+    {
+      "epoch": 1.0348258706467661,
+      "grad_norm": 0.9569176042470755,
+      "learning_rate": 4.961613370094526e-06,
+      "loss": 0.7053,
+      "step": 1092
+    },
+    {
+      "epoch": 1.0357735133854538,
+      "grad_norm": 0.956739530222217,
+      "learning_rate": 4.953936243223077e-06,
+      "loss": 0.7299,
+      "step": 1093
+    },
+    {
+      "epoch": 1.0367211561241412,
+      "grad_norm": 0.9576460725333583,
+      "learning_rate": 4.9462592249561095e-06,
+      "loss": 0.7516,
+      "step": 1094
+    },
+    {
+      "epoch": 1.0376687988628288,
+      "grad_norm": 1.314007292694408,
+      "learning_rate": 4.938582333393727e-06,
+      "loss": 0.7014,
+      "step": 1095
+    },
+    {
+      "epoch": 1.0386164416015162,
+      "grad_norm": 0.9732347243923025,
+      "learning_rate": 4.93090558663573e-06,
+      "loss": 0.6131,
+      "step": 1096
+    },
+    {
+      "epoch": 1.0395640843402036,
+      "grad_norm": 0.902465289858781,
+      "learning_rate": 4.923229002781577e-06,
+      "loss": 0.7244,
+      "step": 1097
+    },
+    {
+      "epoch": 1.0405117270788913,
+      "grad_norm": 0.9138525283319687,
+      "learning_rate": 4.915552599930345e-06,
+      "loss": 0.7447,
+      "step": 1098
+    },
+    {
+      "epoch": 1.0414593698175787,
+      "grad_norm": 0.8667199024588413,
+      "learning_rate": 4.907876396180684e-06,
+      "loss": 0.731,
+      "step": 1099
+    },
+    {
+      "epoch": 1.0424070125562663,
+      "grad_norm": 1.0023840202075536,
+      "learning_rate": 4.900200409630771e-06,
+      "loss": 0.7,
+      "step": 1100
+    },
+    {
+      "epoch": 1.0424070125562663,
+      "eval_loss": 0.9256648421287537,
+      "eval_runtime": 65.8282,
+      "eval_samples_per_second": 41.441,
+      "eval_steps_per_second": 0.653,
+      "step": 1100
+    },
+    {
+      "epoch": 1.0433546552949537,
+      "grad_norm": 1.005189996616475,
+      "learning_rate": 4.892524658378276e-06,
+      "loss": 0.662,
+      "step": 1101
+    },
+    {
+      "epoch": 1.0443022980336414,
+      "grad_norm": 1.0552789716104816,
+      "learning_rate": 4.884849160520311e-06,
+      "loss": 0.7296,
+      "step": 1102
+    },
+    {
+      "epoch": 1.0452499407723288,
+      "grad_norm": 0.9361357742033367,
+      "learning_rate": 4.877173934153392e-06,
+      "loss": 0.7036,
+      "step": 1103
+    },
+    {
+      "epoch": 1.0461975835110164,
+      "grad_norm": 0.9030236691224809,
+      "learning_rate": 4.869498997373393e-06,
+      "loss": 0.6941,
+      "step": 1104
+    },
+    {
+      "epoch": 1.0471452262497039,
+      "grad_norm": 1.0038838078279626,
+      "learning_rate": 4.861824368275508e-06,
+      "loss": 0.7321,
+      "step": 1105
+    },
+    {
+      "epoch": 1.0480928689883915,
+      "grad_norm": 0.9603184361433643,
+      "learning_rate": 4.854150064954201e-06,
+      "loss": 0.6711,
+      "step": 1106
+    },
+    {
+      "epoch": 1.049040511727079,
+      "grad_norm": 0.9225275167901936,
+      "learning_rate": 4.846476105503176e-06,
+      "loss": 0.6717,
+      "step": 1107
+    },
+    {
+      "epoch": 1.0499881544657663,
+      "grad_norm": 1.2404804952292134,
+      "learning_rate": 4.838802508015316e-06,
+      "loss": 0.7472,
+      "step": 1108
+    },
+    {
+      "epoch": 1.050935797204454,
+      "grad_norm": 0.9689214802559678,
+      "learning_rate": 4.83112929058266e-06,
+      "loss": 0.6947,
+      "step": 1109
+    },
+    {
+      "epoch": 1.0518834399431414,
+      "grad_norm": 1.1456662341258836,
+      "learning_rate": 4.8234564712963445e-06,
+      "loss": 0.7316,
+      "step": 1110
+    },
+    {
+      "epoch": 1.052831082681829,
+      "grad_norm": 0.9768673269000111,
+      "learning_rate": 4.815784068246571e-06,
+      "loss": 0.7487,
+      "step": 1111
+    },
+    {
+      "epoch": 1.0537787254205164,
+      "grad_norm": 0.9716968504214143,
+      "learning_rate": 4.808112099522558e-06,
+      "loss": 0.7056,
+      "step": 1112
+    },
+    {
+      "epoch": 1.054726368159204,
+      "grad_norm": 0.974665466729692,
+      "learning_rate": 4.800440583212499e-06,
+      "loss": 0.6911,
+      "step": 1113
+    },
+    {
+      "epoch": 1.0556740108978915,
+      "grad_norm": 0.9167885828760524,
+      "learning_rate": 4.792769537403523e-06,
+      "loss": 0.7107,
+      "step": 1114
+    },
+    {
+      "epoch": 1.0566216536365791,
+      "grad_norm": 1.0329746061334233,
+      "learning_rate": 4.785098980181649e-06,
+      "loss": 0.7229,
+      "step": 1115
+    },
+    {
+      "epoch": 1.0575692963752665,
+      "grad_norm": 0.9737952606280224,
+      "learning_rate": 4.777428929631743e-06,
+      "loss": 0.7777,
+      "step": 1116
+    },
+    {
+      "epoch": 1.058516939113954,
+      "grad_norm": 1.1038154299902683,
+      "learning_rate": 4.769759403837479e-06,
+      "loss": 0.6809,
+      "step": 1117
+    },
+    {
+      "epoch": 1.0594645818526416,
+      "grad_norm": 1.08554606134142,
+      "learning_rate": 4.762090420881289e-06,
+      "loss": 0.6669,
+      "step": 1118
+    },
+    {
+      "epoch": 1.060412224591329,
+      "grad_norm": 1.0920359773635873,
+      "learning_rate": 4.754421998844331e-06,
+      "loss": 0.6871,
+      "step": 1119
+    },
+    {
+      "epoch": 1.0613598673300166,
+      "grad_norm": 0.9825699774740995,
+      "learning_rate": 4.746754155806437e-06,
+      "loss": 0.727,
+      "step": 1120
+    },
+    {
+      "epoch": 1.062307510068704,
+      "grad_norm": 0.9674769463454311,
+      "learning_rate": 4.739086909846075e-06,
+      "loss": 0.7189,
+      "step": 1121
+    },
+    {
+      "epoch": 1.0632551528073917,
+      "grad_norm": 1.0091809961919986,
+      "learning_rate": 4.731420279040303e-06,
+      "loss": 0.7278,
+      "step": 1122
+    },
+    {
+      "epoch": 1.0632551528073917,
+      "eval_loss": 0.9248070120811462,
+      "eval_runtime": 64.5875,
+      "eval_samples_per_second": 42.237,
+      "eval_steps_per_second": 0.666,
+      "step": 1122
+    },
+    {
+      "epoch": 1.064202795546079,
+      "grad_norm": 1.073809323574034,
+      "learning_rate": 4.723754281464732e-06,
+      "loss": 0.7729,
+      "step": 1123
+    },
+    {
+      "epoch": 1.0651504382847667,
+      "grad_norm": 0.9206055880032425,
+      "learning_rate": 4.716088935193479e-06,
+      "loss": 0.6833,
+      "step": 1124
+    },
+    {
+      "epoch": 1.0660980810234542,
+      "grad_norm": 0.9675985199743727,
+      "learning_rate": 4.708424258299125e-06,
+      "loss": 0.7201,
+      "step": 1125
+    },
+    {
+      "epoch": 1.0670457237621416,
+      "grad_norm": 0.970501113273963,
+      "learning_rate": 4.700760268852669e-06,
+      "loss": 0.6957,
+      "step": 1126
+    },
+    {
+      "epoch": 1.0679933665008292,
+      "grad_norm": 1.0025167130129373,
+      "learning_rate": 4.693096984923499e-06,
+      "loss": 0.7329,
+      "step": 1127
+    },
+    {
+      "epoch": 1.0689410092395166,
+      "grad_norm": 1.2193095917303223,
+      "learning_rate": 4.68543442457933e-06,
+      "loss": 0.7177,
+      "step": 1128
+    },
+    {
+      "epoch": 1.0698886519782043,
+      "grad_norm": 1.0294113926873432,
+      "learning_rate": 4.677772605886175e-06,
+      "loss": 0.6829,
+      "step": 1129
+    },
+    {
+      "epoch": 1.0708362947168917,
+      "grad_norm": 0.9136644811068081,
+      "learning_rate": 4.670111546908299e-06,
+      "loss": 0.697,
+      "step": 1130
+    },
+    {
+      "epoch": 1.0717839374555793,
+      "grad_norm": 0.9443556825485072,
+      "learning_rate": 4.662451265708174e-06,
+      "loss": 0.6735,
+      "step": 1131
+    },
+    {
+      "epoch": 1.0727315801942667,
+      "grad_norm": 0.9561425995186141,
+      "learning_rate": 4.65479178034644e-06,
+      "loss": 0.7189,
+      "step": 1132
+    },
+    {
+      "epoch": 1.0736792229329544,
+      "grad_norm": 0.9931041504777264,
+      "learning_rate": 4.647133108881858e-06,
+      "loss": 0.6587,
+      "step": 1133
+    },
+    {
+      "epoch": 1.0746268656716418,
+      "grad_norm": 0.925346382362746,
+      "learning_rate": 4.639475269371273e-06,
+      "loss": 0.7157,
+      "step": 1134
+    },
+    {
+      "epoch": 1.0755745084103294,
+      "grad_norm": 1.031232406635297,
+      "learning_rate": 4.631818279869567e-06,
+      "loss": 0.7325,
+      "step": 1135
+    },
+    {
+      "epoch": 1.0765221511490168,
+      "grad_norm": 1.0831354355205014,
+      "learning_rate": 4.624162158429618e-06,
+      "loss": 0.703,
+      "step": 1136
+    },
+    {
+      "epoch": 1.0774697938877043,
+      "grad_norm": 0.9705781822246375,
+      "learning_rate": 4.616506923102259e-06,
+      "loss": 0.6238,
+      "step": 1137
+    },
+    {
+      "epoch": 1.078417436626392,
+      "grad_norm": 1.1371488724998442,
+      "learning_rate": 4.608852591936231e-06,
+      "loss": 0.7601,
+      "step": 1138
+    },
+    {
+      "epoch": 1.0793650793650793,
+      "grad_norm": 1.064474848705152,
+      "learning_rate": 4.601199182978146e-06,
+      "loss": 0.6468,
+      "step": 1139
+    },
+    {
+      "epoch": 1.080312722103767,
+      "grad_norm": 0.9450727325021634,
+      "learning_rate": 4.593546714272438e-06,
+      "loss": 0.7266,
+      "step": 1140
+    },
+    {
+      "epoch": 1.0812603648424544,
+      "grad_norm": 0.9799112965049801,
+      "learning_rate": 4.585895203861328e-06,
+      "loss": 0.7317,
+      "step": 1141
+    },
+    {
+      "epoch": 1.082208007581142,
+      "grad_norm": 0.9097169877735888,
+      "learning_rate": 4.5782446697847775e-06,
+      "loss": 0.746,
+      "step": 1142
+    },
+    {
+      "epoch": 1.0831556503198294,
+      "grad_norm": 1.054415192907425,
+      "learning_rate": 4.5705951300804425e-06,
+      "loss": 0.726,
+      "step": 1143
+    },
+    {
+      "epoch": 1.0841032930585168,
+      "grad_norm": 0.903372224445911,
+      "learning_rate": 4.562946602783637e-06,
+      "loss": 0.7171,
+      "step": 1144
+    },
+    {
+      "epoch": 1.0841032930585168,
+      "eval_loss": 0.9241182804107666,
+      "eval_runtime": 67.5348,
+      "eval_samples_per_second": 40.394,
+      "eval_steps_per_second": 0.637,
+      "step": 1144
+    },
+    {
+      "epoch": 1.0850509357972045,
+      "grad_norm": 1.0495310770662147,
+      "learning_rate": 4.55529910592729e-06,
+      "loss": 0.6606,
+      "step": 1145
+    },
+    {
+      "epoch": 1.0859985785358919,
+      "grad_norm": 1.3054046428477601,
+      "learning_rate": 4.547652657541897e-06,
+      "loss": 0.7109,
+      "step": 1146
+    },
+    {
+      "epoch": 1.0869462212745795,
+      "grad_norm": 0.9385889950812906,
+      "learning_rate": 4.540007275655485e-06,
+      "loss": 0.7101,
+      "step": 1147
+    },
+    {
+      "epoch": 1.087893864013267,
+      "grad_norm": 1.0307846935200982,
+      "learning_rate": 4.532362978293564e-06,
+      "loss": 0.7025,
+      "step": 1148
+    },
+    {
+      "epoch": 1.0888415067519546,
+      "grad_norm": 1.0094586805349344,
+      "learning_rate": 4.524719783479088e-06,
+      "loss": 0.7341,
+      "step": 1149
+    },
+    {
+      "epoch": 1.089789149490642,
+      "grad_norm": 1.0080024003104493,
+      "learning_rate": 4.517077709232411e-06,
+      "loss": 0.7125,
+      "step": 1150
+    },
+    {
+      "epoch": 1.0907367922293296,
+      "grad_norm": 0.9839429831089125,
+      "learning_rate": 4.509436773571247e-06,
+      "loss": 0.7263,
+      "step": 1151
+    },
+    {
+      "epoch": 1.091684434968017,
+      "grad_norm": 0.9578891424409663,
+      "learning_rate": 4.5017969945106225e-06,
+      "loss": 0.7049,
+      "step": 1152
+    },
+    {
+      "epoch": 1.0926320777067047,
+      "grad_norm": 1.589523374002844,
+      "learning_rate": 4.49415839006284e-06,
+      "loss": 0.7045,
+      "step": 1153
+    },
+    {
+      "epoch": 1.093579720445392,
+      "grad_norm": 1.1691430951977255,
+      "learning_rate": 4.486520978237431e-06,
+      "loss": 0.6681,
+      "step": 1154
+    },
+    {
+      "epoch": 1.0945273631840795,
+      "grad_norm": 1.020265471952243,
+      "learning_rate": 4.478884777041115e-06,
+      "loss": 0.7003,
+      "step": 1155
+    },
+    {
+      "epoch": 1.0954750059227671,
+      "grad_norm": 0.9179136320195528,
+      "learning_rate": 4.471249804477758e-06,
+      "loss": 0.7077,
+      "step": 1156
+    },
+    {
+      "epoch": 1.0964226486614546,
+      "grad_norm": 0.9635788033380907,
+      "learning_rate": 4.4636160785483285e-06,
+      "loss": 0.7151,
+      "step": 1157
+    },
+    {
+      "epoch": 1.0973702914001422,
+      "grad_norm": 2.647967184274327,
+      "learning_rate": 4.455983617250857e-06,
+      "loss": 0.7341,
+      "step": 1158
+    },
+    {
+      "epoch": 1.0983179341388296,
+      "grad_norm": 1.0131020212308353,
+      "learning_rate": 4.448352438580391e-06,
+      "loss": 0.6905,
+      "step": 1159
+    },
+    {
+      "epoch": 1.0992655768775172,
+      "grad_norm": 1.0530355520145287,
+      "learning_rate": 4.440722560528955e-06,
+      "loss": 0.6387,
+      "step": 1160
+    },
+    {
+      "epoch": 1.1002132196162047,
+      "grad_norm": 0.9509811853766807,
+      "learning_rate": 4.433094001085505e-06,
+      "loss": 0.7466,
+      "step": 1161
+    },
+    {
+      "epoch": 1.1011608623548923,
+      "grad_norm": 1.0138369505840823,
+      "learning_rate": 4.4254667782358925e-06,
+      "loss": 0.679,
+      "step": 1162
+    },
+    {
+      "epoch": 1.1021085050935797,
+      "grad_norm": 1.0773914127698383,
+      "learning_rate": 4.417840909962813e-06,
+      "loss": 0.7367,
+      "step": 1163
+    },
+    {
+      "epoch": 1.1030561478322674,
+      "grad_norm": 1.3528732394706713,
+      "learning_rate": 4.410216414245771e-06,
+      "loss": 0.7166,
+      "step": 1164
+    },
+    {
+      "epoch": 1.1040037905709548,
+      "grad_norm": 1.0120462677381739,
+      "learning_rate": 4.402593309061034e-06,
+      "loss": 0.6599,
+      "step": 1165
+    },
+    {
+      "epoch": 1.1049514333096422,
+      "grad_norm": 0.9783229280065361,
+      "learning_rate": 4.394971612381591e-06,
+      "loss": 0.7053,
+      "step": 1166
+    },
+    {
+      "epoch": 1.1049514333096422,
+      "eval_loss": 0.9224104285240173,
+      "eval_runtime": 63.0367,
+      "eval_samples_per_second": 43.276,
+      "eval_steps_per_second": 0.682,
+      "step": 1166
+    },
+    {
+      "epoch": 1.1058990760483298,
+      "grad_norm": 0.9564604779753705,
+      "learning_rate": 4.38735134217711e-06,
+      "loss": 0.7522,
+      "step": 1167
+    },
+    {
+      "epoch": 1.1068467187870172,
+      "grad_norm": 0.9360222777010359,
+      "learning_rate": 4.379732516413897e-06,
+      "loss": 0.6734,
+      "step": 1168
+    },
+    {
+      "epoch": 1.1077943615257049,
+      "grad_norm": 0.8943163287031561,
+      "learning_rate": 4.372115153054851e-06,
+      "loss": 0.7118,
+      "step": 1169
+    },
+    {
+      "epoch": 1.1087420042643923,
+      "grad_norm": 1.179413315968657,
+      "learning_rate": 4.364499270059423e-06,
+      "loss": 0.6538,
+      "step": 1170
+    },
+    {
+      "epoch": 1.10968964700308,
+      "grad_norm": 0.9833678397573673,
+      "learning_rate": 4.356884885383578e-06,
+      "loss": 0.7024,
+      "step": 1171
+    },
+    {
+      "epoch": 1.1106372897417673,
+      "grad_norm": 1.1165040405330118,
+      "learning_rate": 4.34927201697974e-06,
+      "loss": 0.7223,
+      "step": 1172
+    },
+    {
+      "epoch": 1.1115849324804548,
+      "grad_norm": 1.0836563622250095,
+      "learning_rate": 4.341660682796766e-06,
+      "loss": 0.7432,
+      "step": 1173
+    },
+    {
+      "epoch": 1.1125325752191424,
+      "grad_norm": 0.9956369650089748,
+      "learning_rate": 4.334050900779893e-06,
+      "loss": 0.6979,
+      "step": 1174
+    },
+    {
+      "epoch": 1.1134802179578298,
+      "grad_norm": 0.9041906906929965,
+      "learning_rate": 4.326442688870697e-06,
+      "loss": 0.7818,
+      "step": 1175
+    },
+    {
+      "epoch": 1.1144278606965174,
+      "grad_norm": 1.0233141405037254,
+      "learning_rate": 4.318836065007052e-06,
+      "loss": 0.6802,
+      "step": 1176
+    },
+    {
+      "epoch": 1.1153755034352049,
+      "grad_norm": 1.0925280426338722,
+      "learning_rate": 4.3112310471230925e-06,
+      "loss": 0.7202,
+      "step": 1177
+    },
+    {
+      "epoch": 1.1163231461738925,
+      "grad_norm": 1.0258996034471566,
+      "learning_rate": 4.303627653149159e-06,
+      "loss": 0.7173,
+      "step": 1178
+    },
+    {
+      "epoch": 1.11727078891258,
+      "grad_norm": 1.0595188906999566,
+      "learning_rate": 4.296025901011773e-06,
+      "loss": 0.7402,
+      "step": 1179
+    },
+    {
+      "epoch": 1.1182184316512676,
+      "grad_norm": 0.9566532678110401,
+      "learning_rate": 4.2884258086335755e-06,
+      "loss": 0.6911,
+      "step": 1180
+    },
+    {
+      "epoch": 1.119166074389955,
+      "grad_norm": 0.9954131257021107,
+      "learning_rate": 4.2808273939333e-06,
+      "loss": 0.6893,
+      "step": 1181
+    },
+    {
+      "epoch": 1.1201137171286426,
+      "grad_norm": 0.9760861787884846,
+      "learning_rate": 4.2732306748257226e-06,
+      "loss": 0.6839,
+      "step": 1182
+    },
+    {
+      "epoch": 1.12106135986733,
+      "grad_norm": 1.116236364521447,
+      "learning_rate": 4.265635669221622e-06,
+      "loss": 0.7272,
+      "step": 1183
+    },
+    {
+      "epoch": 1.1220090026060174,
+      "grad_norm": 0.9772190754750057,
+      "learning_rate": 4.258042395027738e-06,
+      "loss": 0.7048,
+      "step": 1184
+    },
+    {
+      "epoch": 1.122956645344705,
+      "grad_norm": 0.9990968345465719,
+      "learning_rate": 4.250450870146726e-06,
+      "loss": 0.6661,
+      "step": 1185
+    },
+    {
+      "epoch": 1.1239042880833925,
+      "grad_norm": 1.004582020487418,
+      "learning_rate": 4.2428611124771184e-06,
+      "loss": 0.7158,
+      "step": 1186
+    },
+    {
+      "epoch": 1.1248519308220801,
+      "grad_norm": 1.0285222277895798,
+      "learning_rate": 4.235273139913281e-06,
+      "loss": 0.6759,
+      "step": 1187
+    },
+    {
+      "epoch": 1.1257995735607675,
+      "grad_norm": 1.026042016166187,
+      "learning_rate": 4.227686970345373e-06,
+      "loss": 0.6767,
+      "step": 1188
+    },
+    {
+      "epoch": 1.1257995735607675,
+      "eval_loss": 0.9233511090278625,
+      "eval_runtime": 63.5378,
+      "eval_samples_per_second": 42.935,
+      "eval_steps_per_second": 0.677,
+      "step": 1188
+    },
+    {
+      "epoch": 1.1267472162994552,
+      "grad_norm": 0.9839710491649496,
+      "learning_rate": 4.220102621659298e-06,
+      "loss": 0.698,
+      "step": 1189
+    },
+    {
+      "epoch": 1.1276948590381426,
+      "grad_norm": 1.3599383760269543,
+      "learning_rate": 4.21252011173667e-06,
+      "loss": 0.7257,
+      "step": 1190
+    },
+    {
+      "epoch": 1.1286425017768302,
+      "grad_norm": 1.1366178207656392,
+      "learning_rate": 4.204939458454767e-06,
+      "loss": 0.7008,
+      "step": 1191
+    },
+    {
+      "epoch": 1.1295901445155176,
+      "grad_norm": 0.95168166219681,
+      "learning_rate": 4.197360679686489e-06,
+      "loss": 0.6956,
+      "step": 1192
+    },
+    {
+      "epoch": 1.1305377872542053,
+      "grad_norm": 1.0580531496952468,
+      "learning_rate": 4.1897837933003165e-06,
+      "loss": 0.6555,
+      "step": 1193
+    },
+    {
+      "epoch": 1.1314854299928927,
+      "grad_norm": 1.3388307797907961,
+      "learning_rate": 4.182208817160269e-06,
+      "loss": 0.7038,
+      "step": 1194
+    },
+    {
+      "epoch": 1.1324330727315801,
+      "grad_norm": 1.263598798657549,
+      "learning_rate": 4.174635769125862e-06,
+      "loss": 0.6939,
+      "step": 1195
+    },
+    {
+      "epoch": 1.1333807154702678,
+      "grad_norm": 0.9897430245234835,
+      "learning_rate": 4.1670646670520656e-06,
+      "loss": 0.6949,
+      "step": 1196
+    },
+    {
+      "epoch": 1.1343283582089552,
+      "grad_norm": 1.5009873894314265,
+      "learning_rate": 4.15949552878926e-06,
+      "loss": 0.663,
+      "step": 1197
+    },
+    {
+      "epoch": 1.1352760009476428,
+      "grad_norm": 1.022385852836757,
+      "learning_rate": 4.151928372183198e-06,
+      "loss": 0.7124,
+      "step": 1198
+    },
+    {
+      "epoch": 1.1362236436863302,
+      "grad_norm": 1.1789551448297066,
+      "learning_rate": 4.144363215074959e-06,
+      "loss": 0.6713,
+      "step": 1199
+    },
+    {
+      "epoch": 1.1371712864250179,
+      "grad_norm": 1.0079132927023848,
+      "learning_rate": 4.136800075300906e-06,
+      "loss": 0.6997,
+      "step": 1200
+    },
+    {
+      "epoch": 1.1381189291637053,
+      "grad_norm": 0.9647107031990494,
+      "learning_rate": 4.129238970692651e-06,
+      "loss": 0.6968,
+      "step": 1201
+    },
+    {
+      "epoch": 1.1390665719023927,
+      "grad_norm": 1.0120852534707783,
+      "learning_rate": 4.121679919077001e-06,
+      "loss": 0.7705,
+      "step": 1202
+    },
+    {
+      "epoch": 1.1400142146410803,
+      "grad_norm": 3.766970964988497,
+      "learning_rate": 4.114122938275929e-06,
+      "loss": 0.664,
+      "step": 1203
+    },
+    {
+      "epoch": 1.1409618573797677,
+      "grad_norm": 1.0763801437474894,
+      "learning_rate": 4.10656804610652e-06,
+      "loss": 0.7236,
+      "step": 1204
+    },
+    {
+      "epoch": 1.1419095001184554,
+      "grad_norm": 1.0272206175047942,
+      "learning_rate": 4.0990152603809394e-06,
+      "loss": 0.7017,
+      "step": 1205
+    },
+    {
+      "epoch": 1.1428571428571428,
+      "grad_norm": 1.0151383640578906,
+      "learning_rate": 4.091464598906385e-06,
+      "loss": 0.7475,
+      "step": 1206
+    },
+    {
+      "epoch": 1.1438047855958304,
+      "grad_norm": 1.0605465057418049,
+      "learning_rate": 4.083916079485044e-06,
+      "loss": 0.7003,
+      "step": 1207
+    },
+    {
+      "epoch": 1.1447524283345178,
+      "grad_norm": 0.9663496293680089,
+      "learning_rate": 4.076369719914055e-06,
+      "loss": 0.7059,
+      "step": 1208
+    },
+    {
+      "epoch": 1.1457000710732055,
+      "grad_norm": 1.5121184694790173,
+      "learning_rate": 4.068825537985465e-06,
+      "loss": 0.7403,
+      "step": 1209
+    },
+    {
+      "epoch": 1.146647713811893,
+      "grad_norm": 0.999707638130472,
+      "learning_rate": 4.061283551486185e-06,
+      "loss": 0.6822,
+      "step": 1210
+    },
+    {
+      "epoch": 1.146647713811893,
+      "eval_loss": 0.9220083355903625,
+      "eval_runtime": 64.0249,
+      "eval_samples_per_second": 42.608,
+      "eval_steps_per_second": 0.672,
+      "step": 1210
+    },
+    {
+      "epoch": 1.1475953565505805,
+      "grad_norm": 0.9638255107668675,
+      "learning_rate": 4.053743778197951e-06,
+      "loss": 0.6955,
+      "step": 1211
+    },
+    {
+      "epoch": 1.148542999289268,
+      "grad_norm": 1.092006543225386,
+      "learning_rate": 4.04620623589728e-06,
+      "loss": 0.7363,
+      "step": 1212
+    },
+    {
+      "epoch": 1.1494906420279554,
+      "grad_norm": 0.9984016047331302,
+      "learning_rate": 4.038670942355431e-06,
+      "loss": 0.6918,
+      "step": 1213
+    },
+    {
+      "epoch": 1.150438284766643,
+      "grad_norm": 1.0035214440616442,
+      "learning_rate": 4.03113791533836e-06,
+      "loss": 0.6924,
+      "step": 1214
+    },
+    {
+      "epoch": 1.1513859275053304,
+      "grad_norm": 1.0110914807155829,
+      "learning_rate": 4.023607172606676e-06,
+      "loss": 0.6946,
+      "step": 1215
+    },
+    {
+      "epoch": 1.152333570244018,
+      "grad_norm": 0.8736055096829543,
+      "learning_rate": 4.016078731915608e-06,
+      "loss": 0.775,
+      "step": 1216
+    },
+    {
+      "epoch": 1.1532812129827055,
+      "grad_norm": 0.9939438831479498,
+      "learning_rate": 4.008552611014955e-06,
+      "loss": 0.6888,
+      "step": 1217
+    },
+    {
+      "epoch": 1.154228855721393,
+      "grad_norm": 1.049570796703869,
+      "learning_rate": 4.001028827649046e-06,
+      "loss": 0.7094,
+      "step": 1218
+    },
+    {
+      "epoch": 1.1551764984600805,
+      "grad_norm": 1.148462409040153,
+      "learning_rate": 3.993507399556699e-06,
+      "loss": 0.6845,
+      "step": 1219
+    },
+    {
+      "epoch": 1.156124141198768,
+      "grad_norm": 0.9773344806508405,
+      "learning_rate": 3.9859883444711795e-06,
+      "loss": 0.6948,
+      "step": 1220
+    },
+    {
+      "epoch": 1.1570717839374556,
+      "grad_norm": 1.0928186343002937,
+      "learning_rate": 3.978471680120157e-06,
+      "loss": 0.7538,
+      "step": 1221
+    },
+    {
+      "epoch": 1.1580194266761432,
+      "grad_norm": 1.193743573791038,
+      "learning_rate": 3.970957424225666e-06,
+      "loss": 0.7024,
+      "step": 1222
+    },
+    {
+      "epoch": 1.1589670694148306,
+      "grad_norm": 1.1120074499425576,
+      "learning_rate": 3.963445594504062e-06,
+      "loss": 0.6627,
+      "step": 1223
+    },
+    {
+      "epoch": 1.159914712153518,
+      "grad_norm": 1.2788177552822944,
+      "learning_rate": 3.955936208665979e-06,
+      "loss": 0.6673,
+      "step": 1224
+    },
+    {
+      "epoch": 1.1608623548922057,
+      "grad_norm": 1.0546418537225764,
+      "learning_rate": 3.9484292844162905e-06,
+      "loss": 0.6398,
+      "step": 1225
+    },
+    {
+      "epoch": 1.161809997630893,
+      "grad_norm": 0.9380567432599234,
+      "learning_rate": 3.940924839454067e-06,
+      "loss": 0.6736,
+      "step": 1226
+    },
+    {
+      "epoch": 1.1627576403695807,
+      "grad_norm": 1.0622788802891603,
+      "learning_rate": 3.933422891472532e-06,
+      "loss": 0.6881,
+      "step": 1227
+    },
+    {
+      "epoch": 1.1637052831082682,
+      "grad_norm": 1.0284270221411218,
+      "learning_rate": 3.925923458159023e-06,
+      "loss": 0.6836,
+      "step": 1228
+    },
+    {
+      "epoch": 1.1646529258469558,
+      "grad_norm": 1.135279890250597,
+      "learning_rate": 3.918426557194947e-06,
+      "loss": 0.7027,
+      "step": 1229
+    },
+    {
+      "epoch": 1.1656005685856432,
+      "grad_norm": 1.0051047734363374,
+      "learning_rate": 3.910932206255742e-06,
+      "loss": 0.6571,
+      "step": 1230
+    },
+    {
+      "epoch": 1.1665482113243306,
+      "grad_norm": 1.0386743766132205,
+      "learning_rate": 3.903440423010835e-06,
+      "loss": 0.7293,
+      "step": 1231
+    },
+    {
+      "epoch": 1.1674958540630183,
+      "grad_norm": 0.9974944591028375,
+      "learning_rate": 3.895951225123595e-06,
+      "loss": 0.7061,
+      "step": 1232
+    },
+    {
+      "epoch": 1.1674958540630183,
+      "eval_loss": 0.92330402135849,
+      "eval_runtime": 61.1167,
+      "eval_samples_per_second": 44.636,
+      "eval_steps_per_second": 0.704,
+      "step": 1232
+    },
+    {
+      "epoch": 1.1684434968017057,
+      "grad_norm": 1.0523489123192147,
+      "learning_rate": 3.8884646302512985e-06,
+      "loss": 0.6744,
+      "step": 1233
+    },
+    {
+      "epoch": 1.1693911395403933,
+      "grad_norm": 1.0581086896752634,
+      "learning_rate": 3.880980656045087e-06,
+      "loss": 0.7234,
+      "step": 1234
+    },
+    {
+      "epoch": 1.1703387822790807,
+      "grad_norm": 1.081206532147213,
+      "learning_rate": 3.873499320149918e-06,
+      "loss": 0.7075,
+      "step": 1235
+    },
+    {
+      "epoch": 1.1712864250177684,
+      "grad_norm": 1.0406772938616795,
+      "learning_rate": 3.866020640204533e-06,
+      "loss": 0.6703,
+      "step": 1236
+    },
+    {
+      "epoch": 1.1722340677564558,
+      "grad_norm": 1.0457307864336358,
+      "learning_rate": 3.858544633841409e-06,
+      "loss": 0.6763,
+      "step": 1237
+    },
+    {
+      "epoch": 1.1731817104951434,
+      "grad_norm": 0.9946842072910351,
+      "learning_rate": 3.851071318686721e-06,
+      "loss": 0.6393,
+      "step": 1238
+    },
+    {
+      "epoch": 1.1741293532338308,
+      "grad_norm": 0.9269746374884014,
+      "learning_rate": 3.843600712360298e-06,
+      "loss": 0.729,
+      "step": 1239
+    },
+    {
+      "epoch": 1.1750769959725185,
+      "grad_norm": 0.9972213948459809,
+      "learning_rate": 3.836132832475583e-06,
+      "loss": 0.6714,
+      "step": 1240
+    },
+    {
+      "epoch": 1.1760246387112059,
+      "grad_norm": 1.1281466636664788,
+      "learning_rate": 3.8286676966395895e-06,
+      "loss": 0.7375,
+      "step": 1241
+    },
+    {
+      "epoch": 1.1769722814498933,
+      "grad_norm": 1.0775487768465717,
+      "learning_rate": 3.821205322452863e-06,
+      "loss": 0.7771,
+      "step": 1242
+    },
+    {
+      "epoch": 1.177919924188581,
+      "grad_norm": 0.9769500849217977,
+      "learning_rate": 3.813745727509439e-06,
+      "loss": 0.7238,
+      "step": 1243
+    },
+    {
+      "epoch": 1.1788675669272684,
+      "grad_norm": 0.9685742121387568,
+      "learning_rate": 3.806288929396798e-06,
+      "loss": 0.7081,
+      "step": 1244
+    },
+    {
+      "epoch": 1.179815209665956,
+      "grad_norm": 1.096837571136743,
+      "learning_rate": 3.798834945695826e-06,
+      "loss": 0.6977,
+      "step": 1245
+    },
+    {
+      "epoch": 1.1807628524046434,
+      "grad_norm": 1.1006333163134505,
+      "learning_rate": 3.7913837939807763e-06,
+      "loss": 0.6762,
+      "step": 1246
+    },
+    {
+      "epoch": 1.181710495143331,
+      "grad_norm": 0.9658169918126236,
+      "learning_rate": 3.783935491819222e-06,
+      "loss": 0.6904,
+      "step": 1247
+    },
+    {
+      "epoch": 1.1826581378820185,
+      "grad_norm": 1.1429805038475467,
+      "learning_rate": 3.77649005677202e-06,
+      "loss": 0.7098,
+      "step": 1248
+    },
+    {
+      "epoch": 1.1836057806207059,
+      "grad_norm": 1.1176775156483372,
+      "learning_rate": 3.769047506393267e-06,
+      "loss": 0.6764,
+      "step": 1249
+    },
+    {
+      "epoch": 1.1845534233593935,
+      "grad_norm": 1.0183368197142009,
+      "learning_rate": 3.7616078582302575e-06,
+      "loss": 0.731,
+      "step": 1250
+    },
+    {
+      "epoch": 1.1855010660980811,
+      "grad_norm": 0.9859557354856052,
+      "learning_rate": 3.754171129823444e-06,
+      "loss": 0.7222,
+      "step": 1251
+    },
+    {
+      "epoch": 1.1864487088367686,
+      "grad_norm": 0.9582987322265264,
+      "learning_rate": 3.7467373387063973e-06,
+      "loss": 0.6739,
+      "step": 1252
+    },
+    {
+      "epoch": 1.187396351575456,
+      "grad_norm": 0.9031196186509544,
+      "learning_rate": 3.7393065024057597e-06,
+      "loss": 0.7282,
+      "step": 1253
+    },
+    {
+      "epoch": 1.1883439943141436,
+      "grad_norm": 1.0684330693325141,
+      "learning_rate": 3.7318786384412076e-06,
+      "loss": 0.6953,
+      "step": 1254
+    },
+    {
+      "epoch": 1.1883439943141436,
+      "eval_loss": 0.920585036277771,
+      "eval_runtime": 65.1767,
+      "eval_samples_per_second": 41.855,
+      "eval_steps_per_second": 0.66,
+      "step": 1254
+    },
+    {
+      "epoch": 1.189291637052831,
+      "grad_norm": 1.148345248080178,
+      "learning_rate": 3.7244537643254115e-06,
+      "loss": 0.7035,
+      "step": 1255
+    },
+    {
+      "epoch": 1.1902392797915187,
+      "grad_norm": 1.0249604355926194,
+      "learning_rate": 3.7170318975639902e-06,
+      "loss": 0.7582,
+      "step": 1256
+    },
+    {
+      "epoch": 1.191186922530206,
+      "grad_norm": 1.179036054066612,
+      "learning_rate": 3.7096130556554744e-06,
+      "loss": 0.697,
+      "step": 1257
+    },
+    {
+      "epoch": 1.1921345652688937,
+      "grad_norm": 1.036930121403606,
+      "learning_rate": 3.70219725609126e-06,
+      "loss": 0.7452,
+      "step": 1258
+    },
+    {
+      "epoch": 1.1930822080075811,
+      "grad_norm": 0.9553861484853223,
+      "learning_rate": 3.694784516355573e-06,
+      "loss": 0.7419,
+      "step": 1259
+    },
+    {
+      "epoch": 1.1940298507462686,
+      "grad_norm": 0.9117393301073062,
+      "learning_rate": 3.687374853925425e-06,
+      "loss": 0.6818,
+      "step": 1260
+    },
+    {
+      "epoch": 1.1949774934849562,
+      "grad_norm": 1.109221558375404,
+      "learning_rate": 3.679968286270571e-06,
+      "loss": 0.6819,
+      "step": 1261
+    },
+    {
+      "epoch": 1.1959251362236436,
+      "grad_norm": 1.020240570463157,
+      "learning_rate": 3.67256483085347e-06,
+      "loss": 0.7115,
+      "step": 1262
+    },
+    {
+      "epoch": 1.1968727789623312,
+      "grad_norm": 1.0960139903595318,
+      "learning_rate": 3.6651645051292415e-06,
+      "loss": 0.7298,
+      "step": 1263
+    },
+    {
+      "epoch": 1.1978204217010187,
+      "grad_norm": 0.8730491568921783,
+      "learning_rate": 3.6577673265456296e-06,
+      "loss": 0.6626,
+      "step": 1264
+    },
+    {
+      "epoch": 1.1987680644397063,
+      "grad_norm": 1.0528341736215752,
+      "learning_rate": 3.6503733125429557e-06,
+      "loss": 0.7439,
+      "step": 1265
+    },
+    {
+      "epoch": 1.1997157071783937,
+      "grad_norm": 1.0899721179963884,
+      "learning_rate": 3.6429824805540816e-06,
+      "loss": 0.6907,
+      "step": 1266
+    },
+    {
+      "epoch": 1.2006633499170813,
+      "grad_norm": 1.0591609285941943,
+      "learning_rate": 3.6355948480043647e-06,
+      "loss": 0.6818,
+      "step": 1267
+    },
+    {
+      "epoch": 1.2016109926557688,
+      "grad_norm": 1.091729744316094,
+      "learning_rate": 3.628210432311621e-06,
+      "loss": 0.7118,
+      "step": 1268
+    },
+    {
+      "epoch": 1.2025586353944564,
+      "grad_norm": 1.1106751888187103,
+      "learning_rate": 3.620829250886083e-06,
+      "loss": 0.7496,
+      "step": 1269
+    },
+    {
+      "epoch": 1.2035062781331438,
+      "grad_norm": 0.9071769388935164,
+      "learning_rate": 3.6134513211303555e-06,
+      "loss": 0.6996,
+      "step": 1270
+    },
+    {
+      "epoch": 1.2044539208718312,
+      "grad_norm": 0.9816514708234372,
+      "learning_rate": 3.606076660439378e-06,
+      "loss": 0.7154,
+      "step": 1271
+    },
+    {
+      "epoch": 1.2054015636105189,
+      "grad_norm": 0.9314656323457674,
+      "learning_rate": 3.5987052862003824e-06,
+      "loss": 0.7288,
+      "step": 1272
+    },
+    {
+      "epoch": 1.2063492063492063,
+      "grad_norm": 1.0352321626353647,
+      "learning_rate": 3.5913372157928515e-06,
+      "loss": 0.6235,
+      "step": 1273
+    },
+    {
+      "epoch": 1.207296849087894,
+      "grad_norm": 1.3247779592651683,
+      "learning_rate": 3.58397246658848e-06,
+      "loss": 0.7273,
+      "step": 1274
+    },
+    {
+      "epoch": 1.2082444918265813,
+      "grad_norm": 0.9925803897741295,
+      "learning_rate": 3.5766110559511313e-06,
+      "loss": 0.749,
+      "step": 1275
+    },
+    {
+      "epoch": 1.209192134565269,
+      "grad_norm": 0.9515222056049948,
+      "learning_rate": 3.569253001236795e-06,
+      "loss": 0.7559,
+      "step": 1276
+    },
+    {
+      "epoch": 1.209192134565269,
+      "eval_loss": 0.9210101366043091,
+      "eval_runtime": 60.5067,
+      "eval_samples_per_second": 45.086,
+      "eval_steps_per_second": 0.711,
+      "step": 1276
+    },
+    {
+      "epoch": 1.2101397773039564,
+      "grad_norm": 1.0505259866511463,
+      "learning_rate": 3.561898319793555e-06,
+      "loss": 0.6777,
+      "step": 1277
+    },
+    {
+      "epoch": 1.2110874200426438,
+      "grad_norm": 1.0470739963568594,
+      "learning_rate": 3.554547028961537e-06,
+      "loss": 0.6687,
+      "step": 1278
+    },
+    {
+      "epoch": 1.2120350627813314,
+      "grad_norm": 0.9372200609494287,
+      "learning_rate": 3.5471991460728725e-06,
+      "loss": 0.7364,
+      "step": 1279
+    },
+    {
+      "epoch": 1.212982705520019,
+      "grad_norm": 1.0677225546085984,
+      "learning_rate": 3.5398546884516606e-06,
+      "loss": 0.6946,
+      "step": 1280
+    },
+    {
+      "epoch": 1.2139303482587065,
+      "grad_norm": 1.2432772070818632,
+      "learning_rate": 3.5325136734139213e-06,
+      "loss": 0.7216,
+      "step": 1281
+    },
+    {
+      "epoch": 1.214877990997394,
+      "grad_norm": 0.9586638500297385,
+      "learning_rate": 3.5251761182675626e-06,
+      "loss": 0.6836,
+      "step": 1282
+    },
+    {
+      "epoch": 1.2158256337360815,
+      "grad_norm": 1.0193646692337255,
+      "learning_rate": 3.5178420403123307e-06,
+      "loss": 0.7499,
+      "step": 1283
+    },
+    {
+      "epoch": 1.216773276474769,
+      "grad_norm": 0.9934250718994667,
+      "learning_rate": 3.510511456839777e-06,
+      "loss": 0.7127,
+      "step": 1284
+    },
+    {
+      "epoch": 1.2177209192134566,
+      "grad_norm": 1.018374538431729,
+      "learning_rate": 3.5031843851332105e-06,
+      "loss": 0.7211,
+      "step": 1285
+    },
+    {
+      "epoch": 1.218668561952144,
+      "grad_norm": 1.0345735993851575,
+      "learning_rate": 3.495860842467664e-06,
+      "loss": 0.7196,
+      "step": 1286
+    },
+    {
+      "epoch": 1.2196162046908317,
+      "grad_norm": 1.053615402301811,
+      "learning_rate": 3.488540846109849e-06,
+      "loss": 0.6648,
+      "step": 1287
+    },
+    {
+      "epoch": 1.220563847429519,
+      "grad_norm": 1.0164841506591313,
+      "learning_rate": 3.481224413318114e-06,
+      "loss": 0.6602,
+      "step": 1288
+    },
+    {
+      "epoch": 1.2215114901682065,
+      "grad_norm": 1.0367484711368176,
+      "learning_rate": 3.4739115613424078e-06,
+      "loss": 0.7115,
+      "step": 1289
+    },
+    {
+      "epoch": 1.2224591329068941,
+      "grad_norm": 1.038323393287383,
+      "learning_rate": 3.4666023074242356e-06,
+      "loss": 0.6587,
+      "step": 1290
+    },
+    {
+      "epoch": 1.2234067756455815,
+      "grad_norm": 0.9327274906523454,
+      "learning_rate": 3.459296668796619e-06,
+      "loss": 0.6846,
+      "step": 1291
+    },
+    {
+      "epoch": 1.2243544183842692,
+      "grad_norm": 1.0084456960618864,
+      "learning_rate": 3.451994662684057e-06,
+      "loss": 0.7076,
+      "step": 1292
+    },
+    {
+      "epoch": 1.2253020611229566,
+      "grad_norm": 1.0275585469763515,
+      "learning_rate": 3.4446963063024854e-06,
+      "loss": 0.691,
+      "step": 1293
+    },
+    {
+      "epoch": 1.2262497038616442,
+      "grad_norm": 1.125902212799892,
+      "learning_rate": 3.4374016168592296e-06,
+      "loss": 0.7251,
+      "step": 1294
+    },
+    {
+      "epoch": 1.2271973466003316,
+      "grad_norm": 1.0322864795599813,
+      "learning_rate": 3.4301106115529766e-06,
+      "loss": 0.7284,
+      "step": 1295
+    },
+    {
+      "epoch": 1.2281449893390193,
+      "grad_norm": 0.9989097221420168,
+      "learning_rate": 3.4228233075737225e-06,
+      "loss": 0.7035,
+      "step": 1296
+    },
+    {
+      "epoch": 1.2290926320777067,
+      "grad_norm": 0.9373804854464124,
+      "learning_rate": 3.4155397221027396e-06,
+      "loss": 0.7139,
+      "step": 1297
+    },
+    {
+      "epoch": 1.2300402748163943,
+      "grad_norm": 1.0343160679933974,
+      "learning_rate": 3.4082598723125303e-06,
+      "loss": 0.6859,
+      "step": 1298
+    },
+    {
+      "epoch": 1.2300402748163943,
+      "eval_loss": 0.9219260215759277,
+      "eval_runtime": 64.2211,
+      "eval_samples_per_second": 42.478,
+      "eval_steps_per_second": 0.67,
+      "step": 1298
+    },
+    {
+      "epoch": 1.2309879175550817,
+      "grad_norm": 1.0041258596912708,
+      "learning_rate": 3.4009837753667918e-06,
+      "loss": 0.6752,
+      "step": 1299
+    },
+    {
+      "epoch": 1.2319355602937692,
+      "grad_norm": 1.062141619360519,
+      "learning_rate": 3.393711448420372e-06,
+      "loss": 0.7558,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2328832030324568,
+      "grad_norm": 1.1486226243514412,
+      "learning_rate": 3.3864429086192295e-06,
+      "loss": 0.6976,
+      "step": 1301
+    },
+    {
+      "epoch": 1.2338308457711442,
+      "grad_norm": 1.0162684385678484,
+      "learning_rate": 3.379178173100396e-06,
+      "loss": 0.6503,
+      "step": 1302
+    },
+    {
+      "epoch": 1.2347784885098319,
+      "grad_norm": 1.0535985388570441,
+      "learning_rate": 3.371917258991933e-06,
+      "loss": 0.7014,
+      "step": 1303
+    },
+    {
+      "epoch": 1.2357261312485193,
+      "grad_norm": 0.9476340249103999,
+      "learning_rate": 3.3646601834128924e-06,
+      "loss": 0.7243,
+      "step": 1304
+    },
+    {
+      "epoch": 1.236673773987207,
+      "grad_norm": 0.9722138093221362,
+      "learning_rate": 3.3574069634732744e-06,
+      "loss": 0.6936,
+      "step": 1305
+    },
+    {
+      "epoch": 1.2376214167258943,
+      "grad_norm": 0.960280885803354,
+      "learning_rate": 3.3501576162739903e-06,
+      "loss": 0.7258,
+      "step": 1306
+    },
+    {
+      "epoch": 1.2385690594645817,
+      "grad_norm": 1.011953193368571,
+      "learning_rate": 3.3429121589068213e-06,
+      "loss": 0.7573,
+      "step": 1307
+    },
+    {
+      "epoch": 1.2395167022032694,
+      "grad_norm": 1.035071350641064,
+      "learning_rate": 3.3356706084543766e-06,
+      "loss": 0.7303,
+      "step": 1308
+    },
+    {
+      "epoch": 1.2404643449419568,
+      "grad_norm": 1.0243269638647712,
+      "learning_rate": 3.328432981990053e-06,
+      "loss": 0.7117,
+      "step": 1309
+    },
+    {
+      "epoch": 1.2414119876806444,
+      "grad_norm": 1.00968974551286,
+      "learning_rate": 3.3211992965779984e-06,
+      "loss": 0.6356,
+      "step": 1310
+    },
+    {
+      "epoch": 1.2423596304193318,
+      "grad_norm": 0.9221223902659512,
+      "learning_rate": 3.3139695692730644e-06,
+      "loss": 0.6582,
+      "step": 1311
+    },
+    {
+      "epoch": 1.2433072731580195,
+      "grad_norm": 1.063526317074848,
+      "learning_rate": 3.306743817120777e-06,
+      "loss": 0.6458,
+      "step": 1312
+    },
+    {
+      "epoch": 1.244254915896707,
+      "grad_norm": 1.0099993891673738,
+      "learning_rate": 3.2995220571572845e-06,
+      "loss": 0.6945,
+      "step": 1313
+    },
+    {
+      "epoch": 1.2452025586353945,
+      "grad_norm": 1.2682014688129715,
+      "learning_rate": 3.2923043064093252e-06,
+      "loss": 0.7106,
+      "step": 1314
+    },
+    {
+      "epoch": 1.246150201374082,
+      "grad_norm": 0.9957192198152068,
+      "learning_rate": 3.2850905818941853e-06,
+      "loss": 0.7159,
+      "step": 1315
+    },
+    {
+      "epoch": 1.2470978441127696,
+      "grad_norm": 0.970490636667617,
+      "learning_rate": 3.2778809006196564e-06,
+      "loss": 0.7628,
+      "step": 1316
+    },
+    {
+      "epoch": 1.248045486851457,
+      "grad_norm": 1.049773961103418,
+      "learning_rate": 3.2706752795839984e-06,
+      "loss": 0.7065,
+      "step": 1317
+    },
+    {
+      "epoch": 1.2489931295901444,
+      "grad_norm": 1.0171220181984746,
+      "learning_rate": 3.2634737357758994e-06,
+      "loss": 0.6594,
+      "step": 1318
+    },
+    {
+      "epoch": 1.249940772328832,
+      "grad_norm": 1.0689789937287877,
+      "learning_rate": 3.256276286174433e-06,
+      "loss": 0.72,
+      "step": 1319
+    },
+    {
+      "epoch": 1.2508884150675195,
+      "grad_norm": 0.9750893399192391,
+      "learning_rate": 3.2490829477490194e-06,
+      "loss": 0.7237,
+      "step": 1320
+    },
+    {
+      "epoch": 1.2508884150675195,
+      "eval_loss": 0.9202948808670044,
+      "eval_runtime": 64.6354,
+      "eval_samples_per_second": 42.206,
+      "eval_steps_per_second": 0.665,
+      "step": 1320
+    },
+    {
+      "epoch": 1.251836057806207,
+      "grad_norm": 1.0670485098725635,
+      "learning_rate": 3.2418937374593895e-06,
+      "loss": 0.7168,
+      "step": 1321
+    },
+    {
+      "epoch": 1.2527837005448945,
+      "grad_norm": 0.9432261527116877,
+      "learning_rate": 3.2347086722555382e-06,
+      "loss": 0.741,
+      "step": 1322
+    },
+    {
+      "epoch": 1.2537313432835822,
+      "grad_norm": 0.9750755053529253,
+      "learning_rate": 3.2275277690776876e-06,
+      "loss": 0.6547,
+      "step": 1323
+    },
+    {
+      "epoch": 1.2546789860222696,
+      "grad_norm": 0.9301758889182347,
+      "learning_rate": 3.220351044856247e-06,
+      "loss": 0.7478,
+      "step": 1324
+    },
+    {
+      "epoch": 1.255626628760957,
+      "grad_norm": 1.0318451433516014,
+      "learning_rate": 3.2131785165117748e-06,
+      "loss": 0.6562,
+      "step": 1325
+    },
+    {
+      "epoch": 1.2565742714996446,
+      "grad_norm": 0.889824279332266,
+      "learning_rate": 3.206010200954935e-06,
+      "loss": 0.6682,
+      "step": 1326
+    },
+    {
+      "epoch": 1.2575219142383323,
+      "grad_norm": 0.9419702907671557,
+      "learning_rate": 3.198846115086459e-06,
+      "loss": 0.6833,
+      "step": 1327
+    },
+    {
+      "epoch": 1.2584695569770197,
+      "grad_norm": 1.0527246614550414,
+      "learning_rate": 3.191686275797107e-06,
+      "loss": 0.7099,
+      "step": 1328
+    },
+    {
+      "epoch": 1.259417199715707,
+      "grad_norm": 1.055473610231369,
+      "learning_rate": 3.1845306999676274e-06,
+      "loss": 0.6996,
+      "step": 1329
+    },
+    {
+      "epoch": 1.2603648424543947,
+      "grad_norm": 0.9595441733482751,
+      "learning_rate": 3.177379404468715e-06,
+      "loss": 0.6818,
+      "step": 1330
+    },
+    {
+      "epoch": 1.2613124851930821,
+      "grad_norm": 1.0295006726261346,
+      "learning_rate": 3.170232406160974e-06,
+      "loss": 0.6539,
+      "step": 1331
+    },
+    {
+      "epoch": 1.2622601279317698,
+      "grad_norm": 1.124771581323331,
+      "learning_rate": 3.1630897218948765e-06,
+      "loss": 0.6911,
+      "step": 1332
+    },
+    {
+      "epoch": 1.2632077706704572,
+      "grad_norm": 0.9768323438673063,
+      "learning_rate": 3.1559513685107233e-06,
+      "loss": 0.7021,
+      "step": 1333
+    },
+    {
+      "epoch": 1.2641554134091448,
+      "grad_norm": 0.9702345738300058,
+      "learning_rate": 3.1488173628386066e-06,
+      "loss": 0.7039,
+      "step": 1334
+    },
+    {
+      "epoch": 1.2651030561478323,
+      "grad_norm": 0.9740566736999792,
+      "learning_rate": 3.141687721698363e-06,
+      "loss": 0.7201,
+      "step": 1335
+    },
+    {
+      "epoch": 1.2660506988865197,
+      "grad_norm": 1.0341764156676143,
+      "learning_rate": 3.1345624618995444e-06,
+      "loss": 0.6815,
+      "step": 1336
+    },
+    {
+      "epoch": 1.2669983416252073,
+      "grad_norm": 1.0038570117243435,
+      "learning_rate": 3.127441600241369e-06,
+      "loss": 0.6874,
+      "step": 1337
+    },
+    {
+      "epoch": 1.267945984363895,
+      "grad_norm": 0.9492110824297334,
+      "learning_rate": 3.1203251535126867e-06,
+      "loss": 0.6973,
+      "step": 1338
+    },
+    {
+      "epoch": 1.2688936271025824,
+      "grad_norm": 1.2890789269673384,
+      "learning_rate": 3.11321313849194e-06,
+      "loss": 0.7239,
+      "step": 1339
+    },
+    {
+      "epoch": 1.2698412698412698,
+      "grad_norm": 0.9728809474646835,
+      "learning_rate": 3.10610557194712e-06,
+      "loss": 0.6764,
+      "step": 1340
+    },
+    {
+      "epoch": 1.2707889125799574,
+      "grad_norm": 1.1635688288087744,
+      "learning_rate": 3.0990024706357314e-06,
+      "loss": 0.6918,
+      "step": 1341
+    },
+    {
+      "epoch": 1.2717365553186448,
+      "grad_norm": 0.9704804371870352,
+      "learning_rate": 3.0919038513047507e-06,
+      "loss": 0.7398,
+      "step": 1342
+    },
+    {
+      "epoch": 1.2717365553186448,
+      "eval_loss": 0.9205412864685059,
+      "eval_runtime": 64.3962,
+      "eval_samples_per_second": 42.363,
+      "eval_steps_per_second": 0.668,
+      "step": 1342
+    },
+    {
+      "epoch": 1.2726841980573325,
+      "grad_norm": 0.9623053497453619,
+      "learning_rate": 3.084809730690587e-06,
+      "loss": 0.7125,
+      "step": 1343
+    },
+    {
+      "epoch": 1.2736318407960199,
+      "grad_norm": 1.0918361707437365,
+      "learning_rate": 3.077720125519042e-06,
+      "loss": 0.6929,
+      "step": 1344
+    },
+    {
+      "epoch": 1.2745794835347075,
+      "grad_norm": 1.0920621674730842,
+      "learning_rate": 3.070635052505273e-06,
+      "loss": 0.736,
+      "step": 1345
+    },
+    {
+      "epoch": 1.275527126273395,
+      "grad_norm": 0.9952373114086388,
+      "learning_rate": 3.0635545283537523e-06,
+      "loss": 0.687,
+      "step": 1346
+    },
+    {
+      "epoch": 1.2764747690120823,
+      "grad_norm": 1.2534896758069607,
+      "learning_rate": 3.056478569758225e-06,
+      "loss": 0.7381,
+      "step": 1347
+    },
+    {
+      "epoch": 1.27742241175077,
+      "grad_norm": 1.0662944647834347,
+      "learning_rate": 3.0494071934016737e-06,
+      "loss": 0.7478,
+      "step": 1348
+    },
+    {
+      "epoch": 1.2783700544894574,
+      "grad_norm": 1.1088922876494405,
+      "learning_rate": 3.0423404159562776e-06,
+      "loss": 0.7582,
+      "step": 1349
+    },
+    {
+      "epoch": 1.279317697228145,
+      "grad_norm": 1.176182333691132,
+      "learning_rate": 3.03527825408337e-06,
+      "loss": 0.7656,
+      "step": 1350
+    },
+    {
+      "epoch": 1.2802653399668324,
+      "grad_norm": 0.9658766613029017,
+      "learning_rate": 3.0282207244334084e-06,
+      "loss": 0.724,
+      "step": 1351
+    },
+    {
+      "epoch": 1.28121298270552,
+      "grad_norm": 1.0827099691926507,
+      "learning_rate": 3.0211678436459214e-06,
+      "loss": 0.6916,
+      "step": 1352
+    },
+    {
+      "epoch": 1.2821606254442075,
+      "grad_norm": 1.0338922828145432,
+      "learning_rate": 3.014119628349482e-06,
+      "loss": 0.6895,
+      "step": 1353
+    },
+    {
+      "epoch": 1.283108268182895,
+      "grad_norm": 0.9767332618971463,
+      "learning_rate": 3.007076095161662e-06,
+      "loss": 0.6949,
+      "step": 1354
+    },
+    {
+      "epoch": 1.2840559109215826,
+      "grad_norm": 1.0930600930773744,
+      "learning_rate": 3.0000372606889937e-06,
+      "loss": 0.7021,
+      "step": 1355
+    },
+    {
+      "epoch": 1.2850035536602702,
+      "grad_norm": 1.1419255045928394,
+      "learning_rate": 2.9930031415269327e-06,
+      "loss": 0.6816,
+      "step": 1356
+    },
+    {
+      "epoch": 1.2859511963989576,
+      "grad_norm": 1.0003644858641374,
+      "learning_rate": 2.9859737542598157e-06,
+      "loss": 0.7194,
+      "step": 1357
+    },
+    {
+      "epoch": 1.286898839137645,
+      "grad_norm": 0.9900004048868847,
+      "learning_rate": 2.978949115460824e-06,
+      "loss": 0.6978,
+      "step": 1358
+    },
+    {
+      "epoch": 1.2878464818763327,
+      "grad_norm": 1.0245392546866654,
+      "learning_rate": 2.971929241691942e-06,
+      "loss": 0.7067,
+      "step": 1359
+    },
+    {
+      "epoch": 1.28879412461502,
+      "grad_norm": 1.1770400753015886,
+      "learning_rate": 2.9649141495039225e-06,
+      "loss": 0.6811,
+      "step": 1360
+    },
+    {
+      "epoch": 1.2897417673537077,
+      "grad_norm": 0.9716668472533,
+      "learning_rate": 2.9579038554362412e-06,
+      "loss": 0.6944,
+      "step": 1361
+    },
+    {
+      "epoch": 1.2906894100923951,
+      "grad_norm": 0.9803805263564218,
+      "learning_rate": 2.950898376017064e-06,
+      "loss": 0.6599,
+      "step": 1362
+    },
+    {
+      "epoch": 1.2916370528310828,
+      "grad_norm": 1.1276831233974194,
+      "learning_rate": 2.943897727763202e-06,
+      "loss": 0.7439,
+      "step": 1363
+    },
+    {
+      "epoch": 1.2925846955697702,
+      "grad_norm": 0.9936565556974072,
+      "learning_rate": 2.9369019271800827e-06,
+      "loss": 0.7139,
+      "step": 1364
+    },
+    {
+      "epoch": 1.2925846955697702,
+      "eval_loss": 0.9199525117874146,
+      "eval_runtime": 65.9605,
+      "eval_samples_per_second": 41.358,
+      "eval_steps_per_second": 0.652,
+      "step": 1364
+    },
+    {
+      "epoch": 1.2935323383084576,
+      "grad_norm": 1.0225633529982556,
+      "learning_rate": 2.9299109907616956e-06,
+      "loss": 0.7169,
+      "step": 1365
+    },
+    {
+      "epoch": 1.2944799810471452,
+      "grad_norm": 1.1496102662994758,
+      "learning_rate": 2.9229249349905686e-06,
+      "loss": 0.7257,
+      "step": 1366
+    },
+    {
+      "epoch": 1.2954276237858329,
+      "grad_norm": 1.0411288552368323,
+      "learning_rate": 2.9159437763377187e-06,
+      "loss": 0.7057,
+      "step": 1367
+    },
+    {
+      "epoch": 1.2963752665245203,
+      "grad_norm": 1.1200448529620028,
+      "learning_rate": 2.908967531262618e-06,
+      "loss": 0.7623,
+      "step": 1368
+    },
+    {
+      "epoch": 1.2973229092632077,
+      "grad_norm": 1.0477643207405842,
+      "learning_rate": 2.9019962162131564e-06,
+      "loss": 0.6169,
+      "step": 1369
+    },
+    {
+      "epoch": 1.2982705520018953,
+      "grad_norm": 1.008136764087855,
+      "learning_rate": 2.895029847625595e-06,
+      "loss": 0.6862,
+      "step": 1370
+    },
+    {
+      "epoch": 1.2992181947405828,
+      "grad_norm": 1.0600688588701463,
+      "learning_rate": 2.8880684419245387e-06,
+      "loss": 0.7149,
+      "step": 1371
+    },
+    {
+      "epoch": 1.3001658374792704,
+      "grad_norm": 1.0027128694027476,
+      "learning_rate": 2.8811120155228843e-06,
+      "loss": 0.7366,
+      "step": 1372
+    },
+    {
+      "epoch": 1.3011134802179578,
+      "grad_norm": 1.4069108299597186,
+      "learning_rate": 2.874160584821798e-06,
+      "loss": 0.7393,
+      "step": 1373
+    },
+    {
+      "epoch": 1.3020611229566454,
+      "grad_norm": 1.0338612942301875,
+      "learning_rate": 2.8672141662106577e-06,
+      "loss": 0.7036,
+      "step": 1374
+    },
+    {
+      "epoch": 1.3030087656953329,
+      "grad_norm": 1.0688910868363117,
+      "learning_rate": 2.8602727760670336e-06,
+      "loss": 0.7306,
+      "step": 1375
+    },
+    {
+      "epoch": 1.3039564084340203,
+      "grad_norm": 1.242518786141904,
+      "learning_rate": 2.8533364307566313e-06,
+      "loss": 0.6862,
+      "step": 1376
+    },
+    {
+      "epoch": 1.304904051172708,
+      "grad_norm": 1.1911216173581962,
+      "learning_rate": 2.846405146633269e-06,
+      "loss": 0.7568,
+      "step": 1377
+    },
+    {
+      "epoch": 1.3058516939113953,
+      "grad_norm": 1.0039693235836689,
+      "learning_rate": 2.839478940038833e-06,
+      "loss": 0.6523,
+      "step": 1378
+    },
+    {
+      "epoch": 1.306799336650083,
+      "grad_norm": 1.072057083327897,
+      "learning_rate": 2.8325578273032295e-06,
+      "loss": 0.7036,
+      "step": 1379
+    },
+    {
+      "epoch": 1.3077469793887704,
+      "grad_norm": 0.9993390017662981,
+      "learning_rate": 2.8256418247443664e-06,
+      "loss": 0.6887,
+      "step": 1380
+    },
+    {
+      "epoch": 1.308694622127458,
+      "grad_norm": 0.9454808776048585,
+      "learning_rate": 2.8187309486680924e-06,
+      "loss": 0.7237,
+      "step": 1381
+    },
+    {
+      "epoch": 1.3096422648661454,
+      "grad_norm": 0.9371803052679708,
+      "learning_rate": 2.811825215368179e-06,
+      "loss": 0.7279,
+      "step": 1382
+    },
+    {
+      "epoch": 1.3105899076048328,
+      "grad_norm": 1.2827676657928362,
+      "learning_rate": 2.804924641126264e-06,
+      "loss": 0.6878,
+      "step": 1383
+    },
+    {
+      "epoch": 1.3115375503435205,
+      "grad_norm": 1.1459955089807479,
+      "learning_rate": 2.7980292422118282e-06,
+      "loss": 0.7615,
+      "step": 1384
+    },
+    {
+      "epoch": 1.3124851930822081,
+      "grad_norm": 1.0020748955412062,
+      "learning_rate": 2.791139034882151e-06,
+      "loss": 0.7376,
+      "step": 1385
+    },
+    {
+      "epoch": 1.3134328358208955,
+      "grad_norm": 1.277527875810019,
+      "learning_rate": 2.7842540353822634e-06,
+      "loss": 0.7209,
+      "step": 1386
+    },
+    {
+      "epoch": 1.3134328358208955,
+      "eval_loss": 0.91898512840271,
+      "eval_runtime": 64.7739,
+      "eval_samples_per_second": 42.116,
+      "eval_steps_per_second": 0.664,
+      "step": 1386
+    },
+    {
+      "epoch": 1.314380478559583,
+      "grad_norm": 1.1181626221192054,
+      "learning_rate": 2.777374259944929e-06,
+      "loss": 0.7057,
+      "step": 1387
+    },
+    {
+      "epoch": 1.3153281212982706,
+      "grad_norm": 0.9400452950748897,
+      "learning_rate": 2.770499724790584e-06,
+      "loss": 0.721,
+      "step": 1388
+    },
+    {
+      "epoch": 1.316275764036958,
+      "grad_norm": 1.0147521664502355,
+      "learning_rate": 2.763630446127319e-06,
+      "loss": 0.7199,
+      "step": 1389
+    },
+    {
+      "epoch": 1.3172234067756456,
+      "grad_norm": 0.9881704878801858,
+      "learning_rate": 2.7567664401508225e-06,
+      "loss": 0.7116,
+      "step": 1390
+    },
+    {
+      "epoch": 1.318171049514333,
+      "grad_norm": 0.9795454208939292,
+      "learning_rate": 2.7499077230443607e-06,
+      "loss": 0.6953,
+      "step": 1391
+    },
+    {
+      "epoch": 1.3191186922530207,
+      "grad_norm": 1.0069354879696941,
+      "learning_rate": 2.743054310978722e-06,
+      "loss": 0.7098,
+      "step": 1392
+    },
+    {
+      "epoch": 1.3200663349917081,
+      "grad_norm": 0.9429203635412998,
+      "learning_rate": 2.736206220112192e-06,
+      "loss": 0.7004,
+      "step": 1393
+    },
+    {
+      "epoch": 1.3210139777303955,
+      "grad_norm": 1.0682307544212157,
+      "learning_rate": 2.729363466590511e-06,
+      "loss": 0.6745,
+      "step": 1394
+    },
+    {
+      "epoch": 1.3219616204690832,
+      "grad_norm": 1.0079367947967734,
+      "learning_rate": 2.72252606654683e-06,
+      "loss": 0.6362,
+      "step": 1395
+    },
+    {
+      "epoch": 1.3229092632077708,
+      "grad_norm": 0.936909554532062,
+      "learning_rate": 2.7156940361016864e-06,
+      "loss": 0.7282,
+      "step": 1396
+    },
+    {
+      "epoch": 1.3238569059464582,
+      "grad_norm": 1.1619360098958085,
+      "learning_rate": 2.708867391362948e-06,
+      "loss": 0.759,
+      "step": 1397
+    },
+    {
+      "epoch": 1.3248045486851456,
+      "grad_norm": 1.0332898627783107,
+      "learning_rate": 2.7020461484257952e-06,
+      "loss": 0.7224,
+      "step": 1398
+    },
+    {
+      "epoch": 1.3257521914238333,
+      "grad_norm": 1.0120053743179587,
+      "learning_rate": 2.6952303233726628e-06,
+      "loss": 0.7007,
+      "step": 1399
+    },
+    {
+      "epoch": 1.3266998341625207,
+      "grad_norm": 1.0182348658415017,
+      "learning_rate": 2.6884199322732192e-06,
+      "loss": 0.7364,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3276474769012083,
+      "grad_norm": 1.0133603532948297,
+      "learning_rate": 2.681614991184315e-06,
+      "loss": 0.743,
+      "step": 1401
+    },
+    {
+      "epoch": 1.3285951196398957,
+      "grad_norm": 0.9924480633218891,
+      "learning_rate": 2.6748155161499568e-06,
+      "loss": 0.6545,
+      "step": 1402
+    },
+    {
+      "epoch": 1.3295427623785834,
+      "grad_norm": 1.127155396416284,
+      "learning_rate": 2.668021523201263e-06,
+      "loss": 0.7471,
+      "step": 1403
+    },
+    {
+      "epoch": 1.3304904051172708,
+      "grad_norm": 1.0775922171839278,
+      "learning_rate": 2.6612330283564226e-06,
+      "loss": 0.6713,
+      "step": 1404
+    },
+    {
+      "epoch": 1.3314380478559582,
+      "grad_norm": 0.9947061326854107,
+      "learning_rate": 2.6544500476206675e-06,
+      "loss": 0.6725,
+      "step": 1405
+    },
+    {
+      "epoch": 1.3323856905946458,
+      "grad_norm": 1.0208055783943726,
+      "learning_rate": 2.6476725969862227e-06,
+      "loss": 0.7577,
+      "step": 1406
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.9200584970050147,
+      "learning_rate": 2.6409006924322824e-06,
+      "loss": 0.7277,
+      "step": 1407
+    },
+    {
+      "epoch": 1.334280976072021,
+      "grad_norm": 1.1839327711762426,
+      "learning_rate": 2.634134349924956e-06,
+      "loss": 0.7352,
+      "step": 1408
+    },
+    {
+      "epoch": 1.334280976072021,
+      "eval_loss": 0.9188514351844788,
+      "eval_runtime": 65.4529,
+      "eval_samples_per_second": 41.679,
+      "eval_steps_per_second": 0.657,
+      "step": 1408
+    },
+    {
+      "epoch": 1.3352286188107083,
+      "grad_norm": 1.0311881529332272,
+      "learning_rate": 2.6273735854172487e-06,
+      "loss": 0.6348,
+      "step": 1409
+    },
+    {
+      "epoch": 1.336176261549396,
+      "grad_norm": 0.869437188326139,
+      "learning_rate": 2.6206184148490066e-06,
+      "loss": 0.6783,
+      "step": 1410
+    },
+    {
+      "epoch": 1.3371239042880834,
+      "grad_norm": 0.9896388796869628,
+      "learning_rate": 2.6138688541468903e-06,
+      "loss": 0.6565,
+      "step": 1411
+    },
+    {
+      "epoch": 1.3380715470267708,
+      "grad_norm": 0.9975523350097485,
+      "learning_rate": 2.6071249192243365e-06,
+      "loss": 0.7388,
+      "step": 1412
+    },
+    {
+      "epoch": 1.3390191897654584,
+      "grad_norm": 3.814992564369548,
+      "learning_rate": 2.6003866259815123e-06,
+      "loss": 0.7403,
+      "step": 1413
+    },
+    {
+      "epoch": 1.339966832504146,
+      "grad_norm": 1.1226980770763029,
+      "learning_rate": 2.5936539903052893e-06,
+      "loss": 0.7311,
+      "step": 1414
+    },
+    {
+      "epoch": 1.3409144752428335,
+      "grad_norm": 1.0149265229287798,
+      "learning_rate": 2.5869270280691945e-06,
+      "loss": 0.6922,
+      "step": 1415
+    },
+    {
+      "epoch": 1.3418621179815209,
+      "grad_norm": 1.1097881040738804,
+      "learning_rate": 2.580205755133384e-06,
+      "loss": 0.6867,
+      "step": 1416
+    },
+    {
+      "epoch": 1.3428097607202085,
+      "grad_norm": 1.0437635962821175,
+      "learning_rate": 2.573490187344596e-06,
+      "loss": 0.6817,
+      "step": 1417
+    },
+    {
+      "epoch": 1.343757403458896,
+      "grad_norm": 1.0360266239174718,
+      "learning_rate": 2.5667803405361214e-06,
+      "loss": 0.7413,
+      "step": 1418
+    },
+    {
+      "epoch": 1.3447050461975836,
+      "grad_norm": 0.9107405093250232,
+      "learning_rate": 2.560076230527758e-06,
+      "loss": 0.6722,
+      "step": 1419
+    },
+    {
+      "epoch": 1.345652688936271,
+      "grad_norm": 0.9408273813849782,
+      "learning_rate": 2.5533778731257824e-06,
+      "loss": 0.7198,
+      "step": 1420
+    },
+    {
+      "epoch": 1.3466003316749586,
+      "grad_norm": 1.0137206933457714,
+      "learning_rate": 2.546685284122909e-06,
+      "loss": 0.6862,
+      "step": 1421
+    },
+    {
+      "epoch": 1.347547974413646,
+      "grad_norm": 1.207183923292191,
+      "learning_rate": 2.5399984792982457e-06,
+      "loss": 0.7163,
+      "step": 1422
+    },
+    {
+      "epoch": 1.3484956171523335,
+      "grad_norm": 0.9580259639674358,
+      "learning_rate": 2.5333174744172705e-06,
+      "loss": 0.7006,
+      "step": 1423
+    },
+    {
+      "epoch": 1.349443259891021,
+      "grad_norm": 1.0387469510787142,
+      "learning_rate": 2.5266422852317796e-06,
+      "loss": 0.66,
+      "step": 1424
+    },
+    {
+      "epoch": 1.3503909026297087,
+      "grad_norm": 1.0656383066236028,
+      "learning_rate": 2.5199729274798664e-06,
+      "loss": 0.7036,
+      "step": 1425
+    },
+    {
+      "epoch": 1.3513385453683961,
+      "grad_norm": 1.0545231585290438,
+      "learning_rate": 2.513309416885865e-06,
+      "loss": 0.643,
+      "step": 1426
+    },
+    {
+      "epoch": 1.3522861881070836,
+      "grad_norm": 1.1886784682255809,
+      "learning_rate": 2.5066517691603327e-06,
+      "loss": 0.6968,
+      "step": 1427
+    },
+    {
+      "epoch": 1.3532338308457712,
+      "grad_norm": 1.0686385881546185,
+      "learning_rate": 2.5000000000000015e-06,
+      "loss": 0.6948,
+      "step": 1428
+    },
+    {
+      "epoch": 1.3541814735844586,
+      "grad_norm": 1.022360298648431,
+      "learning_rate": 2.493354125087738e-06,
+      "loss": 0.6812,
+      "step": 1429
+    },
+    {
+      "epoch": 1.3551291163231463,
+      "grad_norm": 1.057656689299949,
+      "learning_rate": 2.4867141600925214e-06,
+      "loss": 0.7209,
+      "step": 1430
+    },
+    {
+      "epoch": 1.3551291163231463,
+      "eval_loss": 0.9181008338928223,
+      "eval_runtime": 61.8166,
+      "eval_samples_per_second": 44.131,
+      "eval_steps_per_second": 0.696,
+      "step": 1430
+    },
+    {
+      "epoch": 1.3560767590618337,
+      "grad_norm": 1.0044797087222561,
+      "learning_rate": 2.4800801206693873e-06,
+      "loss": 0.6994,
+      "step": 1431
+    },
+    {
+      "epoch": 1.3570244018005213,
+      "grad_norm": 1.1363501611998355,
+      "learning_rate": 2.4734520224594094e-06,
+      "loss": 0.6967,
+      "step": 1432
+    },
+    {
+      "epoch": 1.3579720445392087,
+      "grad_norm": 1.0735619665932887,
+      "learning_rate": 2.4668298810896463e-06,
+      "loss": 0.6615,
+      "step": 1433
+    },
+    {
+      "epoch": 1.3589196872778961,
+      "grad_norm": 1.2512092821838807,
+      "learning_rate": 2.4602137121731195e-06,
+      "loss": 0.7226,
+      "step": 1434
+    },
+    {
+      "epoch": 1.3598673300165838,
+      "grad_norm": 1.0104639217781053,
+      "learning_rate": 2.4536035313087603e-06,
+      "loss": 0.7748,
+      "step": 1435
+    },
+    {
+      "epoch": 1.3608149727552712,
+      "grad_norm": 0.945616005794055,
+      "learning_rate": 2.44699935408139e-06,
+      "loss": 0.7169,
+      "step": 1436
+    },
+    {
+      "epoch": 1.3617626154939588,
+      "grad_norm": 1.1241609779341695,
+      "learning_rate": 2.4404011960616747e-06,
+      "loss": 0.6734,
+      "step": 1437
+    },
+    {
+      "epoch": 1.3627102582326462,
+      "grad_norm": 1.321111844351736,
+      "learning_rate": 2.4338090728060808e-06,
+      "loss": 0.7567,
+      "step": 1438
+    },
+    {
+      "epoch": 1.3636579009713339,
+      "grad_norm": 0.9913817986983521,
+      "learning_rate": 2.4272229998568576e-06,
+      "loss": 0.6312,
+      "step": 1439
+    },
+    {
+      "epoch": 1.3646055437100213,
+      "grad_norm": 1.0071709428393925,
+      "learning_rate": 2.4206429927419795e-06,
+      "loss": 0.6763,
+      "step": 1440
+    },
+    {
+      "epoch": 1.3655531864487087,
+      "grad_norm": 1.0356649949239918,
+      "learning_rate": 2.414069066975128e-06,
+      "loss": 0.6461,
+      "step": 1441
+    },
+    {
+      "epoch": 1.3665008291873963,
+      "grad_norm": 0.9723682213152954,
+      "learning_rate": 2.40750123805564e-06,
+      "loss": 0.6884,
+      "step": 1442
+    },
+    {
+      "epoch": 1.367448471926084,
+      "grad_norm": 0.9948531066652241,
+      "learning_rate": 2.400939521468484e-06,
+      "loss": 0.7155,
+      "step": 1443
+    },
+    {
+      "epoch": 1.3683961146647714,
+      "grad_norm": 0.9940870834143998,
+      "learning_rate": 2.3943839326842096e-06,
+      "loss": 0.6657,
+      "step": 1444
+    },
+    {
+      "epoch": 1.3693437574034588,
+      "grad_norm": 1.1771867828127296,
+      "learning_rate": 2.387834487158926e-06,
+      "loss": 0.7088,
+      "step": 1445
+    },
+    {
+      "epoch": 1.3702914001421465,
+      "grad_norm": 1.0503694972372863,
+      "learning_rate": 2.381291200334257e-06,
+      "loss": 0.7379,
+      "step": 1446
+    },
+    {
+      "epoch": 1.3712390428808339,
+      "grad_norm": 1.0357654532991112,
+      "learning_rate": 2.3747540876373026e-06,
+      "loss": 0.6843,
+      "step": 1447
+    },
+    {
+      "epoch": 1.3721866856195215,
+      "grad_norm": 0.9841256078266499,
+      "learning_rate": 2.368223164480611e-06,
+      "loss": 0.7251,
+      "step": 1448
+    },
+    {
+      "epoch": 1.373134328358209,
+      "grad_norm": 0.943494428045495,
+      "learning_rate": 2.3616984462621307e-06,
+      "loss": 0.756,
+      "step": 1449
+    },
+    {
+      "epoch": 1.3740819710968966,
+      "grad_norm": 0.9561466461280033,
+      "learning_rate": 2.3551799483651894e-06,
+      "loss": 0.6926,
+      "step": 1450
+    },
+    {
+      "epoch": 1.375029613835584,
+      "grad_norm": 1.2692497517950982,
+      "learning_rate": 2.348667686158441e-06,
+      "loss": 0.6878,
+      "step": 1451
+    },
+    {
+      "epoch": 1.3759772565742714,
+      "grad_norm": 1.2205664546373765,
+      "learning_rate": 2.342161674995843e-06,
+      "loss": 0.7187,
+      "step": 1452
+    },
+    {
+      "epoch": 1.3759772565742714,
+      "eval_loss": 0.9179805517196655,
+      "eval_runtime": 66.4952,
+      "eval_samples_per_second": 41.026,
+      "eval_steps_per_second": 0.647,
+      "step": 1452
+    },
+    {
+      "epoch": 1.376924899312959,
+      "grad_norm": 0.9875355272859734,
+      "learning_rate": 2.335661930216611e-06,
+      "loss": 0.6266,
+      "step": 1453
+    },
+    {
+      "epoch": 1.3778725420516467,
+      "grad_norm": 1.0518188044284036,
+      "learning_rate": 2.3291684671451905e-06,
+      "loss": 0.6734,
+      "step": 1454
+    },
+    {
+      "epoch": 1.378820184790334,
+      "grad_norm": 1.0155553726238449,
+      "learning_rate": 2.322681301091214e-06,
+      "loss": 0.6737,
+      "step": 1455
+    },
+    {
+      "epoch": 1.3797678275290215,
+      "grad_norm": 1.127782826546028,
+      "learning_rate": 2.316200447349466e-06,
+      "loss": 0.7146,
+      "step": 1456
+    },
+    {
+      "epoch": 1.3807154702677091,
+      "grad_norm": 1.0784960552341187,
+      "learning_rate": 2.3097259211998536e-06,
+      "loss": 0.7501,
+      "step": 1457
+    },
+    {
+      "epoch": 1.3816631130063965,
+      "grad_norm": 0.9949725127797444,
+      "learning_rate": 2.3032577379073577e-06,
+      "loss": 0.7015,
+      "step": 1458
+    },
+    {
+      "epoch": 1.3826107557450842,
+      "grad_norm": 0.9785164181620558,
+      "learning_rate": 2.296795912722014e-06,
+      "loss": 0.7015,
+      "step": 1459
+    },
+    {
+      "epoch": 1.3835583984837716,
+      "grad_norm": 1.126981467512306,
+      "learning_rate": 2.2903404608788582e-06,
+      "loss": 0.6766,
+      "step": 1460
+    },
+    {
+      "epoch": 1.3845060412224592,
+      "grad_norm": 1.0927804032807178,
+      "learning_rate": 2.283891397597908e-06,
+      "loss": 0.6693,
+      "step": 1461
+    },
+    {
+      "epoch": 1.3854536839611467,
+      "grad_norm": 1.0258483620038565,
+      "learning_rate": 2.2774487380841116e-06,
+      "loss": 0.6607,
+      "step": 1462
+    },
+    {
+      "epoch": 1.386401326699834,
+      "grad_norm": 1.0139207350796542,
+      "learning_rate": 2.2710124975273236e-06,
+      "loss": 0.7301,
+      "step": 1463
+    },
+    {
+      "epoch": 1.3873489694385217,
+      "grad_norm": 1.2623014945186244,
+      "learning_rate": 2.2645826911022656e-06,
+      "loss": 0.6878,
+      "step": 1464
+    },
+    {
+      "epoch": 1.3882966121772091,
+      "grad_norm": 1.0697558784167414,
+      "learning_rate": 2.258159333968484e-06,
+      "loss": 0.7058,
+      "step": 1465
+    },
+    {
+      "epoch": 1.3892442549158968,
+      "grad_norm": 1.1679882747027204,
+      "learning_rate": 2.2517424412703256e-06,
+      "loss": 0.7337,
+      "step": 1466
+    },
+    {
+      "epoch": 1.3901918976545842,
+      "grad_norm": 1.0802296481928269,
+      "learning_rate": 2.2453320281368903e-06,
+      "loss": 0.686,
+      "step": 1467
+    },
+    {
+      "epoch": 1.3911395403932718,
+      "grad_norm": 0.9794786330880751,
+      "learning_rate": 2.2389281096820077e-06,
+      "loss": 0.7638,
+      "step": 1468
+    },
+    {
+      "epoch": 1.3920871831319592,
+      "grad_norm": 0.9835019275382246,
+      "learning_rate": 2.2325307010041874e-06,
+      "loss": 0.7598,
+      "step": 1469
+    },
+    {
+      "epoch": 1.3930348258706466,
+      "grad_norm": 1.239471343857703,
+      "learning_rate": 2.2261398171865976e-06,
+      "loss": 0.6944,
+      "step": 1470
+    },
+    {
+      "epoch": 1.3939824686093343,
+      "grad_norm": 1.0053386117085916,
+      "learning_rate": 2.21975547329702e-06,
+      "loss": 0.6756,
+      "step": 1471
+    },
+    {
+      "epoch": 1.394930111348022,
+      "grad_norm": 1.0689464802675128,
+      "learning_rate": 2.2133776843878185e-06,
+      "loss": 0.7674,
+      "step": 1472
+    },
+    {
+      "epoch": 1.3958777540867093,
+      "grad_norm": 1.345586768860224,
+      "learning_rate": 2.207006465495898e-06,
+      "loss": 0.6936,
+      "step": 1473
+    },
+    {
+      "epoch": 1.3968253968253967,
+      "grad_norm": 1.155638754133062,
+      "learning_rate": 2.2006418316426773e-06,
+      "loss": 0.6912,
+      "step": 1474
+    },
+    {
+      "epoch": 1.3968253968253967,
+      "eval_loss": 0.9174560308456421,
+      "eval_runtime": 65.4877,
+      "eval_samples_per_second": 41.657,
+      "eval_steps_per_second": 0.657,
+      "step": 1474
+    },
+    {
+      "epoch": 1.3977730395640844,
+      "grad_norm": 1.7458254846447263,
+      "learning_rate": 2.1942837978340516e-06,
+      "loss": 0.7289,
+      "step": 1475
+    },
+    {
+      "epoch": 1.3987206823027718,
+      "grad_norm": 1.104499286780302,
+      "learning_rate": 2.187932379060348e-06,
+      "loss": 0.6773,
+      "step": 1476
+    },
+    {
+      "epoch": 1.3996683250414594,
+      "grad_norm": 1.0183691386946678,
+      "learning_rate": 2.1815875902963058e-06,
+      "loss": 0.7138,
+      "step": 1477
+    },
+    {
+      "epoch": 1.4006159677801469,
+      "grad_norm": 0.9953027863427754,
+      "learning_rate": 2.175249446501024e-06,
+      "loss": 0.6644,
+      "step": 1478
+    },
+    {
+      "epoch": 1.4015636105188345,
+      "grad_norm": 1.119903318819432,
+      "learning_rate": 2.1689179626179442e-06,
+      "loss": 0.673,
+      "step": 1479
+    },
+    {
+      "epoch": 1.402511253257522,
+      "grad_norm": 1.01161813324734,
+      "learning_rate": 2.1625931535747964e-06,
+      "loss": 0.7104,
+      "step": 1480
+    },
+    {
+      "epoch": 1.4034588959962093,
+      "grad_norm": 0.953534813718956,
+      "learning_rate": 2.1562750342835827e-06,
+      "loss": 0.7277,
+      "step": 1481
+    },
+    {
+      "epoch": 1.404406538734897,
+      "grad_norm": 1.1396651805754336,
+      "learning_rate": 2.1499636196405225e-06,
+      "loss": 0.7227,
+      "step": 1482
+    },
+    {
+      "epoch": 1.4053541814735846,
+      "grad_norm": 1.5386081307384067,
+      "learning_rate": 2.1436589245260375e-06,
+      "loss": 0.668,
+      "step": 1483
+    },
+    {
+      "epoch": 1.406301824212272,
+      "grad_norm": 0.9995908645331962,
+      "learning_rate": 2.1373609638047033e-06,
+      "loss": 0.7043,
+      "step": 1484
+    },
+    {
+      "epoch": 1.4072494669509594,
+      "grad_norm": 1.10383243717245,
+      "learning_rate": 2.1310697523252126e-06,
+      "loss": 0.7026,
+      "step": 1485
+    },
+    {
+      "epoch": 1.408197109689647,
+      "grad_norm": 1.0267277972723963,
+      "learning_rate": 2.1247853049203543e-06,
+      "loss": 0.7082,
+      "step": 1486
+    },
+    {
+      "epoch": 1.4091447524283345,
+      "grad_norm": 1.0496913097987108,
+      "learning_rate": 2.118507636406962e-06,
+      "loss": 0.6481,
+      "step": 1487
+    },
+    {
+      "epoch": 1.410092395167022,
+      "grad_norm": 1.1897337956700562,
+      "learning_rate": 2.112236761585892e-06,
+      "loss": 0.7089,
+      "step": 1488
+    },
+    {
+      "epoch": 1.4110400379057095,
+      "grad_norm": 0.9785065766450801,
+      "learning_rate": 2.1059726952419782e-06,
+      "loss": 0.7485,
+      "step": 1489
+    },
+    {
+      "epoch": 1.4119876806443972,
+      "grad_norm": 1.0121891810944206,
+      "learning_rate": 2.09971545214401e-06,
+      "loss": 0.7133,
+      "step": 1490
+    },
+    {
+      "epoch": 1.4129353233830846,
+      "grad_norm": 1.292470404393716,
+      "learning_rate": 2.0934650470446788e-06,
+      "loss": 0.6978,
+      "step": 1491
+    },
+    {
+      "epoch": 1.413882966121772,
+      "grad_norm": 0.9544540655235114,
+      "learning_rate": 2.087221494680563e-06,
+      "loss": 0.7313,
+      "step": 1492
+    },
+    {
+      "epoch": 1.4148306088604596,
+      "grad_norm": 1.1659180349275824,
+      "learning_rate": 2.0809848097720823e-06,
+      "loss": 0.6451,
+      "step": 1493
+    },
+    {
+      "epoch": 1.415778251599147,
+      "grad_norm": 1.037380174383666,
+      "learning_rate": 2.074755007023461e-06,
+      "loss": 0.7405,
+      "step": 1494
+    },
+    {
+      "epoch": 1.4167258943378347,
+      "grad_norm": 1.0242249417898568,
+      "learning_rate": 2.068532101122704e-06,
+      "loss": 0.6708,
+      "step": 1495
+    },
+    {
+      "epoch": 1.417673537076522,
+      "grad_norm": 0.9620726481692868,
+      "learning_rate": 2.0623161067415463e-06,
+      "loss": 0.6707,
+      "step": 1496
+    },
+    {
+      "epoch": 1.417673537076522,
+      "eval_loss": 0.9181029200553894,
+      "eval_runtime": 64.509,
+      "eval_samples_per_second": 42.289,
+      "eval_steps_per_second": 0.667,
+      "step": 1496
+    },
+    {
+      "epoch": 1.4186211798152097,
+      "grad_norm": 1.005417588718906,
+      "learning_rate": 2.0561070385354388e-06,
+      "loss": 0.6731,
+      "step": 1497
+    },
+    {
+      "epoch": 1.4195688225538972,
+      "grad_norm": 1.1896671861120682,
+      "learning_rate": 2.0499049111434922e-06,
+      "loss": 0.7227,
+      "step": 1498
+    },
+    {
+      "epoch": 1.4205164652925846,
+      "grad_norm": 2.0023777404039786,
+      "learning_rate": 2.0437097391884613e-06,
+      "loss": 0.6868,
+      "step": 1499
+    },
+    {
+      "epoch": 1.4214641080312722,
+      "grad_norm": 1.0629425622129343,
+      "learning_rate": 2.0375215372766944e-06,
+      "loss": 0.6846,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4224117507699598,
+      "grad_norm": 0.9445974760544792,
+      "learning_rate": 2.0313403199981125e-06,
+      "loss": 0.7394,
+      "step": 1501
+    },
+    {
+      "epoch": 1.4233593935086473,
+      "grad_norm": 0.9843138879504003,
+      "learning_rate": 2.025166101926168e-06,
+      "loss": 0.7182,
+      "step": 1502
+    },
+    {
+      "epoch": 1.4243070362473347,
+      "grad_norm": 0.9243254028077557,
+      "learning_rate": 2.018998897617808e-06,
+      "loss": 0.6837,
+      "step": 1503
+    },
+    {
+      "epoch": 1.4252546789860223,
+      "grad_norm": 1.0253863539461858,
+      "learning_rate": 2.012838721613447e-06,
+      "loss": 0.667,
+      "step": 1504
+    },
+    {
+      "epoch": 1.4262023217247097,
+      "grad_norm": 1.327749397929995,
+      "learning_rate": 2.0066855884369246e-06,
+      "loss": 0.7177,
+      "step": 1505
+    },
+    {
+      "epoch": 1.4271499644633974,
+      "grad_norm": 1.1076042384170708,
+      "learning_rate": 2.0005395125954814e-06,
+      "loss": 0.7841,
+      "step": 1506
+    },
+    {
+      "epoch": 1.4280976072020848,
+      "grad_norm": 1.037107571912964,
+      "learning_rate": 1.9944005085797124e-06,
+      "loss": 0.6346,
+      "step": 1507
+    },
+    {
+      "epoch": 1.4290452499407724,
+      "grad_norm": 1.0616935978252389,
+      "learning_rate": 1.988268590863546e-06,
+      "loss": 0.7287,
+      "step": 1508
+    },
+    {
+      "epoch": 1.4299928926794598,
+      "grad_norm": 1.1394211171878255,
+      "learning_rate": 1.982143773904197e-06,
+      "loss": 0.7026,
+      "step": 1509
+    },
+    {
+      "epoch": 1.4309405354181473,
+      "grad_norm": 1.0108800469903032,
+      "learning_rate": 1.9760260721421426e-06,
+      "loss": 0.722,
+      "step": 1510
+    },
+    {
+      "epoch": 1.431888178156835,
+      "grad_norm": 1.0306788000447762,
+      "learning_rate": 1.9699155000010853e-06,
+      "loss": 0.6762,
+      "step": 1511
+    },
+    {
+      "epoch": 1.4328358208955223,
+      "grad_norm": 0.963103281240134,
+      "learning_rate": 1.9638120718879133e-06,
+      "loss": 0.7084,
+      "step": 1512
+    },
+    {
+      "epoch": 1.43378346363421,
+      "grad_norm": 0.9474369197260795,
+      "learning_rate": 1.9577158021926774e-06,
+      "loss": 0.6745,
+      "step": 1513
+    },
+    {
+      "epoch": 1.4347311063728974,
+      "grad_norm": 1.9939781034695376,
+      "learning_rate": 1.951626705288544e-06,
+      "loss": 0.7011,
+      "step": 1514
+    },
+    {
+      "epoch": 1.435678749111585,
+      "grad_norm": 0.9725792085154051,
+      "learning_rate": 1.945544795531777e-06,
+      "loss": 0.7155,
+      "step": 1515
+    },
+    {
+      "epoch": 1.4366263918502724,
+      "grad_norm": 1.1197678082060893,
+      "learning_rate": 1.9394700872616856e-06,
+      "loss": 0.6595,
+      "step": 1516
+    },
+    {
+      "epoch": 1.4375740345889598,
+      "grad_norm": 1.0301457084495482,
+      "learning_rate": 1.9334025948006074e-06,
+      "loss": 0.6955,
+      "step": 1517
+    },
+    {
+      "epoch": 1.4385216773276475,
+      "grad_norm": 0.9348949706000595,
+      "learning_rate": 1.927342332453866e-06,
+      "loss": 0.6047,
+      "step": 1518
+    },
+    {
+      "epoch": 1.4385216773276475,
+      "eval_loss": 0.9176700115203857,
+      "eval_runtime": 67.6785,
+      "eval_samples_per_second": 40.308,
+      "eval_steps_per_second": 0.635,
+      "step": 1518
+    },
+    {
+      "epoch": 1.439469320066335,
+      "grad_norm": 1.0644335037153672,
+      "learning_rate": 1.921289314509734e-06,
+      "loss": 0.7127,
+      "step": 1519
+    },
+    {
+      "epoch": 1.4404169628050225,
+      "grad_norm": 1.0834219671738452,
+      "learning_rate": 1.9152435552394105e-06,
+      "loss": 0.7215,
+      "step": 1520
+    },
+    {
+      "epoch": 1.44136460554371,
+      "grad_norm": 1.0745604461477756,
+      "learning_rate": 1.9092050688969736e-06,
+      "loss": 0.678,
+      "step": 1521
+    },
+    {
+      "epoch": 1.4423122482823976,
+      "grad_norm": 1.0371641556496103,
+      "learning_rate": 1.9031738697193618e-06,
+      "loss": 0.633,
+      "step": 1522
+    },
+    {
+      "epoch": 1.443259891021085,
+      "grad_norm": 1.2536801282599777,
+      "learning_rate": 1.8971499719263253e-06,
+      "loss": 0.6985,
+      "step": 1523
+    },
+    {
+      "epoch": 1.4442075337597726,
+      "grad_norm": 0.9951235851029153,
+      "learning_rate": 1.8911333897204071e-06,
+      "loss": 0.7719,
+      "step": 1524
+    },
+    {
+      "epoch": 1.44515517649846,
+      "grad_norm": 1.0116038011102524,
+      "learning_rate": 1.8851241372868938e-06,
+      "loss": 0.6848,
+      "step": 1525
+    },
+    {
+      "epoch": 1.4461028192371477,
+      "grad_norm": 1.4604330607664564,
+      "learning_rate": 1.8791222287937983e-06,
+      "loss": 0.7657,
+      "step": 1526
+    },
+    {
+      "epoch": 1.447050461975835,
+      "grad_norm": 1.0830259379657234,
+      "learning_rate": 1.8731276783918162e-06,
+      "loss": 0.6805,
+      "step": 1527
+    },
+    {
+      "epoch": 1.4479981047145225,
+      "grad_norm": 1.0267731594205933,
+      "learning_rate": 1.8671405002142918e-06,
+      "loss": 0.6707,
+      "step": 1528
+    },
+    {
+      "epoch": 1.4489457474532101,
+      "grad_norm": 1.0461880905401142,
+      "learning_rate": 1.8611607083771931e-06,
+      "loss": 0.7222,
+      "step": 1529
+    },
+    {
+      "epoch": 1.4498933901918978,
+      "grad_norm": 1.5078994577302969,
+      "learning_rate": 1.855188316979068e-06,
+      "loss": 0.7749,
+      "step": 1530
+    },
+    {
+      "epoch": 1.4508410329305852,
+      "grad_norm": 0.9580998087747682,
+      "learning_rate": 1.8492233401010218e-06,
+      "loss": 0.6656,
+      "step": 1531
+    },
+    {
+      "epoch": 1.4517886756692726,
+      "grad_norm": 1.030790506291452,
+      "learning_rate": 1.8432657918066732e-06,
+      "loss": 0.6938,
+      "step": 1532
+    },
+    {
+      "epoch": 1.4527363184079602,
+      "grad_norm": 1.0172225305837883,
+      "learning_rate": 1.8373156861421327e-06,
+      "loss": 0.6944,
+      "step": 1533
+    },
+    {
+      "epoch": 1.4536839611466477,
+      "grad_norm": 1.0683718679060756,
+      "learning_rate": 1.831373037135955e-06,
+      "loss": 0.6548,
+      "step": 1534
+    },
+    {
+      "epoch": 1.4546316038853353,
+      "grad_norm": 1.0948584996468758,
+      "learning_rate": 1.8254378587991229e-06,
+      "loss": 0.7163,
+      "step": 1535
+    },
+    {
+      "epoch": 1.4555792466240227,
+      "grad_norm": 0.9144325805834296,
+      "learning_rate": 1.819510165125002e-06,
+      "loss": 0.6897,
+      "step": 1536
+    },
+    {
+      "epoch": 1.4565268893627104,
+      "grad_norm": 1.0047087823053469,
+      "learning_rate": 1.813589970089308e-06,
+      "loss": 0.682,
+      "step": 1537
+    },
+    {
+      "epoch": 1.4574745321013978,
+      "grad_norm": 1.0404416866339528,
+      "learning_rate": 1.8076772876500831e-06,
+      "loss": 0.7615,
+      "step": 1538
+    },
+    {
+      "epoch": 1.4584221748400852,
+      "grad_norm": 0.9884869531835868,
+      "learning_rate": 1.8017721317476517e-06,
+      "loss": 0.7436,
+      "step": 1539
+    },
+    {
+      "epoch": 1.4593698175787728,
+      "grad_norm": 1.1394802881055484,
+      "learning_rate": 1.7958745163045987e-06,
+      "loss": 0.6969,
+      "step": 1540
+    },
+    {
+      "epoch": 1.4593698175787728,
+      "eval_loss": 0.9182996153831482,
+      "eval_runtime": 64.6749,
+      "eval_samples_per_second": 42.18,
+      "eval_steps_per_second": 0.665,
+      "step": 1540
+    },
+    {
+      "epoch": 1.4603174603174602,
+      "grad_norm": 0.9439166549447204,
+      "learning_rate": 1.7899844552257233e-06,
+      "loss": 0.6422,
+      "step": 1541
+    },
+    {
+      "epoch": 1.4612651030561479,
+      "grad_norm": 1.0656288471157296,
+      "learning_rate": 1.7841019623980215e-06,
+      "loss": 0.7706,
+      "step": 1542
+    },
+    {
+      "epoch": 1.4622127457948353,
+      "grad_norm": 1.1222434957696432,
+      "learning_rate": 1.778227051690639e-06,
+      "loss": 0.7507,
+      "step": 1543
+    },
+    {
+      "epoch": 1.463160388533523,
+      "grad_norm": 0.983832268950965,
+      "learning_rate": 1.77235973695485e-06,
+      "loss": 0.6955,
+      "step": 1544
+    },
+    {
+      "epoch": 1.4641080312722103,
+      "grad_norm": 1.2587712766095054,
+      "learning_rate": 1.76650003202402e-06,
+      "loss": 0.6477,
+      "step": 1545
+    },
+    {
+      "epoch": 1.4650556740108978,
+      "grad_norm": 1.0066028742449087,
+      "learning_rate": 1.760647950713566e-06,
+      "loss": 0.7544,
+      "step": 1546
+    },
+    {
+      "epoch": 1.4660033167495854,
+      "grad_norm": 1.7078692165257152,
+      "learning_rate": 1.7548035068209402e-06,
+      "loss": 0.6756,
+      "step": 1547
+    },
+    {
+      "epoch": 1.466950959488273,
+      "grad_norm": 1.166304208347539,
+      "learning_rate": 1.7489667141255801e-06,
+      "loss": 0.7093,
+      "step": 1548
+    },
+    {
+      "epoch": 1.4678986022269604,
+      "grad_norm": 1.1980159481547823,
+      "learning_rate": 1.74313758638889e-06,
+      "loss": 0.6749,
+      "step": 1549
+    },
+    {
+      "epoch": 1.4688462449656479,
+      "grad_norm": 1.1944578051622323,
+      "learning_rate": 1.7373161373541968e-06,
+      "loss": 0.7281,
+      "step": 1550
+    },
+    {
+      "epoch": 1.4697938877043355,
+      "grad_norm": 0.9485809631995211,
+      "learning_rate": 1.7315023807467297e-06,
+      "loss": 0.7248,
+      "step": 1551
+    },
+    {
+      "epoch": 1.470741530443023,
+      "grad_norm": 1.0931088608413588,
+      "learning_rate": 1.7256963302735752e-06,
+      "loss": 0.7378,
+      "step": 1552
+    },
+    {
+      "epoch": 1.4716891731817106,
+      "grad_norm": 1.060786787419658,
+      "learning_rate": 1.7198979996236548e-06,
+      "loss": 0.7155,
+      "step": 1553
+    },
+    {
+      "epoch": 1.472636815920398,
+      "grad_norm": 1.0652060238495478,
+      "learning_rate": 1.7141074024676913e-06,
+      "loss": 0.7045,
+      "step": 1554
+    },
+    {
+      "epoch": 1.4735844586590856,
+      "grad_norm": 1.0281307395838142,
+      "learning_rate": 1.7083245524581666e-06,
+      "loss": 0.6337,
+      "step": 1555
+    },
+    {
+      "epoch": 1.474532101397773,
+      "grad_norm": 1.0438612628712518,
+      "learning_rate": 1.702549463229305e-06,
+      "loss": 0.71,
+      "step": 1556
+    },
+    {
+      "epoch": 1.4754797441364604,
+      "grad_norm": 1.1014417394767868,
+      "learning_rate": 1.6967821483970277e-06,
+      "loss": 0.7179,
+      "step": 1557
+    },
+    {
+      "epoch": 1.476427386875148,
+      "grad_norm": 1.111247079863271,
+      "learning_rate": 1.6910226215589303e-06,
+      "loss": 0.7377,
+      "step": 1558
+    },
+    {
+      "epoch": 1.4773750296138357,
+      "grad_norm": 0.9295211830798729,
+      "learning_rate": 1.6852708962942426e-06,
+      "loss": 0.6809,
+      "step": 1559
+    },
+    {
+      "epoch": 1.4783226723525231,
+      "grad_norm": 0.962644045007234,
+      "learning_rate": 1.6795269861638041e-06,
+      "loss": 0.6314,
+      "step": 1560
+    },
+    {
+      "epoch": 1.4792703150912105,
+      "grad_norm": 1.0292591372583686,
+      "learning_rate": 1.6737909047100292e-06,
+      "loss": 0.6838,
+      "step": 1561
+    },
+    {
+      "epoch": 1.4802179578298982,
+      "grad_norm": 0.933451941605439,
+      "learning_rate": 1.6680626654568688e-06,
+      "loss": 0.6608,
+      "step": 1562
+    },
+    {
+      "epoch": 1.4802179578298982,
+      "eval_loss": 0.9174679517745972,
+      "eval_runtime": 65.8561,
+      "eval_samples_per_second": 41.424,
+      "eval_steps_per_second": 0.653,
+      "step": 1562
+    },
+    {
+      "epoch": 1.4811656005685856,
+      "grad_norm": 1.2955810557077383,
+      "learning_rate": 1.6623422819097916e-06,
+      "loss": 0.6458,
+      "step": 1563
+    },
+    {
+      "epoch": 1.4821132433072732,
+      "grad_norm": 1.0310551387137157,
+      "learning_rate": 1.6566297675557392e-06,
+      "loss": 0.6919,
+      "step": 1564
+    },
+    {
+      "epoch": 1.4830608860459606,
+      "grad_norm": 1.0315297687360494,
+      "learning_rate": 1.650925135863104e-06,
+      "loss": 0.7086,
+      "step": 1565
+    },
+    {
+      "epoch": 1.4840085287846483,
+      "grad_norm": 1.0726991322217692,
+      "learning_rate": 1.6452284002816893e-06,
+      "loss": 0.7162,
+      "step": 1566
+    },
+    {
+      "epoch": 1.4849561715233357,
+      "grad_norm": 0.9497387623431127,
+      "learning_rate": 1.6395395742426873e-06,
+      "loss": 0.7216,
+      "step": 1567
+    },
+    {
+      "epoch": 1.4859038142620231,
+      "grad_norm": 1.2380479809887557,
+      "learning_rate": 1.6338586711586358e-06,
+      "loss": 0.7606,
+      "step": 1568
+    },
+    {
+      "epoch": 1.4868514570007108,
+      "grad_norm": 1.0796844065547042,
+      "learning_rate": 1.6281857044233968e-06,
+      "loss": 0.7319,
+      "step": 1569
+    },
+    {
+      "epoch": 1.4877990997393982,
+      "grad_norm": 1.1057313142497514,
+      "learning_rate": 1.6225206874121219e-06,
+      "loss": 0.6829,
+      "step": 1570
+    },
+    {
+      "epoch": 1.4887467424780858,
+      "grad_norm": 0.9734912790760845,
+      "learning_rate": 1.6168636334812126e-06,
+      "loss": 0.7407,
+      "step": 1571
+    },
+    {
+      "epoch": 1.4896943852167732,
+      "grad_norm": 1.0421614980005374,
+      "learning_rate": 1.6112145559683057e-06,
+      "loss": 0.7287,
+      "step": 1572
+    },
+    {
+      "epoch": 1.4906420279554609,
+      "grad_norm": 1.04980460946954,
+      "learning_rate": 1.6055734681922225e-06,
+      "loss": 0.7045,
+      "step": 1573
+    },
+    {
+      "epoch": 1.4915896706941483,
+      "grad_norm": 0.9906252177237804,
+      "learning_rate": 1.5999403834529549e-06,
+      "loss": 0.7192,
+      "step": 1574
+    },
+    {
+      "epoch": 1.4925373134328357,
+      "grad_norm": 0.8864390607015001,
+      "learning_rate": 1.5943153150316192e-06,
+      "loss": 0.6814,
+      "step": 1575
+    },
+    {
+      "epoch": 1.4934849561715233,
+      "grad_norm": 1.0203127741430889,
+      "learning_rate": 1.588698276190438e-06,
+      "loss": 0.6962,
+      "step": 1576
+    },
+    {
+      "epoch": 1.494432598910211,
+      "grad_norm": 1.1100506177133262,
+      "learning_rate": 1.583089280172696e-06,
+      "loss": 0.7852,
+      "step": 1577
+    },
+    {
+      "epoch": 1.4953802416488984,
+      "grad_norm": 1.0162831638512881,
+      "learning_rate": 1.5774883402027208e-06,
+      "loss": 0.7059,
+      "step": 1578
+    },
+    {
+      "epoch": 1.4963278843875858,
+      "grad_norm": 0.9936052407213679,
+      "learning_rate": 1.5718954694858457e-06,
+      "loss": 0.6858,
+      "step": 1579
+    },
+    {
+      "epoch": 1.4972755271262734,
+      "grad_norm": 0.9088159832146572,
+      "learning_rate": 1.5663106812083746e-06,
+      "loss": 0.75,
+      "step": 1580
+    },
+    {
+      "epoch": 1.4982231698649608,
+      "grad_norm": 1.078879512723169,
+      "learning_rate": 1.5607339885375616e-06,
+      "loss": 0.7139,
+      "step": 1581
+    },
+    {
+      "epoch": 1.4991708126036485,
+      "grad_norm": 1.096073398162632,
+      "learning_rate": 1.555165404621567e-06,
+      "loss": 0.7086,
+      "step": 1582
+    },
+    {
+      "epoch": 1.500118455342336,
+      "grad_norm": 0.9334643534638518,
+      "learning_rate": 1.549604942589441e-06,
+      "loss": 0.7479,
+      "step": 1583
+    },
+    {
+      "epoch": 1.5010660980810235,
+      "grad_norm": 1.0170090962142344,
+      "learning_rate": 1.5440526155510766e-06,
+      "loss": 0.7369,
+      "step": 1584
+    },
+    {
+      "epoch": 1.5010660980810235,
+      "eval_loss": 0.9167425036430359,
+      "eval_runtime": 65.8794,
+      "eval_samples_per_second": 41.409,
+      "eval_steps_per_second": 0.653,
+      "step": 1584
+    },
+    {
+      "epoch": 1.502013740819711,
+      "grad_norm": 0.9547990185885695,
+      "learning_rate": 1.5385084365971947e-06,
+      "loss": 0.6959,
+      "step": 1585
+    },
+    {
+      "epoch": 1.5029613835583984,
+      "grad_norm": 1.0361542203838576,
+      "learning_rate": 1.5329724187992983e-06,
+      "loss": 0.7212,
+      "step": 1586
+    },
+    {
+      "epoch": 1.503909026297086,
+      "grad_norm": 0.9844297983728068,
+      "learning_rate": 1.527444575209654e-06,
+      "loss": 0.6246,
+      "step": 1587
+    },
+    {
+      "epoch": 1.5048566690357736,
+      "grad_norm": 1.1774971138059982,
+      "learning_rate": 1.5219249188612556e-06,
+      "loss": 0.7104,
+      "step": 1588
+    },
+    {
+      "epoch": 1.505804311774461,
+      "grad_norm": 1.0258042621080319,
+      "learning_rate": 1.5164134627677895e-06,
+      "loss": 0.8074,
+      "step": 1589
+    },
+    {
+      "epoch": 1.5067519545131485,
+      "grad_norm": 1.0211430093660283,
+      "learning_rate": 1.5109102199236152e-06,
+      "loss": 0.7133,
+      "step": 1590
+    },
+    {
+      "epoch": 1.507699597251836,
+      "grad_norm": 0.9827554066548617,
+      "learning_rate": 1.5054152033037206e-06,
+      "loss": 0.6725,
+      "step": 1591
+    },
+    {
+      "epoch": 1.5086472399905235,
+      "grad_norm": 1.039393638590845,
+      "learning_rate": 1.4999284258637054e-06,
+      "loss": 0.7353,
+      "step": 1592
+    },
+    {
+      "epoch": 1.509594882729211,
+      "grad_norm": 0.9523927797678573,
+      "learning_rate": 1.4944499005397372e-06,
+      "loss": 0.7171,
+      "step": 1593
+    },
+    {
+      "epoch": 1.5105425254678986,
+      "grad_norm": 1.1643227627630548,
+      "learning_rate": 1.488979640248534e-06,
+      "loss": 0.7277,
+      "step": 1594
+    },
+    {
+      "epoch": 1.5114901682065862,
+      "grad_norm": 1.3140724772089227,
+      "learning_rate": 1.483517657887321e-06,
+      "loss": 0.6989,
+      "step": 1595
+    },
+    {
+      "epoch": 1.5124378109452736,
+      "grad_norm": 1.1677523249682737,
+      "learning_rate": 1.4780639663338125e-06,
+      "loss": 0.6442,
+      "step": 1596
+    },
+    {
+      "epoch": 1.513385453683961,
+      "grad_norm": 1.054535982093413,
+      "learning_rate": 1.4726185784461726e-06,
+      "loss": 0.7267,
+      "step": 1597
+    },
+    {
+      "epoch": 1.5143330964226487,
+      "grad_norm": 1.0699556405778263,
+      "learning_rate": 1.467181507062987e-06,
+      "loss": 0.7513,
+      "step": 1598
+    },
+    {
+      "epoch": 1.5152807391613363,
+      "grad_norm": 1.0277582958250449,
+      "learning_rate": 1.4617527650032359e-06,
+      "loss": 0.7007,
+      "step": 1599
+    },
+    {
+      "epoch": 1.5162283819000237,
+      "grad_norm": 0.9386085483384984,
+      "learning_rate": 1.4563323650662586e-06,
+      "loss": 0.6309,
+      "step": 1600
+    },
+    {
+      "epoch": 1.5171760246387112,
+      "grad_norm": 0.9866666932951867,
+      "learning_rate": 1.45092032003173e-06,
+      "loss": 0.7236,
+      "step": 1601
+    },
+    {
+      "epoch": 1.5181236673773988,
+      "grad_norm": 1.1091828859845587,
+      "learning_rate": 1.4455166426596222e-06,
+      "loss": 0.6645,
+      "step": 1602
+    },
+    {
+      "epoch": 1.5190713101160862,
+      "grad_norm": 1.0719630621382366,
+      "learning_rate": 1.440121345690182e-06,
+      "loss": 0.6967,
+      "step": 1603
+    },
+    {
+      "epoch": 1.5200189528547736,
+      "grad_norm": 1.2153721393164998,
+      "learning_rate": 1.434734441843899e-06,
+      "loss": 0.6897,
+      "step": 1604
+    },
+    {
+      "epoch": 1.5209665955934613,
+      "grad_norm": 1.4318770673483734,
+      "learning_rate": 1.4293559438214688e-06,
+      "loss": 0.6556,
+      "step": 1605
+    },
+    {
+      "epoch": 1.521914238332149,
+      "grad_norm": 1.2817723869505593,
+      "learning_rate": 1.4239858643037753e-06,
+      "loss": 0.714,
+      "step": 1606
+    },
+    {
+      "epoch": 1.521914238332149,
+      "eval_loss": 0.917027473449707,
+      "eval_runtime": 64.2379,
+      "eval_samples_per_second": 42.467,
+      "eval_steps_per_second": 0.669,
+      "step": 1606
+    },
+    {
+      "epoch": 1.5228618810708363,
+      "grad_norm": 0.8577851406821353,
+      "learning_rate": 1.4186242159518477e-06,
+      "loss": 0.7231,
+      "step": 1607
+    },
+    {
+      "epoch": 1.5238095238095237,
+      "grad_norm": 1.3693963934030193,
+      "learning_rate": 1.4132710114068427e-06,
+      "loss": 0.7009,
+      "step": 1608
+    },
+    {
+      "epoch": 1.5247571665482114,
+      "grad_norm": 1.0677528890480996,
+      "learning_rate": 1.4079262632900048e-06,
+      "loss": 0.7038,
+      "step": 1609
+    },
+    {
+      "epoch": 1.525704809286899,
+      "grad_norm": 1.0765183801869453,
+      "learning_rate": 1.4025899842026442e-06,
+      "loss": 0.6736,
+      "step": 1610
+    },
+    {
+      "epoch": 1.5266524520255862,
+      "grad_norm": 0.913748130871297,
+      "learning_rate": 1.3972621867261e-06,
+      "loss": 0.7614,
+      "step": 1611
+    },
+    {
+      "epoch": 1.5276000947642738,
+      "grad_norm": 0.9806034371545538,
+      "learning_rate": 1.3919428834217163e-06,
+      "loss": 0.7362,
+      "step": 1612
+    },
+    {
+      "epoch": 1.5285477375029615,
+      "grad_norm": 0.9176939063393724,
+      "learning_rate": 1.3866320868308137e-06,
+      "loss": 0.7242,
+      "step": 1613
+    },
+    {
+      "epoch": 1.5294953802416489,
+      "grad_norm": 1.078657631575495,
+      "learning_rate": 1.3813298094746491e-06,
+      "loss": 0.7036,
+      "step": 1614
+    },
+    {
+      "epoch": 1.5304430229803363,
+      "grad_norm": 0.9600837682866002,
+      "learning_rate": 1.3760360638544012e-06,
+      "loss": 0.6084,
+      "step": 1615
+    },
+    {
+      "epoch": 1.531390665719024,
+      "grad_norm": 0.9717695630583644,
+      "learning_rate": 1.3707508624511263e-06,
+      "loss": 0.7243,
+      "step": 1616
+    },
+    {
+      "epoch": 1.5323383084577116,
+      "grad_norm": 1.00289902715343,
+      "learning_rate": 1.3654742177257436e-06,
+      "loss": 0.7266,
+      "step": 1617
+    },
+    {
+      "epoch": 1.533285951196399,
+      "grad_norm": 0.9836974323677292,
+      "learning_rate": 1.3602061421189899e-06,
+      "loss": 0.6669,
+      "step": 1618
+    },
+    {
+      "epoch": 1.5342335939350864,
+      "grad_norm": 0.9697616210673388,
+      "learning_rate": 1.3549466480514079e-06,
+      "loss": 0.6768,
+      "step": 1619
+    },
+    {
+      "epoch": 1.535181236673774,
+      "grad_norm": 1.0209728170363237,
+      "learning_rate": 1.349695747923298e-06,
+      "loss": 0.68,
+      "step": 1620
+    },
+    {
+      "epoch": 1.5361288794124615,
+      "grad_norm": 1.1317276468030417,
+      "learning_rate": 1.3444534541147058e-06,
+      "loss": 0.6391,
+      "step": 1621
+    },
+    {
+      "epoch": 1.5370765221511489,
+      "grad_norm": 0.9314842383246464,
+      "learning_rate": 1.339219778985385e-06,
+      "loss": 0.7117,
+      "step": 1622
+    },
+    {
+      "epoch": 1.5380241648898365,
+      "grad_norm": 1.1323771307347132,
+      "learning_rate": 1.3339947348747633e-06,
+      "loss": 0.7511,
+      "step": 1623
+    },
+    {
+      "epoch": 1.5389718076285241,
+      "grad_norm": 1.0881241560945962,
+      "learning_rate": 1.3287783341019278e-06,
+      "loss": 0.6818,
+      "step": 1624
+    },
+    {
+      "epoch": 1.5399194503672116,
+      "grad_norm": 1.0586689056144305,
+      "learning_rate": 1.3235705889655781e-06,
+      "loss": 0.7126,
+      "step": 1625
+    },
+    {
+      "epoch": 1.540867093105899,
+      "grad_norm": 0.9786185399753199,
+      "learning_rate": 1.3183715117440143e-06,
+      "loss": 0.704,
+      "step": 1626
+    },
+    {
+      "epoch": 1.5418147358445866,
+      "grad_norm": 1.003277545254164,
+      "learning_rate": 1.3131811146950946e-06,
+      "loss": 0.7513,
+      "step": 1627
+    },
+    {
+      "epoch": 1.5427623785832743,
+      "grad_norm": 0.9826596333095564,
+      "learning_rate": 1.307999410056216e-06,
+      "loss": 0.7253,
+      "step": 1628
+    },
+    {
+      "epoch": 1.5427623785832743,
+      "eval_loss": 0.9158708453178406,
+      "eval_runtime": 63.8817,
+      "eval_samples_per_second": 42.704,
+      "eval_steps_per_second": 0.673,
+      "step": 1628
+    },
+    {
+      "epoch": 1.5437100213219617,
+      "grad_norm": 1.0364913585561064,
+      "learning_rate": 1.3028264100442773e-06,
+      "loss": 0.7177,
+      "step": 1629
+    },
+    {
+      "epoch": 1.544657664060649,
+      "grad_norm": 1.0314613393589664,
+      "learning_rate": 1.2976621268556571e-06,
+      "loss": 0.6822,
+      "step": 1630
+    },
+    {
+      "epoch": 1.5456053067993367,
+      "grad_norm": 1.15231691952742,
+      "learning_rate": 1.2925065726661845e-06,
+      "loss": 0.6954,
+      "step": 1631
+    },
+    {
+      "epoch": 1.5465529495380241,
+      "grad_norm": 1.130048978044205,
+      "learning_rate": 1.2873597596311026e-06,
+      "loss": 0.6895,
+      "step": 1632
+    },
+    {
+      "epoch": 1.5475005922767116,
+      "grad_norm": 0.9910577741606257,
+      "learning_rate": 1.2822216998850506e-06,
+      "loss": 0.7672,
+      "step": 1633
+    },
+    {
+      "epoch": 1.5484482350153992,
+      "grad_norm": 1.1500833811393358,
+      "learning_rate": 1.2770924055420258e-06,
+      "loss": 0.6813,
+      "step": 1634
+    },
+    {
+      "epoch": 1.5493958777540868,
+      "grad_norm": 1.0195004139070214,
+      "learning_rate": 1.2719718886953647e-06,
+      "loss": 0.6438,
+      "step": 1635
+    },
+    {
+      "epoch": 1.5503435204927742,
+      "grad_norm": 1.0569551060744469,
+      "learning_rate": 1.2668601614177017e-06,
+      "loss": 0.678,
+      "step": 1636
+    },
+    {
+      "epoch": 1.5512911632314617,
+      "grad_norm": 1.043875396257435,
+      "learning_rate": 1.2617572357609565e-06,
+      "loss": 0.7044,
+      "step": 1637
+    },
+    {
+      "epoch": 1.5522388059701493,
+      "grad_norm": 3.1818927893039124,
+      "learning_rate": 1.2566631237562894e-06,
+      "loss": 0.682,
+      "step": 1638
+    },
+    {
+      "epoch": 1.553186448708837,
+      "grad_norm": 0.9283273042702088,
+      "learning_rate": 1.2515778374140858e-06,
+      "loss": 0.688,
+      "step": 1639
+    },
+    {
+      "epoch": 1.5541340914475241,
+      "grad_norm": 0.9528198487095788,
+      "learning_rate": 1.246501388723923e-06,
+      "loss": 0.7322,
+      "step": 1640
+    },
+    {
+      "epoch": 1.5550817341862118,
+      "grad_norm": 0.9973994825972451,
+      "learning_rate": 1.2414337896545375e-06,
+      "loss": 0.666,
+      "step": 1641
+    },
+    {
+      "epoch": 1.5560293769248994,
+      "grad_norm": 0.9902699851910854,
+      "learning_rate": 1.2363750521538064e-06,
+      "loss": 0.6851,
+      "step": 1642
+    },
+    {
+      "epoch": 1.5569770196635868,
+      "grad_norm": 0.9650904944333506,
+      "learning_rate": 1.2313251881487081e-06,
+      "loss": 0.6672,
+      "step": 1643
+    },
+    {
+      "epoch": 1.5579246624022742,
+      "grad_norm": 1.0589094342875154,
+      "learning_rate": 1.2262842095453065e-06,
+      "loss": 0.7416,
+      "step": 1644
+    },
+    {
+      "epoch": 1.5588723051409619,
+      "grad_norm": 0.9386856438191878,
+      "learning_rate": 1.2212521282287093e-06,
+      "loss": 0.6483,
+      "step": 1645
+    },
+    {
+      "epoch": 1.5598199478796495,
+      "grad_norm": 1.0392724309573975,
+      "learning_rate": 1.2162289560630524e-06,
+      "loss": 0.647,
+      "step": 1646
+    },
+    {
+      "epoch": 1.560767590618337,
+      "grad_norm": 0.9961962290639487,
+      "learning_rate": 1.211214704891467e-06,
+      "loss": 0.6501,
+      "step": 1647
+    },
+    {
+      "epoch": 1.5617152333570243,
+      "grad_norm": 0.9147134754595742,
+      "learning_rate": 1.2062093865360458e-06,
+      "loss": 0.6753,
+      "step": 1648
+    },
+    {
+      "epoch": 1.562662876095712,
+      "grad_norm": 1.1879710589928145,
+      "learning_rate": 1.2012130127978267e-06,
+      "loss": 0.7233,
+      "step": 1649
+    },
+    {
+      "epoch": 1.5636105188343994,
+      "grad_norm": 0.931640687866233,
+      "learning_rate": 1.1962255954567537e-06,
+      "loss": 0.6783,
+      "step": 1650
+    },
+    {
+      "epoch": 1.5636105188343994,
+      "eval_loss": 0.9168549180030823,
+      "eval_runtime": 67.249,
+      "eval_samples_per_second": 40.566,
+      "eval_steps_per_second": 0.639,
+      "step": 1650
+    },
+    {
+      "epoch": 1.5645581615730868,
+      "grad_norm": 1.0551992767048257,
+      "learning_rate": 1.1912471462716596e-06,
+      "loss": 0.7034,
+      "step": 1651
+    },
+    {
+      "epoch": 1.5655058043117744,
+      "grad_norm": 1.0948003208344899,
+      "learning_rate": 1.1862776769802275e-06,
+      "loss": 0.7325,
+      "step": 1652
+    },
+    {
+      "epoch": 1.566453447050462,
+      "grad_norm": 0.9671073218744537,
+      "learning_rate": 1.181317199298974e-06,
+      "loss": 0.6658,
+      "step": 1653
+    },
+    {
+      "epoch": 1.5674010897891495,
+      "grad_norm": 1.0286649464862914,
+      "learning_rate": 1.1763657249232107e-06,
+      "loss": 0.696,
+      "step": 1654
+    },
+    {
+      "epoch": 1.568348732527837,
+      "grad_norm": 1.0682369368320455,
+      "learning_rate": 1.1714232655270264e-06,
+      "loss": 0.6833,
+      "step": 1655
+    },
+    {
+      "epoch": 1.5692963752665245,
+      "grad_norm": 0.9350398037136525,
+      "learning_rate": 1.1664898327632552e-06,
+      "loss": 0.6133,
+      "step": 1656
+    },
+    {
+      "epoch": 1.5702440180052122,
+      "grad_norm": 1.0464212808521496,
+      "learning_rate": 1.1615654382634444e-06,
+      "loss": 0.6935,
+      "step": 1657
+    },
+    {
+      "epoch": 1.5711916607438996,
+      "grad_norm": 1.1421842706854686,
+      "learning_rate": 1.1566500936378389e-06,
+      "loss": 0.6562,
+      "step": 1658
+    },
+    {
+      "epoch": 1.572139303482587,
+      "grad_norm": 1.0126937824702735,
+      "learning_rate": 1.1517438104753386e-06,
+      "loss": 0.7224,
+      "step": 1659
+    },
+    {
+      "epoch": 1.5730869462212747,
+      "grad_norm": 1.033911213832933,
+      "learning_rate": 1.146846600343488e-06,
+      "loss": 0.7106,
+      "step": 1660
+    },
+    {
+      "epoch": 1.574034588959962,
+      "grad_norm": 1.1179259174465113,
+      "learning_rate": 1.1419584747884322e-06,
+      "loss": 0.6983,
+      "step": 1661
+    },
+    {
+      "epoch": 1.5749822316986495,
+      "grad_norm": 1.1682227334057596,
+      "learning_rate": 1.1370794453349039e-06,
+      "loss": 0.7165,
+      "step": 1662
+    },
+    {
+      "epoch": 1.5759298744373371,
+      "grad_norm": 0.977082622032907,
+      "learning_rate": 1.132209523486184e-06,
+      "loss": 0.6902,
+      "step": 1663
+    },
+    {
+      "epoch": 1.5768775171760248,
+      "grad_norm": 1.023609290507064,
+      "learning_rate": 1.1273487207240845e-06,
+      "loss": 0.6784,
+      "step": 1664
+    },
+    {
+      "epoch": 1.5778251599147122,
+      "grad_norm": 1.0160057466393255,
+      "learning_rate": 1.1224970485089193e-06,
+      "loss": 0.6993,
+      "step": 1665
+    },
+    {
+      "epoch": 1.5787728026533996,
+      "grad_norm": 0.9873881250473061,
+      "learning_rate": 1.1176545182794674e-06,
+      "loss": 0.7175,
+      "step": 1666
+    },
+    {
+      "epoch": 1.5797204453920872,
+      "grad_norm": 1.3197902151472374,
+      "learning_rate": 1.1128211414529626e-06,
+      "loss": 0.6993,
+      "step": 1667
+    },
+    {
+      "epoch": 1.5806680881307749,
+      "grad_norm": 1.3071909280159977,
+      "learning_rate": 1.1079969294250515e-06,
+      "loss": 0.7093,
+      "step": 1668
+    },
+    {
+      "epoch": 1.581615730869462,
+      "grad_norm": 1.0541564785946917,
+      "learning_rate": 1.1031818935697763e-06,
+      "loss": 0.7186,
+      "step": 1669
+    },
+    {
+      "epoch": 1.5825633736081497,
+      "grad_norm": 0.9430987398425215,
+      "learning_rate": 1.0983760452395415e-06,
+      "loss": 0.6589,
+      "step": 1670
+    },
+    {
+      "epoch": 1.5835110163468373,
+      "grad_norm": 0.990119702459614,
+      "learning_rate": 1.0935793957650947e-06,
+      "loss": 0.6329,
+      "step": 1671
+    },
+    {
+      "epoch": 1.5844586590855247,
+      "grad_norm": 1.0566708748575848,
+      "learning_rate": 1.0887919564554893e-06,
+      "loss": 0.7004,
+      "step": 1672
+    },
+    {
+      "epoch": 1.5844586590855247,
+      "eval_loss": 0.91568523645401,
+      "eval_runtime": 65.9386,
+      "eval_samples_per_second": 41.372,
+      "eval_steps_per_second": 0.652,
+      "step": 1672
+    },
+    {
+      "epoch": 1.5854063018242122,
+      "grad_norm": 0.9610068854225464,
+      "learning_rate": 1.0840137385980698e-06,
+      "loss": 0.6791,
+      "step": 1673
+    },
+    {
+      "epoch": 1.5863539445628998,
+      "grad_norm": 1.0544927310701904,
+      "learning_rate": 1.079244753458437e-06,
+      "loss": 0.6809,
+      "step": 1674
+    },
+    {
+      "epoch": 1.5873015873015874,
+      "grad_norm": 0.9544448982368492,
+      "learning_rate": 1.0744850122804218e-06,
+      "loss": 0.6979,
+      "step": 1675
+    },
+    {
+      "epoch": 1.5882492300402749,
+      "grad_norm": 1.2640977900333505,
+      "learning_rate": 1.0697345262860638e-06,
+      "loss": 0.6599,
+      "step": 1676
+    },
+    {
+      "epoch": 1.5891968727789623,
+      "grad_norm": 1.0083657972577804,
+      "learning_rate": 1.064993306675578e-06,
+      "loss": 0.6699,
+      "step": 1677
+    },
+    {
+      "epoch": 1.59014451551765,
+      "grad_norm": 1.0076578894558765,
+      "learning_rate": 1.0602613646273374e-06,
+      "loss": 0.6576,
+      "step": 1678
+    },
+    {
+      "epoch": 1.5910921582563373,
+      "grad_norm": 1.0296009587033885,
+      "learning_rate": 1.055538711297835e-06,
+      "loss": 0.6642,
+      "step": 1679
+    },
+    {
+      "epoch": 1.5920398009950247,
+      "grad_norm": 1.031189178917333,
+      "learning_rate": 1.0508253578216693e-06,
+      "loss": 0.5869,
+      "step": 1680
+    },
+    {
+      "epoch": 1.5929874437337124,
+      "grad_norm": 1.0167017867708008,
+      "learning_rate": 1.046121315311508e-06,
+      "loss": 0.699,
+      "step": 1681
+    },
+    {
+      "epoch": 1.5939350864724,
+      "grad_norm": 1.7410568540334943,
+      "learning_rate": 1.0414265948580694e-06,
+      "loss": 0.7248,
+      "step": 1682
+    },
+    {
+      "epoch": 1.5948827292110874,
+      "grad_norm": 1.0198403446768327,
+      "learning_rate": 1.0367412075300942e-06,
+      "loss": 0.7163,
+      "step": 1683
+    },
+    {
+      "epoch": 1.5958303719497748,
+      "grad_norm": 0.9447519143939068,
+      "learning_rate": 1.0320651643743128e-06,
+      "loss": 0.6455,
+      "step": 1684
+    },
+    {
+      "epoch": 1.5967780146884625,
+      "grad_norm": 0.9675157899047281,
+      "learning_rate": 1.0273984764154327e-06,
+      "loss": 0.6627,
+      "step": 1685
+    },
+    {
+      "epoch": 1.5977256574271501,
+      "grad_norm": 1.099687430563817,
+      "learning_rate": 1.0227411546560962e-06,
+      "loss": 0.6868,
+      "step": 1686
+    },
+    {
+      "epoch": 1.5986733001658375,
+      "grad_norm": 1.0425813440851017,
+      "learning_rate": 1.0180932100768714e-06,
+      "loss": 0.7263,
+      "step": 1687
+    },
+    {
+      "epoch": 1.599620942904525,
+      "grad_norm": 1.0467623766365717,
+      "learning_rate": 1.0134546536362099e-06,
+      "loss": 0.7087,
+      "step": 1688
+    },
+    {
+      "epoch": 1.6005685856432126,
+      "grad_norm": 1.1080721212745201,
+      "learning_rate": 1.008825496270434e-06,
+      "loss": 0.708,
+      "step": 1689
+    },
+    {
+      "epoch": 1.6015162283819,
+      "grad_norm": 1.18766161221473,
+      "learning_rate": 1.0042057488937067e-06,
+      "loss": 0.6998,
+      "step": 1690
+    },
+    {
+      "epoch": 1.6024638711205874,
+      "grad_norm": 1.0510182256034266,
+      "learning_rate": 9.995954223979992e-07,
+      "loss": 0.6989,
+      "step": 1691
+    },
+    {
+      "epoch": 1.603411513859275,
+      "grad_norm": 1.3860613450249764,
+      "learning_rate": 9.949945276530782e-07,
+      "loss": 0.7097,
+      "step": 1692
+    },
+    {
+      "epoch": 1.6043591565979627,
+      "grad_norm": 1.2179486037581968,
+      "learning_rate": 9.904030755064659e-07,
+      "loss": 0.6978,
+      "step": 1693
+    },
+    {
+      "epoch": 1.60530679933665,
+      "grad_norm": 0.964495472412476,
+      "learning_rate": 9.858210767834292e-07,
+      "loss": 0.6589,
+      "step": 1694
+    },
+    {
+      "epoch": 1.60530679933665,
+      "eval_loss": 0.9162237644195557,
+      "eval_runtime": 66.3491,
+      "eval_samples_per_second": 41.116,
+      "eval_steps_per_second": 0.648,
+      "step": 1694
+    },
+    {
+      "epoch": 1.6062544420753375,
+      "grad_norm": 0.9604758470801715,
+      "learning_rate": 9.8124854228694e-07,
+      "loss": 0.6931,
+      "step": 1695
+    },
+    {
+      "epoch": 1.6072020848140252,
+      "grad_norm": 1.1003612306707224,
+      "learning_rate": 9.76685482797662e-07,
+      "loss": 0.7113,
+      "step": 1696
+    },
+    {
+      "epoch": 1.6081497275527128,
+      "grad_norm": 1.0752314491639252,
+      "learning_rate": 9.72131909073914e-07,
+      "loss": 0.6719,
+      "step": 1697
+    },
+    {
+      "epoch": 1.6090973702914,
+      "grad_norm": 1.0414996201454678,
+      "learning_rate": 9.675878318516546e-07,
+      "loss": 0.7659,
+      "step": 1698
+    },
+    {
+      "epoch": 1.6100450130300876,
+      "grad_norm": 1.1239598855506638,
+      "learning_rate": 9.630532618444532e-07,
+      "loss": 0.6927,
+      "step": 1699
+    },
+    {
+      "epoch": 1.6109926557687753,
+      "grad_norm": 1.0046753329350298,
+      "learning_rate": 9.58528209743459e-07,
+      "loss": 0.7055,
+      "step": 1700
+    },
+    {
+      "epoch": 1.6119402985074627,
+      "grad_norm": 1.0189187609725145,
+      "learning_rate": 9.540126862173865e-07,
+      "loss": 0.7139,
+      "step": 1701
+    },
+    {
+      "epoch": 1.61288794124615,
+      "grad_norm": 1.719157547437819,
+      "learning_rate": 9.495067019124793e-07,
+      "loss": 0.7117,
+      "step": 1702
+    },
+    {
+      "epoch": 1.6138355839848377,
+      "grad_norm": 1.01602314797504,
+      "learning_rate": 9.450102674524952e-07,
+      "loss": 0.7244,
+      "step": 1703
+    },
+    {
+      "epoch": 1.6147832267235254,
+      "grad_norm": 1.0640275443844036,
+      "learning_rate": 9.405233934386726e-07,
+      "loss": 0.6851,
+      "step": 1704
+    },
+    {
+      "epoch": 1.6157308694622128,
+      "grad_norm": 0.9979744262453907,
+      "learning_rate": 9.360460904497132e-07,
+      "loss": 0.712,
+      "step": 1705
+    },
+    {
+      "epoch": 1.6166785122009002,
+      "grad_norm": 0.9521292764998414,
+      "learning_rate": 9.315783690417479e-07,
+      "loss": 0.6478,
+      "step": 1706
+    },
+    {
+      "epoch": 1.6176261549395878,
+      "grad_norm": 1.0329943572255877,
+      "learning_rate": 9.271202397483214e-07,
+      "loss": 0.692,
+      "step": 1707
+    },
+    {
+      "epoch": 1.6185737976782753,
+      "grad_norm": 0.9688251186485236,
+      "learning_rate": 9.226717130803636e-07,
+      "loss": 0.7099,
+      "step": 1708
+    },
+    {
+      "epoch": 1.6195214404169627,
+      "grad_norm": 1.0095855927099802,
+      "learning_rate": 9.182327995261592e-07,
+      "loss": 0.6799,
+      "step": 1709
+    },
+    {
+      "epoch": 1.6204690831556503,
+      "grad_norm": 0.9336916292986674,
+      "learning_rate": 9.138035095513337e-07,
+      "loss": 0.7118,
+      "step": 1710
+    },
+    {
+      "epoch": 1.621416725894338,
+      "grad_norm": 0.9262678528281649,
+      "learning_rate": 9.093838535988181e-07,
+      "loss": 0.7048,
+      "step": 1711
+    },
+    {
+      "epoch": 1.6223643686330254,
+      "grad_norm": 1.010842317395484,
+      "learning_rate": 9.049738420888349e-07,
+      "loss": 0.6302,
+      "step": 1712
+    },
+    {
+      "epoch": 1.6233120113717128,
+      "grad_norm": 1.0054077234262655,
+      "learning_rate": 9.005734854188625e-07,
+      "loss": 0.7457,
+      "step": 1713
+    },
+    {
+      "epoch": 1.6242596541104004,
+      "grad_norm": 1.1774522725658818,
+      "learning_rate": 8.961827939636198e-07,
+      "loss": 0.6728,
+      "step": 1714
+    },
+    {
+      "epoch": 1.625207296849088,
+      "grad_norm": 1.0498467807029983,
+      "learning_rate": 8.918017780750349e-07,
+      "loss": 0.7334,
+      "step": 1715
+    },
+    {
+      "epoch": 1.6261549395877755,
+      "grad_norm": 0.9901280155483644,
+      "learning_rate": 8.874304480822271e-07,
+      "loss": 0.7517,
+      "step": 1716
+    },
+    {
+      "epoch": 1.6261549395877755,
+      "eval_loss": 0.9154621958732605,
+      "eval_runtime": 65.8149,
+      "eval_samples_per_second": 41.45,
+      "eval_steps_per_second": 0.653,
+      "step": 1716
+    },
+    {
+      "epoch": 1.6271025823264629,
+      "grad_norm": 1.1089800038542108,
+      "learning_rate": 8.830688142914783e-07,
+      "loss": 0.6657,
+      "step": 1717
+    },
+    {
+      "epoch": 1.6280502250651505,
+      "grad_norm": 1.009309929039825,
+      "learning_rate": 8.787168869862067e-07,
+      "loss": 0.6259,
+      "step": 1718
+    },
+    {
+      "epoch": 1.628997867803838,
+      "grad_norm": 0.9117346360474112,
+      "learning_rate": 8.743746764269512e-07,
+      "loss": 0.6988,
+      "step": 1719
+    },
+    {
+      "epoch": 1.6299455105425253,
+      "grad_norm": 2.22961599294848,
+      "learning_rate": 8.700421928513353e-07,
+      "loss": 0.653,
+      "step": 1720
+    },
+    {
+      "epoch": 1.630893153281213,
+      "grad_norm": 0.9850710565653991,
+      "learning_rate": 8.657194464740542e-07,
+      "loss": 0.737,
+      "step": 1721
+    },
+    {
+      "epoch": 1.6318407960199006,
+      "grad_norm": 1.0081901085844467,
+      "learning_rate": 8.614064474868423e-07,
+      "loss": 0.6789,
+      "step": 1722
+    },
+    {
+      "epoch": 1.632788438758588,
+      "grad_norm": 1.0631665749702537,
+      "learning_rate": 8.571032060584555e-07,
+      "loss": 0.7087,
+      "step": 1723
+    },
+    {
+      "epoch": 1.6337360814972754,
+      "grad_norm": 1.0118721550869085,
+      "learning_rate": 8.528097323346408e-07,
+      "loss": 0.6821,
+      "step": 1724
+    },
+    {
+      "epoch": 1.634683724235963,
+      "grad_norm": 1.274936977561588,
+      "learning_rate": 8.485260364381187e-07,
+      "loss": 0.6716,
+      "step": 1725
+    },
+    {
+      "epoch": 1.6356313669746505,
+      "grad_norm": 1.0982132650644516,
+      "learning_rate": 8.442521284685573e-07,
+      "loss": 0.6765,
+      "step": 1726
+    },
+    {
+      "epoch": 1.636579009713338,
+      "grad_norm": 1.1367575465419841,
+      "learning_rate": 8.399880185025439e-07,
+      "loss": 0.6864,
+      "step": 1727
+    },
+    {
+      "epoch": 1.6375266524520256,
+      "grad_norm": 0.9279424119252897,
+      "learning_rate": 8.357337165935675e-07,
+      "loss": 0.7321,
+      "step": 1728
+    },
+    {
+      "epoch": 1.6384742951907132,
+      "grad_norm": 1.0676158883165283,
+      "learning_rate": 8.314892327719937e-07,
+      "loss": 0.7418,
+      "step": 1729
+    },
+    {
+      "epoch": 1.6394219379294006,
+      "grad_norm": 1.014952809890761,
+      "learning_rate": 8.27254577045039e-07,
+      "loss": 0.7356,
+      "step": 1730
+    },
+    {
+      "epoch": 1.640369580668088,
+      "grad_norm": 1.0553676092641795,
+      "learning_rate": 8.230297593967463e-07,
+      "loss": 0.6572,
+      "step": 1731
+    },
+    {
+      "epoch": 1.6413172234067757,
+      "grad_norm": 1.0596022428705136,
+      "learning_rate": 8.188147897879667e-07,
+      "loss": 0.6834,
+      "step": 1732
+    },
+    {
+      "epoch": 1.6422648661454633,
+      "grad_norm": 1.0760233096652914,
+      "learning_rate": 8.146096781563284e-07,
+      "loss": 0.6732,
+      "step": 1733
+    },
+    {
+      "epoch": 1.6432125088841507,
+      "grad_norm": 1.0435516775906648,
+      "learning_rate": 8.104144344162229e-07,
+      "loss": 0.7147,
+      "step": 1734
+    },
+    {
+      "epoch": 1.6441601516228381,
+      "grad_norm": 0.9126453263477479,
+      "learning_rate": 8.062290684587698e-07,
+      "loss": 0.7066,
+      "step": 1735
+    },
+    {
+      "epoch": 1.6451077943615258,
+      "grad_norm": 1.136390471558309,
+      "learning_rate": 8.02053590151805e-07,
+      "loss": 0.675,
+      "step": 1736
+    },
+    {
+      "epoch": 1.6460554371002132,
+      "grad_norm": 0.9443277333193851,
+      "learning_rate": 7.978880093398517e-07,
+      "loss": 0.6556,
+      "step": 1737
+    },
+    {
+      "epoch": 1.6470030798389006,
+      "grad_norm": 0.9782892174224657,
+      "learning_rate": 7.937323358440935e-07,
+      "loss": 0.6771,
+      "step": 1738
+    },
+    {
+      "epoch": 1.6470030798389006,
+      "eval_loss": 0.9155307412147522,
+      "eval_runtime": 69.7327,
+      "eval_samples_per_second": 39.121,
+      "eval_steps_per_second": 0.617,
+      "step": 1738
+    },
+    {
+      "epoch": 1.6479507225775882,
+      "grad_norm": 0.9461043845473808,
+      "learning_rate": 7.89586579462362e-07,
+      "loss": 0.6145,
+      "step": 1739
+    },
+    {
+      "epoch": 1.6488983653162759,
+      "grad_norm": 1.0760119198442148,
+      "learning_rate": 7.854507499691006e-07,
+      "loss": 0.6764,
+      "step": 1740
+    },
+    {
+      "epoch": 1.6498460080549633,
+      "grad_norm": 1.047284386935676,
+      "learning_rate": 7.813248571153542e-07,
+      "loss": 0.7229,
+      "step": 1741
+    },
+    {
+      "epoch": 1.6507936507936507,
+      "grad_norm": 1.0394813864374197,
+      "learning_rate": 7.772089106287345e-07,
+      "loss": 0.7326,
+      "step": 1742
+    },
+    {
+      "epoch": 1.6517412935323383,
+      "grad_norm": 1.045028572772028,
+      "learning_rate": 7.731029202134077e-07,
+      "loss": 0.7167,
+      "step": 1743
+    },
+    {
+      "epoch": 1.652688936271026,
+      "grad_norm": 1.006834434991697,
+      "learning_rate": 7.690068955500623e-07,
+      "loss": 0.705,
+      "step": 1744
+    },
+    {
+      "epoch": 1.6536365790097134,
+      "grad_norm": 1.0207105597171535,
+      "learning_rate": 7.649208462958935e-07,
+      "loss": 0.7293,
+      "step": 1745
+    },
+    {
+      "epoch": 1.6545842217484008,
+      "grad_norm": 0.9666760369440525,
+      "learning_rate": 7.608447820845771e-07,
+      "loss": 0.6882,
+      "step": 1746
+    },
+    {
+      "epoch": 1.6555318644870884,
+      "grad_norm": 0.983892799303487,
+      "learning_rate": 7.567787125262449e-07,
+      "loss": 0.6787,
+      "step": 1747
+    },
+    {
+      "epoch": 1.6564795072257759,
+      "grad_norm": 1.0469953263447322,
+      "learning_rate": 7.527226472074678e-07,
+      "loss": 0.7717,
+      "step": 1748
+    },
+    {
+      "epoch": 1.6574271499644633,
+      "grad_norm": 0.9326611142849608,
+      "learning_rate": 7.486765956912261e-07,
+      "loss": 0.6829,
+      "step": 1749
+    },
+    {
+      "epoch": 1.658374792703151,
+      "grad_norm": 0.9698239262438025,
+      "learning_rate": 7.446405675168938e-07,
+      "loss": 0.6417,
+      "step": 1750
+    },
+    {
+      "epoch": 1.6593224354418386,
+      "grad_norm": 1.0388054521896983,
+      "learning_rate": 7.406145722002101e-07,
+      "loss": 0.661,
+      "step": 1751
+    },
+    {
+      "epoch": 1.660270078180526,
+      "grad_norm": 1.0284488705513752,
+      "learning_rate": 7.365986192332624e-07,
+      "loss": 0.6885,
+      "step": 1752
+    },
+    {
+      "epoch": 1.6612177209192134,
+      "grad_norm": 1.0447857805111578,
+      "learning_rate": 7.325927180844589e-07,
+      "loss": 0.754,
+      "step": 1753
+    },
+    {
+      "epoch": 1.662165363657901,
+      "grad_norm": 1.0316358381680006,
+      "learning_rate": 7.285968781985093e-07,
+      "loss": 0.7376,
+      "step": 1754
+    },
+    {
+      "epoch": 1.6631130063965884,
+      "grad_norm": 0.9473765886693883,
+      "learning_rate": 7.246111089964042e-07,
+      "loss": 0.7222,
+      "step": 1755
+    },
+    {
+      "epoch": 1.6640606491352758,
+      "grad_norm": 1.0459905890378751,
+      "learning_rate": 7.206354198753862e-07,
+      "loss": 0.7092,
+      "step": 1756
+    },
+    {
+      "epoch": 1.6650082918739635,
+      "grad_norm": 1.1687326620774066,
+      "learning_rate": 7.166698202089367e-07,
+      "loss": 0.6543,
+      "step": 1757
+    },
+    {
+      "epoch": 1.6659559346126511,
+      "grad_norm": 0.9534227603353764,
+      "learning_rate": 7.127143193467445e-07,
+      "loss": 0.6816,
+      "step": 1758
+    },
+    {
+      "epoch": 1.6669035773513385,
+      "grad_norm": 1.6473488276372368,
+      "learning_rate": 7.087689266146935e-07,
+      "loss": 0.609,
+      "step": 1759
+    },
+    {
+      "epoch": 1.667851220090026,
+      "grad_norm": 1.0168119921182184,
+      "learning_rate": 7.048336513148307e-07,
+      "loss": 0.7228,
+      "step": 1760
+    },
+    {
+      "epoch": 1.667851220090026,
+      "eval_loss": 0.9152177572250366,
+      "eval_runtime": 64.5647,
+      "eval_samples_per_second": 42.252,
+      "eval_steps_per_second": 0.666,
+      "step": 1760
+    },
+    {
+      "epoch": 1.6687988628287136,
+      "grad_norm": 1.091512400561692,
+      "learning_rate": 7.009085027253543e-07,
+      "loss": 0.7229,
+      "step": 1761
+    },
+    {
+      "epoch": 1.6697465055674012,
+      "grad_norm": 0.9169936965491188,
+      "learning_rate": 6.969934901005809e-07,
+      "loss": 0.6622,
+      "step": 1762
+    },
+    {
+      "epoch": 1.6706941483060886,
+      "grad_norm": 0.9220318537884853,
+      "learning_rate": 6.930886226709344e-07,
+      "loss": 0.6763,
+      "step": 1763
+    },
+    {
+      "epoch": 1.671641791044776,
+      "grad_norm": 1.4840032839975184,
+      "learning_rate": 6.89193909642919e-07,
+      "loss": 0.7216,
+      "step": 1764
+    },
+    {
+      "epoch": 1.6725894337834637,
+      "grad_norm": 1.0184604456491309,
+      "learning_rate": 6.853093601990946e-07,
+      "loss": 0.7152,
+      "step": 1765
+    },
+    {
+      "epoch": 1.6735370765221511,
+      "grad_norm": 0.9526356849525173,
+      "learning_rate": 6.814349834980622e-07,
+      "loss": 0.6673,
+      "step": 1766
+    },
+    {
+      "epoch": 1.6744847192608385,
+      "grad_norm": 1.0461521335211754,
+      "learning_rate": 6.775707886744343e-07,
+      "loss": 0.7344,
+      "step": 1767
+    },
+    {
+      "epoch": 1.6754323619995262,
+      "grad_norm": 0.9731694459589282,
+      "learning_rate": 6.737167848388227e-07,
+      "loss": 0.6401,
+      "step": 1768
+    },
+    {
+      "epoch": 1.6763800047382138,
+      "grad_norm": 1.033021359037975,
+      "learning_rate": 6.698729810778065e-07,
+      "loss": 0.6726,
+      "step": 1769
+    },
+    {
+      "epoch": 1.6773276474769012,
+      "grad_norm": 1.0233932401805885,
+      "learning_rate": 6.660393864539222e-07,
+      "loss": 0.746,
+      "step": 1770
+    },
+    {
+      "epoch": 1.6782752902155886,
+      "grad_norm": 1.1404272415247523,
+      "learning_rate": 6.622160100056296e-07,
+      "loss": 0.7257,
+      "step": 1771
+    },
+    {
+      "epoch": 1.6792229329542763,
+      "grad_norm": 0.9500280589832726,
+      "learning_rate": 6.584028607473019e-07,
+      "loss": 0.6845,
+      "step": 1772
+    },
+    {
+      "epoch": 1.680170575692964,
+      "grad_norm": 1.1155349368000151,
+      "learning_rate": 6.545999476691994e-07,
+      "loss": 0.7388,
+      "step": 1773
+    },
+    {
+      "epoch": 1.681118218431651,
+      "grad_norm": 1.0605447278963818,
+      "learning_rate": 6.508072797374454e-07,
+      "loss": 0.7103,
+      "step": 1774
+    },
+    {
+      "epoch": 1.6820658611703387,
+      "grad_norm": 1.2288918607070451,
+      "learning_rate": 6.470248658940115e-07,
+      "loss": 0.7631,
+      "step": 1775
+    },
+    {
+      "epoch": 1.6830135039090264,
+      "grad_norm": 1.5101120147098823,
+      "learning_rate": 6.432527150566903e-07,
+      "loss": 0.6687,
+      "step": 1776
+    },
+    {
+      "epoch": 1.6839611466477138,
+      "grad_norm": 1.0065803962933693,
+      "learning_rate": 6.394908361190804e-07,
+      "loss": 0.6794,
+      "step": 1777
+    },
+    {
+      "epoch": 1.6849087893864012,
+      "grad_norm": 0.9998197093667816,
+      "learning_rate": 6.3573923795056e-07,
+      "loss": 0.7064,
+      "step": 1778
+    },
+    {
+      "epoch": 1.6858564321250888,
+      "grad_norm": 0.9471515572718026,
+      "learning_rate": 6.319979293962692e-07,
+      "loss": 0.6864,
+      "step": 1779
+    },
+    {
+      "epoch": 1.6868040748637765,
+      "grad_norm": 1.1324852784222434,
+      "learning_rate": 6.282669192770896e-07,
+      "loss": 0.6993,
+      "step": 1780
+    },
+    {
+      "epoch": 1.687751717602464,
+      "grad_norm": 1.1270168615840963,
+      "learning_rate": 6.245462163896188e-07,
+      "loss": 0.6916,
+      "step": 1781
+    },
+    {
+      "epoch": 1.6886993603411513,
+      "grad_norm": 1.0074883404182633,
+      "learning_rate": 6.208358295061572e-07,
+      "loss": 0.6657,
+      "step": 1782
+    },
+    {
+      "epoch": 1.6886993603411513,
+      "eval_loss": 0.9154070615768433,
+      "eval_runtime": 66.96,
+      "eval_samples_per_second": 40.741,
+      "eval_steps_per_second": 0.642,
+      "step": 1782
+    },
+    {
+      "epoch": 1.689647003079839,
+      "grad_norm": 1.054498287010899,
+      "learning_rate": 6.171357673746798e-07,
+      "loss": 0.6781,
+      "step": 1783
+    },
+    {
+      "epoch": 1.6905946458185264,
+      "grad_norm": 1.1173569579127263,
+      "learning_rate": 6.134460387188207e-07,
+      "loss": 0.7066,
+      "step": 1784
+    },
+    {
+      "epoch": 1.6915422885572138,
+      "grad_norm": 1.1694100658433553,
+      "learning_rate": 6.097666522378498e-07,
+      "loss": 0.7334,
+      "step": 1785
+    },
+    {
+      "epoch": 1.6924899312959014,
+      "grad_norm": 1.0277071128785418,
+      "learning_rate": 6.060976166066546e-07,
+      "loss": 0.653,
+      "step": 1786
+    },
+    {
+      "epoch": 1.693437574034589,
+      "grad_norm": 0.9433860569393974,
+      "learning_rate": 6.024389404757164e-07,
+      "loss": 0.7334,
+      "step": 1787
+    },
+    {
+      "epoch": 1.6943852167732765,
+      "grad_norm": 1.1834900094280103,
+      "learning_rate": 5.98790632471094e-07,
+      "loss": 0.6869,
+      "step": 1788
+    },
+    {
+      "epoch": 1.6953328595119639,
+      "grad_norm": 1.0548703068294034,
+      "learning_rate": 5.951527011944008e-07,
+      "loss": 0.6971,
+      "step": 1789
+    },
+    {
+      "epoch": 1.6962805022506515,
+      "grad_norm": 1.04211743004335,
+      "learning_rate": 5.91525155222783e-07,
+      "loss": 0.6956,
+      "step": 1790
+    },
+    {
+      "epoch": 1.6972281449893392,
+      "grad_norm": 0.9772649558392219,
+      "learning_rate": 5.879080031089047e-07,
+      "loss": 0.6854,
+      "step": 1791
+    },
+    {
+      "epoch": 1.6981757877280266,
+      "grad_norm": 1.0335742882872847,
+      "learning_rate": 5.843012533809211e-07,
+      "loss": 0.6413,
+      "step": 1792
+    },
+    {
+      "epoch": 1.699123430466714,
+      "grad_norm": 1.0424548733114698,
+      "learning_rate": 5.807049145424648e-07,
+      "loss": 0.6913,
+      "step": 1793
+    },
+    {
+      "epoch": 1.7000710732054016,
+      "grad_norm": 0.9133463333235484,
+      "learning_rate": 5.771189950726191e-07,
+      "loss": 0.7096,
+      "step": 1794
+    },
+    {
+      "epoch": 1.701018715944089,
+      "grad_norm": 1.052652054283284,
+      "learning_rate": 5.735435034259057e-07,
+      "loss": 0.6999,
+      "step": 1795
+    },
+    {
+      "epoch": 1.7019663586827765,
+      "grad_norm": 0.9804718340800169,
+      "learning_rate": 5.699784480322568e-07,
+      "loss": 0.7222,
+      "step": 1796
+    },
+    {
+      "epoch": 1.702914001421464,
+      "grad_norm": 1.0457563081701153,
+      "learning_rate": 5.664238372970016e-07,
+      "loss": 0.7255,
+      "step": 1797
+    },
+    {
+      "epoch": 1.7038616441601517,
+      "grad_norm": 1.032471323678011,
+      "learning_rate": 5.628796796008435e-07,
+      "loss": 0.7157,
+      "step": 1798
+    },
+    {
+      "epoch": 1.7048092868988391,
+      "grad_norm": 1.060728898729736,
+      "learning_rate": 5.593459832998388e-07,
+      "loss": 0.7115,
+      "step": 1799
+    },
+    {
+      "epoch": 1.7057569296375266,
+      "grad_norm": 0.9198580084720938,
+      "learning_rate": 5.558227567253832e-07,
+      "loss": 0.6637,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7067045723762142,
+      "grad_norm": 1.1801823503070277,
+      "learning_rate": 5.52310008184182e-07,
+      "loss": 0.6761,
+      "step": 1801
+    },
+    {
+      "epoch": 1.7076522151149018,
+      "grad_norm": 0.9488900309898747,
+      "learning_rate": 5.488077459582425e-07,
+      "loss": 0.6881,
+      "step": 1802
+    },
+    {
+      "epoch": 1.708599857853589,
+      "grad_norm": 1.0453768970538024,
+      "learning_rate": 5.453159783048434e-07,
+      "loss": 0.6938,
+      "step": 1803
+    },
+    {
+      "epoch": 1.7095475005922767,
+      "grad_norm": 0.8783306154356906,
+      "learning_rate": 5.418347134565249e-07,
+      "loss": 0.7375,
+      "step": 1804
+    },
+    {
+      "epoch": 1.7095475005922767,
+      "eval_loss": 0.9155663847923279,
+      "eval_runtime": 61.2948,
+      "eval_samples_per_second": 44.506,
+      "eval_steps_per_second": 0.702,
+      "step": 1804
+    },
+    {
+      "epoch": 1.7104951433309643,
+      "grad_norm": 0.9783675322880301,
+      "learning_rate": 5.383639596210605e-07,
+      "loss": 0.7133,
+      "step": 1805
+    },
+    {
+      "epoch": 1.7114427860696517,
+      "grad_norm": 1.071623947559265,
+      "learning_rate": 5.349037249814443e-07,
+      "loss": 0.717,
+      "step": 1806
+    },
+    {
+      "epoch": 1.7123904288083391,
+      "grad_norm": 0.9742720184084963,
+      "learning_rate": 5.314540176958699e-07,
+      "loss": 0.6707,
+      "step": 1807
+    },
+    {
+      "epoch": 1.7133380715470268,
+      "grad_norm": 1.0237973061069328,
+      "learning_rate": 5.28014845897708e-07,
+      "loss": 0.6885,
+      "step": 1808
+    },
+    {
+      "epoch": 1.7142857142857144,
+      "grad_norm": 1.1521501110479333,
+      "learning_rate": 5.24586217695493e-07,
+      "loss": 0.6501,
+      "step": 1809
+    },
+    {
+      "epoch": 1.7152333570244018,
+      "grad_norm": 1.103148137037962,
+      "learning_rate": 5.211681411728969e-07,
+      "loss": 0.7074,
+      "step": 1810
+    },
+    {
+      "epoch": 1.7161809997630892,
+      "grad_norm": 1.0912149190601321,
+      "learning_rate": 5.177606243887184e-07,
+      "loss": 0.6816,
+      "step": 1811
+    },
+    {
+      "epoch": 1.7171286425017769,
+      "grad_norm": 1.1479638151703746,
+      "learning_rate": 5.14363675376855e-07,
+      "loss": 0.6941,
+      "step": 1812
+    },
+    {
+      "epoch": 1.7180762852404643,
+      "grad_norm": 1.0169229566279356,
+      "learning_rate": 5.109773021462921e-07,
+      "loss": 0.6869,
+      "step": 1813
+    },
+    {
+      "epoch": 1.7190239279791517,
+      "grad_norm": 1.0911518292945759,
+      "learning_rate": 5.076015126810784e-07,
+      "loss": 0.6936,
+      "step": 1814
+    },
+    {
+      "epoch": 1.7199715707178393,
+      "grad_norm": 1.0143944326536474,
+      "learning_rate": 5.042363149403106e-07,
+      "loss": 0.6826,
+      "step": 1815
+    },
+    {
+      "epoch": 1.720919213456527,
+      "grad_norm": 1.0145646424072496,
+      "learning_rate": 5.008817168581137e-07,
+      "loss": 0.738,
+      "step": 1816
+    },
+    {
+      "epoch": 1.7218668561952144,
+      "grad_norm": 0.9897806551140146,
+      "learning_rate": 4.975377263436193e-07,
+      "loss": 0.702,
+      "step": 1817
+    },
+    {
+      "epoch": 1.7228144989339018,
+      "grad_norm": 0.9854817267582501,
+      "learning_rate": 4.94204351280953e-07,
+      "loss": 0.7192,
+      "step": 1818
+    },
+    {
+      "epoch": 1.7237621416725895,
+      "grad_norm": 1.5230326854807534,
+      "learning_rate": 4.908815995292082e-07,
+      "loss": 0.7293,
+      "step": 1819
+    },
+    {
+      "epoch": 1.724709784411277,
+      "grad_norm": 1.2847042251851852,
+      "learning_rate": 4.875694789224372e-07,
+      "loss": 0.6911,
+      "step": 1820
+    },
+    {
+      "epoch": 1.7256574271499645,
+      "grad_norm": 1.026112774551678,
+      "learning_rate": 4.842679972696213e-07,
+      "loss": 0.6836,
+      "step": 1821
+    },
+    {
+      "epoch": 1.726605069888652,
+      "grad_norm": 1.0314353073227083,
+      "learning_rate": 4.809771623546627e-07,
+      "loss": 0.6813,
+      "step": 1822
+    },
+    {
+      "epoch": 1.7275527126273396,
+      "grad_norm": 0.9609581306542341,
+      "learning_rate": 4.776969819363614e-07,
+      "loss": 0.7,
+      "step": 1823
+    },
+    {
+      "epoch": 1.728500355366027,
+      "grad_norm": 1.0594554399314495,
+      "learning_rate": 4.7442746374839363e-07,
+      "loss": 0.6848,
+      "step": 1824
+    },
+    {
+      "epoch": 1.7294479981047144,
+      "grad_norm": 1.0911056243232513,
+      "learning_rate": 4.711686154993028e-07,
+      "loss": 0.6629,
+      "step": 1825
+    },
+    {
+      "epoch": 1.730395640843402,
+      "grad_norm": 1.0318006859508377,
+      "learning_rate": 4.6792044487247003e-07,
+      "loss": 0.6968,
+      "step": 1826
+    },
+    {
+      "epoch": 1.730395640843402,
+      "eval_loss": 0.9146263003349304,
+      "eval_runtime": 63.3709,
+      "eval_samples_per_second": 43.048,
+      "eval_steps_per_second": 0.679,
+      "step": 1826
+    },
+    {
+      "epoch": 1.7313432835820897,
+      "grad_norm": 0.9085218539728935,
+      "learning_rate": 4.646829595261071e-07,
+      "loss": 0.6937,
+      "step": 1827
+    },
+    {
+      "epoch": 1.732290926320777,
+      "grad_norm": 1.1715576080637178,
+      "learning_rate": 4.614561670932288e-07,
+      "loss": 0.7269,
+      "step": 1828
+    },
+    {
+      "epoch": 1.7332385690594645,
+      "grad_norm": 1.1027251721128186,
+      "learning_rate": 4.582400751816435e-07,
+      "loss": 0.7023,
+      "step": 1829
+    },
+    {
+      "epoch": 1.7341862117981521,
+      "grad_norm": 1.110203701042969,
+      "learning_rate": 4.5503469137392565e-07,
+      "loss": 0.6782,
+      "step": 1830
+    },
+    {
+      "epoch": 1.7351338545368398,
+      "grad_norm": 1.208247198988513,
+      "learning_rate": 4.5184002322740784e-07,
+      "loss": 0.7379,
+      "step": 1831
+    },
+    {
+      "epoch": 1.736081497275527,
+      "grad_norm": 0.9596889918326993,
+      "learning_rate": 4.486560782741578e-07,
+      "loss": 0.7485,
+      "step": 1832
+    },
+    {
+      "epoch": 1.7370291400142146,
+      "grad_norm": 0.9856266219344122,
+      "learning_rate": 4.454828640209574e-07,
+      "loss": 0.7021,
+      "step": 1833
+    },
+    {
+      "epoch": 1.7379767827529022,
+      "grad_norm": 1.0066120094983713,
+      "learning_rate": 4.423203879492943e-07,
+      "loss": 0.6334,
+      "step": 1834
+    },
+    {
+      "epoch": 1.7389244254915897,
+      "grad_norm": 1.0174910309231802,
+      "learning_rate": 4.3916865751533313e-07,
+      "loss": 0.6737,
+      "step": 1835
+    },
+    {
+      "epoch": 1.739872068230277,
+      "grad_norm": 1.090557902374621,
+      "learning_rate": 4.360276801499086e-07,
+      "loss": 0.6986,
+      "step": 1836
+    },
+    {
+      "epoch": 1.7408197109689647,
+      "grad_norm": 0.9525400709898934,
+      "learning_rate": 4.3289746325849924e-07,
+      "loss": 0.6387,
+      "step": 1837
+    },
+    {
+      "epoch": 1.7417673537076523,
+      "grad_norm": 0.9714172712407362,
+      "learning_rate": 4.29778014221216e-07,
+      "loss": 0.7426,
+      "step": 1838
+    },
+    {
+      "epoch": 1.7427149964463398,
+      "grad_norm": 1.011041205364556,
+      "learning_rate": 4.2666934039278017e-07,
+      "loss": 0.7251,
+      "step": 1839
+    },
+    {
+      "epoch": 1.7436626391850272,
+      "grad_norm": 1.0835244258679044,
+      "learning_rate": 4.2357144910251003e-07,
+      "loss": 0.7394,
+      "step": 1840
+    },
+    {
+      "epoch": 1.7446102819237148,
+      "grad_norm": 0.9136527438313549,
+      "learning_rate": 4.20484347654303e-07,
+      "loss": 0.6833,
+      "step": 1841
+    },
+    {
+      "epoch": 1.7455579246624022,
+      "grad_norm": 0.9394753418742889,
+      "learning_rate": 4.1740804332661365e-07,
+      "loss": 0.7183,
+      "step": 1842
+    },
+    {
+      "epoch": 1.7465055674010896,
+      "grad_norm": 1.1163656927143548,
+      "learning_rate": 4.1434254337244404e-07,
+      "loss": 0.6688,
+      "step": 1843
+    },
+    {
+      "epoch": 1.7474532101397773,
+      "grad_norm": 1.0767321655039874,
+      "learning_rate": 4.1128785501931947e-07,
+      "loss": 0.7301,
+      "step": 1844
+    },
+    {
+      "epoch": 1.748400852878465,
+      "grad_norm": 1.091800385892591,
+      "learning_rate": 4.0824398546927823e-07,
+      "loss": 0.7628,
+      "step": 1845
+    },
+    {
+      "epoch": 1.7493484956171523,
+      "grad_norm": 1.1875176453886316,
+      "learning_rate": 4.05210941898847e-07,
+      "loss": 0.7532,
+      "step": 1846
+    },
+    {
+      "epoch": 1.7502961383558397,
+      "grad_norm": 0.9616131150389049,
+      "learning_rate": 4.021887314590323e-07,
+      "loss": 0.7407,
+      "step": 1847
+    },
+    {
+      "epoch": 1.7512437810945274,
+      "grad_norm": 1.03865176835429,
+      "learning_rate": 3.9917736127529525e-07,
+      "loss": 0.7331,
+      "step": 1848
+    },
+    {
+      "epoch": 1.7512437810945274,
+      "eval_loss": 0.914626955986023,
+      "eval_runtime": 61.9304,
+      "eval_samples_per_second": 44.049,
+      "eval_steps_per_second": 0.694,
+      "step": 1848
+    },
+    {
+      "epoch": 1.752191423833215,
+      "grad_norm": 1.023686798198027,
+      "learning_rate": 3.9617683844754284e-07,
+      "loss": 0.7311,
+      "step": 1849
+    },
+    {
+      "epoch": 1.7531390665719024,
+      "grad_norm": 0.9592372092729335,
+      "learning_rate": 3.9318717005010496e-07,
+      "loss": 0.7405,
+      "step": 1850
+    },
+    {
+      "epoch": 1.7540867093105899,
+      "grad_norm": 1.0255950736345492,
+      "learning_rate": 3.902083631317194e-07,
+      "loss": 0.6882,
+      "step": 1851
+    },
+    {
+      "epoch": 1.7550343520492775,
+      "grad_norm": 1.0334498715269957,
+      "learning_rate": 3.8724042471551925e-07,
+      "loss": 0.6409,
+      "step": 1852
+    },
+    {
+      "epoch": 1.755981994787965,
+      "grad_norm": 1.187230644417929,
+      "learning_rate": 3.8428336179900773e-07,
+      "loss": 0.687,
+      "step": 1853
+    },
+    {
+      "epoch": 1.7569296375266523,
+      "grad_norm": 1.0889047565138557,
+      "learning_rate": 3.8133718135405283e-07,
+      "loss": 0.713,
+      "step": 1854
+    },
+    {
+      "epoch": 1.75787728026534,
+      "grad_norm": 0.9442970597032486,
+      "learning_rate": 3.784018903268588e-07,
+      "loss": 0.6456,
+      "step": 1855
+    },
+    {
+      "epoch": 1.7588249230040276,
+      "grad_norm": 0.9511254579043233,
+      "learning_rate": 3.7547749563796144e-07,
+      "loss": 0.675,
+      "step": 1856
+    },
+    {
+      "epoch": 1.759772565742715,
+      "grad_norm": 1.07549297963586,
+      "learning_rate": 3.725640041822026e-07,
+      "loss": 0.7639,
+      "step": 1857
+    },
+    {
+      "epoch": 1.7607202084814024,
+      "grad_norm": 1.2232858699290627,
+      "learning_rate": 3.6966142282871873e-07,
+      "loss": 0.738,
+      "step": 1858
+    },
+    {
+      "epoch": 1.76166785122009,
+      "grad_norm": 0.977573869114317,
+      "learning_rate": 3.667697584209251e-07,
+      "loss": 0.6537,
+      "step": 1859
+    },
+    {
+      "epoch": 1.7626154939587777,
+      "grad_norm": 1.0901080710066895,
+      "learning_rate": 3.638890177764948e-07,
+      "loss": 0.6607,
+      "step": 1860
+    },
+    {
+      "epoch": 1.763563136697465,
+      "grad_norm": 1.0047668118955564,
+      "learning_rate": 3.610192076873498e-07,
+      "loss": 0.6992,
+      "step": 1861
+    },
+    {
+      "epoch": 1.7645107794361525,
+      "grad_norm": 1.0810551846393102,
+      "learning_rate": 3.581603349196372e-07,
+      "loss": 0.7749,
+      "step": 1862
+    },
+    {
+      "epoch": 1.7654584221748402,
+      "grad_norm": 1.2746674219972365,
+      "learning_rate": 3.553124062137203e-07,
+      "loss": 0.697,
+      "step": 1863
+    },
+    {
+      "epoch": 1.7664060649135276,
+      "grad_norm": 1.0162449684395298,
+      "learning_rate": 3.524754282841575e-07,
+      "loss": 0.741,
+      "step": 1864
+    },
+    {
+      "epoch": 1.767353707652215,
+      "grad_norm": 0.9261105845228721,
+      "learning_rate": 3.49649407819691e-07,
+      "loss": 0.6527,
+      "step": 1865
+    },
+    {
+      "epoch": 1.7683013503909026,
+      "grad_norm": 1.0754817790075664,
+      "learning_rate": 3.468343514832251e-07,
+      "loss": 0.6518,
+      "step": 1866
+    },
+    {
+      "epoch": 1.7692489931295903,
+      "grad_norm": 0.9487301149199158,
+      "learning_rate": 3.440302659118172e-07,
+      "loss": 0.7055,
+      "step": 1867
+    },
+    {
+      "epoch": 1.7701966358682777,
+      "grad_norm": 0.9988153915506769,
+      "learning_rate": 3.4123715771665786e-07,
+      "loss": 0.6693,
+      "step": 1868
+    },
+    {
+      "epoch": 1.771144278606965,
+      "grad_norm": 1.1376902119777852,
+      "learning_rate": 3.3845503348305554e-07,
+      "loss": 0.6901,
+      "step": 1869
+    },
+    {
+      "epoch": 1.7720919213456527,
+      "grad_norm": 0.9271287842743362,
+      "learning_rate": 3.356838997704226e-07,
+      "loss": 0.6715,
+      "step": 1870
+    },
+    {
+      "epoch": 1.7720919213456527,
+      "eval_loss": 0.9146553874015808,
+      "eval_runtime": 65.9577,
+      "eval_samples_per_second": 41.36,
+      "eval_steps_per_second": 0.652,
+      "step": 1870
+    },
+    {
+      "epoch": 1.7730395640843402,
+      "grad_norm": 1.114859990972608,
+      "learning_rate": 3.3292376311225837e-07,
+      "loss": 0.7206,
+      "step": 1871
+    },
+    {
+      "epoch": 1.7739872068230276,
+      "grad_norm": 0.9957104184290693,
+      "learning_rate": 3.3017463001613625e-07,
+      "loss": 0.7175,
+      "step": 1872
+    },
+    {
+      "epoch": 1.7749348495617152,
+      "grad_norm": 1.085353272133234,
+      "learning_rate": 3.274365069636831e-07,
+      "loss": 0.7183,
+      "step": 1873
+    },
+    {
+      "epoch": 1.7758824923004028,
+      "grad_norm": 0.9620062890136332,
+      "learning_rate": 3.247094004105711e-07,
+      "loss": 0.6941,
+      "step": 1874
+    },
+    {
+      "epoch": 1.7768301350390903,
+      "grad_norm": 1.0208277814345714,
+      "learning_rate": 3.2199331678649804e-07,
+      "loss": 0.6735,
+      "step": 1875
+    },
+    {
+      "epoch": 1.7777777777777777,
+      "grad_norm": 1.085385982876263,
+      "learning_rate": 3.1928826249516984e-07,
+      "loss": 0.7081,
+      "step": 1876
+    },
+    {
+      "epoch": 1.7787254205164653,
+      "grad_norm": 0.9829670594766973,
+      "learning_rate": 3.165942439142927e-07,
+      "loss": 0.6604,
+      "step": 1877
+    },
+    {
+      "epoch": 1.779673063255153,
+      "grad_norm": 1.0546523994379018,
+      "learning_rate": 3.1391126739555134e-07,
+      "loss": 0.6916,
+      "step": 1878
+    },
+    {
+      "epoch": 1.7806207059938404,
+      "grad_norm": 1.3474084051089132,
+      "learning_rate": 3.112393392645985e-07,
+      "loss": 0.7241,
+      "step": 1879
+    },
+    {
+      "epoch": 1.7815683487325278,
+      "grad_norm": 1.2195465607727498,
+      "learning_rate": 3.0857846582103504e-07,
+      "loss": 0.7133,
+      "step": 1880
+    },
+    {
+      "epoch": 1.7825159914712154,
+      "grad_norm": 1.0574688930127023,
+      "learning_rate": 3.059286533384021e-07,
+      "loss": 0.6827,
+      "step": 1881
+    },
+    {
+      "epoch": 1.7834636342099028,
+      "grad_norm": 0.9882282740053548,
+      "learning_rate": 3.0328990806415935e-07,
+      "loss": 0.6634,
+      "step": 1882
+    },
+    {
+      "epoch": 1.7844112769485903,
+      "grad_norm": 1.6186322230221908,
+      "learning_rate": 3.006622362196748e-07,
+      "loss": 0.681,
+      "step": 1883
+    },
+    {
+      "epoch": 1.785358919687278,
+      "grad_norm": 0.9919895774024126,
+      "learning_rate": 2.9804564400021e-07,
+      "loss": 0.6462,
+      "step": 1884
+    },
+    {
+      "epoch": 1.7863065624259655,
+      "grad_norm": 1.023431960722124,
+      "learning_rate": 2.9544013757489944e-07,
+      "loss": 0.6782,
+      "step": 1885
+    },
+    {
+      "epoch": 1.787254205164653,
+      "grad_norm": 1.4703356457868497,
+      "learning_rate": 2.92845723086746e-07,
+      "loss": 0.7125,
+      "step": 1886
+    },
+    {
+      "epoch": 1.7882018479033404,
+      "grad_norm": 1.044872020512602,
+      "learning_rate": 2.9026240665259717e-07,
+      "loss": 0.6705,
+      "step": 1887
+    },
+    {
+      "epoch": 1.789149490642028,
+      "grad_norm": 1.0114171562637975,
+      "learning_rate": 2.876901943631372e-07,
+      "loss": 0.7051,
+      "step": 1888
+    },
+    {
+      "epoch": 1.7900971333807156,
+      "grad_norm": 0.9319783340627764,
+      "learning_rate": 2.8512909228286814e-07,
+      "loss": 0.6933,
+      "step": 1889
+    },
+    {
+      "epoch": 1.7910447761194028,
+      "grad_norm": 1.1744614079136493,
+      "learning_rate": 2.8257910645009935e-07,
+      "loss": 0.6957,
+      "step": 1890
+    },
+    {
+      "epoch": 1.7919924188580905,
+      "grad_norm": 0.9991417606881348,
+      "learning_rate": 2.8004024287692944e-07,
+      "loss": 0.7323,
+      "step": 1891
+    },
+    {
+      "epoch": 1.792940061596778,
+      "grad_norm": 1.0062578000109592,
+      "learning_rate": 2.7751250754923574e-07,
+      "loss": 0.6934,
+      "step": 1892
+    },
+    {
+      "epoch": 1.792940061596778,
+      "eval_loss": 0.9148725867271423,
+      "eval_runtime": 60.4722,
+      "eval_samples_per_second": 45.112,
+      "eval_steps_per_second": 0.711,
+      "step": 1892
+    },
+    {
+      "epoch": 1.7938877043354655,
+      "grad_norm": 1.0646128012425906,
+      "learning_rate": 2.7499590642665773e-07,
+      "loss": 0.6725,
+      "step": 1893
+    },
+    {
+      "epoch": 1.794835347074153,
+      "grad_norm": 0.9591060924803608,
+      "learning_rate": 2.724904454425836e-07,
+      "loss": 0.7088,
+      "step": 1894
+    },
+    {
+      "epoch": 1.7957829898128406,
+      "grad_norm": 1.0631806918984852,
+      "learning_rate": 2.699961305041382e-07,
+      "loss": 0.698,
+      "step": 1895
+    },
+    {
+      "epoch": 1.7967306325515282,
+      "grad_norm": 1.082828132414062,
+      "learning_rate": 2.6751296749216395e-07,
+      "loss": 0.6522,
+      "step": 1896
+    },
+    {
+      "epoch": 1.7976782752902156,
+      "grad_norm": 1.3644267899875255,
+      "learning_rate": 2.650409622612138e-07,
+      "loss": 0.6988,
+      "step": 1897
+    },
+    {
+      "epoch": 1.798625918028903,
+      "grad_norm": 1.1184995105345898,
+      "learning_rate": 2.625801206395312e-07,
+      "loss": 0.6482,
+      "step": 1898
+    },
+    {
+      "epoch": 1.7995735607675907,
+      "grad_norm": 1.1815217570033627,
+      "learning_rate": 2.6013044842904233e-07,
+      "loss": 0.6416,
+      "step": 1899
+    },
+    {
+      "epoch": 1.800521203506278,
+      "grad_norm": 0.9571223280821131,
+      "learning_rate": 2.5769195140533556e-07,
+      "loss": 0.7289,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8014688462449655,
+      "grad_norm": 0.9341994296803589,
+      "learning_rate": 2.5526463531765467e-07,
+      "loss": 0.6686,
+      "step": 1901
+    },
+    {
+      "epoch": 1.8024164889836531,
+      "grad_norm": 0.9035497177092859,
+      "learning_rate": 2.528485058888813e-07,
+      "loss": 0.7046,
+      "step": 1902
+    },
+    {
+      "epoch": 1.8033641317223408,
+      "grad_norm": 1.0743252934046903,
+      "learning_rate": 2.5044356881552045e-07,
+      "loss": 0.7197,
+      "step": 1903
+    },
+    {
+      "epoch": 1.8043117744610282,
+      "grad_norm": 1.0079857675814266,
+      "learning_rate": 2.4804982976769197e-07,
+      "loss": 0.6867,
+      "step": 1904
+    },
+    {
+      "epoch": 1.8052594171997156,
+      "grad_norm": 1.059517658637514,
+      "learning_rate": 2.456672943891114e-07,
+      "loss": 0.6749,
+      "step": 1905
+    },
+    {
+      "epoch": 1.8062070599384032,
+      "grad_norm": 1.114382931333417,
+      "learning_rate": 2.4329596829708145e-07,
+      "loss": 0.6778,
+      "step": 1906
+    },
+    {
+      "epoch": 1.8071547026770909,
+      "grad_norm": 1.0664257741407888,
+      "learning_rate": 2.409358570824749e-07,
+      "loss": 0.7155,
+      "step": 1907
+    },
+    {
+      "epoch": 1.8081023454157783,
+      "grad_norm": 1.1085710156266713,
+      "learning_rate": 2.385869663097251e-07,
+      "loss": 0.712,
+      "step": 1908
+    },
+    {
+      "epoch": 1.8090499881544657,
+      "grad_norm": 1.0500505818112218,
+      "learning_rate": 2.362493015168088e-07,
+      "loss": 0.657,
+      "step": 1909
+    },
+    {
+      "epoch": 1.8099976308931534,
+      "grad_norm": 1.1320348644595801,
+      "learning_rate": 2.3392286821523723e-07,
+      "loss": 0.7458,
+      "step": 1910
+    },
+    {
+      "epoch": 1.8109452736318408,
+      "grad_norm": 1.1363033352104308,
+      "learning_rate": 2.316076718900412e-07,
+      "loss": 0.7449,
+      "step": 1911
+    },
+    {
+      "epoch": 1.8118929163705282,
+      "grad_norm": 0.9155284671345113,
+      "learning_rate": 2.2930371799975593e-07,
+      "loss": 0.6543,
+      "step": 1912
+    },
+    {
+      "epoch": 1.8128405591092158,
+      "grad_norm": 1.0126916247727809,
+      "learning_rate": 2.270110119764124e-07,
+      "loss": 0.7248,
+      "step": 1913
+    },
+    {
+      "epoch": 1.8137882018479035,
+      "grad_norm": 1.0272524735780668,
+      "learning_rate": 2.2472955922552164e-07,
+      "loss": 0.7114,
+      "step": 1914
+    },
+    {
+      "epoch": 1.8137882018479035,
+      "eval_loss": 0.9147893786430359,
+      "eval_runtime": 67.7847,
+      "eval_samples_per_second": 40.245,
+      "eval_steps_per_second": 0.634,
+      "step": 1914
+    },
+    {
+      "epoch": 1.8147358445865909,
+      "grad_norm": 1.0985166320474906,
+      "learning_rate": 2.2245936512606314e-07,
+      "loss": 0.6455,
+      "step": 1915
+    },
+    {
+      "epoch": 1.8156834873252783,
+      "grad_norm": 1.0448066285286555,
+      "learning_rate": 2.202004350304715e-07,
+      "loss": 0.6757,
+      "step": 1916
+    },
+    {
+      "epoch": 1.816631130063966,
+      "grad_norm": 1.2157625524349163,
+      "learning_rate": 2.179527742646248e-07,
+      "loss": 0.6647,
+      "step": 1917
+    },
+    {
+      "epoch": 1.8175787728026536,
+      "grad_norm": 0.9580609539987758,
+      "learning_rate": 2.1571638812783125e-07,
+      "loss": 0.6307,
+      "step": 1918
+    },
+    {
+      "epoch": 1.8185264155413408,
+      "grad_norm": 1.0743648151551368,
+      "learning_rate": 2.1349128189281587e-07,
+      "loss": 0.7276,
+      "step": 1919
+    },
+    {
+      "epoch": 1.8194740582800284,
+      "grad_norm": 1.1869777760221731,
+      "learning_rate": 2.112774608057111e-07,
+      "loss": 0.7087,
+      "step": 1920
+    },
+    {
+      "epoch": 1.820421701018716,
+      "grad_norm": 1.0016495300786694,
+      "learning_rate": 2.0907493008604007e-07,
+      "loss": 0.6908,
+      "step": 1921
+    },
+    {
+      "epoch": 1.8213693437574034,
+      "grad_norm": 1.1603016404098847,
+      "learning_rate": 2.068836949267089e-07,
+      "loss": 0.6766,
+      "step": 1922
+    },
+    {
+      "epoch": 1.8223169864960909,
+      "grad_norm": 1.0187010201658582,
+      "learning_rate": 2.0470376049398944e-07,
+      "loss": 0.7093,
+      "step": 1923
+    },
+    {
+      "epoch": 1.8232646292347785,
+      "grad_norm": 1.3203086559264363,
+      "learning_rate": 2.0253513192751374e-07,
+      "loss": 0.6744,
+      "step": 1924
+    },
+    {
+      "epoch": 1.8242122719734661,
+      "grad_norm": 0.9700557122361928,
+      "learning_rate": 2.003778143402535e-07,
+      "loss": 0.6905,
+      "step": 1925
+    },
+    {
+      "epoch": 1.8251599147121536,
+      "grad_norm": 1.3517239993246017,
+      "learning_rate": 1.9823181281851513e-07,
+      "loss": 0.6834,
+      "step": 1926
+    },
+    {
+      "epoch": 1.826107557450841,
+      "grad_norm": 1.02906230765169,
+      "learning_rate": 1.960971324219263e-07,
+      "loss": 0.7265,
+      "step": 1927
+    },
+    {
+      "epoch": 1.8270552001895286,
+      "grad_norm": 1.0585910384244444,
+      "learning_rate": 1.9397377818341945e-07,
+      "loss": 0.6877,
+      "step": 1928
+    },
+    {
+      "epoch": 1.828002842928216,
+      "grad_norm": 0.9475218916837848,
+      "learning_rate": 1.9186175510922666e-07,
+      "loss": 0.7416,
+      "step": 1929
+    },
+    {
+      "epoch": 1.8289504856669034,
+      "grad_norm": 1.0203159431279483,
+      "learning_rate": 1.8976106817886197e-07,
+      "loss": 0.714,
+      "step": 1930
+    },
+    {
+      "epoch": 1.829898128405591,
+      "grad_norm": 1.127014657550726,
+      "learning_rate": 1.876717223451141e-07,
+      "loss": 0.7112,
+      "step": 1931
+    },
+    {
+      "epoch": 1.8308457711442787,
+      "grad_norm": 1.353971401648652,
+      "learning_rate": 1.8559372253403152e-07,
+      "loss": 0.714,
+      "step": 1932
+    },
+    {
+      "epoch": 1.8317934138829661,
+      "grad_norm": 1.0446112826008636,
+      "learning_rate": 1.8352707364491352e-07,
+      "loss": 0.6958,
+      "step": 1933
+    },
+    {
+      "epoch": 1.8327410566216535,
+      "grad_norm": 1.0577797650889291,
+      "learning_rate": 1.814717805502958e-07,
+      "loss": 0.736,
+      "step": 1934
+    },
+    {
+      "epoch": 1.8336886993603412,
+      "grad_norm": 1.2894072021725473,
+      "learning_rate": 1.794278480959416e-07,
+      "loss": 0.7035,
+      "step": 1935
+    },
+    {
+      "epoch": 1.8346363420990288,
+      "grad_norm": 1.0041014130507704,
+      "learning_rate": 1.7739528110083003e-07,
+      "loss": 0.6661,
+      "step": 1936
+    },
+    {
+      "epoch": 1.8346363420990288,
+      "eval_loss": 0.9146416783332825,
+      "eval_runtime": 66.1092,
+      "eval_samples_per_second": 41.265,
+      "eval_steps_per_second": 0.65,
+      "step": 1936
+    },
+    {
+      "epoch": 1.8355839848377162,
+      "grad_norm": 1.2009141696461565,
+      "learning_rate": 1.7537408435714054e-07,
+      "loss": 0.698,
+      "step": 1937
+    },
+    {
+      "epoch": 1.8365316275764036,
+      "grad_norm": 1.0557423818039546,
+      "learning_rate": 1.7336426263024896e-07,
+      "loss": 0.6599,
+      "step": 1938
+    },
+    {
+      "epoch": 1.8374792703150913,
+      "grad_norm": 1.1355253989528187,
+      "learning_rate": 1.7136582065870876e-07,
+      "loss": 0.7389,
+      "step": 1939
+    },
+    {
+      "epoch": 1.8384269130537787,
+      "grad_norm": 0.947567680408928,
+      "learning_rate": 1.6937876315424707e-07,
+      "loss": 0.6902,
+      "step": 1940
+    },
+    {
+      "epoch": 1.8393745557924661,
+      "grad_norm": 1.0469689776321616,
+      "learning_rate": 1.6740309480174633e-07,
+      "loss": 0.6955,
+      "step": 1941
+    },
+    {
+      "epoch": 1.8403221985311538,
+      "grad_norm": 1.0956794304972344,
+      "learning_rate": 1.6543882025923884e-07,
+      "loss": 0.7019,
+      "step": 1942
+    },
+    {
+      "epoch": 1.8412698412698414,
+      "grad_norm": 0.9499513224862992,
+      "learning_rate": 1.6348594415789286e-07,
+      "loss": 0.7197,
+      "step": 1943
+    },
+    {
+      "epoch": 1.8422174840085288,
+      "grad_norm": 1.0290878955410225,
+      "learning_rate": 1.6154447110200256e-07,
+      "loss": 0.6963,
+      "step": 1944
+    },
+    {
+      "epoch": 1.8431651267472162,
+      "grad_norm": 1.084819106183627,
+      "learning_rate": 1.5961440566897913e-07,
+      "loss": 0.6618,
+      "step": 1945
+    },
+    {
+      "epoch": 1.8441127694859039,
+      "grad_norm": 0.9712359809714644,
+      "learning_rate": 1.5769575240933422e-07,
+      "loss": 0.7188,
+      "step": 1946
+    },
+    {
+      "epoch": 1.8450604122245915,
+      "grad_norm": 1.053958386842615,
+      "learning_rate": 1.5578851584667654e-07,
+      "loss": 0.6487,
+      "step": 1947
+    },
+    {
+      "epoch": 1.8460080549632787,
+      "grad_norm": 1.0254427828467692,
+      "learning_rate": 1.5389270047769578e-07,
+      "loss": 0.7443,
+      "step": 1948
+    },
+    {
+      "epoch": 1.8469556977019663,
+      "grad_norm": 1.0100347718010834,
+      "learning_rate": 1.520083107721543e-07,
+      "loss": 0.7099,
+      "step": 1949
+    },
+    {
+      "epoch": 1.847903340440654,
+      "grad_norm": 1.0591527642673004,
+      "learning_rate": 1.5013535117287648e-07,
+      "loss": 0.7101,
+      "step": 1950
+    },
+    {
+      "epoch": 1.8488509831793414,
+      "grad_norm": 0.9196055199586443,
+      "learning_rate": 1.482738260957378e-07,
+      "loss": 0.6968,
+      "step": 1951
+    },
+    {
+      "epoch": 1.8497986259180288,
+      "grad_norm": 1.082437628745268,
+      "learning_rate": 1.4642373992965365e-07,
+      "loss": 0.6848,
+      "step": 1952
+    },
+    {
+      "epoch": 1.8507462686567164,
+      "grad_norm": 1.020089742858143,
+      "learning_rate": 1.4458509703657197e-07,
+      "loss": 0.7327,
+      "step": 1953
+    },
+    {
+      "epoch": 1.851693911395404,
+      "grad_norm": 1.138043400664436,
+      "learning_rate": 1.427579017514591e-07,
+      "loss": 0.7166,
+      "step": 1954
+    },
+    {
+      "epoch": 1.8526415541340915,
+      "grad_norm": 1.034045450725446,
+      "learning_rate": 1.4094215838229176e-07,
+      "loss": 0.7585,
+      "step": 1955
+    },
+    {
+      "epoch": 1.853589196872779,
+      "grad_norm": 1.7484699081096364,
+      "learning_rate": 1.3913787121004717e-07,
+      "loss": 0.6699,
+      "step": 1956
+    },
+    {
+      "epoch": 1.8545368396114665,
+      "grad_norm": 1.1518516482534196,
+      "learning_rate": 1.3734504448869147e-07,
+      "loss": 0.7528,
+      "step": 1957
+    },
+    {
+      "epoch": 1.855484482350154,
+      "grad_norm": 1.2727867307662963,
+      "learning_rate": 1.3556368244517116e-07,
+      "loss": 0.7042,
+      "step": 1958
+    },
+    {
+      "epoch": 1.855484482350154,
+      "eval_loss": 0.9145249128341675,
+      "eval_runtime": 68.0932,
+      "eval_samples_per_second": 40.063,
+      "eval_steps_per_second": 0.631,
+      "step": 1958
+    },
+    {
+      "epoch": 1.8564321250888414,
+      "grad_norm": 0.9279042520126958,
+      "learning_rate": 1.3379378927940167e-07,
+      "loss": 0.7096,
+      "step": 1959
+    },
+    {
+      "epoch": 1.857379767827529,
+      "grad_norm": 0.8860547537873784,
+      "learning_rate": 1.3203536916425842e-07,
+      "loss": 0.6665,
+      "step": 1960
+    },
+    {
+      "epoch": 1.8583274105662166,
+      "grad_norm": 0.9885824244176509,
+      "learning_rate": 1.3028842624556893e-07,
+      "loss": 0.6769,
+      "step": 1961
+    },
+    {
+      "epoch": 1.859275053304904,
+      "grad_norm": 1.1914468020222337,
+      "learning_rate": 1.2855296464209687e-07,
+      "loss": 0.6548,
+      "step": 1962
+    },
+    {
+      "epoch": 1.8602226960435915,
+      "grad_norm": 1.2643990216476162,
+      "learning_rate": 1.2682898844554093e-07,
+      "loss": 0.7257,
+      "step": 1963
+    },
+    {
+      "epoch": 1.861170338782279,
+      "grad_norm": 0.9779836857682124,
+      "learning_rate": 1.2511650172051636e-07,
+      "loss": 0.6888,
+      "step": 1964
+    },
+    {
+      "epoch": 1.8621179815209667,
+      "grad_norm": 0.9438092146007916,
+      "learning_rate": 1.2341550850455353e-07,
+      "loss": 0.6962,
+      "step": 1965
+    },
+    {
+      "epoch": 1.8630656242596542,
+      "grad_norm": 1.24403617918052,
+      "learning_rate": 1.217260128080816e-07,
+      "loss": 0.733,
+      "step": 1966
+    },
+    {
+      "epoch": 1.8640132669983416,
+      "grad_norm": 0.9517426709373272,
+      "learning_rate": 1.2004801861442373e-07,
+      "loss": 0.7037,
+      "step": 1967
+    },
+    {
+      "epoch": 1.8649609097370292,
+      "grad_norm": 1.1778147961281975,
+      "learning_rate": 1.183815298797858e-07,
+      "loss": 0.7429,
+      "step": 1968
+    },
+    {
+      "epoch": 1.8659085524757166,
+      "grad_norm": 1.104468634647898,
+      "learning_rate": 1.1672655053324655e-07,
+      "loss": 0.712,
+      "step": 1969
+    },
+    {
+      "epoch": 1.866856195214404,
+      "grad_norm": 1.0425031543126493,
+      "learning_rate": 1.1508308447674977e-07,
+      "loss": 0.7324,
+      "step": 1970
+    },
+    {
+      "epoch": 1.8678038379530917,
+      "grad_norm": 1.0608934372919678,
+      "learning_rate": 1.1345113558509424e-07,
+      "loss": 0.7224,
+      "step": 1971
+    },
+    {
+      "epoch": 1.8687514806917793,
+      "grad_norm": 0.9785171863226142,
+      "learning_rate": 1.1183070770592442e-07,
+      "loss": 0.7362,
+      "step": 1972
+    },
+    {
+      "epoch": 1.8696991234304667,
+      "grad_norm": 1.0041114112851635,
+      "learning_rate": 1.1022180465972198e-07,
+      "loss": 0.7145,
+      "step": 1973
+    },
+    {
+      "epoch": 1.8706467661691542,
+      "grad_norm": 1.0344356033639341,
+      "learning_rate": 1.0862443023979651e-07,
+      "loss": 0.6638,
+      "step": 1974
+    },
+    {
+      "epoch": 1.8715944089078418,
+      "grad_norm": 1.1440137970112205,
+      "learning_rate": 1.0703858821227541e-07,
+      "loss": 0.72,
+      "step": 1975
+    },
+    {
+      "epoch": 1.8725420516465292,
+      "grad_norm": 1.011318166969755,
+      "learning_rate": 1.0546428231609896e-07,
+      "loss": 0.7001,
+      "step": 1976
+    },
+    {
+      "epoch": 1.8734896943852166,
+      "grad_norm": 1.1534751759952744,
+      "learning_rate": 1.0390151626300527e-07,
+      "loss": 0.7046,
+      "step": 1977
+    },
+    {
+      "epoch": 1.8744373371239043,
+      "grad_norm": 1.090411632798464,
+      "learning_rate": 1.0235029373752758e-07,
+      "loss": 0.6901,
+      "step": 1978
+    },
+    {
+      "epoch": 1.875384979862592,
+      "grad_norm": 1.07336526868862,
+      "learning_rate": 1.0081061839698259e-07,
+      "loss": 0.723,
+      "step": 1979
+    },
+    {
+      "epoch": 1.8763326226012793,
+      "grad_norm": 1.0079673427741604,
+      "learning_rate": 9.928249387145983e-08,
+      "loss": 0.6381,
+      "step": 1980
+    },
+    {
+      "epoch": 1.8763326226012793,
+      "eval_loss": 0.9143509268760681,
+      "eval_runtime": 64.728,
+      "eval_samples_per_second": 42.146,
+      "eval_steps_per_second": 0.664,
+      "step": 1980
+    },
+    {
+      "epoch": 1.8772802653399667,
+      "grad_norm": 1.1089070458655792,
+      "learning_rate": 9.776592376381955e-08,
+      "loss": 0.684,
+      "step": 1981
+    },
+    {
+      "epoch": 1.8782279080786544,
+      "grad_norm": 1.0295711500308624,
+      "learning_rate": 9.626091164967599e-08,
+      "loss": 0.6981,
+      "step": 1982
+    },
+    {
+      "epoch": 1.879175550817342,
+      "grad_norm": 1.1306401873193062,
+      "learning_rate": 9.476746107739577e-08,
+      "loss": 0.6419,
+      "step": 1983
+    },
+    {
+      "epoch": 1.8801231935560294,
+      "grad_norm": 1.0143892851114753,
+      "learning_rate": 9.32855755680867e-08,
+      "loss": 0.7298,
+      "step": 1984
+    },
+    {
+      "epoch": 1.8810708362947168,
+      "grad_norm": 1.200139953828078,
+      "learning_rate": 9.181525861558849e-08,
+      "loss": 0.7129,
+      "step": 1985
+    },
+    {
+      "epoch": 1.8820184790334045,
+      "grad_norm": 1.1828188126049186,
+      "learning_rate": 9.035651368646647e-08,
+      "loss": 0.6156,
+      "step": 1986
+    },
+    {
+      "epoch": 1.8829661217720919,
+      "grad_norm": 0.9893546670383357,
+      "learning_rate": 8.89093442200023e-08,
+      "loss": 0.6687,
+      "step": 1987
+    },
+    {
+      "epoch": 1.8839137645107793,
+      "grad_norm": 1.3094641319150206,
+      "learning_rate": 8.747375362818667e-08,
+      "loss": 0.6492,
+      "step": 1988
+    },
+    {
+      "epoch": 1.884861407249467,
+      "grad_norm": 0.9994122322268034,
+      "learning_rate": 8.604974529571042e-08,
+      "loss": 0.6722,
+      "step": 1989
+    },
+    {
+      "epoch": 1.8858090499881546,
+      "grad_norm": 1.0426017376290013,
+      "learning_rate": 8.463732257995571e-08,
+      "loss": 0.727,
+      "step": 1990
+    },
+    {
+      "epoch": 1.886756692726842,
+      "grad_norm": 1.2180348697029497,
+      "learning_rate": 8.323648881099211e-08,
+      "loss": 0.7231,
+      "step": 1991
+    },
+    {
+      "epoch": 1.8877043354655294,
+      "grad_norm": 1.0642272839244786,
+      "learning_rate": 8.184724729156379e-08,
+      "loss": 0.7435,
+      "step": 1992
+    },
+    {
+      "epoch": 1.888651978204217,
+      "grad_norm": 0.9269037955466151,
+      "learning_rate": 8.046960129708348e-08,
+      "loss": 0.6886,
+      "step": 1993
+    },
+    {
+      "epoch": 1.8895996209429047,
+      "grad_norm": 0.9229485938696481,
+      "learning_rate": 7.910355407562742e-08,
+      "loss": 0.7321,
+      "step": 1994
+    },
+    {
+      "epoch": 1.890547263681592,
+      "grad_norm": 1.0976428013397825,
+      "learning_rate": 7.774910884792319e-08,
+      "loss": 0.7073,
+      "step": 1995
+    },
+    {
+      "epoch": 1.8914949064202795,
+      "grad_norm": 0.9509585098963864,
+      "learning_rate": 7.640626880734581e-08,
+      "loss": 0.6638,
+      "step": 1996
+    },
+    {
+      "epoch": 1.8924425491589671,
+      "grad_norm": 1.0689281848692378,
+      "learning_rate": 7.507503711990771e-08,
+      "loss": 0.6994,
+      "step": 1997
+    },
+    {
+      "epoch": 1.8933901918976546,
+      "grad_norm": 0.9440594653903914,
+      "learning_rate": 7.375541692425325e-08,
+      "loss": 0.7163,
+      "step": 1998
+    },
+    {
+      "epoch": 1.894337834636342,
+      "grad_norm": 1.231877807532716,
+      "learning_rate": 7.244741133164979e-08,
+      "loss": 0.7402,
+      "step": 1999
+    },
+    {
+      "epoch": 1.8952854773750296,
+      "grad_norm": 1.021884925509466,
+      "learning_rate": 7.115102342598101e-08,
+      "loss": 0.7227,
+      "step": 2000
+    },
+    {
+      "epoch": 1.8962331201137173,
+      "grad_norm": 1.003242355792749,
+      "learning_rate": 6.986625626373978e-08,
+      "loss": 0.7284,
+      "step": 2001
+    },
+    {
+      "epoch": 1.8971807628524047,
+      "grad_norm": 1.0311894967892232,
+      "learning_rate": 6.859311287402081e-08,
+      "loss": 0.745,
+      "step": 2002
+    },
+    {
+      "epoch": 1.8971807628524047,
+      "eval_loss": 0.9144185185432434,
+      "eval_runtime": 63.7067,
+      "eval_samples_per_second": 42.821,
+      "eval_steps_per_second": 0.675,
+      "step": 2002
+    },
+    {
+      "epoch": 1.898128405591092,
+      "grad_norm": 0.933859188399113,
+      "learning_rate": 6.733159625851304e-08,
+      "loss": 0.7088,
+      "step": 2003
+    },
+    {
+      "epoch": 1.8990760483297797,
+      "grad_norm": 0.9503318061850076,
+      "learning_rate": 6.608170939149283e-08,
+      "loss": 0.666,
+      "step": 2004
+    },
+    {
+      "epoch": 1.9000236910684671,
+      "grad_norm": 1.028500893291808,
+      "learning_rate": 6.48434552198185e-08,
+      "loss": 0.646,
+      "step": 2005
+    },
+    {
+      "epoch": 1.9009713338071546,
+      "grad_norm": 1.1161543459809187,
+      "learning_rate": 6.361683666291973e-08,
+      "loss": 0.6776,
+      "step": 2006
+    },
+    {
+      "epoch": 1.9019189765458422,
+      "grad_norm": 1.0620526873003586,
+      "learning_rate": 6.240185661279541e-08,
+      "loss": 0.7249,
+      "step": 2007
+    },
+    {
+      "epoch": 1.9028666192845298,
+      "grad_norm": 1.209070343804771,
+      "learning_rate": 6.119851793400188e-08,
+      "loss": 0.6793,
+      "step": 2008
+    },
+    {
+      "epoch": 1.9038142620232172,
+      "grad_norm": 0.9596501498455559,
+      "learning_rate": 6.000682346365084e-08,
+      "loss": 0.6838,
+      "step": 2009
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 1.0459464382958554,
+      "learning_rate": 5.882677601139919e-08,
+      "loss": 0.6883,
+      "step": 2010
+    },
+    {
+      "epoch": 1.9057095475005923,
+      "grad_norm": 1.3695482020370489,
+      "learning_rate": 5.7658378359443104e-08,
+      "loss": 0.721,
+      "step": 2011
+    },
+    {
+      "epoch": 1.90665719023928,
+      "grad_norm": 1.056846758068541,
+      "learning_rate": 5.6501633262513454e-08,
+      "loss": 0.6852,
+      "step": 2012
+    },
+    {
+      "epoch": 1.9076048329779673,
+      "grad_norm": 1.0176824003025227,
+      "learning_rate": 5.535654344786756e-08,
+      "loss": 0.6672,
+      "step": 2013
+    },
+    {
+      "epoch": 1.9085524757166548,
+      "grad_norm": 1.186973736079735,
+      "learning_rate": 5.4223111615281935e-08,
+      "loss": 0.7098,
+      "step": 2014
+    },
+    {
+      "epoch": 1.9095001184553424,
+      "grad_norm": 1.0942365940476848,
+      "learning_rate": 5.310134043704895e-08,
+      "loss": 0.7302,
+      "step": 2015
+    },
+    {
+      "epoch": 1.9104477611940298,
+      "grad_norm": 1.1024694043360106,
+      "learning_rate": 5.1991232557966344e-08,
+      "loss": 0.689,
+      "step": 2016
+    },
+    {
+      "epoch": 1.9113954039327172,
+      "grad_norm": 0.9659220513288693,
+      "learning_rate": 5.089279059533658e-08,
+      "loss": 0.6806,
+      "step": 2017
+    },
+    {
+      "epoch": 1.9123430466714049,
+      "grad_norm": 1.0448744385892244,
+      "learning_rate": 4.9806017138953053e-08,
+      "loss": 0.7005,
+      "step": 2018
+    },
+    {
+      "epoch": 1.9132906894100925,
+      "grad_norm": 1.2099073891780832,
+      "learning_rate": 4.873091475110281e-08,
+      "loss": 0.701,
+      "step": 2019
+    },
+    {
+      "epoch": 1.91423833214878,
+      "grad_norm": 1.144622221873965,
+      "learning_rate": 4.766748596655268e-08,
+      "loss": 0.7041,
+      "step": 2020
+    },
+    {
+      "epoch": 1.9151859748874673,
+      "grad_norm": 1.0628184982525557,
+      "learning_rate": 4.66157332925482e-08,
+      "loss": 0.6452,
+      "step": 2021
+    },
+    {
+      "epoch": 1.916133617626155,
+      "grad_norm": 1.0240120660236798,
+      "learning_rate": 4.55756592088058e-08,
+      "loss": 0.6759,
+      "step": 2022
+    },
+    {
+      "epoch": 1.9170812603648426,
+      "grad_norm": 1.0346055793091427,
+      "learning_rate": 4.4547266167507264e-08,
+      "loss": 0.6916,
+      "step": 2023
+    },
+    {
+      "epoch": 1.9180289031035298,
+      "grad_norm": 1.0007416620575995,
+      "learning_rate": 4.3530556593294194e-08,
+      "loss": 0.704,
+      "step": 2024
+    },
+    {
+      "epoch": 1.9180289031035298,
+      "eval_loss": 0.9143268465995789,
+      "eval_runtime": 66.9808,
+      "eval_samples_per_second": 40.728,
+      "eval_steps_per_second": 0.642,
+      "step": 2024
+    },
+    {
+      "epoch": 1.9189765458422174,
+      "grad_norm": 1.1078567701211084,
+      "learning_rate": 4.2525532883261886e-08,
+      "loss": 0.6868,
+      "step": 2025
+    },
+    {
+      "epoch": 1.919924188580905,
+      "grad_norm": 1.107371658240934,
+      "learning_rate": 4.1532197406954357e-08,
+      "loss": 0.6794,
+      "step": 2026
+    },
+    {
+      "epoch": 1.9208718313195925,
+      "grad_norm": 1.1152223254534468,
+      "learning_rate": 4.0550552506357646e-08,
+      "loss": 0.6733,
+      "step": 2027
+    },
+    {
+      "epoch": 1.92181947405828,
+      "grad_norm": 0.9814552628881259,
+      "learning_rate": 3.958060049589485e-08,
+      "loss": 0.6746,
+      "step": 2028
+    },
+    {
+      "epoch": 1.9227671167969675,
+      "grad_norm": 1.1148524114158356,
+      "learning_rate": 3.862234366242168e-08,
+      "loss": 0.6809,
+      "step": 2029
+    },
+    {
+      "epoch": 1.9237147595356552,
+      "grad_norm": 1.1264125552667186,
+      "learning_rate": 3.767578426521923e-08,
+      "loss": 0.6624,
+      "step": 2030
+    },
+    {
+      "epoch": 1.9246624022743426,
+      "grad_norm": 0.8821749884652544,
+      "learning_rate": 3.674092453598954e-08,
+      "loss": 0.7492,
+      "step": 2031
+    },
+    {
+      "epoch": 1.92561004501303,
+      "grad_norm": 0.970645505659204,
+      "learning_rate": 3.581776667885062e-08,
+      "loss": 0.6561,
+      "step": 2032
+    },
+    {
+      "epoch": 1.9265576877517177,
+      "grad_norm": 1.1049159464316436,
+      "learning_rate": 3.4906312870331973e-08,
+      "loss": 0.7361,
+      "step": 2033
+    },
+    {
+      "epoch": 1.927505330490405,
+      "grad_norm": 1.0950647602834038,
+      "learning_rate": 3.40065652593663e-08,
+      "loss": 0.7006,
+      "step": 2034
+    },
+    {
+      "epoch": 1.9284529732290925,
+      "grad_norm": 1.0452185514606323,
+      "learning_rate": 3.311852596728948e-08,
+      "loss": 0.7276,
+      "step": 2035
+    },
+    {
+      "epoch": 1.9294006159677801,
+      "grad_norm": 1.1747706021025806,
+      "learning_rate": 3.2242197087828944e-08,
+      "loss": 0.691,
+      "step": 2036
+    },
+    {
+      "epoch": 1.9303482587064678,
+      "grad_norm": 1.2178046074293865,
+      "learning_rate": 3.137758068710694e-08,
+      "loss": 0.6611,
+      "step": 2037
+    },
+    {
+      "epoch": 1.9312959014451552,
+      "grad_norm": 0.9619015695423282,
+      "learning_rate": 3.052467880362675e-08,
+      "loss": 0.6696,
+      "step": 2038
+    },
+    {
+      "epoch": 1.9322435441838426,
+      "grad_norm": 1.157318529310348,
+      "learning_rate": 2.9683493448275925e-08,
+      "loss": 0.681,
+      "step": 2039
+    },
+    {
+      "epoch": 1.9331911869225302,
+      "grad_norm": 1.0061721079680708,
+      "learning_rate": 2.8854026604315798e-08,
+      "loss": 0.6822,
+      "step": 2040
+    },
+    {
+      "epoch": 1.9341388296612179,
+      "grad_norm": 1.0526406685824896,
+      "learning_rate": 2.8036280227379808e-08,
+      "loss": 0.6901,
+      "step": 2041
+    },
+    {
+      "epoch": 1.9350864723999053,
+      "grad_norm": 0.968442328631709,
+      "learning_rate": 2.723025624546849e-08,
+      "loss": 0.6801,
+      "step": 2042
+    },
+    {
+      "epoch": 1.9360341151385927,
+      "grad_norm": 0.9496600506704314,
+      "learning_rate": 2.6435956558943375e-08,
+      "loss": 0.7094,
+      "step": 2043
+    },
+    {
+      "epoch": 1.9369817578772803,
+      "grad_norm": 0.9299456148661794,
+      "learning_rate": 2.5653383040524228e-08,
+      "loss": 0.6135,
+      "step": 2044
+    },
+    {
+      "epoch": 1.9379294006159677,
+      "grad_norm": 0.9516242679649526,
+      "learning_rate": 2.488253753528458e-08,
+      "loss": 0.7323,
+      "step": 2045
+    },
+    {
+      "epoch": 1.9388770433546552,
+      "grad_norm": 1.076415703445146,
+      "learning_rate": 2.4123421860645646e-08,
+      "loss": 0.7032,
+      "step": 2046
+    },
+    {
+      "epoch": 1.9388770433546552,
+      "eval_loss": 0.9144385457038879,
+      "eval_runtime": 67.2,
+      "eval_samples_per_second": 40.595,
+      "eval_steps_per_second": 0.64,
+      "step": 2046
+    },
+    {
+      "epoch": 1.9398246860933428,
+      "grad_norm": 2.5186162393004623,
+      "learning_rate": 2.3376037806374097e-08,
+      "loss": 0.714,
+      "step": 2047
+    },
+    {
+      "epoch": 1.9407723288320304,
+      "grad_norm": 1.008486947731255,
+      "learning_rate": 2.264038713457706e-08,
+      "loss": 0.6927,
+      "step": 2048
+    },
+    {
+      "epoch": 1.9417199715707179,
+      "grad_norm": 0.9617322705423318,
+      "learning_rate": 2.1916471579697117e-08,
+      "loss": 0.691,
+      "step": 2049
+    },
+    {
+      "epoch": 1.9426676143094053,
+      "grad_norm": 0.9939767442168306,
+      "learning_rate": 2.1204292848509557e-08,
+      "loss": 0.7493,
+      "step": 2050
+    },
+    {
+      "epoch": 1.943615257048093,
+      "grad_norm": 0.9853441684157483,
+      "learning_rate": 2.050385262011789e-08,
+      "loss": 0.6878,
+      "step": 2051
+    },
+    {
+      "epoch": 1.9445628997867805,
+      "grad_norm": 1.0326884095472177,
+      "learning_rate": 1.98151525459489e-08,
+      "loss": 0.7353,
+      "step": 2052
+    },
+    {
+      "epoch": 1.9455105425254677,
+      "grad_norm": 1.0951236722722288,
+      "learning_rate": 1.9138194249750386e-08,
+      "loss": 0.6593,
+      "step": 2053
+    },
+    {
+      "epoch": 1.9464581852641554,
+      "grad_norm": 1.1769494445073123,
+      "learning_rate": 1.8472979327587292e-08,
+      "loss": 0.6544,
+      "step": 2054
+    },
+    {
+      "epoch": 1.947405828002843,
+      "grad_norm": 1.0395960653177918,
+      "learning_rate": 1.781950934783505e-08,
+      "loss": 0.7335,
+      "step": 2055
+    },
+    {
+      "epoch": 1.9483534707415304,
+      "grad_norm": 1.0211404503318975,
+      "learning_rate": 1.7177785851180127e-08,
+      "loss": 0.7096,
+      "step": 2056
+    },
+    {
+      "epoch": 1.9493011134802178,
+      "grad_norm": 1.2278758807843733,
+      "learning_rate": 1.654781035061337e-08,
+      "loss": 0.6919,
+      "step": 2057
+    },
+    {
+      "epoch": 1.9502487562189055,
+      "grad_norm": 1.1176872580144226,
+      "learning_rate": 1.5929584331427218e-08,
+      "loss": 0.6904,
+      "step": 2058
+    },
+    {
+      "epoch": 1.9511963989575931,
+      "grad_norm": 1.2525283554104196,
+      "learning_rate": 1.532310925121294e-08,
+      "loss": 0.733,
+      "step": 2059
+    },
+    {
+      "epoch": 1.9521440416962805,
+      "grad_norm": 1.0083196464834074,
+      "learning_rate": 1.4728386539856754e-08,
+      "loss": 0.6684,
+      "step": 2060
+    },
+    {
+      "epoch": 1.953091684434968,
+      "grad_norm": 0.9670996702496595,
+      "learning_rate": 1.4145417599534805e-08,
+      "loss": 0.6882,
+      "step": 2061
+    },
+    {
+      "epoch": 1.9540393271736556,
+      "grad_norm": 2.2195094052226754,
+      "learning_rate": 1.3574203804713748e-08,
+      "loss": 0.7289,
+      "step": 2062
+    },
+    {
+      "epoch": 1.954986969912343,
+      "grad_norm": 1.0493072981480793,
+      "learning_rate": 1.3014746502142962e-08,
+      "loss": 0.7345,
+      "step": 2063
+    },
+    {
+      "epoch": 1.9559346126510304,
+      "grad_norm": 1.1402015478284233,
+      "learning_rate": 1.2467047010855659e-08,
+      "loss": 0.7084,
+      "step": 2064
+    },
+    {
+      "epoch": 1.956882255389718,
+      "grad_norm": 1.028808624981634,
+      "learning_rate": 1.1931106622161127e-08,
+      "loss": 0.7665,
+      "step": 2065
+    },
+    {
+      "epoch": 1.9578298981284057,
+      "grad_norm": 0.9887477552469968,
+      "learning_rate": 1.1406926599646373e-08,
+      "loss": 0.7191,
+      "step": 2066
+    },
+    {
+      "epoch": 1.958777540867093,
+      "grad_norm": 1.093798325618811,
+      "learning_rate": 1.0894508179170038e-08,
+      "loss": 0.6972,
+      "step": 2067
+    },
+    {
+      "epoch": 1.9597251836057805,
+      "grad_norm": 0.9804042235247051,
+      "learning_rate": 1.0393852568860718e-08,
+      "loss": 0.7328,
+      "step": 2068
+    },
+    {
+      "epoch": 1.9597251836057805,
+      "eval_loss": 0.914358913898468,
+      "eval_runtime": 68.5205,
+      "eval_samples_per_second": 39.813,
+      "eval_steps_per_second": 0.628,
+      "step": 2068
+    },
+    {
+      "epoch": 1.9606728263444682,
+      "grad_norm": 1.2343380039639555,
+      "learning_rate": 9.904960949114195e-09,
+      "loss": 0.7172,
+      "step": 2069
+    },
+    {
+      "epoch": 1.9616204690831558,
+      "grad_norm": 1.0811426709912628,
+      "learning_rate": 9.427834472588992e-09,
+      "loss": 0.721,
+      "step": 2070
+    },
+    {
+      "epoch": 1.9625681118218432,
+      "grad_norm": 1.3595974625306733,
+      "learning_rate": 8.962474264206378e-09,
+      "loss": 0.7008,
+      "step": 2071
+    },
+    {
+      "epoch": 1.9635157545605306,
+      "grad_norm": 1.0348382536874432,
+      "learning_rate": 8.508881421145366e-09,
+      "loss": 0.7039,
+      "step": 2072
+    },
+    {
+      "epoch": 1.9644633972992183,
+      "grad_norm": 1.2153800966637154,
+      "learning_rate": 8.067057012842161e-09,
+      "loss": 0.7865,
+      "step": 2073
+    },
+    {
+      "epoch": 1.9654110400379057,
+      "grad_norm": 0.9289644211293113,
+      "learning_rate": 7.637002080985167e-09,
+      "loss": 0.7485,
+      "step": 2074
+    },
+    {
+      "epoch": 1.966358682776593,
+      "grad_norm": 0.9661916287589752,
+      "learning_rate": 7.218717639514983e-09,
+      "loss": 0.666,
+      "step": 2075
+    },
+    {
+      "epoch": 1.9673063255152807,
+      "grad_norm": 0.9787686202940714,
+      "learning_rate": 6.81220467461996e-09,
+      "loss": 0.7334,
+      "step": 2076
+    },
+    {
+      "epoch": 1.9682539682539684,
+      "grad_norm": 0.9964620209319142,
+      "learning_rate": 6.417464144736208e-09,
+      "loss": 0.67,
+      "step": 2077
+    },
+    {
+      "epoch": 1.9692016109926558,
+      "grad_norm": 1.030464390987384,
+      "learning_rate": 6.034496980542037e-09,
+      "loss": 0.705,
+      "step": 2078
+    },
+    {
+      "epoch": 1.9701492537313432,
+      "grad_norm": 1.1369647103723564,
+      "learning_rate": 5.6633040849601865e-09,
+      "loss": 0.7031,
+      "step": 2079
+    },
+    {
+      "epoch": 1.9710968964700308,
+      "grad_norm": 1.17782549137388,
+      "learning_rate": 5.303886333151154e-09,
+      "loss": 0.7033,
+      "step": 2080
+    },
+    {
+      "epoch": 1.9720445392087185,
+      "grad_norm": 1.123167425449723,
+      "learning_rate": 4.956244572513203e-09,
+      "loss": 0.7173,
+      "step": 2081
+    },
+    {
+      "epoch": 1.9729921819474057,
+      "grad_norm": 0.9688837536293665,
+      "learning_rate": 4.620379622682358e-09,
+      "loss": 0.7403,
+      "step": 2082
+    },
+    {
+      "epoch": 1.9739398246860933,
+      "grad_norm": 1.234375631478768,
+      "learning_rate": 4.296292275526859e-09,
+      "loss": 0.6677,
+      "step": 2083
+    },
+    {
+      "epoch": 1.974887467424781,
+      "grad_norm": 1.0712274738868512,
+      "learning_rate": 3.983983295146599e-09,
+      "loss": 0.68,
+      "step": 2084
+    },
+    {
+      "epoch": 1.9758351101634684,
+      "grad_norm": 1.0128865650855263,
+      "learning_rate": 3.6834534178725734e-09,
+      "loss": 0.7062,
+      "step": 2085
+    },
+    {
+      "epoch": 1.9767827529021558,
+      "grad_norm": 1.1056609912623454,
+      "learning_rate": 3.394703352263551e-09,
+      "loss": 0.6977,
+      "step": 2086
+    },
+    {
+      "epoch": 1.9777303956408434,
+      "grad_norm": 0.9556835224953595,
+      "learning_rate": 3.117733779105514e-09,
+      "loss": 0.7046,
+      "step": 2087
+    },
+    {
+      "epoch": 1.978678038379531,
+      "grad_norm": 0.9749594888727722,
+      "learning_rate": 2.8525453514099966e-09,
+      "loss": 0.6734,
+      "step": 2088
+    },
+    {
+      "epoch": 1.9796256811182185,
+      "grad_norm": 1.0169544561219785,
+      "learning_rate": 2.5991386944107524e-09,
+      "loss": 0.7337,
+      "step": 2089
+    },
+    {
+      "epoch": 1.9805733238569059,
+      "grad_norm": 1.0181317147660969,
+      "learning_rate": 2.3575144055643094e-09,
+      "loss": 0.6938,
+      "step": 2090
+    },
+    {
+      "epoch": 1.9805733238569059,
+      "eval_loss": 0.914404034614563,
+      "eval_runtime": 63.6123,
+      "eval_samples_per_second": 42.885,
+      "eval_steps_per_second": 0.676,
+      "step": 2090
+    },
+    {
+      "epoch": 1.9815209665955935,
+      "grad_norm": 1.1555804337504532,
+      "learning_rate": 2.1276730545488623e-09,
+      "loss": 0.6347,
+      "step": 2091
+    },
+    {
+      "epoch": 1.982468609334281,
+      "grad_norm": 1.1894163816979972,
+      "learning_rate": 1.9096151832609378e-09,
+      "loss": 0.7038,
+      "step": 2092
+    },
+    {
+      "epoch": 1.9834162520729683,
+      "grad_norm": 1.045940279368652,
+      "learning_rate": 1.703341305815398e-09,
+      "loss": 0.7356,
+      "step": 2093
+    },
+    {
+      "epoch": 1.984363894811656,
+      "grad_norm": 1.016015756301957,
+      "learning_rate": 1.5088519085437736e-09,
+      "loss": 0.6858,
+      "step": 2094
+    },
+    {
+      "epoch": 1.9853115375503436,
+      "grad_norm": 1.0947801703603017,
+      "learning_rate": 1.326147449993709e-09,
+      "loss": 0.6883,
+      "step": 2095
+    },
+    {
+      "epoch": 1.986259180289031,
+      "grad_norm": 0.960911476029797,
+      "learning_rate": 1.1552283609272962e-09,
+      "loss": 0.6925,
+      "step": 2096
+    },
+    {
+      "epoch": 1.9872068230277184,
+      "grad_norm": 0.9695849235179069,
+      "learning_rate": 9.96095044320522e-10,
+      "loss": 0.7557,
+      "step": 2097
+    },
+    {
+      "epoch": 1.988154465766406,
+      "grad_norm": 0.9926261452237545,
+      "learning_rate": 8.487478753615997e-10,
+      "loss": 0.6702,
+      "step": 2098
+    },
+    {
+      "epoch": 1.9891021085050937,
+      "grad_norm": 0.9680825511217606,
+      "learning_rate": 7.131872014509711e-10,
+      "loss": 0.6212,
+      "step": 2099
+    },
+    {
+      "epoch": 1.9900497512437811,
+      "grad_norm": 1.1257192398766538,
+      "learning_rate": 5.894133422001957e-10,
+      "loss": 0.7271,
+      "step": 2100
+    },
+    {
+      "epoch": 1.9909973939824686,
+      "grad_norm": 1.0549297797609254,
+      "learning_rate": 4.774265894302854e-10,
+      "loss": 0.6968,
+      "step": 2101
+    },
+    {
+      "epoch": 1.9919450367211562,
+      "grad_norm": 1.0063870271651199,
+      "learning_rate": 3.772272071722594e-10,
+      "loss": 0.7127,
+      "step": 2102
+    },
+    {
+      "epoch": 1.9928926794598436,
+      "grad_norm": 0.9955451047912606,
+      "learning_rate": 2.888154316671443e-10,
+      "loss": 0.6535,
+      "step": 2103
+    },
+    {
+      "epoch": 1.993840322198531,
+      "grad_norm": 1.0303767113264195,
+      "learning_rate": 2.1219147136264383e-10,
+      "loss": 0.7685,
+      "step": 2104
+    },
+    {
+      "epoch": 1.9947879649372187,
+      "grad_norm": 0.9819925019424903,
+      "learning_rate": 1.473555069148036e-10,
+      "loss": 0.6488,
+      "step": 2105
+    },
+    {
+      "epoch": 1.9957356076759063,
+      "grad_norm": 1.7129723621352664,
+      "learning_rate": 9.43076911874563e-11,
+      "loss": 0.7282,
+      "step": 2106
+    },
+    {
+      "epoch": 1.9966832504145937,
+      "grad_norm": 1.0109896100493034,
+      "learning_rate": 5.3048149251111456e-11,
+      "loss": 0.6369,
+      "step": 2107
+    },
+    {
+      "epoch": 1.9976308931532811,
+      "grad_norm": 0.9652839280997483,
+      "learning_rate": 2.3576978384065585e-11,
+      "loss": 0.731,
+      "step": 2108
+    },
+    {
+      "epoch": 1.9985785358919688,
+      "grad_norm": 1.1017969508811174,
+      "learning_rate": 5.8942480701817965e-12,
+      "loss": 0.6985,
+      "step": 2109
+    },
+    {
+      "epoch": 1.9995261786306564,
+      "grad_norm": 1.030266403397907,
+      "learning_rate": 0.0,
+      "loss": 0.6768,
+      "step": 2110
+    },
+    {
+      "epoch": 1.9995261786306564,
+      "step": 2110,
+      "total_flos": 7068238416445440.0,
+      "train_loss": 0.820082382003278,
+      "train_runtime": 57795.8559,
+      "train_samples_per_second": 9.348,
+      "train_steps_per_second": 0.037
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 2110,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "total_flos": 7068238416445440.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}