{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06363278537871711, "eval_steps": 73, "global_step": 9417, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 7.270200174029405, "learning_rate": 2e-07, "loss": 1.5886, "step": 1 }, { "epoch": 0.0, "eval_loss": 1.8586688041687012, "eval_runtime": 4.4089, "eval_samples_per_second": 2.041, "eval_steps_per_second": 1.134, "step": 1 }, { "epoch": 0.0, "grad_norm": 7.088138615023864, "learning_rate": 4e-07, "loss": 1.4809, "step": 2 }, { "epoch": 0.0, "grad_norm": 7.409449066839206, "learning_rate": 6e-07, "loss": 1.6585, "step": 3 }, { "epoch": 0.0, "grad_norm": 9.366466325593624, "learning_rate": 8e-07, "loss": 1.6965, "step": 4 }, { "epoch": 0.0, "grad_norm": 7.247338916406195, "learning_rate": 1e-06, "loss": 1.532, "step": 5 }, { "epoch": 0.0, "grad_norm": 8.049902060629968, "learning_rate": 1.2e-06, "loss": 1.596, "step": 6 }, { "epoch": 0.0, "grad_norm": 7.1820303407795745, "learning_rate": 1.4e-06, "loss": 1.5625, "step": 7 }, { "epoch": 0.0, "grad_norm": 6.7480024222126165, "learning_rate": 1.6e-06, "loss": 1.4882, "step": 8 }, { "epoch": 0.0, "grad_norm": 6.09119480827286, "learning_rate": 1.8e-06, "loss": 1.5345, "step": 9 }, { "epoch": 0.0, "grad_norm": 5.701895875897247, "learning_rate": 2e-06, "loss": 1.5403, "step": 10 }, { "epoch": 0.0, "grad_norm": 12.444180689164233, "learning_rate": 1.9999999999859167e-06, "loss": 1.7593, "step": 11 }, { "epoch": 0.0, "grad_norm": 6.671206404255702, "learning_rate": 1.9999999999436667e-06, "loss": 1.5666, "step": 12 }, { "epoch": 0.0, "grad_norm": 5.879938934255732, "learning_rate": 1.99999999987325e-06, "loss": 1.5774, "step": 13 }, { "epoch": 0.0, "grad_norm": 5.598165698155055, "learning_rate": 1.9999999997746663e-06, "loss": 1.3795, "step": 14 }, { "epoch": 0.0, "grad_norm": 5.241997738434293, "learning_rate": 1.9999999996479163e-06, "loss": 1.5449, "step": 15 }, { "epoch": 0.0, "grad_norm": 9.92151785969858, "learning_rate": 1.999999999493e-06, "loss": 1.6951, "step": 16 }, { "epoch": 0.0, "grad_norm": 5.539525947721768, "learning_rate": 1.9999999993099162e-06, "loss": 1.5409, "step": 17 }, { "epoch": 0.0, "grad_norm": 5.351267308329018, "learning_rate": 1.9999999990986662e-06, "loss": 1.426, "step": 18 }, { "epoch": 0.0, "grad_norm": 5.705234009184205, "learning_rate": 1.99999999885925e-06, "loss": 1.5517, "step": 19 }, { "epoch": 0.0, "grad_norm": 5.5021957856088175, "learning_rate": 1.999999998591666e-06, "loss": 1.6403, "step": 20 }, { "epoch": 0.0, "grad_norm": 5.230793498668246, "learning_rate": 1.999999998295916e-06, "loss": 1.3916, "step": 21 }, { "epoch": 0.0, "grad_norm": 7.632016493197849, "learning_rate": 1.9999999979719993e-06, "loss": 1.5029, "step": 22 }, { "epoch": 0.0, "grad_norm": 5.224094987664232, "learning_rate": 1.999999997619916e-06, "loss": 1.5172, "step": 23 }, { "epoch": 0.0, "grad_norm": 4.9052975256388995, "learning_rate": 1.9999999972396656e-06, "loss": 1.2809, "step": 24 }, { "epoch": 0.0, "grad_norm": 5.749261607817126, "learning_rate": 1.9999999968312492e-06, "loss": 1.682, "step": 25 }, { "epoch": 0.0, "grad_norm": 5.255433017333782, "learning_rate": 1.9999999963946656e-06, "loss": 1.5094, "step": 26 }, { "epoch": 0.0, "grad_norm": 5.093790043356135, "learning_rate": 1.9999999959299155e-06, "loss": 1.4338, "step": 27 }, { "epoch": 0.0, "grad_norm": 6.001582030581129, "learning_rate": 1.9999999954369987e-06, "loss": 1.5456, "step": 28 }, { "epoch": 0.0, "grad_norm": 8.863618311056822, "learning_rate": 1.999999994915915e-06, "loss": 1.5252, "step": 29 }, { "epoch": 0.0, "grad_norm": 5.926702449626208, "learning_rate": 1.9999999943666654e-06, "loss": 1.5579, "step": 30 }, { "epoch": 0.0, "grad_norm": 5.798968821403948, "learning_rate": 1.9999999937892486e-06, "loss": 1.5616, "step": 31 }, { "epoch": 0.0, "grad_norm": 5.306082122633555, "learning_rate": 1.999999993183665e-06, "loss": 1.6885, "step": 32 }, { "epoch": 0.0, "grad_norm": 7.324360862135967, "learning_rate": 1.999999992549915e-06, "loss": 1.6009, "step": 33 }, { "epoch": 0.0, "grad_norm": 5.712084687468965, "learning_rate": 1.9999999918879976e-06, "loss": 1.5022, "step": 34 }, { "epoch": 0.0, "grad_norm": 5.562761534033398, "learning_rate": 1.9999999911979143e-06, "loss": 1.7051, "step": 35 }, { "epoch": 0.0, "grad_norm": 11.4469400814171, "learning_rate": 1.9999999904796643e-06, "loss": 1.5599, "step": 36 }, { "epoch": 0.0, "grad_norm": 5.361519499368759, "learning_rate": 1.9999999897332474e-06, "loss": 1.5086, "step": 37 }, { "epoch": 0.0, "grad_norm": 4.9815078648545965, "learning_rate": 1.9999999889586637e-06, "loss": 1.5386, "step": 38 }, { "epoch": 0.0, "grad_norm": 5.162059963508055, "learning_rate": 1.9999999881559136e-06, "loss": 1.4769, "step": 39 }, { "epoch": 0.0, "grad_norm": 5.122293609109329, "learning_rate": 1.9999999873249968e-06, "loss": 1.561, "step": 40 }, { "epoch": 0.0, "grad_norm": 5.152152754881128, "learning_rate": 1.999999986465913e-06, "loss": 1.5329, "step": 41 }, { "epoch": 0.0, "grad_norm": 4.991233383631504, "learning_rate": 1.999999985578663e-06, "loss": 1.6348, "step": 42 }, { "epoch": 0.0, "grad_norm": 5.1180827908562545, "learning_rate": 1.999999984663246e-06, "loss": 1.5922, "step": 43 }, { "epoch": 0.0, "grad_norm": 5.447368695015986, "learning_rate": 1.9999999837196624e-06, "loss": 1.3119, "step": 44 }, { "epoch": 0.0, "grad_norm": 5.058434450512851, "learning_rate": 1.9999999827479124e-06, "loss": 1.4098, "step": 45 }, { "epoch": 0.0, "grad_norm": 6.792299679667143, "learning_rate": 1.9999999817479955e-06, "loss": 1.427, "step": 46 }, { "epoch": 0.0, "grad_norm": 6.86507140594805, "learning_rate": 1.9999999807199118e-06, "loss": 1.6528, "step": 47 }, { "epoch": 0.0, "grad_norm": 5.121726652742493, "learning_rate": 1.9999999796636617e-06, "loss": 1.4907, "step": 48 }, { "epoch": 0.0, "grad_norm": 4.897225770537551, "learning_rate": 1.9999999785792444e-06, "loss": 1.5129, "step": 49 }, { "epoch": 0.0, "grad_norm": 5.447285160025109, "learning_rate": 1.999999977466661e-06, "loss": 1.4732, "step": 50 }, { "epoch": 0.0, "grad_norm": 6.435992424750984, "learning_rate": 1.9999999763259106e-06, "loss": 1.5531, "step": 51 }, { "epoch": 0.0, "grad_norm": 5.126344987263065, "learning_rate": 1.9999999751569937e-06, "loss": 1.4008, "step": 52 }, { "epoch": 0.0, "grad_norm": 6.003394959358567, "learning_rate": 1.99999997395991e-06, "loss": 1.5076, "step": 53 }, { "epoch": 0.0, "grad_norm": 5.375009945600638, "learning_rate": 1.99999997273466e-06, "loss": 1.3866, "step": 54 }, { "epoch": 0.0, "grad_norm": 4.849278700300698, "learning_rate": 1.999999971481243e-06, "loss": 1.427, "step": 55 }, { "epoch": 0.0, "grad_norm": 5.532239141247253, "learning_rate": 1.9999999701996592e-06, "loss": 1.6364, "step": 56 }, { "epoch": 0.0, "grad_norm": 5.438678986247806, "learning_rate": 1.999999968889909e-06, "loss": 1.4648, "step": 57 }, { "epoch": 0.0, "grad_norm": 5.393102110640265, "learning_rate": 1.9999999675519918e-06, "loss": 1.5226, "step": 58 }, { "epoch": 0.0, "grad_norm": 5.602009023037625, "learning_rate": 1.999999966185908e-06, "loss": 1.745, "step": 59 }, { "epoch": 0.0, "grad_norm": 5.30661712292468, "learning_rate": 1.999999964791658e-06, "loss": 1.5059, "step": 60 }, { "epoch": 0.0, "grad_norm": 5.15102423620726, "learning_rate": 1.999999963369241e-06, "loss": 1.3974, "step": 61 }, { "epoch": 0.0, "grad_norm": 5.870501279592248, "learning_rate": 1.9999999619186573e-06, "loss": 1.5026, "step": 62 }, { "epoch": 0.0, "grad_norm": 5.270272741292025, "learning_rate": 1.9999999604399067e-06, "loss": 1.6544, "step": 63 }, { "epoch": 0.0, "grad_norm": 4.886919913741941, "learning_rate": 1.99999995893299e-06, "loss": 1.5181, "step": 64 }, { "epoch": 0.0, "grad_norm": 7.05398621579166, "learning_rate": 1.999999957397906e-06, "loss": 1.5616, "step": 65 }, { "epoch": 0.0, "grad_norm": 4.836134104240407, "learning_rate": 1.9999999558346555e-06, "loss": 1.54, "step": 66 }, { "epoch": 0.0, "grad_norm": 5.082059874604205, "learning_rate": 1.9999999542432386e-06, "loss": 1.5239, "step": 67 }, { "epoch": 0.0, "grad_norm": 4.874373987895782, "learning_rate": 1.999999952623655e-06, "loss": 1.5878, "step": 68 }, { "epoch": 0.0, "grad_norm": 4.946163953606462, "learning_rate": 1.9999999509759043e-06, "loss": 1.4434, "step": 69 }, { "epoch": 0.0, "grad_norm": 4.876405244918954, "learning_rate": 1.9999999492999873e-06, "loss": 1.4608, "step": 70 }, { "epoch": 0.0, "grad_norm": 5.5778213104294005, "learning_rate": 1.9999999475959036e-06, "loss": 1.5291, "step": 71 }, { "epoch": 0.0, "grad_norm": 4.99889754273985, "learning_rate": 1.9999999458636535e-06, "loss": 1.3987, "step": 72 }, { "epoch": 0.0, "grad_norm": 5.130434143656013, "learning_rate": 1.999999944103236e-06, "loss": 1.6093, "step": 73 }, { "epoch": 0.0, "eval_loss": 1.7611451148986816, "eval_runtime": 4.6217, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.082, "step": 73 }, { "epoch": 0.0, "grad_norm": 5.212510949695165, "learning_rate": 1.9999999423146523e-06, "loss": 1.7044, "step": 74 }, { "epoch": 0.0, "grad_norm": 5.2823971109355705, "learning_rate": 1.999999940497902e-06, "loss": 1.7013, "step": 75 }, { "epoch": 0.0, "grad_norm": 5.045619546308826, "learning_rate": 1.999999938652985e-06, "loss": 1.5755, "step": 76 }, { "epoch": 0.0, "grad_norm": 5.87457170524847, "learning_rate": 1.999999936779901e-06, "loss": 1.6232, "step": 77 }, { "epoch": 0.0, "grad_norm": 6.082677620793644, "learning_rate": 1.999999934878651e-06, "loss": 1.6031, "step": 78 }, { "epoch": 0.0, "grad_norm": 5.872910823814863, "learning_rate": 1.9999999329492335e-06, "loss": 1.5715, "step": 79 }, { "epoch": 0.0, "grad_norm": 5.244165093478454, "learning_rate": 1.9999999309916497e-06, "loss": 1.3898, "step": 80 }, { "epoch": 0.0, "grad_norm": 5.113300123505611, "learning_rate": 1.999999929005899e-06, "loss": 1.5425, "step": 81 }, { "epoch": 0.0, "grad_norm": 6.028851212735335, "learning_rate": 1.999999926991982e-06, "loss": 1.5812, "step": 82 }, { "epoch": 0.0, "grad_norm": 5.11348885382359, "learning_rate": 1.9999999249498984e-06, "loss": 1.4328, "step": 83 }, { "epoch": 0.0, "grad_norm": 5.535550818095342, "learning_rate": 1.9999999228796482e-06, "loss": 1.6637, "step": 84 }, { "epoch": 0.0, "grad_norm": 7.007537107122481, "learning_rate": 1.999999920781231e-06, "loss": 1.5714, "step": 85 }, { "epoch": 0.0, "grad_norm": 6.267096315507558, "learning_rate": 1.9999999186546466e-06, "loss": 1.5447, "step": 86 }, { "epoch": 0.0, "grad_norm": 5.796533721972671, "learning_rate": 1.9999999164998964e-06, "loss": 1.5971, "step": 87 }, { "epoch": 0.0, "grad_norm": 6.228074731568103, "learning_rate": 1.999999914316979e-06, "loss": 1.7089, "step": 88 }, { "epoch": 0.0, "grad_norm": 5.7122834140340295, "learning_rate": 1.9999999121058956e-06, "loss": 1.2932, "step": 89 }, { "epoch": 0.0, "grad_norm": 6.2806919781906965, "learning_rate": 1.999999909866645e-06, "loss": 1.653, "step": 90 }, { "epoch": 0.0, "grad_norm": 5.368043140767265, "learning_rate": 1.9999999075992276e-06, "loss": 1.6132, "step": 91 }, { "epoch": 0.0, "grad_norm": 5.76243769319436, "learning_rate": 1.999999905303644e-06, "loss": 1.6384, "step": 92 }, { "epoch": 0.0, "grad_norm": 5.672061728664766, "learning_rate": 1.9999999029798932e-06, "loss": 1.4254, "step": 93 }, { "epoch": 0.0, "grad_norm": 5.002515171971739, "learning_rate": 1.9999999006279762e-06, "loss": 1.4739, "step": 94 }, { "epoch": 0.0, "grad_norm": 5.2077060992013084, "learning_rate": 1.9999998982478924e-06, "loss": 1.4923, "step": 95 }, { "epoch": 0.0, "grad_norm": 5.167170360061274, "learning_rate": 1.999999895839642e-06, "loss": 1.6129, "step": 96 }, { "epoch": 0.0, "grad_norm": 4.886352788159953, "learning_rate": 1.9999998934032244e-06, "loss": 1.4177, "step": 97 }, { "epoch": 0.0, "grad_norm": 5.408894422523236, "learning_rate": 1.999999890938641e-06, "loss": 1.6144, "step": 98 }, { "epoch": 0.0, "grad_norm": 5.490500856061367, "learning_rate": 1.9999998884458904e-06, "loss": 1.5951, "step": 99 }, { "epoch": 0.0, "grad_norm": 5.1356644085000065, "learning_rate": 1.9999998859249733e-06, "loss": 1.587, "step": 100 }, { "epoch": 0.0, "grad_norm": 14.038552559799147, "learning_rate": 1.999999883375889e-06, "loss": 1.2883, "step": 101 }, { "epoch": 0.0, "grad_norm": 5.226960512898749, "learning_rate": 1.999999880798639e-06, "loss": 1.593, "step": 102 }, { "epoch": 0.0, "grad_norm": 6.414801916717659, "learning_rate": 1.9999998781932214e-06, "loss": 1.444, "step": 103 }, { "epoch": 0.0, "grad_norm": 4.966168140745907, "learning_rate": 1.9999998755596376e-06, "loss": 1.4407, "step": 104 }, { "epoch": 0.0, "grad_norm": 6.159957368813785, "learning_rate": 1.999999872897887e-06, "loss": 1.465, "step": 105 }, { "epoch": 0.0, "grad_norm": 5.6281356469376655, "learning_rate": 1.9999998702079695e-06, "loss": 1.5183, "step": 106 }, { "epoch": 0.0, "grad_norm": 4.823355492437726, "learning_rate": 1.9999998674898857e-06, "loss": 1.3937, "step": 107 }, { "epoch": 0.0, "grad_norm": 5.900910155703639, "learning_rate": 1.999999864743635e-06, "loss": 1.5294, "step": 108 }, { "epoch": 0.0, "grad_norm": 5.025636152106037, "learning_rate": 1.999999861969218e-06, "loss": 1.6249, "step": 109 }, { "epoch": 0.0, "grad_norm": 5.116169125449381, "learning_rate": 1.999999859166634e-06, "loss": 1.4079, "step": 110 }, { "epoch": 0.0, "grad_norm": 4.778904603716487, "learning_rate": 1.999999856335883e-06, "loss": 1.5865, "step": 111 }, { "epoch": 0.0, "grad_norm": 5.756222368715884, "learning_rate": 1.999999853476966e-06, "loss": 1.7328, "step": 112 }, { "epoch": 0.0, "grad_norm": 5.412891077813045, "learning_rate": 1.9999998505898822e-06, "loss": 1.6096, "step": 113 }, { "epoch": 0.0, "grad_norm": 4.918651260002986, "learning_rate": 1.9999998476746316e-06, "loss": 1.4487, "step": 114 }, { "epoch": 0.0, "grad_norm": 5.5264820158788845, "learning_rate": 1.9999998447312145e-06, "loss": 1.5762, "step": 115 }, { "epoch": 0.0, "grad_norm": 4.9444017993692375, "learning_rate": 1.9999998417596302e-06, "loss": 1.6108, "step": 116 }, { "epoch": 0.0, "grad_norm": 5.3741051561343065, "learning_rate": 1.9999998387598796e-06, "loss": 1.4945, "step": 117 }, { "epoch": 0.0, "grad_norm": 5.086038907073674, "learning_rate": 1.9999998357319625e-06, "loss": 1.4727, "step": 118 }, { "epoch": 0.0, "grad_norm": 5.374970852866187, "learning_rate": 1.9999998326758787e-06, "loss": 1.5108, "step": 119 }, { "epoch": 0.0, "grad_norm": 5.046226292058247, "learning_rate": 1.999999829591628e-06, "loss": 1.547, "step": 120 }, { "epoch": 0.0, "grad_norm": 4.920252413086519, "learning_rate": 1.9999998264792105e-06, "loss": 1.5885, "step": 121 }, { "epoch": 0.0, "grad_norm": 4.9199662171825, "learning_rate": 1.9999998233386266e-06, "loss": 1.5461, "step": 122 }, { "epoch": 0.0, "grad_norm": 4.849224216338868, "learning_rate": 1.9999998201698764e-06, "loss": 1.3918, "step": 123 }, { "epoch": 0.0, "grad_norm": 5.122690985334921, "learning_rate": 1.999999816972959e-06, "loss": 1.7063, "step": 124 }, { "epoch": 0.0, "grad_norm": 5.4642737814450495, "learning_rate": 1.999999813747875e-06, "loss": 1.4925, "step": 125 }, { "epoch": 0.0, "grad_norm": 6.985722953022008, "learning_rate": 1.9999998104946243e-06, "loss": 1.6928, "step": 126 }, { "epoch": 0.0, "grad_norm": 9.95531201412864, "learning_rate": 1.999999807213207e-06, "loss": 1.3453, "step": 127 }, { "epoch": 0.0, "grad_norm": 4.9615775535071025, "learning_rate": 1.999999803903623e-06, "loss": 1.4873, "step": 128 }, { "epoch": 0.0, "grad_norm": 4.995275322635195, "learning_rate": 1.9999998005658727e-06, "loss": 1.4477, "step": 129 }, { "epoch": 0.0, "grad_norm": 5.042565222809337, "learning_rate": 1.999999797199955e-06, "loss": 1.4494, "step": 130 }, { "epoch": 0.0, "grad_norm": 6.123208053454304, "learning_rate": 1.999999793805871e-06, "loss": 1.6236, "step": 131 }, { "epoch": 0.0, "grad_norm": 5.125753408156544, "learning_rate": 1.9999997903836206e-06, "loss": 1.4867, "step": 132 }, { "epoch": 0.0, "grad_norm": 4.953051829249889, "learning_rate": 1.999999786933203e-06, "loss": 1.4471, "step": 133 }, { "epoch": 0.0, "grad_norm": 5.01487688139561, "learning_rate": 1.999999783454619e-06, "loss": 1.5202, "step": 134 }, { "epoch": 0.0, "grad_norm": 4.9862855831810355, "learning_rate": 1.9999997799478685e-06, "loss": 1.2726, "step": 135 }, { "epoch": 0.0, "grad_norm": 4.664158110626264, "learning_rate": 1.9999997764129514e-06, "loss": 1.453, "step": 136 }, { "epoch": 0.0, "grad_norm": 4.515072838378242, "learning_rate": 1.9999997728498675e-06, "loss": 1.4493, "step": 137 }, { "epoch": 0.0, "grad_norm": 4.638297190169102, "learning_rate": 1.9999997692586168e-06, "loss": 1.4888, "step": 138 }, { "epoch": 0.0, "grad_norm": 4.88971951842883, "learning_rate": 1.9999997656391992e-06, "loss": 1.4683, "step": 139 }, { "epoch": 0.0, "grad_norm": 5.907276772043835, "learning_rate": 1.9999997619916153e-06, "loss": 1.4255, "step": 140 }, { "epoch": 0.0, "grad_norm": 5.130307226886763, "learning_rate": 1.9999997583158646e-06, "loss": 1.4295, "step": 141 }, { "epoch": 0.0, "grad_norm": 5.878724833576819, "learning_rate": 1.9999997546119475e-06, "loss": 1.6857, "step": 142 }, { "epoch": 0.0, "grad_norm": 4.889147537162318, "learning_rate": 1.9999997508798636e-06, "loss": 1.2668, "step": 143 }, { "epoch": 0.0, "grad_norm": 5.079666872019813, "learning_rate": 1.9999997471196124e-06, "loss": 1.4661, "step": 144 }, { "epoch": 0.0, "grad_norm": 4.896421399772935, "learning_rate": 1.9999997433311953e-06, "loss": 1.4462, "step": 145 }, { "epoch": 0.0, "grad_norm": 4.904873841992958, "learning_rate": 1.9999997395146114e-06, "loss": 1.5005, "step": 146 }, { "epoch": 0.0, "eval_loss": 1.7462446689605713, "eval_runtime": 4.5857, "eval_samples_per_second": 1.963, "eval_steps_per_second": 1.09, "step": 146 }, { "epoch": 0.0, "grad_norm": 4.780194658732313, "learning_rate": 1.9999997356698607e-06, "loss": 1.2281, "step": 147 }, { "epoch": 0.0, "grad_norm": 4.947954070905395, "learning_rate": 1.9999997317969435e-06, "loss": 1.5465, "step": 148 }, { "epoch": 0.0, "grad_norm": 5.262663754638768, "learning_rate": 1.9999997278958596e-06, "loss": 1.7203, "step": 149 }, { "epoch": 0.0, "grad_norm": 4.980503880353371, "learning_rate": 1.999999723966609e-06, "loss": 1.5602, "step": 150 }, { "epoch": 0.0, "grad_norm": 5.070738329007044, "learning_rate": 1.9999997200091917e-06, "loss": 1.4116, "step": 151 }, { "epoch": 0.0, "grad_norm": 5.443381763060975, "learning_rate": 1.999999716023608e-06, "loss": 1.6409, "step": 152 }, { "epoch": 0.0, "grad_norm": 4.836870306399049, "learning_rate": 1.999999712009857e-06, "loss": 1.3384, "step": 153 }, { "epoch": 0.0, "grad_norm": 7.427614614695787, "learning_rate": 1.9999997079679395e-06, "loss": 1.4659, "step": 154 }, { "epoch": 0.0, "grad_norm": 5.172785821462023, "learning_rate": 1.999999703897856e-06, "loss": 1.4818, "step": 155 }, { "epoch": 0.0, "grad_norm": 5.300155696147022, "learning_rate": 1.9999996997996052e-06, "loss": 1.4057, "step": 156 }, { "epoch": 0.0, "grad_norm": 4.918600876049223, "learning_rate": 1.9999996956731877e-06, "loss": 1.5059, "step": 157 }, { "epoch": 0.0, "grad_norm": 6.916667098440986, "learning_rate": 1.999999691518604e-06, "loss": 1.4173, "step": 158 }, { "epoch": 0.0, "grad_norm": 4.8728284717219905, "learning_rate": 1.999999687335853e-06, "loss": 1.3795, "step": 159 }, { "epoch": 0.0, "grad_norm": 5.287532891218524, "learning_rate": 1.999999683124936e-06, "loss": 1.4764, "step": 160 }, { "epoch": 0.0, "grad_norm": 4.987191884182559, "learning_rate": 1.999999678885852e-06, "loss": 1.5181, "step": 161 }, { "epoch": 0.0, "grad_norm": 5.186035530995563, "learning_rate": 1.999999674618601e-06, "loss": 1.4576, "step": 162 }, { "epoch": 0.0, "grad_norm": 4.918294071223459, "learning_rate": 1.999999670323184e-06, "loss": 1.4588, "step": 163 }, { "epoch": 0.0, "grad_norm": 5.065760262070265, "learning_rate": 1.9999996659996e-06, "loss": 1.5348, "step": 164 }, { "epoch": 0.0, "grad_norm": 5.017202795011469, "learning_rate": 1.999999661647849e-06, "loss": 1.5338, "step": 165 }, { "epoch": 0.0, "grad_norm": 5.6648484254216225, "learning_rate": 1.999999657267932e-06, "loss": 1.6196, "step": 166 }, { "epoch": 0.0, "grad_norm": 5.4750502865711965, "learning_rate": 1.999999652859848e-06, "loss": 1.5665, "step": 167 }, { "epoch": 0.0, "grad_norm": 5.586077526270329, "learning_rate": 1.9999996484235977e-06, "loss": 1.5768, "step": 168 }, { "epoch": 0.0, "grad_norm": 5.064098969414124, "learning_rate": 1.9999996439591805e-06, "loss": 1.5485, "step": 169 }, { "epoch": 0.0, "grad_norm": 6.831991975556203, "learning_rate": 1.999999639466596e-06, "loss": 1.6605, "step": 170 }, { "epoch": 0.0, "grad_norm": 5.386693105255357, "learning_rate": 1.9999996349458458e-06, "loss": 1.5403, "step": 171 }, { "epoch": 0.0, "grad_norm": 5.383469172835463, "learning_rate": 1.9999996303969286e-06, "loss": 1.4123, "step": 172 }, { "epoch": 0.0, "grad_norm": 6.136249870917203, "learning_rate": 1.9999996258198446e-06, "loss": 1.5223, "step": 173 }, { "epoch": 0.0, "grad_norm": 5.497494715675666, "learning_rate": 1.999999621214594e-06, "loss": 1.4759, "step": 174 }, { "epoch": 0.0, "grad_norm": 5.359298992694515, "learning_rate": 1.9999996165811766e-06, "loss": 1.5216, "step": 175 }, { "epoch": 0.0, "grad_norm": 4.554159062585914, "learning_rate": 1.9999996119195926e-06, "loss": 1.3022, "step": 176 }, { "epoch": 0.0, "grad_norm": 5.655783926270953, "learning_rate": 1.9999996072298423e-06, "loss": 1.6985, "step": 177 }, { "epoch": 0.0, "grad_norm": 6.815355669323773, "learning_rate": 1.999999602511925e-06, "loss": 1.6323, "step": 178 }, { "epoch": 0.0, "grad_norm": 5.090330865005731, "learning_rate": 1.999999597765841e-06, "loss": 1.4405, "step": 179 }, { "epoch": 0.0, "grad_norm": 5.453237104330645, "learning_rate": 1.9999995929915903e-06, "loss": 1.6569, "step": 180 }, { "epoch": 0.0, "grad_norm": 5.140949180379015, "learning_rate": 1.999999588189173e-06, "loss": 1.5043, "step": 181 }, { "epoch": 0.0, "grad_norm": 6.312297253169766, "learning_rate": 1.999999583358589e-06, "loss": 1.4393, "step": 182 }, { "epoch": 0.0, "grad_norm": 5.5602461468607745, "learning_rate": 1.9999995784998387e-06, "loss": 1.5034, "step": 183 }, { "epoch": 0.0, "grad_norm": 4.984563051637153, "learning_rate": 1.9999995736129215e-06, "loss": 1.5951, "step": 184 }, { "epoch": 0.0, "grad_norm": 5.210447188187252, "learning_rate": 1.9999995686978374e-06, "loss": 1.4994, "step": 185 }, { "epoch": 0.0, "grad_norm": 4.986374115612521, "learning_rate": 1.999999563754587e-06, "loss": 1.3567, "step": 186 }, { "epoch": 0.0, "grad_norm": 5.868262699823536, "learning_rate": 1.99999955878317e-06, "loss": 1.5591, "step": 187 }, { "epoch": 0.0, "grad_norm": 5.532695287284847, "learning_rate": 1.999999553783586e-06, "loss": 1.6909, "step": 188 }, { "epoch": 0.0, "grad_norm": 5.064305786007753, "learning_rate": 1.9999995487558354e-06, "loss": 1.6499, "step": 189 }, { "epoch": 0.0, "grad_norm": 5.3615235293142645, "learning_rate": 1.999999543699918e-06, "loss": 1.5116, "step": 190 }, { "epoch": 0.0, "grad_norm": 5.820360691431748, "learning_rate": 1.999999538615834e-06, "loss": 1.585, "step": 191 }, { "epoch": 0.0, "grad_norm": 4.848927324752224, "learning_rate": 1.9999995335035838e-06, "loss": 1.5208, "step": 192 }, { "epoch": 0.0, "grad_norm": 5.008810784644451, "learning_rate": 1.999999528363167e-06, "loss": 1.4648, "step": 193 }, { "epoch": 0.0, "grad_norm": 5.012428143646969, "learning_rate": 1.999999523194583e-06, "loss": 1.5394, "step": 194 }, { "epoch": 0.0, "grad_norm": 5.014801660701698, "learning_rate": 1.9999995179978325e-06, "loss": 1.5681, "step": 195 }, { "epoch": 0.0, "grad_norm": 5.044808653366915, "learning_rate": 1.9999995127729153e-06, "loss": 1.5071, "step": 196 }, { "epoch": 0.0, "grad_norm": 5.138850239659894, "learning_rate": 1.9999995075198317e-06, "loss": 1.5622, "step": 197 }, { "epoch": 0.0, "grad_norm": 4.942376781806205, "learning_rate": 1.999999502238581e-06, "loss": 1.4336, "step": 198 }, { "epoch": 0.0, "grad_norm": 5.454348532164084, "learning_rate": 1.999999496929164e-06, "loss": 1.5379, "step": 199 }, { "epoch": 0.0, "grad_norm": 4.515385692801986, "learning_rate": 1.99999949159158e-06, "loss": 1.3406, "step": 200 }, { "epoch": 0.0, "grad_norm": 5.124129865470126, "learning_rate": 1.9999994862258295e-06, "loss": 1.3091, "step": 201 }, { "epoch": 0.0, "grad_norm": 6.04812044323597, "learning_rate": 1.9999994808319127e-06, "loss": 1.4656, "step": 202 }, { "epoch": 0.0, "grad_norm": 4.832399296186657, "learning_rate": 1.9999994754098286e-06, "loss": 1.4201, "step": 203 }, { "epoch": 0.0, "grad_norm": 5.180488033599416, "learning_rate": 1.999999469959578e-06, "loss": 1.385, "step": 204 }, { "epoch": 0.0, "grad_norm": 5.190369524582554, "learning_rate": 1.9999994644811614e-06, "loss": 1.561, "step": 205 }, { "epoch": 0.0, "grad_norm": 5.554453318862103, "learning_rate": 1.9999994589745773e-06, "loss": 1.67, "step": 206 }, { "epoch": 0.0, "grad_norm": 7.035958799888537, "learning_rate": 1.999999453439827e-06, "loss": 1.3742, "step": 207 }, { "epoch": 0.0, "grad_norm": 5.829678242084621, "learning_rate": 1.99999944787691e-06, "loss": 1.5053, "step": 208 }, { "epoch": 0.0, "grad_norm": 5.67709513251063, "learning_rate": 1.999999442285826e-06, "loss": 1.6848, "step": 209 }, { "epoch": 0.0, "grad_norm": 4.611739029374384, "learning_rate": 1.999999436666576e-06, "loss": 1.534, "step": 210 }, { "epoch": 0.0, "grad_norm": 7.303224421644464, "learning_rate": 1.9999994310191587e-06, "loss": 1.4115, "step": 211 }, { "epoch": 0.0, "grad_norm": 4.739064239711381, "learning_rate": 1.999999425343575e-06, "loss": 1.5154, "step": 212 }, { "epoch": 0.0, "grad_norm": 4.998482123577443, "learning_rate": 1.999999419639825e-06, "loss": 1.4069, "step": 213 }, { "epoch": 0.0, "grad_norm": 4.777605807898373, "learning_rate": 1.9999994139079077e-06, "loss": 1.4424, "step": 214 }, { "epoch": 0.0, "grad_norm": 4.91838714700508, "learning_rate": 1.999999408147824e-06, "loss": 1.4662, "step": 215 }, { "epoch": 0.0, "grad_norm": 5.080194035605747, "learning_rate": 1.999999402359574e-06, "loss": 1.5027, "step": 216 }, { "epoch": 0.0, "grad_norm": 7.028785668898977, "learning_rate": 1.9999993965431567e-06, "loss": 1.4228, "step": 217 }, { "epoch": 0.0, "grad_norm": 4.840444295707605, "learning_rate": 1.999999390698573e-06, "loss": 1.415, "step": 218 }, { "epoch": 0.0, "grad_norm": 5.684443121932812, "learning_rate": 1.9999993848258226e-06, "loss": 1.5323, "step": 219 }, { "epoch": 0.0, "eval_loss": 1.7417716979980469, "eval_runtime": 4.6034, "eval_samples_per_second": 1.955, "eval_steps_per_second": 1.086, "step": 219 }, { "epoch": 0.0, "grad_norm": 4.888070245726793, "learning_rate": 1.9999993789249058e-06, "loss": 1.3426, "step": 220 }, { "epoch": 0.0, "grad_norm": 6.874791209837943, "learning_rate": 1.999999372995822e-06, "loss": 1.5123, "step": 221 }, { "epoch": 0.0, "grad_norm": 5.351391729703106, "learning_rate": 1.999999367038572e-06, "loss": 1.511, "step": 222 }, { "epoch": 0.0, "grad_norm": 5.252575209784133, "learning_rate": 1.999999361053155e-06, "loss": 1.5191, "step": 223 }, { "epoch": 0.0, "grad_norm": 5.0084979782736765, "learning_rate": 1.9999993550395715e-06, "loss": 1.4319, "step": 224 }, { "epoch": 0.0, "grad_norm": 5.032795555975619, "learning_rate": 1.999999348997821e-06, "loss": 1.5326, "step": 225 }, { "epoch": 0.0, "grad_norm": 4.882732392699978, "learning_rate": 1.9999993429279045e-06, "loss": 1.5562, "step": 226 }, { "epoch": 0.0, "grad_norm": 10.5857954829159, "learning_rate": 1.999999336829821e-06, "loss": 1.5575, "step": 227 }, { "epoch": 0.0, "grad_norm": 5.368713396037795, "learning_rate": 1.9999993307035704e-06, "loss": 1.5341, "step": 228 }, { "epoch": 0.0, "grad_norm": 6.564926482513187, "learning_rate": 1.999999324549154e-06, "loss": 1.5136, "step": 229 }, { "epoch": 0.0, "grad_norm": 4.87983027120232, "learning_rate": 1.9999993183665702e-06, "loss": 1.5424, "step": 230 }, { "epoch": 0.0, "grad_norm": 5.130333500849336, "learning_rate": 1.99999931215582e-06, "loss": 1.4305, "step": 231 }, { "epoch": 0.0, "grad_norm": 4.810216498710757, "learning_rate": 1.9999993059169032e-06, "loss": 1.3897, "step": 232 }, { "epoch": 0.0, "grad_norm": 4.75131060987687, "learning_rate": 1.99999929964982e-06, "loss": 1.4905, "step": 233 }, { "epoch": 0.0, "grad_norm": 4.9417367696494185, "learning_rate": 1.99999929335457e-06, "loss": 1.5133, "step": 234 }, { "epoch": 0.0, "grad_norm": 5.129642959459062, "learning_rate": 1.999999287031153e-06, "loss": 1.4786, "step": 235 }, { "epoch": 0.0, "grad_norm": 5.589926026863031, "learning_rate": 1.9999992806795697e-06, "loss": 1.4966, "step": 236 }, { "epoch": 0.0, "grad_norm": 5.509054909009049, "learning_rate": 1.999999274299819e-06, "loss": 1.4371, "step": 237 }, { "epoch": 0.0, "grad_norm": 4.6496971486682135, "learning_rate": 1.9999992678919027e-06, "loss": 1.4595, "step": 238 }, { "epoch": 0.0, "grad_norm": 5.832332761745741, "learning_rate": 1.999999261455819e-06, "loss": 1.5185, "step": 239 }, { "epoch": 0.0, "grad_norm": 5.257353906344085, "learning_rate": 1.9999992549915693e-06, "loss": 1.5056, "step": 240 }, { "epoch": 0.0, "grad_norm": 7.37222280076076, "learning_rate": 1.9999992484991524e-06, "loss": 1.504, "step": 241 }, { "epoch": 0.0, "grad_norm": 4.824423532897361, "learning_rate": 1.999999241978569e-06, "loss": 1.4981, "step": 242 }, { "epoch": 0.0, "grad_norm": 4.9091796444197575, "learning_rate": 1.999999235429819e-06, "loss": 1.4832, "step": 243 }, { "epoch": 0.0, "grad_norm": 7.748583674899383, "learning_rate": 1.9999992288529025e-06, "loss": 1.3462, "step": 244 }, { "epoch": 0.0, "grad_norm": 5.125022502509408, "learning_rate": 1.9999992222478192e-06, "loss": 1.4612, "step": 245 }, { "epoch": 0.0, "grad_norm": 32.53860605587701, "learning_rate": 1.999999215614569e-06, "loss": 1.4692, "step": 246 }, { "epoch": 0.0, "grad_norm": 5.052996211809871, "learning_rate": 1.9999992089531526e-06, "loss": 1.4238, "step": 247 }, { "epoch": 0.0, "grad_norm": 5.542095695226945, "learning_rate": 1.9999992022635693e-06, "loss": 1.4174, "step": 248 }, { "epoch": 0.0, "grad_norm": 5.686250494352825, "learning_rate": 1.999999195545819e-06, "loss": 1.6754, "step": 249 }, { "epoch": 0.0, "grad_norm": 5.6359874929300915, "learning_rate": 1.9999991887999027e-06, "loss": 1.5792, "step": 250 }, { "epoch": 0.0, "grad_norm": 5.0936043548140795, "learning_rate": 1.9999991820258194e-06, "loss": 1.4941, "step": 251 }, { "epoch": 0.0, "grad_norm": 6.406141836123647, "learning_rate": 1.9999991752235697e-06, "loss": 1.5123, "step": 252 }, { "epoch": 0.0, "grad_norm": 6.199896874003351, "learning_rate": 1.999999168393153e-06, "loss": 1.476, "step": 253 }, { "epoch": 0.0, "grad_norm": 5.66865264194522, "learning_rate": 1.99999916153457e-06, "loss": 1.5464, "step": 254 }, { "epoch": 0.0, "grad_norm": 5.021383409806155, "learning_rate": 1.99999915464782e-06, "loss": 1.5831, "step": 255 }, { "epoch": 0.0, "grad_norm": 6.615349724765676, "learning_rate": 1.9999991477329036e-06, "loss": 1.6743, "step": 256 }, { "epoch": 0.0, "grad_norm": 5.024216598608762, "learning_rate": 1.9999991407898203e-06, "loss": 1.5263, "step": 257 }, { "epoch": 0.0, "grad_norm": 5.352770335446249, "learning_rate": 1.9999991338185706e-06, "loss": 1.6316, "step": 258 }, { "epoch": 0.0, "grad_norm": 5.472919888558574, "learning_rate": 1.9999991268191545e-06, "loss": 1.4915, "step": 259 }, { "epoch": 0.0, "grad_norm": 5.46803445951313, "learning_rate": 1.999999119791571e-06, "loss": 1.5079, "step": 260 }, { "epoch": 0.0, "grad_norm": 5.154033494012857, "learning_rate": 1.9999991127358214e-06, "loss": 1.5658, "step": 261 }, { "epoch": 0.0, "grad_norm": 5.032995728347807, "learning_rate": 1.9999991056519053e-06, "loss": 1.5069, "step": 262 }, { "epoch": 0.0, "grad_norm": 4.916728694225458, "learning_rate": 1.9999990985398224e-06, "loss": 1.5897, "step": 263 }, { "epoch": 0.0, "grad_norm": 5.075661387114088, "learning_rate": 1.9999990913995727e-06, "loss": 1.5507, "step": 264 }, { "epoch": 0.0, "grad_norm": 10.051445604812477, "learning_rate": 1.999999084231156e-06, "loss": 1.6803, "step": 265 }, { "epoch": 0.0, "grad_norm": 4.971780062105953, "learning_rate": 1.999999077034573e-06, "loss": 1.4367, "step": 266 }, { "epoch": 0.0, "grad_norm": 4.934330515559447, "learning_rate": 1.9999990698098235e-06, "loss": 1.356, "step": 267 }, { "epoch": 0.0, "grad_norm": 6.5083027384301015, "learning_rate": 1.9999990625569073e-06, "loss": 1.4274, "step": 268 }, { "epoch": 0.0, "grad_norm": 5.109968483536287, "learning_rate": 1.9999990552758244e-06, "loss": 1.4728, "step": 269 }, { "epoch": 0.0, "grad_norm": 5.775385612499377, "learning_rate": 1.999999047966575e-06, "loss": 1.5146, "step": 270 }, { "epoch": 0.0, "grad_norm": 4.948468121480936, "learning_rate": 1.9999990406291585e-06, "loss": 1.4915, "step": 271 }, { "epoch": 0.0, "grad_norm": 5.965168560424121, "learning_rate": 1.999999033263576e-06, "loss": 1.4582, "step": 272 }, { "epoch": 0.0, "grad_norm": 5.442431763765304, "learning_rate": 1.9999990258698267e-06, "loss": 1.4613, "step": 273 }, { "epoch": 0.0, "grad_norm": 5.7776067027301545, "learning_rate": 1.99999901844791e-06, "loss": 1.503, "step": 274 }, { "epoch": 0.0, "grad_norm": 5.178892269148893, "learning_rate": 1.999999010997827e-06, "loss": 1.4328, "step": 275 }, { "epoch": 0.0, "grad_norm": 4.939222731616965, "learning_rate": 1.9999990035195783e-06, "loss": 1.4259, "step": 276 }, { "epoch": 0.0, "grad_norm": 4.900888732024022, "learning_rate": 1.9999989960131617e-06, "loss": 1.4883, "step": 277 }, { "epoch": 0.0, "grad_norm": 5.480965646206889, "learning_rate": 1.999998988478579e-06, "loss": 1.4998, "step": 278 }, { "epoch": 0.0, "grad_norm": 5.387613428010502, "learning_rate": 1.99999898091583e-06, "loss": 1.5284, "step": 279 }, { "epoch": 0.0, "grad_norm": 6.93375972130229, "learning_rate": 1.9999989733249137e-06, "loss": 1.5188, "step": 280 }, { "epoch": 0.0, "grad_norm": 4.887766431921025, "learning_rate": 1.999998965705831e-06, "loss": 1.4875, "step": 281 }, { "epoch": 0.0, "grad_norm": 4.872411589272914, "learning_rate": 1.999998958058582e-06, "loss": 1.4924, "step": 282 }, { "epoch": 0.0, "grad_norm": 4.8912427815939825, "learning_rate": 1.999998950383166e-06, "loss": 1.3855, "step": 283 }, { "epoch": 0.0, "grad_norm": 4.849266593984057, "learning_rate": 1.9999989426795835e-06, "loss": 1.252, "step": 284 }, { "epoch": 0.0, "grad_norm": 4.642331959124537, "learning_rate": 1.999998934947834e-06, "loss": 1.4346, "step": 285 }, { "epoch": 0.0, "grad_norm": 4.824479351331792, "learning_rate": 1.999998927187918e-06, "loss": 1.3413, "step": 286 }, { "epoch": 0.0, "grad_norm": 5.088834406666165, "learning_rate": 1.999998919399836e-06, "loss": 1.5249, "step": 287 }, { "epoch": 0.0, "grad_norm": 6.380115793381058, "learning_rate": 1.9999989115835865e-06, "loss": 1.6594, "step": 288 }, { "epoch": 0.0, "grad_norm": 4.560993714860048, "learning_rate": 1.9999989037391708e-06, "loss": 1.378, "step": 289 }, { "epoch": 0.0, "grad_norm": 11.285453090943186, "learning_rate": 1.9999988958665882e-06, "loss": 1.5047, "step": 290 }, { "epoch": 0.0, "grad_norm": 6.939941853112505, "learning_rate": 1.9999988879658393e-06, "loss": 1.4266, "step": 291 }, { "epoch": 0.0, "grad_norm": 4.842983984638459, "learning_rate": 1.9999988800369235e-06, "loss": 1.4714, "step": 292 }, { "epoch": 0.0, "eval_loss": 1.7301859855651855, "eval_runtime": 4.6111, "eval_samples_per_second": 1.952, "eval_steps_per_second": 1.084, "step": 292 }, { "epoch": 0.0, "grad_norm": 6.779278420529169, "learning_rate": 1.999998872079841e-06, "loss": 1.59, "step": 293 }, { "epoch": 0.0, "grad_norm": 5.66455555017548, "learning_rate": 1.999998864094592e-06, "loss": 1.5995, "step": 294 }, { "epoch": 0.0, "grad_norm": 5.542518840920991, "learning_rate": 1.9999988560811762e-06, "loss": 1.3785, "step": 295 }, { "epoch": 0.0, "grad_norm": 4.741540663363095, "learning_rate": 1.999998848039594e-06, "loss": 1.3725, "step": 296 }, { "epoch": 0.0, "grad_norm": 4.958319396026163, "learning_rate": 1.999998839969845e-06, "loss": 1.4389, "step": 297 }, { "epoch": 0.0, "grad_norm": 4.858392697584495, "learning_rate": 1.99999883187193e-06, "loss": 1.4766, "step": 298 }, { "epoch": 0.0, "grad_norm": 4.77843632873252, "learning_rate": 1.9999988237458472e-06, "loss": 1.4784, "step": 299 }, { "epoch": 0.0, "grad_norm": 5.166101873536924, "learning_rate": 1.9999988155915983e-06, "loss": 1.5636, "step": 300 }, { "epoch": 0.0, "grad_norm": 5.541352000059288, "learning_rate": 1.999998807409183e-06, "loss": 1.3294, "step": 301 }, { "epoch": 0.0, "grad_norm": 4.738532728807263, "learning_rate": 1.9999987991986007e-06, "loss": 1.3765, "step": 302 }, { "epoch": 0.0, "grad_norm": 6.12811751220863, "learning_rate": 1.9999987909598518e-06, "loss": 1.6032, "step": 303 }, { "epoch": 0.0, "grad_norm": 5.897961031920527, "learning_rate": 1.9999987826929364e-06, "loss": 1.5486, "step": 304 }, { "epoch": 0.0, "grad_norm": 5.108120627298669, "learning_rate": 1.9999987743978542e-06, "loss": 1.5455, "step": 305 }, { "epoch": 0.0, "grad_norm": 6.654624244259197, "learning_rate": 1.9999987660746057e-06, "loss": 1.563, "step": 306 }, { "epoch": 0.0, "grad_norm": 6.32748407900337, "learning_rate": 1.9999987577231903e-06, "loss": 1.5657, "step": 307 }, { "epoch": 0.0, "grad_norm": 6.643090403562411, "learning_rate": 1.9999987493436086e-06, "loss": 1.4312, "step": 308 }, { "epoch": 0.0, "grad_norm": 5.16915443300704, "learning_rate": 1.99999874093586e-06, "loss": 1.4448, "step": 309 }, { "epoch": 0.0, "grad_norm": 5.277599731706892, "learning_rate": 1.9999987324999446e-06, "loss": 1.6571, "step": 310 }, { "epoch": 0.0, "grad_norm": 5.2349500478147215, "learning_rate": 1.9999987240358625e-06, "loss": 1.4609, "step": 311 }, { "epoch": 0.0, "grad_norm": 6.248179668341996, "learning_rate": 1.9999987155436143e-06, "loss": 1.6418, "step": 312 }, { "epoch": 0.0, "grad_norm": 5.223835299607056, "learning_rate": 1.9999987070231985e-06, "loss": 1.495, "step": 313 }, { "epoch": 0.0, "grad_norm": 5.202665887318379, "learning_rate": 1.999998698474617e-06, "loss": 1.5282, "step": 314 }, { "epoch": 0.0, "grad_norm": 4.938213910400009, "learning_rate": 1.9999986898978686e-06, "loss": 1.4169, "step": 315 }, { "epoch": 0.0, "grad_norm": 9.193414540211652, "learning_rate": 1.9999986812929536e-06, "loss": 1.3747, "step": 316 }, { "epoch": 0.0, "grad_norm": 4.870538982226375, "learning_rate": 1.999998672659872e-06, "loss": 1.3901, "step": 317 }, { "epoch": 0.0, "grad_norm": 5.491542785298596, "learning_rate": 1.9999986639986233e-06, "loss": 1.5331, "step": 318 }, { "epoch": 0.0, "grad_norm": 5.500143758119544, "learning_rate": 1.9999986553092083e-06, "loss": 1.6651, "step": 319 }, { "epoch": 0.0, "grad_norm": 10.286451328879394, "learning_rate": 1.9999986465916265e-06, "loss": 1.6562, "step": 320 }, { "epoch": 0.0, "grad_norm": 5.19858420548839, "learning_rate": 1.9999986378458784e-06, "loss": 1.4969, "step": 321 }, { "epoch": 0.0, "grad_norm": 5.524050041906956, "learning_rate": 1.9999986290719634e-06, "loss": 1.4227, "step": 322 }, { "epoch": 0.0, "grad_norm": 7.65324091844115, "learning_rate": 1.999998620269882e-06, "loss": 1.5175, "step": 323 }, { "epoch": 0.0, "grad_norm": 4.955466116973382, "learning_rate": 1.999998611439634e-06, "loss": 1.3881, "step": 324 }, { "epoch": 0.0, "grad_norm": 4.98640055971755, "learning_rate": 1.9999986025812193e-06, "loss": 1.6281, "step": 325 }, { "epoch": 0.0, "grad_norm": 5.372570880498082, "learning_rate": 1.9999985936946375e-06, "loss": 1.4439, "step": 326 }, { "epoch": 0.0, "grad_norm": 5.307836526026473, "learning_rate": 1.9999985847798893e-06, "loss": 1.5315, "step": 327 }, { "epoch": 0.0, "grad_norm": 4.967657152244879, "learning_rate": 1.9999985758369748e-06, "loss": 1.5171, "step": 328 }, { "epoch": 0.0, "grad_norm": 6.189830333449287, "learning_rate": 1.9999985668658934e-06, "loss": 1.5468, "step": 329 }, { "epoch": 0.0, "grad_norm": 5.903849880093316, "learning_rate": 1.9999985578666452e-06, "loss": 1.6242, "step": 330 }, { "epoch": 0.0, "grad_norm": 4.784805252959356, "learning_rate": 1.999998548839231e-06, "loss": 1.4167, "step": 331 }, { "epoch": 0.0, "grad_norm": 5.196004132700368, "learning_rate": 1.9999985397836497e-06, "loss": 1.5647, "step": 332 }, { "epoch": 0.0, "grad_norm": 5.103521406136126, "learning_rate": 1.999998530699902e-06, "loss": 1.4414, "step": 333 }, { "epoch": 0.0, "grad_norm": 5.570598621348803, "learning_rate": 1.9999985215879873e-06, "loss": 1.3089, "step": 334 }, { "epoch": 0.0, "grad_norm": 5.777301480121715, "learning_rate": 1.999998512447906e-06, "loss": 1.5408, "step": 335 }, { "epoch": 0.0, "grad_norm": 4.997876127994012, "learning_rate": 1.9999985032796586e-06, "loss": 1.4277, "step": 336 }, { "epoch": 0.0, "grad_norm": 5.633061500236633, "learning_rate": 1.999998494083244e-06, "loss": 1.6895, "step": 337 }, { "epoch": 0.0, "grad_norm": 4.834617334661858, "learning_rate": 1.999998484858663e-06, "loss": 1.426, "step": 338 }, { "epoch": 0.0, "grad_norm": 6.993606113796682, "learning_rate": 1.9999984756059153e-06, "loss": 1.3854, "step": 339 }, { "epoch": 0.0, "grad_norm": 5.101502771813274, "learning_rate": 1.999998466325001e-06, "loss": 1.6404, "step": 340 }, { "epoch": 0.0, "grad_norm": 5.089236533265877, "learning_rate": 1.9999984570159197e-06, "loss": 1.4611, "step": 341 }, { "epoch": 0.0, "grad_norm": 5.206319847977299, "learning_rate": 1.9999984476786723e-06, "loss": 1.3741, "step": 342 }, { "epoch": 0.0, "grad_norm": 5.331926916079775, "learning_rate": 1.999998438313258e-06, "loss": 1.5426, "step": 343 }, { "epoch": 0.0, "grad_norm": 7.3624137401783445, "learning_rate": 1.9999984289196776e-06, "loss": 1.6399, "step": 344 }, { "epoch": 0.0, "grad_norm": 6.918510892290634, "learning_rate": 1.99999841949793e-06, "loss": 1.4012, "step": 345 }, { "epoch": 0.0, "grad_norm": 4.5972792983175665, "learning_rate": 1.999998410048016e-06, "loss": 1.4264, "step": 346 }, { "epoch": 0.0, "grad_norm": 4.916021369254276, "learning_rate": 1.999998400569935e-06, "loss": 1.5523, "step": 347 }, { "epoch": 0.0, "grad_norm": 5.5315684631864, "learning_rate": 1.9999983910636877e-06, "loss": 1.5335, "step": 348 }, { "epoch": 0.0, "grad_norm": 4.656760353737745, "learning_rate": 1.999998381529274e-06, "loss": 1.4952, "step": 349 }, { "epoch": 0.0, "grad_norm": 5.583073365446635, "learning_rate": 1.9999983719666933e-06, "loss": 1.4312, "step": 350 }, { "epoch": 0.0, "grad_norm": 5.293004596545783, "learning_rate": 1.999998362375946e-06, "loss": 1.5153, "step": 351 }, { "epoch": 0.0, "grad_norm": 4.998189663482702, "learning_rate": 1.999998352757032e-06, "loss": 1.5468, "step": 352 }, { "epoch": 0.0, "grad_norm": 4.704748816779011, "learning_rate": 1.9999983431099516e-06, "loss": 1.4175, "step": 353 }, { "epoch": 0.0, "grad_norm": 4.816706794317481, "learning_rate": 1.9999983334347046e-06, "loss": 1.5202, "step": 354 }, { "epoch": 0.0, "grad_norm": 4.973712391629575, "learning_rate": 1.999998323731291e-06, "loss": 1.4952, "step": 355 }, { "epoch": 0.0, "grad_norm": 5.4799024863000305, "learning_rate": 1.9999983139997107e-06, "loss": 1.6129, "step": 356 }, { "epoch": 0.0, "grad_norm": 5.773895048702807, "learning_rate": 1.9999983042399637e-06, "loss": 1.5075, "step": 357 }, { "epoch": 0.0, "grad_norm": 5.786637205360941, "learning_rate": 1.99999829445205e-06, "loss": 1.5998, "step": 358 }, { "epoch": 0.0, "grad_norm": 5.417325890208001, "learning_rate": 1.9999982846359697e-06, "loss": 1.491, "step": 359 }, { "epoch": 0.0, "grad_norm": 6.441996636074185, "learning_rate": 1.9999982747917228e-06, "loss": 1.452, "step": 360 }, { "epoch": 0.0, "grad_norm": 5.484858685210035, "learning_rate": 1.9999982649193094e-06, "loss": 1.5286, "step": 361 }, { "epoch": 0.0, "grad_norm": 4.304285744838827, "learning_rate": 1.999998255018729e-06, "loss": 1.261, "step": 362 }, { "epoch": 0.0, "grad_norm": 5.00715136108103, "learning_rate": 1.9999982450899826e-06, "loss": 1.5213, "step": 363 }, { "epoch": 0.0, "grad_norm": 7.9421250934625744, "learning_rate": 1.9999982351330693e-06, "loss": 1.5245, "step": 364 }, { "epoch": 0.0, "grad_norm": 7.249784350703182, "learning_rate": 1.999998225147989e-06, "loss": 1.4959, "step": 365 }, { "epoch": 0.0, "eval_loss": 1.7152172327041626, "eval_runtime": 4.5993, "eval_samples_per_second": 1.957, "eval_steps_per_second": 1.087, "step": 365 }, { "epoch": 0.0, "grad_norm": 4.950903475262251, "learning_rate": 1.9999982151347425e-06, "loss": 1.5132, "step": 366 }, { "epoch": 0.0, "grad_norm": 5.001256076322384, "learning_rate": 1.999998205093329e-06, "loss": 1.4178, "step": 367 }, { "epoch": 0.0, "grad_norm": 5.145343234500232, "learning_rate": 1.9999981950237494e-06, "loss": 1.6928, "step": 368 }, { "epoch": 0.0, "grad_norm": 4.926796515825282, "learning_rate": 1.999998184926003e-06, "loss": 1.4183, "step": 369 }, { "epoch": 0.0, "grad_norm": 6.205577250811191, "learning_rate": 1.99999817480009e-06, "loss": 1.4067, "step": 370 }, { "epoch": 0.0, "grad_norm": 6.181548092407497, "learning_rate": 1.99999816464601e-06, "loss": 1.3839, "step": 371 }, { "epoch": 0.0, "grad_norm": 4.8304254951229355, "learning_rate": 1.9999981544637634e-06, "loss": 1.3712, "step": 372 }, { "epoch": 0.0, "grad_norm": 12.315569431338385, "learning_rate": 1.9999981442533505e-06, "loss": 1.5337, "step": 373 }, { "epoch": 0.0, "grad_norm": 4.969790738605887, "learning_rate": 1.999998134014771e-06, "loss": 1.5229, "step": 374 }, { "epoch": 0.0, "grad_norm": 5.073608092190098, "learning_rate": 1.999998123748025e-06, "loss": 1.579, "step": 375 }, { "epoch": 0.0, "grad_norm": 5.111176704390247, "learning_rate": 1.999998113453112e-06, "loss": 1.4727, "step": 376 }, { "epoch": 0.0, "grad_norm": 4.404278529357815, "learning_rate": 1.9999981031300326e-06, "loss": 1.3756, "step": 377 }, { "epoch": 0.0, "grad_norm": 4.66133544977496, "learning_rate": 1.9999980927787864e-06, "loss": 1.3958, "step": 378 }, { "epoch": 0.0, "grad_norm": 4.811042205639708, "learning_rate": 1.999998082399374e-06, "loss": 1.4358, "step": 379 }, { "epoch": 0.0, "grad_norm": 5.082316240684894, "learning_rate": 1.9999980719917945e-06, "loss": 1.5925, "step": 380 }, { "epoch": 0.0, "grad_norm": 5.316025491311415, "learning_rate": 1.9999980615560487e-06, "loss": 1.3726, "step": 381 }, { "epoch": 0.0, "grad_norm": 4.922423456498873, "learning_rate": 1.9999980510921357e-06, "loss": 1.3629, "step": 382 }, { "epoch": 0.0, "grad_norm": 4.966088631224505, "learning_rate": 1.9999980406000568e-06, "loss": 1.4834, "step": 383 }, { "epoch": 0.0, "grad_norm": 4.8135477821063475, "learning_rate": 1.999998030079811e-06, "loss": 1.5864, "step": 384 }, { "epoch": 0.0, "grad_norm": 7.2288779061936035, "learning_rate": 1.999998019531399e-06, "loss": 1.5929, "step": 385 }, { "epoch": 0.0, "grad_norm": 7.583909712278632, "learning_rate": 1.9999980089548195e-06, "loss": 1.4818, "step": 386 }, { "epoch": 0.0, "grad_norm": 5.161770592187534, "learning_rate": 1.999997998350074e-06, "loss": 1.7042, "step": 387 }, { "epoch": 0.0, "grad_norm": 5.53786371138208, "learning_rate": 1.999997987717162e-06, "loss": 1.4949, "step": 388 }, { "epoch": 0.0, "grad_norm": 5.547955232044561, "learning_rate": 1.999997977056083e-06, "loss": 1.5075, "step": 389 }, { "epoch": 0.0, "grad_norm": 5.443846222937035, "learning_rate": 1.999997966366837e-06, "loss": 1.5852, "step": 390 }, { "epoch": 0.0, "grad_norm": 4.948645185345369, "learning_rate": 1.9999979556494255e-06, "loss": 1.3367, "step": 391 }, { "epoch": 0.0, "grad_norm": 4.580424596028237, "learning_rate": 1.9999979449038465e-06, "loss": 1.2409, "step": 392 }, { "epoch": 0.0, "grad_norm": 5.704258128824565, "learning_rate": 1.999997934130101e-06, "loss": 1.4568, "step": 393 }, { "epoch": 0.0, "grad_norm": 5.441676548521125, "learning_rate": 1.9999979233281894e-06, "loss": 1.5953, "step": 394 }, { "epoch": 0.0, "grad_norm": 4.912142407168767, "learning_rate": 1.999997912498111e-06, "loss": 1.54, "step": 395 }, { "epoch": 0.0, "grad_norm": 5.1706944951472895, "learning_rate": 1.9999979016398654e-06, "loss": 1.3123, "step": 396 }, { "epoch": 0.0, "grad_norm": 8.26460889745062, "learning_rate": 1.9999978907534537e-06, "loss": 1.5813, "step": 397 }, { "epoch": 0.0, "grad_norm": 4.564397081017155, "learning_rate": 1.999997879838875e-06, "loss": 1.3763, "step": 398 }, { "epoch": 0.0, "grad_norm": 4.9015032586368905, "learning_rate": 1.99999786889613e-06, "loss": 1.4756, "step": 399 }, { "epoch": 0.0, "grad_norm": 4.917803358556267, "learning_rate": 1.9999978579252184e-06, "loss": 1.2537, "step": 400 }, { "epoch": 0.0, "grad_norm": 4.5181742977699555, "learning_rate": 1.9999978469261402e-06, "loss": 1.4328, "step": 401 }, { "epoch": 0.0, "grad_norm": 4.660663659308694, "learning_rate": 1.9999978358988953e-06, "loss": 1.4581, "step": 402 }, { "epoch": 0.0, "grad_norm": 5.0474102698666385, "learning_rate": 1.999997824843484e-06, "loss": 1.4002, "step": 403 }, { "epoch": 0.0, "grad_norm": 9.834707543136538, "learning_rate": 1.9999978137599058e-06, "loss": 1.4236, "step": 404 }, { "epoch": 0.0, "grad_norm": 5.168260756918322, "learning_rate": 1.9999978026481612e-06, "loss": 1.5046, "step": 405 }, { "epoch": 0.0, "grad_norm": 5.035029265083008, "learning_rate": 1.99999779150825e-06, "loss": 1.3636, "step": 406 }, { "epoch": 0.0, "grad_norm": 5.28716001715498, "learning_rate": 1.999997780340172e-06, "loss": 1.4576, "step": 407 }, { "epoch": 0.0, "grad_norm": 4.8852077758822015, "learning_rate": 1.999997769143927e-06, "loss": 1.4481, "step": 408 }, { "epoch": 0.0, "grad_norm": 5.232260975458147, "learning_rate": 1.999997757919516e-06, "loss": 1.5196, "step": 409 }, { "epoch": 0.0, "grad_norm": 5.264068624514251, "learning_rate": 1.9999977466669385e-06, "loss": 1.4934, "step": 410 }, { "epoch": 0.0, "grad_norm": 5.469538493605676, "learning_rate": 1.999997735386194e-06, "loss": 1.505, "step": 411 }, { "epoch": 0.0, "grad_norm": 4.999328912549101, "learning_rate": 1.999997724077283e-06, "loss": 1.4756, "step": 412 }, { "epoch": 0.0, "grad_norm": 5.076217295124521, "learning_rate": 1.9999977127402057e-06, "loss": 1.5694, "step": 413 }, { "epoch": 0.0, "grad_norm": 5.226359452459192, "learning_rate": 1.999997701374961e-06, "loss": 1.4211, "step": 414 }, { "epoch": 0.0, "grad_norm": 4.819718845384241, "learning_rate": 1.99999768998155e-06, "loss": 1.5322, "step": 415 }, { "epoch": 0.0, "grad_norm": 5.122043265582762, "learning_rate": 1.9999976785599732e-06, "loss": 1.4893, "step": 416 }, { "epoch": 0.0, "grad_norm": 8.297154589932777, "learning_rate": 1.9999976671102287e-06, "loss": 1.5449, "step": 417 }, { "epoch": 0.0, "grad_norm": 4.890157121974458, "learning_rate": 1.999997655632318e-06, "loss": 1.4776, "step": 418 }, { "epoch": 0.0, "grad_norm": 5.589475875269356, "learning_rate": 1.9999976441262408e-06, "loss": 1.4896, "step": 419 }, { "epoch": 0.0, "grad_norm": 5.277939766849428, "learning_rate": 1.999997632591997e-06, "loss": 1.5828, "step": 420 }, { "epoch": 0.0, "grad_norm": 4.470441711000115, "learning_rate": 1.9999976210295865e-06, "loss": 1.3217, "step": 421 }, { "epoch": 0.0, "grad_norm": 5.995769226585237, "learning_rate": 1.9999976094390096e-06, "loss": 1.5426, "step": 422 }, { "epoch": 0.0, "grad_norm": 6.160211017382697, "learning_rate": 1.999997597820266e-06, "loss": 1.5534, "step": 423 }, { "epoch": 0.0, "grad_norm": 5.8941787869537325, "learning_rate": 1.9999975861733557e-06, "loss": 1.4683, "step": 424 }, { "epoch": 0.0, "grad_norm": 5.213637265959196, "learning_rate": 1.9999975744982788e-06, "loss": 1.7138, "step": 425 }, { "epoch": 0.0, "grad_norm": 4.62520919093602, "learning_rate": 1.999997562795035e-06, "loss": 1.4748, "step": 426 }, { "epoch": 0.0, "grad_norm": 4.836310097570104, "learning_rate": 1.999997551063625e-06, "loss": 1.5654, "step": 427 }, { "epoch": 0.0, "grad_norm": 4.4455635910745555, "learning_rate": 1.9999975393040484e-06, "loss": 1.3387, "step": 428 }, { "epoch": 0.0, "grad_norm": 6.820553071368411, "learning_rate": 1.999997527516305e-06, "loss": 1.4821, "step": 429 }, { "epoch": 0.0, "grad_norm": 4.97958037440551, "learning_rate": 1.9999975157003953e-06, "loss": 1.5386, "step": 430 }, { "epoch": 0.0, "grad_norm": 5.218479803099335, "learning_rate": 1.9999975038563184e-06, "loss": 1.5253, "step": 431 }, { "epoch": 0.0, "grad_norm": 4.817934555265224, "learning_rate": 1.9999974919840755e-06, "loss": 1.4677, "step": 432 }, { "epoch": 0.0, "grad_norm": 7.04962891837311, "learning_rate": 1.9999974800836658e-06, "loss": 1.4149, "step": 433 }, { "epoch": 0.0, "grad_norm": 4.977731627262668, "learning_rate": 1.9999974681550892e-06, "loss": 1.4752, "step": 434 }, { "epoch": 0.0, "grad_norm": 5.593445830695836, "learning_rate": 1.9999974561983467e-06, "loss": 1.317, "step": 435 }, { "epoch": 0.0, "grad_norm": 5.061091292458903, "learning_rate": 1.999997444213437e-06, "loss": 1.486, "step": 436 }, { "epoch": 0.0, "grad_norm": 4.608804550083991, "learning_rate": 1.999997432200361e-06, "loss": 1.3927, "step": 437 }, { "epoch": 0.0, "grad_norm": 4.670859026053921, "learning_rate": 1.999997420159118e-06, "loss": 1.4257, "step": 438 }, { "epoch": 0.0, "eval_loss": 1.7094385623931885, "eval_runtime": 4.6238, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 438 }, { "epoch": 0.0, "grad_norm": 5.859564981544054, "learning_rate": 1.9999974080897087e-06, "loss": 1.3354, "step": 439 }, { "epoch": 0.0, "grad_norm": 5.52294064206441, "learning_rate": 1.999997395992133e-06, "loss": 1.6065, "step": 440 }, { "epoch": 0.0, "grad_norm": 4.424488305091629, "learning_rate": 1.99999738386639e-06, "loss": 1.4306, "step": 441 }, { "epoch": 0.0, "grad_norm": 4.798968031648101, "learning_rate": 1.999997371712481e-06, "loss": 1.4365, "step": 442 }, { "epoch": 0.0, "grad_norm": 4.932924371225362, "learning_rate": 1.9999973595304054e-06, "loss": 1.3648, "step": 443 }, { "epoch": 0.0, "grad_norm": 5.234896304295093, "learning_rate": 1.999997347320163e-06, "loss": 1.4025, "step": 444 }, { "epoch": 0.0, "grad_norm": 7.0542656681469715, "learning_rate": 1.9999973350817544e-06, "loss": 1.2087, "step": 445 }, { "epoch": 0.0, "grad_norm": 5.129429012048715, "learning_rate": 1.9999973228151787e-06, "loss": 1.4321, "step": 446 }, { "epoch": 0.0, "grad_norm": 4.613105028702654, "learning_rate": 1.999997310520436e-06, "loss": 1.4545, "step": 447 }, { "epoch": 0.0, "grad_norm": 5.0864337050088295, "learning_rate": 1.9999972981975277e-06, "loss": 1.3047, "step": 448 }, { "epoch": 0.0, "grad_norm": 5.123209043678265, "learning_rate": 1.9999972858464524e-06, "loss": 1.4923, "step": 449 }, { "epoch": 0.0, "grad_norm": 4.770437981538585, "learning_rate": 1.9999972734672107e-06, "loss": 1.3795, "step": 450 }, { "epoch": 0.0, "grad_norm": 4.66047161014276, "learning_rate": 1.999997261059802e-06, "loss": 1.4359, "step": 451 }, { "epoch": 0.0, "grad_norm": 5.661063937671921, "learning_rate": 1.999997248624227e-06, "loss": 1.6533, "step": 452 }, { "epoch": 0.0, "grad_norm": 4.631944386384085, "learning_rate": 1.9999972361604853e-06, "loss": 1.4383, "step": 453 }, { "epoch": 0.0, "grad_norm": 4.732909501037933, "learning_rate": 1.9999972236685768e-06, "loss": 1.3426, "step": 454 }, { "epoch": 0.0, "grad_norm": 5.460195121758093, "learning_rate": 1.999997211148502e-06, "loss": 1.5001, "step": 455 }, { "epoch": 0.0, "grad_norm": 4.942983319180353, "learning_rate": 1.99999719860026e-06, "loss": 1.5882, "step": 456 }, { "epoch": 0.0, "grad_norm": 4.541373356599531, "learning_rate": 1.999997186023852e-06, "loss": 1.3362, "step": 457 }, { "epoch": 0.0, "grad_norm": 4.6766662288287515, "learning_rate": 1.9999971734192776e-06, "loss": 1.4117, "step": 458 }, { "epoch": 0.0, "grad_norm": 5.240283985252159, "learning_rate": 1.9999971607865364e-06, "loss": 1.2851, "step": 459 }, { "epoch": 0.0, "grad_norm": 6.6165761980037265, "learning_rate": 1.9999971481256283e-06, "loss": 1.4075, "step": 460 }, { "epoch": 0.0, "grad_norm": 5.356136217997657, "learning_rate": 1.999997135436554e-06, "loss": 1.2614, "step": 461 }, { "epoch": 0.0, "grad_norm": 5.235383295905265, "learning_rate": 1.999997122719313e-06, "loss": 1.5025, "step": 462 }, { "epoch": 0.0, "grad_norm": 5.59435469640636, "learning_rate": 1.999997109973905e-06, "loss": 1.6514, "step": 463 }, { "epoch": 0.0, "grad_norm": 8.674661542805518, "learning_rate": 1.9999970972003308e-06, "loss": 1.5387, "step": 464 }, { "epoch": 0.0, "grad_norm": 5.954501972594385, "learning_rate": 1.99999708439859e-06, "loss": 1.376, "step": 465 }, { "epoch": 0.0, "grad_norm": 5.031043958621965, "learning_rate": 1.9999970715686827e-06, "loss": 1.5981, "step": 466 }, { "epoch": 0.0, "grad_norm": 5.849912677371338, "learning_rate": 1.9999970587106086e-06, "loss": 1.4769, "step": 467 }, { "epoch": 0.0, "grad_norm": 5.18062121080789, "learning_rate": 1.9999970458243677e-06, "loss": 1.5745, "step": 468 }, { "epoch": 0.0, "grad_norm": 6.171389079148798, "learning_rate": 1.999997032909961e-06, "loss": 1.4238, "step": 469 }, { "epoch": 0.0, "grad_norm": 4.888754083257385, "learning_rate": 1.999997019967387e-06, "loss": 1.5669, "step": 470 }, { "epoch": 0.0, "grad_norm": 4.661884685242049, "learning_rate": 1.9999970069966463e-06, "loss": 1.2735, "step": 471 }, { "epoch": 0.0, "grad_norm": 5.231399197062376, "learning_rate": 1.9999969939977395e-06, "loss": 1.4674, "step": 472 }, { "epoch": 0.0, "grad_norm": 4.392315151719007, "learning_rate": 1.999996980970666e-06, "loss": 1.2746, "step": 473 }, { "epoch": 0.0, "grad_norm": 14.143521137726825, "learning_rate": 1.999996967915426e-06, "loss": 1.4949, "step": 474 }, { "epoch": 0.0, "grad_norm": 7.609979863275451, "learning_rate": 1.999996954832019e-06, "loss": 1.5421, "step": 475 }, { "epoch": 0.0, "grad_norm": 4.901316944779091, "learning_rate": 1.9999969417204457e-06, "loss": 1.467, "step": 476 }, { "epoch": 0.0, "grad_norm": 4.9893724431034085, "learning_rate": 1.999996928580706e-06, "loss": 1.5033, "step": 477 }, { "epoch": 0.0, "grad_norm": 7.145370058739066, "learning_rate": 1.9999969154127992e-06, "loss": 1.6459, "step": 478 }, { "epoch": 0.0, "grad_norm": 5.6040849521366045, "learning_rate": 1.999996902216726e-06, "loss": 1.39, "step": 479 }, { "epoch": 0.0, "grad_norm": 4.871162617374601, "learning_rate": 1.9999968889924863e-06, "loss": 1.5008, "step": 480 }, { "epoch": 0.0, "grad_norm": 5.041180910719072, "learning_rate": 1.9999968757400803e-06, "loss": 1.417, "step": 481 }, { "epoch": 0.0, "grad_norm": 7.1416379779540415, "learning_rate": 1.9999968624595075e-06, "loss": 1.3683, "step": 482 }, { "epoch": 0.0, "grad_norm": 4.655432088463851, "learning_rate": 1.999996849150768e-06, "loss": 1.2764, "step": 483 }, { "epoch": 0.0, "grad_norm": 5.390658631829691, "learning_rate": 1.999996835813862e-06, "loss": 1.4183, "step": 484 }, { "epoch": 0.0, "grad_norm": 5.139267857868251, "learning_rate": 1.9999968224487894e-06, "loss": 1.5219, "step": 485 }, { "epoch": 0.0, "grad_norm": 4.98265828385138, "learning_rate": 1.9999968090555498e-06, "loss": 1.5828, "step": 486 }, { "epoch": 0.0, "grad_norm": 4.947025615061189, "learning_rate": 1.999996795634144e-06, "loss": 1.4291, "step": 487 }, { "epoch": 0.0, "grad_norm": 4.8361628814359445, "learning_rate": 1.999996782184572e-06, "loss": 1.5098, "step": 488 }, { "epoch": 0.0, "grad_norm": 6.213733320329145, "learning_rate": 1.999996768706833e-06, "loss": 1.5318, "step": 489 }, { "epoch": 0.0, "grad_norm": 5.354255255395692, "learning_rate": 1.9999967552009273e-06, "loss": 1.5049, "step": 490 }, { "epoch": 0.0, "grad_norm": 5.509817618529389, "learning_rate": 1.9999967416668553e-06, "loss": 1.5207, "step": 491 }, { "epoch": 0.0, "grad_norm": 4.820085190259705, "learning_rate": 1.9999967281046165e-06, "loss": 1.4661, "step": 492 }, { "epoch": 0.0, "grad_norm": 6.5842798335932855, "learning_rate": 1.9999967145142113e-06, "loss": 1.5606, "step": 493 }, { "epoch": 0.0, "grad_norm": 4.832919175129925, "learning_rate": 1.9999967008956397e-06, "loss": 1.5222, "step": 494 }, { "epoch": 0.0, "grad_norm": 5.401711540963192, "learning_rate": 1.9999966872489013e-06, "loss": 1.4282, "step": 495 }, { "epoch": 0.0, "grad_norm": 5.352942988278965, "learning_rate": 1.999996673573996e-06, "loss": 1.4907, "step": 496 }, { "epoch": 0.0, "grad_norm": 4.767236823164161, "learning_rate": 1.9999966598709245e-06, "loss": 1.2979, "step": 497 }, { "epoch": 0.0, "grad_norm": 4.7839487752581995, "learning_rate": 1.999996646139686e-06, "loss": 1.1931, "step": 498 }, { "epoch": 0.0, "grad_norm": 4.526284836165884, "learning_rate": 1.9999966323802813e-06, "loss": 1.4276, "step": 499 }, { "epoch": 0.0, "grad_norm": 4.968983179245585, "learning_rate": 1.99999661859271e-06, "loss": 1.5143, "step": 500 }, { "epoch": 0.0, "grad_norm": 4.929416318558409, "learning_rate": 1.999996604776972e-06, "loss": 1.4274, "step": 501 }, { "epoch": 0.0, "grad_norm": 5.319727005695438, "learning_rate": 1.999996590933068e-06, "loss": 1.4896, "step": 502 }, { "epoch": 0.0, "grad_norm": 5.166912706207141, "learning_rate": 1.9999965770609966e-06, "loss": 1.4971, "step": 503 }, { "epoch": 0.0, "grad_norm": 9.724664850608164, "learning_rate": 1.999996563160759e-06, "loss": 1.314, "step": 504 }, { "epoch": 0.0, "grad_norm": 6.033309554063063, "learning_rate": 1.9999965492323547e-06, "loss": 1.6318, "step": 505 }, { "epoch": 0.0, "grad_norm": 4.693812921757236, "learning_rate": 1.999996535275784e-06, "loss": 1.5998, "step": 506 }, { "epoch": 0.0, "grad_norm": 5.027615373760868, "learning_rate": 1.9999965212910468e-06, "loss": 1.4803, "step": 507 }, { "epoch": 0.0, "grad_norm": 5.150014426528141, "learning_rate": 1.999996507278143e-06, "loss": 1.5533, "step": 508 }, { "epoch": 0.0, "grad_norm": 6.251365830445326, "learning_rate": 1.999996493237072e-06, "loss": 1.5345, "step": 509 }, { "epoch": 0.0, "grad_norm": 5.068491757206887, "learning_rate": 1.999996479167835e-06, "loss": 1.4934, "step": 510 }, { "epoch": 0.0, "grad_norm": 6.510297092824997, "learning_rate": 1.9999964650704313e-06, "loss": 1.4957, "step": 511 }, { "epoch": 0.0, "eval_loss": 1.702190637588501, "eval_runtime": 4.6182, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 511 }, { "epoch": 0.0, "grad_norm": 5.755438253224136, "learning_rate": 1.999996450944861e-06, "loss": 1.4702, "step": 512 }, { "epoch": 0.0, "grad_norm": 4.529822887512446, "learning_rate": 1.9999964367911242e-06, "loss": 1.4433, "step": 513 }, { "epoch": 0.0, "grad_norm": 4.923234715248454, "learning_rate": 1.9999964226092207e-06, "loss": 1.2158, "step": 514 }, { "epoch": 0.0, "grad_norm": 4.99275767811329, "learning_rate": 1.9999964083991507e-06, "loss": 1.4035, "step": 515 }, { "epoch": 0.0, "grad_norm": 5.540700855208618, "learning_rate": 1.9999963941609144e-06, "loss": 1.4089, "step": 516 }, { "epoch": 0.0, "grad_norm": 5.423934034968752, "learning_rate": 1.9999963798945113e-06, "loss": 1.4395, "step": 517 }, { "epoch": 0.0, "grad_norm": 4.938286770911095, "learning_rate": 1.9999963655999413e-06, "loss": 1.5267, "step": 518 }, { "epoch": 0.0, "grad_norm": 6.135159232589015, "learning_rate": 1.9999963512772054e-06, "loss": 1.5788, "step": 519 }, { "epoch": 0.0, "grad_norm": 41.00137108815898, "learning_rate": 1.9999963369263023e-06, "loss": 1.681, "step": 520 }, { "epoch": 0.0, "grad_norm": 4.982490976686388, "learning_rate": 1.9999963225472327e-06, "loss": 1.3849, "step": 521 }, { "epoch": 0.0, "grad_norm": 5.200076278688873, "learning_rate": 1.999996308139997e-06, "loss": 1.4505, "step": 522 }, { "epoch": 0.0, "grad_norm": 5.0374386572637935, "learning_rate": 1.9999962937045945e-06, "loss": 1.6146, "step": 523 }, { "epoch": 0.0, "grad_norm": 5.38514239730456, "learning_rate": 1.9999962792410254e-06, "loss": 1.4862, "step": 524 }, { "epoch": 0.0, "grad_norm": 5.278831951738085, "learning_rate": 1.9999962647492895e-06, "loss": 1.5097, "step": 525 }, { "epoch": 0.0, "grad_norm": 4.988682511151002, "learning_rate": 1.999996250229387e-06, "loss": 1.5579, "step": 526 }, { "epoch": 0.0, "grad_norm": 6.6329832866037854, "learning_rate": 1.999996235681319e-06, "loss": 1.494, "step": 527 }, { "epoch": 0.0, "grad_norm": 6.101238841635504, "learning_rate": 1.999996221105083e-06, "loss": 1.5725, "step": 528 }, { "epoch": 0.0, "grad_norm": 5.618252608407, "learning_rate": 1.999996206500681e-06, "loss": 1.4834, "step": 529 }, { "epoch": 0.0, "grad_norm": 5.783224896235011, "learning_rate": 1.999996191868113e-06, "loss": 1.7878, "step": 530 }, { "epoch": 0.0, "grad_norm": 5.039904911976764, "learning_rate": 1.9999961772073773e-06, "loss": 1.5341, "step": 531 }, { "epoch": 0.0, "grad_norm": 4.749210275476864, "learning_rate": 1.999996162518476e-06, "loss": 1.5232, "step": 532 }, { "epoch": 0.0, "grad_norm": 4.75083641214273, "learning_rate": 1.999996147801408e-06, "loss": 1.4457, "step": 533 }, { "epoch": 0.0, "grad_norm": 4.797512529634022, "learning_rate": 1.999996133056173e-06, "loss": 1.3914, "step": 534 }, { "epoch": 0.0, "grad_norm": 4.996223642108566, "learning_rate": 1.999996118282772e-06, "loss": 1.53, "step": 535 }, { "epoch": 0.0, "grad_norm": 4.7829974351407305, "learning_rate": 1.9999961034812035e-06, "loss": 1.3418, "step": 536 }, { "epoch": 0.0, "grad_norm": 4.718076792623708, "learning_rate": 1.9999960886514693e-06, "loss": 1.3486, "step": 537 }, { "epoch": 0.0, "grad_norm": 5.156179924159171, "learning_rate": 1.9999960737935678e-06, "loss": 1.5375, "step": 538 }, { "epoch": 0.0, "grad_norm": 5.3419660684441705, "learning_rate": 1.9999960589075003e-06, "loss": 1.4394, "step": 539 }, { "epoch": 0.0, "grad_norm": 4.969835171700577, "learning_rate": 1.999996043993266e-06, "loss": 1.4885, "step": 540 }, { "epoch": 0.0, "grad_norm": 5.141114094857767, "learning_rate": 1.9999960290508654e-06, "loss": 1.6062, "step": 541 }, { "epoch": 0.0, "grad_norm": 6.732914023923969, "learning_rate": 1.9999960140802984e-06, "loss": 1.7097, "step": 542 }, { "epoch": 0.0, "grad_norm": 5.405085642943276, "learning_rate": 1.9999959990815645e-06, "loss": 1.5708, "step": 543 }, { "epoch": 0.0, "grad_norm": 4.951768383120494, "learning_rate": 1.9999959840546643e-06, "loss": 1.5517, "step": 544 }, { "epoch": 0.0, "grad_norm": 4.805093975628436, "learning_rate": 1.9999959689995972e-06, "loss": 1.4752, "step": 545 }, { "epoch": 0.0, "grad_norm": 4.475429973625488, "learning_rate": 1.9999959539163634e-06, "loss": 1.3248, "step": 546 }, { "epoch": 0.0, "grad_norm": 5.138477383552902, "learning_rate": 1.9999959388049636e-06, "loss": 1.5739, "step": 547 }, { "epoch": 0.0, "grad_norm": 4.991661833650293, "learning_rate": 1.9999959236653965e-06, "loss": 1.4348, "step": 548 }, { "epoch": 0.0, "grad_norm": 4.701391682646717, "learning_rate": 1.9999959084976635e-06, "loss": 1.3925, "step": 549 }, { "epoch": 0.0, "grad_norm": 4.910083477148781, "learning_rate": 1.9999958933017637e-06, "loss": 1.5543, "step": 550 }, { "epoch": 0.0, "grad_norm": 5.027398384260184, "learning_rate": 1.9999958780776975e-06, "loss": 1.4994, "step": 551 }, { "epoch": 0.0, "grad_norm": 5.5869467738586645, "learning_rate": 1.9999958628254645e-06, "loss": 1.5429, "step": 552 }, { "epoch": 0.0, "grad_norm": 4.951650316590327, "learning_rate": 1.999995847545065e-06, "loss": 1.4204, "step": 553 }, { "epoch": 0.0, "grad_norm": 4.779335727504338, "learning_rate": 1.999995832236499e-06, "loss": 1.5872, "step": 554 }, { "epoch": 0.0, "grad_norm": 4.995816902206547, "learning_rate": 1.9999958168997667e-06, "loss": 1.434, "step": 555 }, { "epoch": 0.0, "grad_norm": 5.2068709341822235, "learning_rate": 1.9999958015348677e-06, "loss": 1.5262, "step": 556 }, { "epoch": 0.0, "grad_norm": 4.993826056871235, "learning_rate": 1.9999957861418014e-06, "loss": 1.4638, "step": 557 }, { "epoch": 0.0, "grad_norm": 6.2251786792625206, "learning_rate": 1.9999957707205693e-06, "loss": 1.508, "step": 558 }, { "epoch": 0.0, "grad_norm": 6.073040511213923, "learning_rate": 1.9999957552711707e-06, "loss": 1.4745, "step": 559 }, { "epoch": 0.0, "grad_norm": 5.738891638073386, "learning_rate": 1.9999957397936053e-06, "loss": 1.6243, "step": 560 }, { "epoch": 0.0, "grad_norm": 5.419662722088751, "learning_rate": 1.9999957242878735e-06, "loss": 1.2774, "step": 561 }, { "epoch": 0.0, "grad_norm": 5.122152529238781, "learning_rate": 1.999995708753975e-06, "loss": 1.4376, "step": 562 }, { "epoch": 0.0, "grad_norm": 5.4113125087183205, "learning_rate": 1.99999569319191e-06, "loss": 1.5806, "step": 563 }, { "epoch": 0.0, "grad_norm": 5.379407688726184, "learning_rate": 1.999995677601678e-06, "loss": 1.4604, "step": 564 }, { "epoch": 0.0, "grad_norm": 5.47622178703681, "learning_rate": 1.9999956619832805e-06, "loss": 1.4344, "step": 565 }, { "epoch": 0.0, "grad_norm": 5.117263096048395, "learning_rate": 1.999995646336716e-06, "loss": 1.4896, "step": 566 }, { "epoch": 0.0, "grad_norm": 5.4922547794799, "learning_rate": 1.9999956306619846e-06, "loss": 1.7709, "step": 567 }, { "epoch": 0.0, "grad_norm": 6.489699330478132, "learning_rate": 1.999995614959087e-06, "loss": 1.5452, "step": 568 }, { "epoch": 0.0, "grad_norm": 11.437005541277756, "learning_rate": 1.9999955992280223e-06, "loss": 1.347, "step": 569 }, { "epoch": 0.0, "grad_norm": 5.224691095393556, "learning_rate": 1.9999955834687917e-06, "loss": 1.5102, "step": 570 }, { "epoch": 0.0, "grad_norm": 5.56916203726456, "learning_rate": 1.999995567681394e-06, "loss": 1.2866, "step": 571 }, { "epoch": 0.0, "grad_norm": 5.203471880109494, "learning_rate": 1.99999555186583e-06, "loss": 1.2939, "step": 572 }, { "epoch": 0.0, "grad_norm": 5.295050199339514, "learning_rate": 1.9999955360220997e-06, "loss": 1.244, "step": 573 }, { "epoch": 0.0, "grad_norm": 5.558252219814004, "learning_rate": 1.9999955201502024e-06, "loss": 1.381, "step": 574 }, { "epoch": 0.0, "grad_norm": 4.693879682065885, "learning_rate": 1.999995504250139e-06, "loss": 1.5172, "step": 575 }, { "epoch": 0.0, "grad_norm": 5.054746659752432, "learning_rate": 1.9999954883219086e-06, "loss": 1.5245, "step": 576 }, { "epoch": 0.0, "grad_norm": 6.288250112135058, "learning_rate": 1.999995472365512e-06, "loss": 1.6557, "step": 577 }, { "epoch": 0.0, "grad_norm": 4.7842439911659005, "learning_rate": 1.9999954563809488e-06, "loss": 1.4294, "step": 578 }, { "epoch": 0.0, "grad_norm": 4.995544020303297, "learning_rate": 1.999995440368219e-06, "loss": 1.3965, "step": 579 }, { "epoch": 0.0, "grad_norm": 4.759630572595074, "learning_rate": 1.9999954243273226e-06, "loss": 1.3996, "step": 580 }, { "epoch": 0.0, "grad_norm": 4.712211700693446, "learning_rate": 1.99999540825826e-06, "loss": 1.4983, "step": 581 }, { "epoch": 0.0, "grad_norm": 5.9470327763676405, "learning_rate": 1.9999953921610304e-06, "loss": 1.355, "step": 582 }, { "epoch": 0.0, "grad_norm": 5.063038997344527, "learning_rate": 1.9999953760356343e-06, "loss": 1.4099, "step": 583 }, { "epoch": 0.0, "grad_norm": 4.844081361582967, "learning_rate": 1.999995359882072e-06, "loss": 1.5044, "step": 584 }, { "epoch": 0.0, "eval_loss": 1.696353793144226, "eval_runtime": 4.6168, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 584 }, { "epoch": 0.0, "grad_norm": 6.493708899953276, "learning_rate": 1.999995343700343e-06, "loss": 1.6627, "step": 585 }, { "epoch": 0.0, "grad_norm": 4.954875007169818, "learning_rate": 1.9999953274904473e-06, "loss": 1.4136, "step": 586 }, { "epoch": 0.0, "grad_norm": 4.910325383229665, "learning_rate": 1.9999953112523853e-06, "loss": 1.4251, "step": 587 }, { "epoch": 0.0, "grad_norm": 4.714961872146683, "learning_rate": 1.9999952949861564e-06, "loss": 1.5491, "step": 588 }, { "epoch": 0.0, "grad_norm": 5.2328265295940435, "learning_rate": 1.999995278691761e-06, "loss": 1.4251, "step": 589 }, { "epoch": 0.0, "grad_norm": 4.533584320037035, "learning_rate": 1.9999952623691995e-06, "loss": 1.4121, "step": 590 }, { "epoch": 0.0, "grad_norm": 4.882067176722944, "learning_rate": 1.999995246018471e-06, "loss": 1.3394, "step": 591 }, { "epoch": 0.0, "grad_norm": 4.9509375278675405, "learning_rate": 1.9999952296395762e-06, "loss": 1.3379, "step": 592 }, { "epoch": 0.0, "grad_norm": 5.653441569963669, "learning_rate": 1.999995213232515e-06, "loss": 1.4337, "step": 593 }, { "epoch": 0.0, "grad_norm": 4.426491174400903, "learning_rate": 1.999995196797287e-06, "loss": 1.2995, "step": 594 }, { "epoch": 0.0, "grad_norm": 5.485353755158814, "learning_rate": 1.999995180333893e-06, "loss": 1.379, "step": 595 }, { "epoch": 0.0, "grad_norm": 4.79710641338462, "learning_rate": 1.9999951638423317e-06, "loss": 1.4514, "step": 596 }, { "epoch": 0.0, "grad_norm": 4.92546100162479, "learning_rate": 1.999995147322604e-06, "loss": 1.4666, "step": 597 }, { "epoch": 0.0, "grad_norm": 5.9766360974271056, "learning_rate": 1.99999513077471e-06, "loss": 1.353, "step": 598 }, { "epoch": 0.0, "grad_norm": 6.442918937982415, "learning_rate": 1.9999951141986493e-06, "loss": 1.6356, "step": 599 }, { "epoch": 0.0, "grad_norm": 5.523537086483564, "learning_rate": 1.9999950975944225e-06, "loss": 1.5245, "step": 600 }, { "epoch": 0.0, "grad_norm": 4.955021932634082, "learning_rate": 1.9999950809620285e-06, "loss": 1.3658, "step": 601 }, { "epoch": 0.0, "grad_norm": 4.8036782108306975, "learning_rate": 1.999995064301468e-06, "loss": 1.4261, "step": 602 }, { "epoch": 0.0, "grad_norm": 4.952441088003973, "learning_rate": 1.9999950476127418e-06, "loss": 1.5592, "step": 603 }, { "epoch": 0.0, "grad_norm": 4.712458785442027, "learning_rate": 1.999995030895848e-06, "loss": 1.6329, "step": 604 }, { "epoch": 0.0, "grad_norm": 6.39848861393844, "learning_rate": 1.9999950141507886e-06, "loss": 1.3637, "step": 605 }, { "epoch": 0.0, "grad_norm": 4.565908788566706, "learning_rate": 1.9999949973775623e-06, "loss": 1.3901, "step": 606 }, { "epoch": 0.0, "grad_norm": 4.839741237419302, "learning_rate": 1.9999949805761695e-06, "loss": 1.5408, "step": 607 }, { "epoch": 0.0, "grad_norm": 5.65666455626493, "learning_rate": 1.99999496374661e-06, "loss": 1.3274, "step": 608 }, { "epoch": 0.0, "grad_norm": 5.003729201972364, "learning_rate": 1.999994946888884e-06, "loss": 1.506, "step": 609 }, { "epoch": 0.0, "grad_norm": 6.133503970487184, "learning_rate": 1.9999949300029917e-06, "loss": 1.5401, "step": 610 }, { "epoch": 0.0, "grad_norm": 5.6024698311473795, "learning_rate": 1.9999949130889325e-06, "loss": 1.6896, "step": 611 }, { "epoch": 0.0, "grad_norm": 4.532215717740302, "learning_rate": 1.999994896146707e-06, "loss": 1.3587, "step": 612 }, { "epoch": 0.0, "grad_norm": 5.70340339038857, "learning_rate": 1.9999948791763146e-06, "loss": 1.577, "step": 613 }, { "epoch": 0.0, "grad_norm": 5.082395639793893, "learning_rate": 1.9999948621777563e-06, "loss": 1.5421, "step": 614 }, { "epoch": 0.0, "grad_norm": 4.988618029308243, "learning_rate": 1.9999948451510312e-06, "loss": 1.5131, "step": 615 }, { "epoch": 0.0, "grad_norm": 6.413069306779731, "learning_rate": 1.9999948280961393e-06, "loss": 1.4763, "step": 616 }, { "epoch": 0.0, "grad_norm": 4.723299915394361, "learning_rate": 1.999994811013081e-06, "loss": 1.4926, "step": 617 }, { "epoch": 0.0, "grad_norm": 4.818607426839722, "learning_rate": 1.9999947939018567e-06, "loss": 1.4112, "step": 618 }, { "epoch": 0.0, "grad_norm": 5.090960497753154, "learning_rate": 1.9999947767624656e-06, "loss": 1.6447, "step": 619 }, { "epoch": 0.0, "grad_norm": 4.869884243493298, "learning_rate": 1.9999947595949073e-06, "loss": 1.4467, "step": 620 }, { "epoch": 0.0, "grad_norm": 4.952829084694713, "learning_rate": 1.999994742399183e-06, "loss": 1.5202, "step": 621 }, { "epoch": 0.0, "grad_norm": 5.000534437688967, "learning_rate": 1.9999947251752923e-06, "loss": 1.569, "step": 622 }, { "epoch": 0.0, "grad_norm": 4.891724084780857, "learning_rate": 1.9999947079232353e-06, "loss": 1.2758, "step": 623 }, { "epoch": 0.0, "grad_norm": 4.720768304907384, "learning_rate": 1.999994690643011e-06, "loss": 1.5005, "step": 624 }, { "epoch": 0.0, "grad_norm": 4.8481849499077, "learning_rate": 1.9999946733346208e-06, "loss": 1.5816, "step": 625 }, { "epoch": 0.0, "grad_norm": 5.153018684172472, "learning_rate": 1.9999946559980637e-06, "loss": 1.6261, "step": 626 }, { "epoch": 0.0, "grad_norm": 5.041326934986502, "learning_rate": 1.9999946386333402e-06, "loss": 1.4737, "step": 627 }, { "epoch": 0.0, "grad_norm": 6.615624017109531, "learning_rate": 1.9999946212404504e-06, "loss": 1.3258, "step": 628 }, { "epoch": 0.0, "grad_norm": 5.5498876909447405, "learning_rate": 1.999994603819394e-06, "loss": 1.6407, "step": 629 }, { "epoch": 0.0, "grad_norm": 5.466395618769352, "learning_rate": 1.999994586370171e-06, "loss": 1.3873, "step": 630 }, { "epoch": 0.0, "grad_norm": 4.84924829351922, "learning_rate": 1.9999945688927813e-06, "loss": 1.4199, "step": 631 }, { "epoch": 0.0, "grad_norm": 5.489922257516625, "learning_rate": 1.999994551387225e-06, "loss": 1.4957, "step": 632 }, { "epoch": 0.0, "grad_norm": 7.178583734354207, "learning_rate": 1.999994533853503e-06, "loss": 1.3008, "step": 633 }, { "epoch": 0.0, "grad_norm": 4.631980034454648, "learning_rate": 1.9999945162916135e-06, "loss": 1.2049, "step": 634 }, { "epoch": 0.0, "grad_norm": 4.715631420481108, "learning_rate": 1.999994498701558e-06, "loss": 1.506, "step": 635 }, { "epoch": 0.0, "grad_norm": 4.866461878636338, "learning_rate": 1.999994481083336e-06, "loss": 1.4913, "step": 636 }, { "epoch": 0.0, "grad_norm": 4.81038769741456, "learning_rate": 1.9999944634369473e-06, "loss": 1.4033, "step": 637 }, { "epoch": 0.0, "grad_norm": 5.08078031950503, "learning_rate": 1.999994445762392e-06, "loss": 1.5048, "step": 638 }, { "epoch": 0.0, "grad_norm": 5.047631225535758, "learning_rate": 1.9999944280596705e-06, "loss": 1.4223, "step": 639 }, { "epoch": 0.0, "grad_norm": 5.040805784015986, "learning_rate": 1.9999944103287824e-06, "loss": 1.5275, "step": 640 }, { "epoch": 0.0, "grad_norm": 5.349390518960892, "learning_rate": 1.9999943925697274e-06, "loss": 1.3653, "step": 641 }, { "epoch": 0.0, "grad_norm": 5.378881266474736, "learning_rate": 1.9999943747825064e-06, "loss": 1.4901, "step": 642 }, { "epoch": 0.0, "grad_norm": 5.959820929884786, "learning_rate": 1.9999943569671187e-06, "loss": 1.5757, "step": 643 }, { "epoch": 0.0, "grad_norm": 4.839111038325676, "learning_rate": 1.999994339123564e-06, "loss": 1.3587, "step": 644 }, { "epoch": 0.0, "grad_norm": 4.487918310261601, "learning_rate": 1.999994321251843e-06, "loss": 1.2875, "step": 645 }, { "epoch": 0.0, "grad_norm": 4.622421469454646, "learning_rate": 1.9999943033519563e-06, "loss": 1.4613, "step": 646 }, { "epoch": 0.0, "grad_norm": 4.851420800533353, "learning_rate": 1.999994285423902e-06, "loss": 1.4116, "step": 647 }, { "epoch": 0.0, "grad_norm": 5.13772278531517, "learning_rate": 1.999994267467682e-06, "loss": 1.503, "step": 648 }, { "epoch": 0.0, "grad_norm": 4.995119073749364, "learning_rate": 1.9999942494832955e-06, "loss": 1.5454, "step": 649 }, { "epoch": 0.0, "grad_norm": 5.260803959027427, "learning_rate": 1.999994231470742e-06, "loss": 1.5893, "step": 650 }, { "epoch": 0.0, "grad_norm": 4.836602513098617, "learning_rate": 1.999994213430022e-06, "loss": 1.5162, "step": 651 }, { "epoch": 0.0, "grad_norm": 5.155960472302761, "learning_rate": 1.9999941953611356e-06, "loss": 1.5372, "step": 652 }, { "epoch": 0.0, "grad_norm": 4.746462550620719, "learning_rate": 1.999994177264083e-06, "loss": 1.4229, "step": 653 }, { "epoch": 0.0, "grad_norm": 5.028762735096975, "learning_rate": 1.9999941591388634e-06, "loss": 1.4108, "step": 654 }, { "epoch": 0.0, "grad_norm": 4.8256605605224, "learning_rate": 1.9999941409854773e-06, "loss": 1.4537, "step": 655 }, { "epoch": 0.0, "grad_norm": 4.976124834899918, "learning_rate": 1.9999941228039253e-06, "loss": 1.4796, "step": 656 }, { "epoch": 0.0, "grad_norm": 8.040837341128276, "learning_rate": 1.999994104594206e-06, "loss": 1.5095, "step": 657 }, { "epoch": 0.0, "eval_loss": 1.6910474300384521, "eval_runtime": 4.6444, "eval_samples_per_second": 1.938, "eval_steps_per_second": 1.077, "step": 657 }, { "epoch": 0.0, "grad_norm": 5.4913712932783305, "learning_rate": 1.9999940863563208e-06, "loss": 1.7822, "step": 658 }, { "epoch": 0.0, "grad_norm": 5.723500971046963, "learning_rate": 1.999994068090269e-06, "loss": 1.3798, "step": 659 }, { "epoch": 0.0, "grad_norm": 5.445742218350867, "learning_rate": 1.9999940497960503e-06, "loss": 1.2929, "step": 660 }, { "epoch": 0.0, "grad_norm": 4.422756108863784, "learning_rate": 1.9999940314736654e-06, "loss": 1.3135, "step": 661 }, { "epoch": 0.0, "grad_norm": 4.932728683114205, "learning_rate": 1.9999940131231142e-06, "loss": 1.5933, "step": 662 }, { "epoch": 0.0, "grad_norm": 4.725603325024966, "learning_rate": 1.999993994744396e-06, "loss": 1.4112, "step": 663 }, { "epoch": 0.0, "grad_norm": 4.709408296070619, "learning_rate": 1.9999939763375114e-06, "loss": 1.3498, "step": 664 }, { "epoch": 0.0, "grad_norm": 4.936026121310175, "learning_rate": 1.9999939579024606e-06, "loss": 1.4758, "step": 665 }, { "epoch": 0.0, "grad_norm": 4.71294086987452, "learning_rate": 1.999993939439243e-06, "loss": 1.4319, "step": 666 }, { "epoch": 0.0, "grad_norm": 6.066575513242086, "learning_rate": 1.999993920947859e-06, "loss": 1.6162, "step": 667 }, { "epoch": 0.0, "grad_norm": 5.234786123258312, "learning_rate": 1.9999939024283086e-06, "loss": 1.4918, "step": 668 }, { "epoch": 0.0, "grad_norm": 4.8552625381814565, "learning_rate": 1.999993883880592e-06, "loss": 1.4518, "step": 669 }, { "epoch": 0.0, "grad_norm": 4.497782097949712, "learning_rate": 1.9999938653047082e-06, "loss": 1.394, "step": 670 }, { "epoch": 0.0, "grad_norm": 6.189146286839885, "learning_rate": 1.9999938467006583e-06, "loss": 1.4676, "step": 671 }, { "epoch": 0.0, "grad_norm": 5.318939072932948, "learning_rate": 1.999993828068442e-06, "loss": 1.4845, "step": 672 }, { "epoch": 0.0, "grad_norm": 5.231313892198888, "learning_rate": 1.9999938094080588e-06, "loss": 1.5264, "step": 673 }, { "epoch": 0.0, "grad_norm": 5.190824126212358, "learning_rate": 1.9999937907195092e-06, "loss": 1.5604, "step": 674 }, { "epoch": 0.0, "grad_norm": 4.911434823319875, "learning_rate": 1.9999937720027937e-06, "loss": 1.4599, "step": 675 }, { "epoch": 0.0, "grad_norm": 5.527825329074028, "learning_rate": 1.999993753257911e-06, "loss": 1.4444, "step": 676 }, { "epoch": 0.0, "grad_norm": 4.612727754818132, "learning_rate": 1.9999937344848622e-06, "loss": 1.4543, "step": 677 }, { "epoch": 0.0, "grad_norm": 4.912996568592968, "learning_rate": 1.9999937156836463e-06, "loss": 1.4475, "step": 678 }, { "epoch": 0.0, "grad_norm": 4.899245327794625, "learning_rate": 1.9999936968542644e-06, "loss": 1.4971, "step": 679 }, { "epoch": 0.0, "grad_norm": 5.683408208952821, "learning_rate": 1.999993677996716e-06, "loss": 1.4405, "step": 680 }, { "epoch": 0.0, "grad_norm": 4.868881120149723, "learning_rate": 1.999993659111001e-06, "loss": 1.3217, "step": 681 }, { "epoch": 0.0, "grad_norm": 4.725153463356434, "learning_rate": 1.9999936401971195e-06, "loss": 1.3043, "step": 682 }, { "epoch": 0.0, "grad_norm": 4.909021709681444, "learning_rate": 1.9999936212550716e-06, "loss": 1.4595, "step": 683 }, { "epoch": 0.0, "grad_norm": 4.906414442711592, "learning_rate": 1.9999936022848574e-06, "loss": 1.4195, "step": 684 }, { "epoch": 0.0, "grad_norm": 6.200952942030242, "learning_rate": 1.9999935832864763e-06, "loss": 1.3076, "step": 685 }, { "epoch": 0.0, "grad_norm": 4.601568858946815, "learning_rate": 1.999993564259929e-06, "loss": 1.3687, "step": 686 }, { "epoch": 0.0, "grad_norm": 4.694173785690684, "learning_rate": 1.999993545205215e-06, "loss": 1.3167, "step": 687 }, { "epoch": 0.0, "grad_norm": 5.925517734547198, "learning_rate": 1.9999935261223344e-06, "loss": 1.524, "step": 688 }, { "epoch": 0.0, "grad_norm": 6.058783499529197, "learning_rate": 1.9999935070112877e-06, "loss": 1.3536, "step": 689 }, { "epoch": 0.0, "grad_norm": 4.7403320365456185, "learning_rate": 1.9999934878720743e-06, "loss": 1.45, "step": 690 }, { "epoch": 0.0, "grad_norm": 8.216158970114753, "learning_rate": 1.9999934687046945e-06, "loss": 1.4329, "step": 691 }, { "epoch": 0.0, "grad_norm": 4.907151976261744, "learning_rate": 1.999993449509148e-06, "loss": 1.579, "step": 692 }, { "epoch": 0.0, "grad_norm": 5.26679103924561, "learning_rate": 1.9999934302854353e-06, "loss": 1.6084, "step": 693 }, { "epoch": 0.0, "grad_norm": 5.831000211583377, "learning_rate": 1.999993411033556e-06, "loss": 1.574, "step": 694 }, { "epoch": 0.0, "grad_norm": 4.9068622026627065, "learning_rate": 1.99999339175351e-06, "loss": 1.5251, "step": 695 }, { "epoch": 0.0, "grad_norm": 4.824731944389374, "learning_rate": 1.9999933724452975e-06, "loss": 1.4736, "step": 696 }, { "epoch": 0.0, "grad_norm": 4.737465979855922, "learning_rate": 1.999993353108919e-06, "loss": 1.3993, "step": 697 }, { "epoch": 0.0, "grad_norm": 5.601956191342637, "learning_rate": 1.9999933337443736e-06, "loss": 1.4853, "step": 698 }, { "epoch": 0.0, "grad_norm": 4.891756552820562, "learning_rate": 1.999993314351662e-06, "loss": 1.3733, "step": 699 }, { "epoch": 0.0, "grad_norm": 5.343260811587546, "learning_rate": 1.9999932949307833e-06, "loss": 1.5779, "step": 700 }, { "epoch": 0.0, "grad_norm": 4.646914554700779, "learning_rate": 1.9999932754817387e-06, "loss": 1.4926, "step": 701 }, { "epoch": 0.0, "grad_norm": 5.004390716593502, "learning_rate": 1.9999932560045274e-06, "loss": 1.343, "step": 702 }, { "epoch": 0.0, "grad_norm": 4.8435215224734, "learning_rate": 1.9999932364991497e-06, "loss": 1.5128, "step": 703 }, { "epoch": 0.0, "grad_norm": 5.094980526251057, "learning_rate": 1.9999932169656056e-06, "loss": 1.4424, "step": 704 }, { "epoch": 0.0, "grad_norm": 4.71606625952716, "learning_rate": 1.9999931974038946e-06, "loss": 1.2818, "step": 705 }, { "epoch": 0.0, "grad_norm": 5.019306981145199, "learning_rate": 1.9999931778140178e-06, "loss": 1.3311, "step": 706 }, { "epoch": 0.0, "grad_norm": 4.402365143728885, "learning_rate": 1.999993158195974e-06, "loss": 1.269, "step": 707 }, { "epoch": 0.0, "grad_norm": 5.4490937352418065, "learning_rate": 1.999993138549764e-06, "loss": 1.4177, "step": 708 }, { "epoch": 0.0, "grad_norm": 4.710938425019419, "learning_rate": 1.999993118875387e-06, "loss": 1.3903, "step": 709 }, { "epoch": 0.0, "grad_norm": 4.853647057982472, "learning_rate": 1.999993099172844e-06, "loss": 1.3143, "step": 710 }, { "epoch": 0.0, "grad_norm": 5.583393631131937, "learning_rate": 1.9999930794421346e-06, "loss": 1.5035, "step": 711 }, { "epoch": 0.0, "grad_norm": 4.9723347976661625, "learning_rate": 1.9999930596832585e-06, "loss": 1.4268, "step": 712 }, { "epoch": 0.0, "grad_norm": 5.280480245670652, "learning_rate": 1.999993039896216e-06, "loss": 1.5342, "step": 713 }, { "epoch": 0.0, "grad_norm": 5.594520301277882, "learning_rate": 1.999993020081007e-06, "loss": 1.5057, "step": 714 }, { "epoch": 0.0, "grad_norm": 5.0248961526344385, "learning_rate": 1.9999930002376316e-06, "loss": 1.5489, "step": 715 }, { "epoch": 0.0, "grad_norm": 4.710291069541927, "learning_rate": 1.999992980366089e-06, "loss": 1.4276, "step": 716 }, { "epoch": 0.0, "grad_norm": 5.12715968906524, "learning_rate": 1.999992960466381e-06, "loss": 1.4865, "step": 717 }, { "epoch": 0.0, "grad_norm": 4.916214864105662, "learning_rate": 1.999992940538506e-06, "loss": 1.4147, "step": 718 }, { "epoch": 0.0, "grad_norm": 5.383613097319036, "learning_rate": 1.999992920582465e-06, "loss": 1.3599, "step": 719 }, { "epoch": 0.0, "grad_norm": 5.341289838892921, "learning_rate": 1.999992900598257e-06, "loss": 1.4866, "step": 720 }, { "epoch": 0.0, "grad_norm": 5.032528541347688, "learning_rate": 1.9999928805858824e-06, "loss": 1.5285, "step": 721 }, { "epoch": 0.0, "grad_norm": 5.113668938434738, "learning_rate": 1.9999928605453417e-06, "loss": 1.4797, "step": 722 }, { "epoch": 0.0, "grad_norm": 4.891988446714629, "learning_rate": 1.9999928404766345e-06, "loss": 1.6038, "step": 723 }, { "epoch": 0.0, "grad_norm": 4.679783775035857, "learning_rate": 1.9999928203797606e-06, "loss": 1.4729, "step": 724 }, { "epoch": 0.0, "grad_norm": 4.772986476245937, "learning_rate": 1.9999928002547202e-06, "loss": 1.429, "step": 725 }, { "epoch": 0.0, "grad_norm": 4.468260713060045, "learning_rate": 1.999992780101514e-06, "loss": 1.3496, "step": 726 }, { "epoch": 0.0, "grad_norm": 4.7466257837247365, "learning_rate": 1.9999927599201404e-06, "loss": 1.5408, "step": 727 }, { "epoch": 0.0, "grad_norm": 4.584370484620455, "learning_rate": 1.999992739710601e-06, "loss": 1.4405, "step": 728 }, { "epoch": 0.0, "grad_norm": 4.783272641625955, "learning_rate": 1.9999927194728945e-06, "loss": 1.4221, "step": 729 }, { "epoch": 0.0, "grad_norm": 4.52556643023439, "learning_rate": 1.9999926992070223e-06, "loss": 1.3328, "step": 730 }, { "epoch": 0.0, "eval_loss": 1.6846652030944824, "eval_runtime": 4.641, "eval_samples_per_second": 1.939, "eval_steps_per_second": 1.077, "step": 730 }, { "epoch": 0.0, "grad_norm": 4.657217221686939, "learning_rate": 1.999992678912983e-06, "loss": 1.4454, "step": 731 }, { "epoch": 0.0, "grad_norm": 4.590847103289048, "learning_rate": 1.9999926585907777e-06, "loss": 1.3924, "step": 732 }, { "epoch": 0.0, "grad_norm": 5.336722077748262, "learning_rate": 1.9999926382404054e-06, "loss": 1.4235, "step": 733 }, { "epoch": 0.0, "grad_norm": 4.450736230018037, "learning_rate": 1.999992617861867e-06, "loss": 1.3447, "step": 734 }, { "epoch": 0.0, "grad_norm": 5.138005519098043, "learning_rate": 1.9999925974551625e-06, "loss": 1.5023, "step": 735 }, { "epoch": 0.0, "grad_norm": 5.723499031096059, "learning_rate": 1.9999925770202907e-06, "loss": 1.5875, "step": 736 }, { "epoch": 0.0, "grad_norm": 6.103542969210838, "learning_rate": 1.999992556557253e-06, "loss": 1.6389, "step": 737 }, { "epoch": 0.0, "grad_norm": 5.358708666669913, "learning_rate": 1.9999925360660486e-06, "loss": 1.5501, "step": 738 }, { "epoch": 0.0, "grad_norm": 4.8728681154106, "learning_rate": 1.999992515546678e-06, "loss": 1.5199, "step": 739 }, { "epoch": 0.01, "grad_norm": 4.254099594123959, "learning_rate": 1.9999924949991406e-06, "loss": 1.2561, "step": 740 }, { "epoch": 0.01, "grad_norm": 4.841976889554841, "learning_rate": 1.999992474423437e-06, "loss": 1.4955, "step": 741 }, { "epoch": 0.01, "grad_norm": 5.3286918973656965, "learning_rate": 1.999992453819567e-06, "loss": 1.5191, "step": 742 }, { "epoch": 0.01, "grad_norm": 12.100923749893232, "learning_rate": 1.99999243318753e-06, "loss": 1.5339, "step": 743 }, { "epoch": 0.01, "grad_norm": 4.937435968281523, "learning_rate": 1.999992412527327e-06, "loss": 1.5689, "step": 744 }, { "epoch": 0.01, "grad_norm": 4.794383156069311, "learning_rate": 1.9999923918389573e-06, "loss": 1.4932, "step": 745 }, { "epoch": 0.01, "grad_norm": 6.328354730784901, "learning_rate": 1.9999923711224215e-06, "loss": 1.4243, "step": 746 }, { "epoch": 0.01, "grad_norm": 5.585973437130427, "learning_rate": 1.999992350377719e-06, "loss": 1.5275, "step": 747 }, { "epoch": 0.01, "grad_norm": 5.252037658328408, "learning_rate": 1.99999232960485e-06, "loss": 1.6859, "step": 748 }, { "epoch": 0.01, "grad_norm": 4.999962938796536, "learning_rate": 1.9999923088038147e-06, "loss": 1.3763, "step": 749 }, { "epoch": 0.01, "grad_norm": 4.6706037411530135, "learning_rate": 1.999992287974613e-06, "loss": 1.4393, "step": 750 }, { "epoch": 0.01, "grad_norm": 5.644118215700577, "learning_rate": 1.9999922671172445e-06, "loss": 1.3404, "step": 751 }, { "epoch": 0.01, "grad_norm": 4.762262865598859, "learning_rate": 1.9999922462317096e-06, "loss": 1.1912, "step": 752 }, { "epoch": 0.01, "grad_norm": 6.509565742435461, "learning_rate": 1.9999922253180088e-06, "loss": 1.5547, "step": 753 }, { "epoch": 0.01, "grad_norm": 4.906236638847442, "learning_rate": 1.999992204376141e-06, "loss": 1.5497, "step": 754 }, { "epoch": 0.01, "grad_norm": 5.389720327672479, "learning_rate": 1.999992183406107e-06, "loss": 1.5383, "step": 755 }, { "epoch": 0.01, "grad_norm": 6.936731396149539, "learning_rate": 1.9999921624079066e-06, "loss": 1.5598, "step": 756 }, { "epoch": 0.01, "grad_norm": 5.891074624493145, "learning_rate": 1.9999921413815393e-06, "loss": 1.4646, "step": 757 }, { "epoch": 0.01, "grad_norm": 5.931816574873316, "learning_rate": 1.9999921203270057e-06, "loss": 1.5956, "step": 758 }, { "epoch": 0.01, "grad_norm": 4.631245635122659, "learning_rate": 1.9999920992443056e-06, "loss": 1.3381, "step": 759 }, { "epoch": 0.01, "grad_norm": 4.69596344452636, "learning_rate": 1.9999920781334392e-06, "loss": 1.4328, "step": 760 }, { "epoch": 0.01, "grad_norm": 4.630359889510624, "learning_rate": 1.999992056994407e-06, "loss": 1.4445, "step": 761 }, { "epoch": 0.01, "grad_norm": 5.4898331473385795, "learning_rate": 1.9999920358272076e-06, "loss": 1.4693, "step": 762 }, { "epoch": 0.01, "grad_norm": 4.976976788582667, "learning_rate": 1.9999920146318416e-06, "loss": 1.4112, "step": 763 }, { "epoch": 0.01, "grad_norm": 5.642721073810829, "learning_rate": 1.9999919934083092e-06, "loss": 1.507, "step": 764 }, { "epoch": 0.01, "grad_norm": 4.938224038260358, "learning_rate": 1.999991972156611e-06, "loss": 1.4828, "step": 765 }, { "epoch": 0.01, "grad_norm": 5.4124846916190545, "learning_rate": 1.9999919508767457e-06, "loss": 1.5935, "step": 766 }, { "epoch": 0.01, "grad_norm": 5.0298956818677665, "learning_rate": 1.9999919295687146e-06, "loss": 1.5506, "step": 767 }, { "epoch": 0.01, "grad_norm": 4.902554353229615, "learning_rate": 1.9999919082325163e-06, "loss": 1.4076, "step": 768 }, { "epoch": 0.01, "grad_norm": 5.046670733247908, "learning_rate": 1.999991886868152e-06, "loss": 1.5376, "step": 769 }, { "epoch": 0.01, "grad_norm": 5.029796705277875, "learning_rate": 1.999991865475621e-06, "loss": 1.5587, "step": 770 }, { "epoch": 0.01, "grad_norm": 5.392708356030036, "learning_rate": 1.9999918440549237e-06, "loss": 1.5396, "step": 771 }, { "epoch": 0.01, "grad_norm": 6.421457119639919, "learning_rate": 1.9999918226060602e-06, "loss": 1.4929, "step": 772 }, { "epoch": 0.01, "grad_norm": 4.752656726954079, "learning_rate": 1.99999180112903e-06, "loss": 1.1638, "step": 773 }, { "epoch": 0.01, "grad_norm": 4.831954478385221, "learning_rate": 1.9999917796238333e-06, "loss": 1.4306, "step": 774 }, { "epoch": 0.01, "grad_norm": 5.101830458146313, "learning_rate": 1.99999175809047e-06, "loss": 1.4116, "step": 775 }, { "epoch": 0.01, "grad_norm": 4.404228318451748, "learning_rate": 1.9999917365289403e-06, "loss": 1.5313, "step": 776 }, { "epoch": 0.01, "grad_norm": 4.736260085200967, "learning_rate": 1.9999917149392445e-06, "loss": 1.4743, "step": 777 }, { "epoch": 0.01, "grad_norm": 4.839914114082853, "learning_rate": 1.9999916933213823e-06, "loss": 1.4311, "step": 778 }, { "epoch": 0.01, "grad_norm": 4.591247776677874, "learning_rate": 1.9999916716753536e-06, "loss": 1.4233, "step": 779 }, { "epoch": 0.01, "grad_norm": 4.6578387294024335, "learning_rate": 1.9999916500011582e-06, "loss": 1.5085, "step": 780 }, { "epoch": 0.01, "grad_norm": 4.93242923456189, "learning_rate": 1.9999916282987964e-06, "loss": 1.5293, "step": 781 }, { "epoch": 0.01, "grad_norm": 4.84156183065501, "learning_rate": 1.9999916065682682e-06, "loss": 1.4976, "step": 782 }, { "epoch": 0.01, "grad_norm": 4.780256207356861, "learning_rate": 1.9999915848095736e-06, "loss": 1.3376, "step": 783 }, { "epoch": 0.01, "grad_norm": 4.519528759054672, "learning_rate": 1.9999915630227127e-06, "loss": 1.4027, "step": 784 }, { "epoch": 0.01, "grad_norm": 10.363927799642175, "learning_rate": 1.9999915412076853e-06, "loss": 1.4659, "step": 785 }, { "epoch": 0.01, "grad_norm": 4.342877391406383, "learning_rate": 1.9999915193644916e-06, "loss": 1.4013, "step": 786 }, { "epoch": 0.01, "grad_norm": 4.813148676207006, "learning_rate": 1.999991497493131e-06, "loss": 1.4737, "step": 787 }, { "epoch": 0.01, "grad_norm": 5.745668191994972, "learning_rate": 1.999991475593604e-06, "loss": 1.2512, "step": 788 }, { "epoch": 0.01, "grad_norm": 5.796669035814396, "learning_rate": 1.999991453665911e-06, "loss": 1.318, "step": 789 }, { "epoch": 0.01, "grad_norm": 5.807933615414672, "learning_rate": 1.999991431710051e-06, "loss": 1.3862, "step": 790 }, { "epoch": 0.01, "grad_norm": 5.092568125176769, "learning_rate": 1.9999914097260254e-06, "loss": 1.5014, "step": 791 }, { "epoch": 0.01, "grad_norm": 6.824966187432624, "learning_rate": 1.999991387713833e-06, "loss": 1.4623, "step": 792 }, { "epoch": 0.01, "grad_norm": 5.0068907060455485, "learning_rate": 1.9999913656734736e-06, "loss": 1.4753, "step": 793 }, { "epoch": 0.01, "grad_norm": 4.838451440188585, "learning_rate": 1.9999913436049483e-06, "loss": 1.4478, "step": 794 }, { "epoch": 0.01, "grad_norm": 4.553033347591773, "learning_rate": 1.9999913215082567e-06, "loss": 1.5087, "step": 795 }, { "epoch": 0.01, "grad_norm": 4.639914766985077, "learning_rate": 1.9999912993833987e-06, "loss": 1.331, "step": 796 }, { "epoch": 0.01, "grad_norm": 5.035741243327864, "learning_rate": 1.999991277230374e-06, "loss": 1.4473, "step": 797 }, { "epoch": 0.01, "grad_norm": 6.0968663437701975, "learning_rate": 1.9999912550491826e-06, "loss": 1.5815, "step": 798 }, { "epoch": 0.01, "grad_norm": 4.730455270173384, "learning_rate": 1.999991232839825e-06, "loss": 1.4369, "step": 799 }, { "epoch": 0.01, "grad_norm": 5.219089987469455, "learning_rate": 1.9999912106023014e-06, "loss": 1.5169, "step": 800 }, { "epoch": 0.01, "grad_norm": 5.750971191979698, "learning_rate": 1.999991188336611e-06, "loss": 1.4153, "step": 801 }, { "epoch": 0.01, "grad_norm": 5.406182419591723, "learning_rate": 1.9999911660427543e-06, "loss": 1.4437, "step": 802 }, { "epoch": 0.01, "grad_norm": 5.063524970869385, "learning_rate": 1.9999911437207307e-06, "loss": 1.507, "step": 803 }, { "epoch": 0.01, "eval_loss": 1.6881712675094604, "eval_runtime": 4.626, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 803 }, { "epoch": 0.01, "grad_norm": 5.467776894504789, "learning_rate": 1.999991121370541e-06, "loss": 1.522, "step": 804 }, { "epoch": 0.01, "grad_norm": 4.971642369149256, "learning_rate": 1.999991098992185e-06, "loss": 1.5321, "step": 805 }, { "epoch": 0.01, "grad_norm": 4.814938121020308, "learning_rate": 1.999991076585663e-06, "loss": 1.433, "step": 806 }, { "epoch": 0.01, "grad_norm": 5.2209822250293065, "learning_rate": 1.9999910541509737e-06, "loss": 1.335, "step": 807 }, { "epoch": 0.01, "grad_norm": 5.296510549803633, "learning_rate": 1.9999910316881186e-06, "loss": 1.7574, "step": 808 }, { "epoch": 0.01, "grad_norm": 4.913991460057143, "learning_rate": 1.9999910091970967e-06, "loss": 1.5179, "step": 809 }, { "epoch": 0.01, "grad_norm": 4.668407347559244, "learning_rate": 1.9999909866779085e-06, "loss": 1.3712, "step": 810 }, { "epoch": 0.01, "grad_norm": 5.453144611957878, "learning_rate": 1.9999909641305538e-06, "loss": 1.3764, "step": 811 }, { "epoch": 0.01, "grad_norm": 5.2650106360957185, "learning_rate": 1.9999909415550327e-06, "loss": 1.4595, "step": 812 }, { "epoch": 0.01, "grad_norm": 5.250833767951291, "learning_rate": 1.9999909189513457e-06, "loss": 1.3007, "step": 813 }, { "epoch": 0.01, "grad_norm": 5.354917246649793, "learning_rate": 1.999990896319492e-06, "loss": 1.6322, "step": 814 }, { "epoch": 0.01, "grad_norm": 5.012749978414504, "learning_rate": 1.9999908736594716e-06, "loss": 1.5338, "step": 815 }, { "epoch": 0.01, "grad_norm": 4.848442817388502, "learning_rate": 1.999990850971285e-06, "loss": 1.2774, "step": 816 }, { "epoch": 0.01, "grad_norm": 6.984924234695706, "learning_rate": 1.9999908282549316e-06, "loss": 1.4143, "step": 817 }, { "epoch": 0.01, "grad_norm": 4.762256555459132, "learning_rate": 1.9999908055104122e-06, "loss": 1.3228, "step": 818 }, { "epoch": 0.01, "grad_norm": 4.844771964077254, "learning_rate": 1.9999907827377265e-06, "loss": 1.4724, "step": 819 }, { "epoch": 0.01, "grad_norm": 4.952200259787622, "learning_rate": 1.999990759936874e-06, "loss": 1.4344, "step": 820 }, { "epoch": 0.01, "grad_norm": 4.711382910161586, "learning_rate": 1.999990737107855e-06, "loss": 1.5817, "step": 821 }, { "epoch": 0.01, "grad_norm": 5.57192836371835, "learning_rate": 1.99999071425067e-06, "loss": 1.4359, "step": 822 }, { "epoch": 0.01, "grad_norm": 5.2612613709184775, "learning_rate": 1.9999906913653187e-06, "loss": 1.683, "step": 823 }, { "epoch": 0.01, "grad_norm": 4.651207872271117, "learning_rate": 1.9999906684518005e-06, "loss": 1.4202, "step": 824 }, { "epoch": 0.01, "grad_norm": 4.33548743621847, "learning_rate": 1.999990645510116e-06, "loss": 1.2962, "step": 825 }, { "epoch": 0.01, "grad_norm": 4.998357754076746, "learning_rate": 1.9999906225402655e-06, "loss": 1.4576, "step": 826 }, { "epoch": 0.01, "grad_norm": 5.936966406696661, "learning_rate": 1.9999905995422483e-06, "loss": 1.4039, "step": 827 }, { "epoch": 0.01, "grad_norm": 5.330845387260364, "learning_rate": 1.9999905765160646e-06, "loss": 1.5228, "step": 828 }, { "epoch": 0.01, "grad_norm": 4.8056099201876705, "learning_rate": 1.9999905534617145e-06, "loss": 1.3755, "step": 829 }, { "epoch": 0.01, "grad_norm": 5.029573312366041, "learning_rate": 1.999990530379198e-06, "loss": 1.5501, "step": 830 }, { "epoch": 0.01, "grad_norm": 4.539685125822293, "learning_rate": 1.999990507268515e-06, "loss": 1.3357, "step": 831 }, { "epoch": 0.01, "grad_norm": 4.880928277155648, "learning_rate": 1.9999904841296656e-06, "loss": 1.6046, "step": 832 }, { "epoch": 0.01, "grad_norm": 5.503556293375853, "learning_rate": 1.99999046096265e-06, "loss": 1.7223, "step": 833 }, { "epoch": 0.01, "grad_norm": 6.225897163215507, "learning_rate": 1.999990437767468e-06, "loss": 1.3195, "step": 834 }, { "epoch": 0.01, "grad_norm": 7.690064080550938, "learning_rate": 1.9999904145441196e-06, "loss": 1.6465, "step": 835 }, { "epoch": 0.01, "grad_norm": 4.5459726864219725, "learning_rate": 1.999990391292605e-06, "loss": 1.4026, "step": 836 }, { "epoch": 0.01, "grad_norm": 4.680412720191206, "learning_rate": 1.9999903680129237e-06, "loss": 1.4353, "step": 837 }, { "epoch": 0.01, "grad_norm": 7.622275997037577, "learning_rate": 1.9999903447050758e-06, "loss": 1.5267, "step": 838 }, { "epoch": 0.01, "grad_norm": 4.741060382625769, "learning_rate": 1.999990321369062e-06, "loss": 1.4955, "step": 839 }, { "epoch": 0.01, "grad_norm": 4.875869067274173, "learning_rate": 1.999990298004881e-06, "loss": 1.5815, "step": 840 }, { "epoch": 0.01, "grad_norm": 4.6600192764255395, "learning_rate": 1.9999902746125344e-06, "loss": 1.3205, "step": 841 }, { "epoch": 0.01, "grad_norm": 5.551113040685454, "learning_rate": 1.999990251192021e-06, "loss": 1.6559, "step": 842 }, { "epoch": 0.01, "grad_norm": 5.3125600816949365, "learning_rate": 1.9999902277433414e-06, "loss": 1.422, "step": 843 }, { "epoch": 0.01, "grad_norm": 5.0897853002323155, "learning_rate": 1.999990204266495e-06, "loss": 1.429, "step": 844 }, { "epoch": 0.01, "grad_norm": 4.620329234269747, "learning_rate": 1.999990180761483e-06, "loss": 1.3901, "step": 845 }, { "epoch": 0.01, "grad_norm": 4.952006426703027, "learning_rate": 1.999990157228304e-06, "loss": 1.4605, "step": 846 }, { "epoch": 0.01, "grad_norm": 5.051790815462307, "learning_rate": 1.999990133666959e-06, "loss": 1.5361, "step": 847 }, { "epoch": 0.01, "grad_norm": 4.723454559230461, "learning_rate": 1.999990110077447e-06, "loss": 1.4063, "step": 848 }, { "epoch": 0.01, "grad_norm": 4.714623591910726, "learning_rate": 1.999990086459769e-06, "loss": 1.4844, "step": 849 }, { "epoch": 0.01, "grad_norm": 4.866772682439225, "learning_rate": 1.999990062813924e-06, "loss": 1.4243, "step": 850 }, { "epoch": 0.01, "grad_norm": 4.982727315839908, "learning_rate": 1.999990039139913e-06, "loss": 1.5169, "step": 851 }, { "epoch": 0.01, "grad_norm": 4.88323303732355, "learning_rate": 1.9999900154377363e-06, "loss": 1.331, "step": 852 }, { "epoch": 0.01, "grad_norm": 4.98858734123965, "learning_rate": 1.9999899917073925e-06, "loss": 1.4386, "step": 853 }, { "epoch": 0.01, "grad_norm": 5.7732616301555, "learning_rate": 1.9999899679488823e-06, "loss": 1.483, "step": 854 }, { "epoch": 0.01, "grad_norm": 10.195015855643826, "learning_rate": 1.9999899441622062e-06, "loss": 1.489, "step": 855 }, { "epoch": 0.01, "grad_norm": 4.777377833871161, "learning_rate": 1.9999899203473633e-06, "loss": 1.3607, "step": 856 }, { "epoch": 0.01, "grad_norm": 8.851948399516926, "learning_rate": 1.999989896504354e-06, "loss": 1.3547, "step": 857 }, { "epoch": 0.01, "grad_norm": 4.7729850075029905, "learning_rate": 1.9999898726331787e-06, "loss": 1.3561, "step": 858 }, { "epoch": 0.01, "grad_norm": 4.764811136272429, "learning_rate": 1.9999898487338367e-06, "loss": 1.3792, "step": 859 }, { "epoch": 0.01, "grad_norm": 4.889712073966274, "learning_rate": 1.999989824806328e-06, "loss": 1.3774, "step": 860 }, { "epoch": 0.01, "grad_norm": 5.465422723384496, "learning_rate": 1.9999898008506533e-06, "loss": 1.3464, "step": 861 }, { "epoch": 0.01, "grad_norm": 5.480222802088414, "learning_rate": 1.9999897768668125e-06, "loss": 1.5737, "step": 862 }, { "epoch": 0.01, "grad_norm": 5.492784136252579, "learning_rate": 1.999989752854805e-06, "loss": 1.5299, "step": 863 }, { "epoch": 0.01, "grad_norm": 5.194980321543087, "learning_rate": 1.999989728814631e-06, "loss": 1.2601, "step": 864 }, { "epoch": 0.01, "grad_norm": 5.373628545087017, "learning_rate": 1.9999897047462905e-06, "loss": 1.6205, "step": 865 }, { "epoch": 0.01, "grad_norm": 4.834528620174113, "learning_rate": 1.9999896806497837e-06, "loss": 1.3969, "step": 866 }, { "epoch": 0.01, "grad_norm": 5.365382013388016, "learning_rate": 1.999989656525111e-06, "loss": 1.4079, "step": 867 }, { "epoch": 0.01, "grad_norm": 5.10671741683615, "learning_rate": 1.9999896323722714e-06, "loss": 1.5323, "step": 868 }, { "epoch": 0.01, "grad_norm": 4.794974200564266, "learning_rate": 1.9999896081912655e-06, "loss": 1.5685, "step": 869 }, { "epoch": 0.01, "grad_norm": 5.460786387110753, "learning_rate": 1.999989583982093e-06, "loss": 1.4878, "step": 870 }, { "epoch": 0.01, "grad_norm": 4.954812125157006, "learning_rate": 1.999989559744755e-06, "loss": 1.6783, "step": 871 }, { "epoch": 0.01, "grad_norm": 4.893553227272597, "learning_rate": 1.9999895354792497e-06, "loss": 1.5179, "step": 872 }, { "epoch": 0.01, "grad_norm": 4.971125258081157, "learning_rate": 1.9999895111855786e-06, "loss": 1.42, "step": 873 }, { "epoch": 0.01, "grad_norm": 5.466531237826114, "learning_rate": 1.9999894868637408e-06, "loss": 1.4594, "step": 874 }, { "epoch": 0.01, "grad_norm": 4.643918053264017, "learning_rate": 1.9999894625137365e-06, "loss": 1.4087, "step": 875 }, { "epoch": 0.01, "grad_norm": 4.946505174466154, "learning_rate": 1.999989438135566e-06, "loss": 1.5287, "step": 876 }, { "epoch": 0.01, "eval_loss": 1.6835453510284424, "eval_runtime": 4.6442, "eval_samples_per_second": 1.938, "eval_steps_per_second": 1.077, "step": 876 }, { "epoch": 0.01, "grad_norm": 5.239412305756631, "learning_rate": 1.9999894137292292e-06, "loss": 1.3614, "step": 877 }, { "epoch": 0.01, "grad_norm": 5.5395595579858075, "learning_rate": 1.999989389294726e-06, "loss": 1.5485, "step": 878 }, { "epoch": 0.01, "grad_norm": 4.712593321081159, "learning_rate": 1.9999893648320564e-06, "loss": 1.4338, "step": 879 }, { "epoch": 0.01, "grad_norm": 4.458574377375978, "learning_rate": 1.9999893403412202e-06, "loss": 1.41, "step": 880 }, { "epoch": 0.01, "grad_norm": 5.551492101493687, "learning_rate": 1.999989315822218e-06, "loss": 1.3577, "step": 881 }, { "epoch": 0.01, "grad_norm": 5.207575524594766, "learning_rate": 1.999989291275049e-06, "loss": 1.5347, "step": 882 }, { "epoch": 0.01, "grad_norm": 5.6630265828087, "learning_rate": 1.999989266699714e-06, "loss": 1.5364, "step": 883 }, { "epoch": 0.01, "grad_norm": 4.879665992047349, "learning_rate": 1.9999892420962124e-06, "loss": 1.4612, "step": 884 }, { "epoch": 0.01, "grad_norm": 5.561642320920106, "learning_rate": 1.9999892174645447e-06, "loss": 1.3238, "step": 885 }, { "epoch": 0.01, "grad_norm": 4.803746896535269, "learning_rate": 1.9999891928047106e-06, "loss": 1.3759, "step": 886 }, { "epoch": 0.01, "grad_norm": 4.881001838381561, "learning_rate": 1.99998916811671e-06, "loss": 1.3259, "step": 887 }, { "epoch": 0.01, "grad_norm": 4.9185588818445165, "learning_rate": 1.9999891434005433e-06, "loss": 1.468, "step": 888 }, { "epoch": 0.01, "grad_norm": 6.4140094573174045, "learning_rate": 1.9999891186562096e-06, "loss": 1.5786, "step": 889 }, { "epoch": 0.01, "grad_norm": 5.4016730660955625, "learning_rate": 1.99998909388371e-06, "loss": 1.5057, "step": 890 }, { "epoch": 0.01, "grad_norm": 5.4186135870000145, "learning_rate": 1.999989069083044e-06, "loss": 1.5624, "step": 891 }, { "epoch": 0.01, "grad_norm": 4.747116381476095, "learning_rate": 1.9999890442542115e-06, "loss": 1.4637, "step": 892 }, { "epoch": 0.01, "grad_norm": 5.40275388365515, "learning_rate": 1.9999890193972127e-06, "loss": 1.5146, "step": 893 }, { "epoch": 0.01, "grad_norm": 5.991497086501674, "learning_rate": 1.9999889945120475e-06, "loss": 1.4213, "step": 894 }, { "epoch": 0.01, "grad_norm": 5.054323720643013, "learning_rate": 1.999988969598716e-06, "loss": 1.523, "step": 895 }, { "epoch": 0.01, "grad_norm": 4.610288116972087, "learning_rate": 1.9999889446572184e-06, "loss": 1.351, "step": 896 }, { "epoch": 0.01, "grad_norm": 5.3918041618620896, "learning_rate": 1.999988919687554e-06, "loss": 1.5724, "step": 897 }, { "epoch": 0.01, "grad_norm": 5.833705802062882, "learning_rate": 1.9999888946897233e-06, "loss": 1.572, "step": 898 }, { "epoch": 0.01, "grad_norm": 5.166823884179871, "learning_rate": 1.999988869663726e-06, "loss": 1.3641, "step": 899 }, { "epoch": 0.01, "grad_norm": 4.444407021344696, "learning_rate": 1.999988844609563e-06, "loss": 1.4219, "step": 900 }, { "epoch": 0.01, "grad_norm": 4.960252287317098, "learning_rate": 1.9999888195272332e-06, "loss": 1.4062, "step": 901 }, { "epoch": 0.01, "grad_norm": 5.074258501650094, "learning_rate": 1.9999887944167374e-06, "loss": 1.5055, "step": 902 }, { "epoch": 0.01, "grad_norm": 4.815381295386099, "learning_rate": 1.9999887692780747e-06, "loss": 1.5434, "step": 903 }, { "epoch": 0.01, "grad_norm": 4.811556472980446, "learning_rate": 1.999988744111246e-06, "loss": 1.4531, "step": 904 }, { "epoch": 0.01, "grad_norm": 6.5338580513110704, "learning_rate": 1.999988718916251e-06, "loss": 1.5654, "step": 905 }, { "epoch": 0.01, "grad_norm": 4.8547503509093985, "learning_rate": 1.9999886936930897e-06, "loss": 1.4963, "step": 906 }, { "epoch": 0.01, "grad_norm": 4.9529825947141655, "learning_rate": 1.9999886684417614e-06, "loss": 1.424, "step": 907 }, { "epoch": 0.01, "grad_norm": 4.992257236115625, "learning_rate": 1.9999886431622673e-06, "loss": 1.4438, "step": 908 }, { "epoch": 0.01, "grad_norm": 4.607129503512642, "learning_rate": 1.999988617854607e-06, "loss": 1.4505, "step": 909 }, { "epoch": 0.01, "grad_norm": 6.157982067067892, "learning_rate": 1.99998859251878e-06, "loss": 1.5385, "step": 910 }, { "epoch": 0.01, "grad_norm": 5.084561010511645, "learning_rate": 1.999988567154787e-06, "loss": 1.4966, "step": 911 }, { "epoch": 0.01, "grad_norm": 4.81877282821209, "learning_rate": 1.999988541762627e-06, "loss": 1.4731, "step": 912 }, { "epoch": 0.01, "grad_norm": 5.009288310745413, "learning_rate": 1.9999885163423014e-06, "loss": 1.432, "step": 913 }, { "epoch": 0.01, "grad_norm": 5.2256414414976184, "learning_rate": 1.999988490893809e-06, "loss": 1.5411, "step": 914 }, { "epoch": 0.01, "grad_norm": 5.261047012186723, "learning_rate": 1.99998846541715e-06, "loss": 1.6397, "step": 915 }, { "epoch": 0.01, "grad_norm": 4.657445695072026, "learning_rate": 1.9999884399123252e-06, "loss": 1.4764, "step": 916 }, { "epoch": 0.01, "grad_norm": 5.668390029076262, "learning_rate": 1.999988414379334e-06, "loss": 1.5984, "step": 917 }, { "epoch": 0.01, "grad_norm": 4.810866617300379, "learning_rate": 1.9999883888181764e-06, "loss": 1.4278, "step": 918 }, { "epoch": 0.01, "grad_norm": 8.825331474808909, "learning_rate": 1.9999883632288524e-06, "loss": 1.5525, "step": 919 }, { "epoch": 0.01, "grad_norm": 6.055755208418581, "learning_rate": 1.999988337611362e-06, "loss": 1.4505, "step": 920 }, { "epoch": 0.01, "grad_norm": 4.750972505525963, "learning_rate": 1.999988311965705e-06, "loss": 1.486, "step": 921 }, { "epoch": 0.01, "grad_norm": 6.217010575239663, "learning_rate": 1.999988286291882e-06, "loss": 1.7214, "step": 922 }, { "epoch": 0.01, "grad_norm": 5.507300171459353, "learning_rate": 1.9999882605898925e-06, "loss": 1.346, "step": 923 }, { "epoch": 0.01, "grad_norm": 5.017321187244063, "learning_rate": 1.999988234859737e-06, "loss": 1.5818, "step": 924 }, { "epoch": 0.01, "grad_norm": 5.028645991367795, "learning_rate": 1.999988209101415e-06, "loss": 1.5805, "step": 925 }, { "epoch": 0.01, "grad_norm": 7.2062677616231765, "learning_rate": 1.9999881833149264e-06, "loss": 1.6537, "step": 926 }, { "epoch": 0.01, "grad_norm": 5.145636588072226, "learning_rate": 1.9999881575002717e-06, "loss": 1.4681, "step": 927 }, { "epoch": 0.01, "grad_norm": 4.828710624451239, "learning_rate": 1.99998813165745e-06, "loss": 1.496, "step": 928 }, { "epoch": 0.01, "grad_norm": 5.549188105899666, "learning_rate": 1.9999881057864627e-06, "loss": 1.3865, "step": 929 }, { "epoch": 0.01, "grad_norm": 5.443520100837284, "learning_rate": 1.9999880798873093e-06, "loss": 1.2829, "step": 930 }, { "epoch": 0.01, "grad_norm": 5.13373771642738, "learning_rate": 1.999988053959989e-06, "loss": 1.5362, "step": 931 }, { "epoch": 0.01, "grad_norm": 4.999494186839008, "learning_rate": 1.9999880280045025e-06, "loss": 1.3462, "step": 932 }, { "epoch": 0.01, "grad_norm": 5.190959037095125, "learning_rate": 1.9999880020208495e-06, "loss": 1.4841, "step": 933 }, { "epoch": 0.01, "grad_norm": 4.780119303450498, "learning_rate": 1.9999879760090306e-06, "loss": 1.5854, "step": 934 }, { "epoch": 0.01, "grad_norm": 5.066755607666266, "learning_rate": 1.9999879499690452e-06, "loss": 1.3359, "step": 935 }, { "epoch": 0.01, "grad_norm": 5.252072684548736, "learning_rate": 1.999987923900893e-06, "loss": 1.4493, "step": 936 }, { "epoch": 0.01, "grad_norm": 4.711513346090706, "learning_rate": 1.999987897804575e-06, "loss": 1.2207, "step": 937 }, { "epoch": 0.01, "grad_norm": 7.728638763732817, "learning_rate": 1.9999878716800904e-06, "loss": 1.3846, "step": 938 }, { "epoch": 0.01, "grad_norm": 5.278677797997787, "learning_rate": 1.9999878455274396e-06, "loss": 1.336, "step": 939 }, { "epoch": 0.01, "grad_norm": 5.0220126825749425, "learning_rate": 1.9999878193466223e-06, "loss": 1.4228, "step": 940 }, { "epoch": 0.01, "grad_norm": 5.034910106052013, "learning_rate": 1.999987793137639e-06, "loss": 1.3907, "step": 941 }, { "epoch": 0.01, "grad_norm": 4.889404163229133, "learning_rate": 1.9999877669004894e-06, "loss": 1.4144, "step": 942 }, { "epoch": 0.01, "grad_norm": 5.162469539125017, "learning_rate": 1.999987740635173e-06, "loss": 1.5852, "step": 943 }, { "epoch": 0.01, "grad_norm": 5.399193356261801, "learning_rate": 1.9999877143416906e-06, "loss": 1.5513, "step": 944 }, { "epoch": 0.01, "grad_norm": 5.138316370644956, "learning_rate": 1.9999876880200418e-06, "loss": 1.467, "step": 945 }, { "epoch": 0.01, "grad_norm": 4.767554379371538, "learning_rate": 1.9999876616702266e-06, "loss": 1.4015, "step": 946 }, { "epoch": 0.01, "grad_norm": 4.432154096584277, "learning_rate": 1.9999876352922455e-06, "loss": 1.3144, "step": 947 }, { "epoch": 0.01, "grad_norm": 5.6901051217643035, "learning_rate": 1.9999876088860975e-06, "loss": 1.6762, "step": 948 }, { "epoch": 0.01, "grad_norm": 4.89608317041777, "learning_rate": 1.999987582451783e-06, "loss": 1.4469, "step": 949 }, { "epoch": 0.01, "eval_loss": 1.6775009632110596, "eval_runtime": 4.6401, "eval_samples_per_second": 1.94, "eval_steps_per_second": 1.078, "step": 949 }, { "epoch": 0.01, "grad_norm": 4.685652449039854, "learning_rate": 1.999987555989303e-06, "loss": 1.3646, "step": 950 }, { "epoch": 0.01, "grad_norm": 5.980010862799234, "learning_rate": 1.9999875294986562e-06, "loss": 1.5188, "step": 951 }, { "epoch": 0.01, "grad_norm": 5.218097124666707, "learning_rate": 1.999987502979843e-06, "loss": 1.4779, "step": 952 }, { "epoch": 0.01, "grad_norm": 4.803466971822877, "learning_rate": 1.9999874764328637e-06, "loss": 1.3129, "step": 953 }, { "epoch": 0.01, "grad_norm": 4.729648111188237, "learning_rate": 1.999987449857718e-06, "loss": 1.4093, "step": 954 }, { "epoch": 0.01, "grad_norm": 5.666417668066331, "learning_rate": 1.999987423254406e-06, "loss": 1.637, "step": 955 }, { "epoch": 0.01, "grad_norm": 4.92053994601316, "learning_rate": 1.999987396622928e-06, "loss": 1.4616, "step": 956 }, { "epoch": 0.01, "grad_norm": 4.729794144404417, "learning_rate": 1.999987369963283e-06, "loss": 1.2795, "step": 957 }, { "epoch": 0.01, "grad_norm": 5.48059643960075, "learning_rate": 1.999987343275472e-06, "loss": 1.4871, "step": 958 }, { "epoch": 0.01, "grad_norm": 4.807955183113965, "learning_rate": 1.9999873165594945e-06, "loss": 1.4542, "step": 959 }, { "epoch": 0.01, "grad_norm": 5.3782702475558475, "learning_rate": 1.999987289815351e-06, "loss": 1.4009, "step": 960 }, { "epoch": 0.01, "grad_norm": 5.465383132137198, "learning_rate": 1.999987263043041e-06, "loss": 1.5234, "step": 961 }, { "epoch": 0.01, "grad_norm": 4.863141270288387, "learning_rate": 1.9999872362425646e-06, "loss": 1.328, "step": 962 }, { "epoch": 0.01, "grad_norm": 5.513281389437473, "learning_rate": 1.999987209413922e-06, "loss": 1.3662, "step": 963 }, { "epoch": 0.01, "grad_norm": 6.182406119541521, "learning_rate": 1.9999871825571133e-06, "loss": 1.4304, "step": 964 }, { "epoch": 0.01, "grad_norm": 4.682777815258765, "learning_rate": 1.999987155672138e-06, "loss": 1.3698, "step": 965 }, { "epoch": 0.01, "grad_norm": 5.698433297833218, "learning_rate": 1.999987128758997e-06, "loss": 1.4941, "step": 966 }, { "epoch": 0.01, "grad_norm": 5.122381225960383, "learning_rate": 1.9999871018176888e-06, "loss": 1.4768, "step": 967 }, { "epoch": 0.01, "grad_norm": 4.72681263242357, "learning_rate": 1.999987074848215e-06, "loss": 1.3749, "step": 968 }, { "epoch": 0.01, "grad_norm": 5.507215592270518, "learning_rate": 1.999987047850574e-06, "loss": 1.5244, "step": 969 }, { "epoch": 0.01, "grad_norm": 4.870388076906783, "learning_rate": 1.9999870208247677e-06, "loss": 1.5131, "step": 970 }, { "epoch": 0.01, "grad_norm": 4.513166729875274, "learning_rate": 1.999986993770794e-06, "loss": 1.4241, "step": 971 }, { "epoch": 0.01, "grad_norm": 4.820438122653338, "learning_rate": 1.999986966688655e-06, "loss": 1.2826, "step": 972 }, { "epoch": 0.01, "grad_norm": 4.839651321923382, "learning_rate": 1.9999869395783495e-06, "loss": 1.2476, "step": 973 }, { "epoch": 0.01, "grad_norm": 5.720429959744313, "learning_rate": 1.999986912439877e-06, "loss": 1.4291, "step": 974 }, { "epoch": 0.01, "grad_norm": 4.4799874056887745, "learning_rate": 1.999986885273239e-06, "loss": 1.3034, "step": 975 }, { "epoch": 0.01, "grad_norm": 5.966740554354801, "learning_rate": 1.9999868580784343e-06, "loss": 1.346, "step": 976 }, { "epoch": 0.01, "grad_norm": 4.748631285522209, "learning_rate": 1.9999868308554637e-06, "loss": 1.3915, "step": 977 }, { "epoch": 0.01, "grad_norm": 5.117205503667891, "learning_rate": 1.9999868036043262e-06, "loss": 1.531, "step": 978 }, { "epoch": 0.01, "grad_norm": 4.761977448039064, "learning_rate": 1.999986776325023e-06, "loss": 1.5092, "step": 979 }, { "epoch": 0.01, "grad_norm": 5.383839203745775, "learning_rate": 1.999986749017553e-06, "loss": 1.4312, "step": 980 }, { "epoch": 0.01, "grad_norm": 4.898589027829305, "learning_rate": 1.999986721681917e-06, "loss": 1.4059, "step": 981 }, { "epoch": 0.01, "grad_norm": 7.05243832236956, "learning_rate": 1.9999866943181144e-06, "loss": 1.4345, "step": 982 }, { "epoch": 0.01, "grad_norm": 8.171888830122464, "learning_rate": 1.999986666926146e-06, "loss": 1.5769, "step": 983 }, { "epoch": 0.01, "grad_norm": 5.268910529823187, "learning_rate": 1.999986639506011e-06, "loss": 1.5645, "step": 984 }, { "epoch": 0.01, "grad_norm": 5.1967337373113525, "learning_rate": 1.9999866120577097e-06, "loss": 1.5157, "step": 985 }, { "epoch": 0.01, "grad_norm": 5.42988030195917, "learning_rate": 1.999986584581242e-06, "loss": 1.5816, "step": 986 }, { "epoch": 0.01, "grad_norm": 4.73138904838603, "learning_rate": 1.999986557076608e-06, "loss": 1.5813, "step": 987 }, { "epoch": 0.01, "grad_norm": 4.753580436572943, "learning_rate": 1.999986529543808e-06, "loss": 1.4126, "step": 988 }, { "epoch": 0.01, "grad_norm": 4.7046114145354565, "learning_rate": 1.999986501982841e-06, "loss": 1.5255, "step": 989 }, { "epoch": 0.01, "grad_norm": 7.691510890894805, "learning_rate": 1.9999864743937083e-06, "loss": 1.4457, "step": 990 }, { "epoch": 0.01, "grad_norm": 5.010108387001166, "learning_rate": 1.9999864467764095e-06, "loss": 1.2545, "step": 991 }, { "epoch": 0.01, "grad_norm": 5.100682664462816, "learning_rate": 1.9999864191309444e-06, "loss": 1.39, "step": 992 }, { "epoch": 0.01, "grad_norm": 5.918040690814786, "learning_rate": 1.9999863914573124e-06, "loss": 1.3931, "step": 993 }, { "epoch": 0.01, "grad_norm": 13.32376088870836, "learning_rate": 1.9999863637555145e-06, "loss": 1.5364, "step": 994 }, { "epoch": 0.01, "grad_norm": 6.404294035760159, "learning_rate": 1.9999863360255502e-06, "loss": 1.6188, "step": 995 }, { "epoch": 0.01, "grad_norm": 4.395708410571946, "learning_rate": 1.99998630826742e-06, "loss": 1.4324, "step": 996 }, { "epoch": 0.01, "grad_norm": 4.64843688225223, "learning_rate": 1.999986280481123e-06, "loss": 1.4221, "step": 997 }, { "epoch": 0.01, "grad_norm": 9.483948754696783, "learning_rate": 1.99998625266666e-06, "loss": 1.4758, "step": 998 }, { "epoch": 0.01, "grad_norm": 6.680984970928925, "learning_rate": 1.999986224824031e-06, "loss": 1.3525, "step": 999 }, { "epoch": 0.01, "grad_norm": 4.5483509269337326, "learning_rate": 1.9999861969532346e-06, "loss": 1.5931, "step": 1000 }, { "epoch": 0.01, "grad_norm": 4.685591189533344, "learning_rate": 1.999986169054273e-06, "loss": 1.4752, "step": 1001 }, { "epoch": 0.01, "grad_norm": 5.027111632819377, "learning_rate": 1.9999861411271447e-06, "loss": 1.4839, "step": 1002 }, { "epoch": 0.01, "grad_norm": 4.813630389301654, "learning_rate": 1.99998611317185e-06, "loss": 1.2419, "step": 1003 }, { "epoch": 0.01, "grad_norm": 4.8555527438063875, "learning_rate": 1.9999860851883896e-06, "loss": 1.5733, "step": 1004 }, { "epoch": 0.01, "grad_norm": 4.863459059811092, "learning_rate": 1.9999860571767623e-06, "loss": 1.4519, "step": 1005 }, { "epoch": 0.01, "grad_norm": 8.069266897105678, "learning_rate": 1.999986029136969e-06, "loss": 1.5143, "step": 1006 }, { "epoch": 0.01, "grad_norm": 6.396817433935851, "learning_rate": 1.9999860010690093e-06, "loss": 1.4442, "step": 1007 }, { "epoch": 0.01, "grad_norm": 4.846992814886912, "learning_rate": 1.9999859729728833e-06, "loss": 1.514, "step": 1008 }, { "epoch": 0.01, "grad_norm": 4.610189787028616, "learning_rate": 1.9999859448485912e-06, "loss": 1.3104, "step": 1009 }, { "epoch": 0.01, "grad_norm": 5.131296471286178, "learning_rate": 1.999985916696133e-06, "loss": 1.3135, "step": 1010 }, { "epoch": 0.01, "grad_norm": 5.091140293402042, "learning_rate": 1.999985888515508e-06, "loss": 1.5342, "step": 1011 }, { "epoch": 0.01, "grad_norm": 5.120127148783947, "learning_rate": 1.999985860306717e-06, "loss": 1.5248, "step": 1012 }, { "epoch": 0.01, "grad_norm": 5.479120892520487, "learning_rate": 1.9999858320697597e-06, "loss": 1.5342, "step": 1013 }, { "epoch": 0.01, "grad_norm": 5.065215097181712, "learning_rate": 1.999985803804636e-06, "loss": 1.5236, "step": 1014 }, { "epoch": 0.01, "grad_norm": 4.790915278833422, "learning_rate": 1.9999857755113463e-06, "loss": 1.5493, "step": 1015 }, { "epoch": 0.01, "grad_norm": 5.245342572510602, "learning_rate": 1.99998574718989e-06, "loss": 1.4241, "step": 1016 }, { "epoch": 0.01, "grad_norm": 5.450685311013884, "learning_rate": 1.9999857188402677e-06, "loss": 1.5392, "step": 1017 }, { "epoch": 0.01, "grad_norm": 5.167929668678754, "learning_rate": 1.9999856904624786e-06, "loss": 1.4839, "step": 1018 }, { "epoch": 0.01, "grad_norm": 4.655577606549074, "learning_rate": 1.999985662056524e-06, "loss": 1.3708, "step": 1019 }, { "epoch": 0.01, "grad_norm": 5.083589051170835, "learning_rate": 1.9999856336224026e-06, "loss": 1.4276, "step": 1020 }, { "epoch": 0.01, "grad_norm": 7.518979837632458, "learning_rate": 1.9999856051601156e-06, "loss": 1.5506, "step": 1021 }, { "epoch": 0.01, "grad_norm": 9.022963635961508, "learning_rate": 1.9999855766696614e-06, "loss": 1.503, "step": 1022 }, { "epoch": 0.01, "eval_loss": 1.6751856803894043, "eval_runtime": 4.6457, "eval_samples_per_second": 1.937, "eval_steps_per_second": 1.076, "step": 1022 }, { "epoch": 0.01, "grad_norm": 4.888363610346068, "learning_rate": 1.9999855481510416e-06, "loss": 1.3999, "step": 1023 }, { "epoch": 0.01, "grad_norm": 6.785844821190048, "learning_rate": 1.999985519604255e-06, "loss": 1.5704, "step": 1024 }, { "epoch": 0.01, "grad_norm": 5.817704365569473, "learning_rate": 1.9999854910293026e-06, "loss": 1.5568, "step": 1025 }, { "epoch": 0.01, "grad_norm": 4.359712088912163, "learning_rate": 1.9999854624261837e-06, "loss": 1.2568, "step": 1026 }, { "epoch": 0.01, "grad_norm": 5.591954237737313, "learning_rate": 1.999985433794899e-06, "loss": 1.4747, "step": 1027 }, { "epoch": 0.01, "grad_norm": 5.102013647933071, "learning_rate": 1.9999854051354476e-06, "loss": 1.4146, "step": 1028 }, { "epoch": 0.01, "grad_norm": 4.931384249700325, "learning_rate": 1.99998537644783e-06, "loss": 1.4324, "step": 1029 }, { "epoch": 0.01, "grad_norm": 4.59036254886226, "learning_rate": 1.999985347732046e-06, "loss": 1.467, "step": 1030 }, { "epoch": 0.01, "grad_norm": 4.994175894124642, "learning_rate": 1.999985318988096e-06, "loss": 1.3865, "step": 1031 }, { "epoch": 0.01, "grad_norm": 4.545053819093002, "learning_rate": 1.999985290215979e-06, "loss": 1.4351, "step": 1032 }, { "epoch": 0.01, "grad_norm": 5.004106893385651, "learning_rate": 1.999985261415697e-06, "loss": 1.4621, "step": 1033 }, { "epoch": 0.01, "grad_norm": 6.279172341870177, "learning_rate": 1.9999852325872476e-06, "loss": 1.7436, "step": 1034 }, { "epoch": 0.01, "grad_norm": 5.343377788723042, "learning_rate": 1.9999852037306325e-06, "loss": 1.5041, "step": 1035 }, { "epoch": 0.01, "grad_norm": 5.268385681335901, "learning_rate": 1.999985174845851e-06, "loss": 1.5105, "step": 1036 }, { "epoch": 0.01, "grad_norm": 6.048875950329858, "learning_rate": 1.9999851459329036e-06, "loss": 1.5769, "step": 1037 }, { "epoch": 0.01, "grad_norm": 5.74228089002837, "learning_rate": 1.9999851169917897e-06, "loss": 1.5507, "step": 1038 }, { "epoch": 0.01, "grad_norm": 4.698397540213852, "learning_rate": 1.9999850880225095e-06, "loss": 1.3009, "step": 1039 }, { "epoch": 0.01, "grad_norm": 4.641564143897353, "learning_rate": 1.999985059025063e-06, "loss": 1.3415, "step": 1040 }, { "epoch": 0.01, "grad_norm": 4.77396271067677, "learning_rate": 1.99998502999945e-06, "loss": 1.4983, "step": 1041 }, { "epoch": 0.01, "grad_norm": 4.772762227532076, "learning_rate": 1.9999850009456713e-06, "loss": 1.3664, "step": 1042 }, { "epoch": 0.01, "grad_norm": 4.914962455888292, "learning_rate": 1.999984971863726e-06, "loss": 1.4483, "step": 1043 }, { "epoch": 0.01, "grad_norm": 5.292636683131925, "learning_rate": 1.9999849427536147e-06, "loss": 1.614, "step": 1044 }, { "epoch": 0.01, "grad_norm": 4.816438676458731, "learning_rate": 1.999984913615337e-06, "loss": 1.4438, "step": 1045 }, { "epoch": 0.01, "grad_norm": 4.61943572442077, "learning_rate": 1.999984884448893e-06, "loss": 1.3256, "step": 1046 }, { "epoch": 0.01, "grad_norm": 16.818243184224574, "learning_rate": 1.999984855254283e-06, "loss": 1.5516, "step": 1047 }, { "epoch": 0.01, "grad_norm": 4.944906131189107, "learning_rate": 1.9999848260315064e-06, "loss": 1.4782, "step": 1048 }, { "epoch": 0.01, "grad_norm": 5.372035876331552, "learning_rate": 1.9999847967805635e-06, "loss": 1.5814, "step": 1049 }, { "epoch": 0.01, "grad_norm": 5.05598468387018, "learning_rate": 1.9999847675014548e-06, "loss": 1.4904, "step": 1050 }, { "epoch": 0.01, "grad_norm": 8.983166835216965, "learning_rate": 1.9999847381941796e-06, "loss": 1.5657, "step": 1051 }, { "epoch": 0.01, "grad_norm": 5.880234134964031, "learning_rate": 1.999984708858738e-06, "loss": 1.2615, "step": 1052 }, { "epoch": 0.01, "grad_norm": 5.383111410300504, "learning_rate": 1.9999846794951305e-06, "loss": 1.5257, "step": 1053 }, { "epoch": 0.01, "grad_norm": 5.319783609516555, "learning_rate": 1.9999846501033566e-06, "loss": 1.3974, "step": 1054 }, { "epoch": 0.01, "grad_norm": 5.276781780830751, "learning_rate": 1.9999846206834163e-06, "loss": 1.5434, "step": 1055 }, { "epoch": 0.01, "grad_norm": 6.704196444791288, "learning_rate": 1.99998459123531e-06, "loss": 1.4933, "step": 1056 }, { "epoch": 0.01, "grad_norm": 7.249291359747204, "learning_rate": 1.9999845617590374e-06, "loss": 1.4506, "step": 1057 }, { "epoch": 0.01, "grad_norm": 4.675801641902775, "learning_rate": 1.9999845322545983e-06, "loss": 1.582, "step": 1058 }, { "epoch": 0.01, "grad_norm": 5.63075829111936, "learning_rate": 1.9999845027219933e-06, "loss": 1.5674, "step": 1059 }, { "epoch": 0.01, "grad_norm": 6.305134362562168, "learning_rate": 1.999984473161222e-06, "loss": 1.1858, "step": 1060 }, { "epoch": 0.01, "grad_norm": 4.923187333888181, "learning_rate": 1.999984443572284e-06, "loss": 1.5338, "step": 1061 }, { "epoch": 0.01, "grad_norm": 5.615589160701651, "learning_rate": 1.9999844139551804e-06, "loss": 1.3402, "step": 1062 }, { "epoch": 0.01, "grad_norm": 5.080506923679196, "learning_rate": 1.9999843843099103e-06, "loss": 1.4556, "step": 1063 }, { "epoch": 0.01, "grad_norm": 4.739895475840536, "learning_rate": 1.9999843546364742e-06, "loss": 1.4932, "step": 1064 }, { "epoch": 0.01, "grad_norm": 4.404861163545689, "learning_rate": 1.9999843249348718e-06, "loss": 1.3194, "step": 1065 }, { "epoch": 0.01, "grad_norm": 6.345308311669248, "learning_rate": 1.999984295205103e-06, "loss": 1.3354, "step": 1066 }, { "epoch": 0.01, "grad_norm": 5.01753123609855, "learning_rate": 1.9999842654471677e-06, "loss": 1.4866, "step": 1067 }, { "epoch": 0.01, "grad_norm": 5.5591172662604595, "learning_rate": 1.9999842356610664e-06, "loss": 1.3546, "step": 1068 }, { "epoch": 0.01, "grad_norm": 4.7557963260332565, "learning_rate": 1.999984205846799e-06, "loss": 1.4518, "step": 1069 }, { "epoch": 0.01, "grad_norm": 4.562175319819017, "learning_rate": 1.9999841760043657e-06, "loss": 1.3072, "step": 1070 }, { "epoch": 0.01, "grad_norm": 4.812788016867621, "learning_rate": 1.9999841461337653e-06, "loss": 1.371, "step": 1071 }, { "epoch": 0.01, "grad_norm": 4.67438764053753, "learning_rate": 1.9999841162349994e-06, "loss": 1.4096, "step": 1072 }, { "epoch": 0.01, "grad_norm": 4.993252181714291, "learning_rate": 1.999984086308067e-06, "loss": 1.4942, "step": 1073 }, { "epoch": 0.01, "grad_norm": 4.910843860455304, "learning_rate": 1.9999840563529685e-06, "loss": 1.4151, "step": 1074 }, { "epoch": 0.01, "grad_norm": 19.5766042127858, "learning_rate": 1.999984026369704e-06, "loss": 1.4966, "step": 1075 }, { "epoch": 0.01, "grad_norm": 5.483294588888528, "learning_rate": 1.9999839963582724e-06, "loss": 1.4772, "step": 1076 }, { "epoch": 0.01, "grad_norm": 4.999652845868447, "learning_rate": 1.9999839663186754e-06, "loss": 1.496, "step": 1077 }, { "epoch": 0.01, "grad_norm": 5.272781233458372, "learning_rate": 1.999983936250912e-06, "loss": 1.4132, "step": 1078 }, { "epoch": 0.01, "grad_norm": 4.980996814545554, "learning_rate": 1.9999839061549822e-06, "loss": 1.2901, "step": 1079 }, { "epoch": 0.01, "grad_norm": 4.953763987949976, "learning_rate": 1.999983876030886e-06, "loss": 1.5155, "step": 1080 }, { "epoch": 0.01, "grad_norm": 5.403058384399181, "learning_rate": 1.9999838458786244e-06, "loss": 1.5269, "step": 1081 }, { "epoch": 0.01, "grad_norm": 6.962939986871904, "learning_rate": 1.999983815698196e-06, "loss": 1.5753, "step": 1082 }, { "epoch": 0.01, "grad_norm": 5.283384276439903, "learning_rate": 1.999983785489601e-06, "loss": 1.5311, "step": 1083 }, { "epoch": 0.01, "grad_norm": 5.614239854059685, "learning_rate": 1.9999837552528406e-06, "loss": 1.5236, "step": 1084 }, { "epoch": 0.01, "grad_norm": 4.5938014662143, "learning_rate": 1.9999837249879133e-06, "loss": 1.449, "step": 1085 }, { "epoch": 0.01, "grad_norm": 4.828053203793281, "learning_rate": 1.99998369469482e-06, "loss": 1.3608, "step": 1086 }, { "epoch": 0.01, "grad_norm": 4.86723210959235, "learning_rate": 1.999983664373561e-06, "loss": 1.3185, "step": 1087 }, { "epoch": 0.01, "grad_norm": 5.15651657506136, "learning_rate": 1.999983634024135e-06, "loss": 1.3037, "step": 1088 }, { "epoch": 0.01, "grad_norm": 4.781841302352801, "learning_rate": 1.999983603646543e-06, "loss": 1.5157, "step": 1089 }, { "epoch": 0.01, "grad_norm": 5.6250573996377256, "learning_rate": 1.999983573240785e-06, "loss": 1.4612, "step": 1090 }, { "epoch": 0.01, "grad_norm": 4.853270627798325, "learning_rate": 1.999983542806861e-06, "loss": 1.3915, "step": 1091 }, { "epoch": 0.01, "grad_norm": 5.416508032931958, "learning_rate": 1.99998351234477e-06, "loss": 1.4846, "step": 1092 }, { "epoch": 0.01, "grad_norm": 5.154167527363172, "learning_rate": 1.9999834818545135e-06, "loss": 1.4909, "step": 1093 }, { "epoch": 0.01, "grad_norm": 5.015726983035818, "learning_rate": 1.9999834513360905e-06, "loss": 1.4462, "step": 1094 }, { "epoch": 0.01, "grad_norm": 4.859626471859029, "learning_rate": 1.9999834207895015e-06, "loss": 1.5644, "step": 1095 }, { "epoch": 0.01, "eval_loss": 1.6735153198242188, "eval_runtime": 4.6256, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 1095 }, { "epoch": 0.01, "grad_norm": 4.566823707743832, "learning_rate": 1.999983390214746e-06, "loss": 1.4629, "step": 1096 }, { "epoch": 0.01, "grad_norm": 4.735335408472524, "learning_rate": 1.9999833596118244e-06, "loss": 1.4391, "step": 1097 }, { "epoch": 0.01, "grad_norm": 4.790636259147894, "learning_rate": 1.999983328980737e-06, "loss": 1.4981, "step": 1098 }, { "epoch": 0.01, "grad_norm": 5.349178643835508, "learning_rate": 1.999983298321483e-06, "loss": 1.3753, "step": 1099 }, { "epoch": 0.01, "grad_norm": 5.263976566555674, "learning_rate": 1.9999832676340625e-06, "loss": 1.55, "step": 1100 }, { "epoch": 0.01, "grad_norm": 4.657912236522682, "learning_rate": 1.9999832369184764e-06, "loss": 1.4841, "step": 1101 }, { "epoch": 0.01, "grad_norm": 4.867208669497604, "learning_rate": 1.999983206174724e-06, "loss": 1.4229, "step": 1102 }, { "epoch": 0.01, "grad_norm": 4.862064804403664, "learning_rate": 1.9999831754028052e-06, "loss": 1.4944, "step": 1103 }, { "epoch": 0.01, "grad_norm": 5.3872647723902585, "learning_rate": 1.99998314460272e-06, "loss": 1.4938, "step": 1104 }, { "epoch": 0.01, "grad_norm": 4.878142365865869, "learning_rate": 1.9999831137744693e-06, "loss": 1.4478, "step": 1105 }, { "epoch": 0.01, "grad_norm": 4.632417620908414, "learning_rate": 1.9999830829180517e-06, "loss": 1.4551, "step": 1106 }, { "epoch": 0.01, "grad_norm": 4.666710624917455, "learning_rate": 1.999983052033468e-06, "loss": 1.4641, "step": 1107 }, { "epoch": 0.01, "grad_norm": 4.732751527383862, "learning_rate": 1.9999830211207183e-06, "loss": 1.3838, "step": 1108 }, { "epoch": 0.01, "grad_norm": 6.192163188860683, "learning_rate": 1.9999829901798025e-06, "loss": 1.5361, "step": 1109 }, { "epoch": 0.01, "grad_norm": 6.029036683181163, "learning_rate": 1.9999829592107202e-06, "loss": 1.5428, "step": 1110 }, { "epoch": 0.01, "grad_norm": 5.235770433822521, "learning_rate": 1.999982928213472e-06, "loss": 1.4727, "step": 1111 }, { "epoch": 0.01, "grad_norm": 17.301123468125486, "learning_rate": 1.9999828971880574e-06, "loss": 1.3757, "step": 1112 }, { "epoch": 0.01, "grad_norm": 4.804238410996875, "learning_rate": 1.9999828661344764e-06, "loss": 1.3589, "step": 1113 }, { "epoch": 0.01, "grad_norm": 4.616566753097097, "learning_rate": 1.9999828350527295e-06, "loss": 1.3691, "step": 1114 }, { "epoch": 0.01, "grad_norm": 5.347116928777432, "learning_rate": 1.999982803942817e-06, "loss": 1.4647, "step": 1115 }, { "epoch": 0.01, "grad_norm": 4.932531917778219, "learning_rate": 1.9999827728047373e-06, "loss": 1.4675, "step": 1116 }, { "epoch": 0.01, "grad_norm": 4.660294529470657, "learning_rate": 1.999982741638492e-06, "loss": 1.4151, "step": 1117 }, { "epoch": 0.01, "grad_norm": 7.620813961955892, "learning_rate": 1.99998271044408e-06, "loss": 1.6348, "step": 1118 }, { "epoch": 0.01, "grad_norm": 5.446077565129274, "learning_rate": 1.9999826792215024e-06, "loss": 1.3873, "step": 1119 }, { "epoch": 0.01, "grad_norm": 5.0629846402349195, "learning_rate": 1.9999826479707584e-06, "loss": 1.4435, "step": 1120 }, { "epoch": 0.01, "grad_norm": 5.390133111401672, "learning_rate": 1.999982616691848e-06, "loss": 1.5374, "step": 1121 }, { "epoch": 0.01, "grad_norm": 4.635104269305596, "learning_rate": 1.9999825853847717e-06, "loss": 1.4144, "step": 1122 }, { "epoch": 0.01, "grad_norm": 4.932450441815705, "learning_rate": 1.999982554049529e-06, "loss": 1.6113, "step": 1123 }, { "epoch": 0.01, "grad_norm": 4.571391891954345, "learning_rate": 1.9999825226861202e-06, "loss": 1.385, "step": 1124 }, { "epoch": 0.01, "grad_norm": 4.463278206524636, "learning_rate": 1.999982491294545e-06, "loss": 1.3112, "step": 1125 }, { "epoch": 0.01, "grad_norm": 5.169524463711344, "learning_rate": 1.999982459874804e-06, "loss": 1.3639, "step": 1126 }, { "epoch": 0.01, "grad_norm": 4.706816189602849, "learning_rate": 1.999982428426897e-06, "loss": 1.4906, "step": 1127 }, { "epoch": 0.01, "grad_norm": 5.4003758033814435, "learning_rate": 1.9999823969508233e-06, "loss": 1.2288, "step": 1128 }, { "epoch": 0.01, "grad_norm": 5.07733560063141, "learning_rate": 1.999982365446584e-06, "loss": 1.4117, "step": 1129 }, { "epoch": 0.01, "grad_norm": 7.969998640669905, "learning_rate": 1.9999823339141778e-06, "loss": 1.3877, "step": 1130 }, { "epoch": 0.01, "grad_norm": 5.142101849234914, "learning_rate": 1.9999823023536056e-06, "loss": 1.4276, "step": 1131 }, { "epoch": 0.01, "grad_norm": 5.223738858689869, "learning_rate": 1.9999822707648675e-06, "loss": 1.4385, "step": 1132 }, { "epoch": 0.01, "grad_norm": 4.955334460812677, "learning_rate": 1.999982239147963e-06, "loss": 1.4981, "step": 1133 }, { "epoch": 0.01, "grad_norm": 5.707569119145383, "learning_rate": 1.9999822075028926e-06, "loss": 1.5537, "step": 1134 }, { "epoch": 0.01, "grad_norm": 6.259173620055015, "learning_rate": 1.999982175829656e-06, "loss": 1.4162, "step": 1135 }, { "epoch": 0.01, "grad_norm": 6.160930511739169, "learning_rate": 1.999982144128253e-06, "loss": 1.6684, "step": 1136 }, { "epoch": 0.01, "grad_norm": 12.436420759166786, "learning_rate": 1.999982112398684e-06, "loss": 1.2647, "step": 1137 }, { "epoch": 0.01, "grad_norm": 5.264705476269943, "learning_rate": 1.9999820806409487e-06, "loss": 1.5094, "step": 1138 }, { "epoch": 0.01, "grad_norm": 4.82448740492418, "learning_rate": 1.9999820488550476e-06, "loss": 1.4738, "step": 1139 }, { "epoch": 0.01, "grad_norm": 5.590049555555406, "learning_rate": 1.9999820170409797e-06, "loss": 1.4759, "step": 1140 }, { "epoch": 0.01, "grad_norm": 4.864218721440189, "learning_rate": 1.9999819851987462e-06, "loss": 1.5362, "step": 1141 }, { "epoch": 0.01, "grad_norm": 5.32497769317765, "learning_rate": 1.9999819533283464e-06, "loss": 1.4001, "step": 1142 }, { "epoch": 0.01, "grad_norm": 4.788543241974733, "learning_rate": 1.99998192142978e-06, "loss": 1.5497, "step": 1143 }, { "epoch": 0.01, "grad_norm": 8.851113964577435, "learning_rate": 1.999981889503048e-06, "loss": 1.4904, "step": 1144 }, { "epoch": 0.01, "grad_norm": 5.107648195516455, "learning_rate": 1.9999818575481494e-06, "loss": 1.5098, "step": 1145 }, { "epoch": 0.01, "grad_norm": 4.583169283696968, "learning_rate": 1.999981825565085e-06, "loss": 1.3993, "step": 1146 }, { "epoch": 0.01, "grad_norm": 4.807776598248037, "learning_rate": 1.999981793553854e-06, "loss": 1.3587, "step": 1147 }, { "epoch": 0.01, "grad_norm": 4.614047081189212, "learning_rate": 1.999981761514457e-06, "loss": 1.3482, "step": 1148 }, { "epoch": 0.01, "grad_norm": 6.35657561717519, "learning_rate": 1.999981729446894e-06, "loss": 1.5398, "step": 1149 }, { "epoch": 0.01, "grad_norm": 4.933553997697057, "learning_rate": 1.999981697351165e-06, "loss": 1.3589, "step": 1150 }, { "epoch": 0.01, "grad_norm": 5.872175454134656, "learning_rate": 1.9999816652272697e-06, "loss": 1.4732, "step": 1151 }, { "epoch": 0.01, "grad_norm": 5.288241183644056, "learning_rate": 1.999981633075208e-06, "loss": 1.3574, "step": 1152 }, { "epoch": 0.01, "grad_norm": 5.009781459825191, "learning_rate": 1.99998160089498e-06, "loss": 1.394, "step": 1153 }, { "epoch": 0.01, "grad_norm": 4.55801046260358, "learning_rate": 1.9999815686865867e-06, "loss": 1.3084, "step": 1154 }, { "epoch": 0.01, "grad_norm": 4.821680285232777, "learning_rate": 1.9999815364500263e-06, "loss": 1.4516, "step": 1155 }, { "epoch": 0.01, "grad_norm": 4.952975805784374, "learning_rate": 1.9999815041853005e-06, "loss": 1.4385, "step": 1156 }, { "epoch": 0.01, "grad_norm": 5.576070368663535, "learning_rate": 1.9999814718924082e-06, "loss": 1.3069, "step": 1157 }, { "epoch": 0.01, "grad_norm": 4.951804621604359, "learning_rate": 1.9999814395713496e-06, "loss": 1.4112, "step": 1158 }, { "epoch": 0.01, "grad_norm": 4.741717751755461, "learning_rate": 1.9999814072221254e-06, "loss": 1.4899, "step": 1159 }, { "epoch": 0.01, "grad_norm": 4.893581963366542, "learning_rate": 1.9999813748447344e-06, "loss": 1.3654, "step": 1160 }, { "epoch": 0.01, "grad_norm": 4.688325242766382, "learning_rate": 1.9999813424391775e-06, "loss": 1.4792, "step": 1161 }, { "epoch": 0.01, "grad_norm": 4.958921548979553, "learning_rate": 1.9999813100054546e-06, "loss": 1.1737, "step": 1162 }, { "epoch": 0.01, "grad_norm": 4.988069853619337, "learning_rate": 1.9999812775435653e-06, "loss": 1.5248, "step": 1163 }, { "epoch": 0.01, "grad_norm": 4.323386669021145, "learning_rate": 1.99998124505351e-06, "loss": 1.3352, "step": 1164 }, { "epoch": 0.01, "grad_norm": 4.863439791281708, "learning_rate": 1.9999812125352883e-06, "loss": 1.4522, "step": 1165 }, { "epoch": 0.01, "grad_norm": 4.50022212533205, "learning_rate": 1.9999811799889007e-06, "loss": 1.4328, "step": 1166 }, { "epoch": 0.01, "grad_norm": 5.283635450013209, "learning_rate": 1.9999811474143467e-06, "loss": 1.4352, "step": 1167 }, { "epoch": 0.01, "grad_norm": 4.613546626420899, "learning_rate": 1.999981114811627e-06, "loss": 1.4733, "step": 1168 }, { "epoch": 0.01, "eval_loss": 1.6666209697723389, "eval_runtime": 4.627, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.081, "step": 1168 }, { "epoch": 0.01, "grad_norm": 5.243713431888752, "learning_rate": 1.999981082180741e-06, "loss": 1.4229, "step": 1169 }, { "epoch": 0.01, "grad_norm": 5.180916470723546, "learning_rate": 1.999981049521689e-06, "loss": 1.5532, "step": 1170 }, { "epoch": 0.01, "grad_norm": 4.634457636781247, "learning_rate": 1.9999810168344707e-06, "loss": 1.4628, "step": 1171 }, { "epoch": 0.01, "grad_norm": 4.841975621960265, "learning_rate": 1.999980984119086e-06, "loss": 1.4714, "step": 1172 }, { "epoch": 0.01, "grad_norm": 4.856407984159355, "learning_rate": 1.9999809513755354e-06, "loss": 1.3953, "step": 1173 }, { "epoch": 0.01, "grad_norm": 4.924593632584216, "learning_rate": 1.9999809186038184e-06, "loss": 1.4777, "step": 1174 }, { "epoch": 0.01, "grad_norm": 10.804521073545573, "learning_rate": 1.999980885803936e-06, "loss": 1.5477, "step": 1175 }, { "epoch": 0.01, "grad_norm": 4.849633875980324, "learning_rate": 1.9999808529758865e-06, "loss": 1.3616, "step": 1176 }, { "epoch": 0.01, "grad_norm": 4.918561776125233, "learning_rate": 1.9999808201196716e-06, "loss": 1.4449, "step": 1177 }, { "epoch": 0.01, "grad_norm": 4.559134203442522, "learning_rate": 1.9999807872352903e-06, "loss": 1.373, "step": 1178 }, { "epoch": 0.01, "grad_norm": 4.7385730138344675, "learning_rate": 1.9999807543227426e-06, "loss": 1.4561, "step": 1179 }, { "epoch": 0.01, "grad_norm": 5.477240093311455, "learning_rate": 1.9999807213820294e-06, "loss": 1.3705, "step": 1180 }, { "epoch": 0.01, "grad_norm": 6.069434481225469, "learning_rate": 1.99998068841315e-06, "loss": 1.2873, "step": 1181 }, { "epoch": 0.01, "grad_norm": 4.697371499630333, "learning_rate": 1.999980655416104e-06, "loss": 1.4569, "step": 1182 }, { "epoch": 0.01, "grad_norm": 4.533455071498911, "learning_rate": 1.999980622390892e-06, "loss": 1.4891, "step": 1183 }, { "epoch": 0.01, "grad_norm": 4.807849990271847, "learning_rate": 1.9999805893375135e-06, "loss": 1.4576, "step": 1184 }, { "epoch": 0.01, "grad_norm": 5.812651120417968, "learning_rate": 1.9999805562559696e-06, "loss": 1.4451, "step": 1185 }, { "epoch": 0.01, "grad_norm": 4.764537937046826, "learning_rate": 1.9999805231462594e-06, "loss": 1.4257, "step": 1186 }, { "epoch": 0.01, "grad_norm": 5.106952053198868, "learning_rate": 1.9999804900083827e-06, "loss": 1.5414, "step": 1187 }, { "epoch": 0.01, "grad_norm": 4.655479441251472, "learning_rate": 1.99998045684234e-06, "loss": 1.4561, "step": 1188 }, { "epoch": 0.01, "grad_norm": 4.473182229797437, "learning_rate": 1.9999804236481316e-06, "loss": 1.4701, "step": 1189 }, { "epoch": 0.01, "grad_norm": 5.065099661197906, "learning_rate": 1.9999803904257566e-06, "loss": 1.4211, "step": 1190 }, { "epoch": 0.01, "grad_norm": 4.538986656004763, "learning_rate": 1.9999803571752157e-06, "loss": 1.5027, "step": 1191 }, { "epoch": 0.01, "grad_norm": 4.4128695506388365, "learning_rate": 1.9999803238965084e-06, "loss": 1.5027, "step": 1192 }, { "epoch": 0.01, "grad_norm": 6.032103944177775, "learning_rate": 1.999980290589635e-06, "loss": 1.5644, "step": 1193 }, { "epoch": 0.01, "grad_norm": 4.682343321057221, "learning_rate": 1.9999802572545963e-06, "loss": 1.5309, "step": 1194 }, { "epoch": 0.01, "grad_norm": 4.343314622774007, "learning_rate": 1.9999802238913906e-06, "loss": 1.3003, "step": 1195 }, { "epoch": 0.01, "grad_norm": 4.630952351276132, "learning_rate": 1.999980190500019e-06, "loss": 1.4954, "step": 1196 }, { "epoch": 0.01, "grad_norm": 5.198619248496427, "learning_rate": 1.9999801570804815e-06, "loss": 1.397, "step": 1197 }, { "epoch": 0.01, "grad_norm": 4.618621734517426, "learning_rate": 1.9999801236327776e-06, "loss": 1.3981, "step": 1198 }, { "epoch": 0.01, "grad_norm": 5.147623660905904, "learning_rate": 1.999980090156908e-06, "loss": 1.3596, "step": 1199 }, { "epoch": 0.01, "grad_norm": 4.946475818289631, "learning_rate": 1.999980056652872e-06, "loss": 1.5295, "step": 1200 }, { "epoch": 0.01, "grad_norm": 4.702884825661346, "learning_rate": 1.9999800231206696e-06, "loss": 1.4685, "step": 1201 }, { "epoch": 0.01, "grad_norm": 7.029321508506549, "learning_rate": 1.9999799895603013e-06, "loss": 1.2344, "step": 1202 }, { "epoch": 0.01, "grad_norm": 5.130362694352546, "learning_rate": 1.999979955971767e-06, "loss": 1.4078, "step": 1203 }, { "epoch": 0.01, "grad_norm": 4.638399765026865, "learning_rate": 1.9999799223550666e-06, "loss": 1.2877, "step": 1204 }, { "epoch": 0.01, "grad_norm": 6.419197466434014, "learning_rate": 1.9999798887102e-06, "loss": 1.5245, "step": 1205 }, { "epoch": 0.01, "grad_norm": 5.048810252708306, "learning_rate": 1.999979855037167e-06, "loss": 1.6089, "step": 1206 }, { "epoch": 0.01, "grad_norm": 5.594986736232871, "learning_rate": 1.9999798213359687e-06, "loss": 1.4686, "step": 1207 }, { "epoch": 0.01, "grad_norm": 4.341694734185055, "learning_rate": 1.999979787606604e-06, "loss": 1.2083, "step": 1208 }, { "epoch": 0.01, "grad_norm": 4.721844492494166, "learning_rate": 1.999979753849073e-06, "loss": 1.3191, "step": 1209 }, { "epoch": 0.01, "grad_norm": 4.660083437293136, "learning_rate": 1.9999797200633755e-06, "loss": 1.405, "step": 1210 }, { "epoch": 0.01, "grad_norm": 7.550917622485102, "learning_rate": 1.9999796862495127e-06, "loss": 1.392, "step": 1211 }, { "epoch": 0.01, "grad_norm": 4.855465281579127, "learning_rate": 1.999979652407483e-06, "loss": 1.2981, "step": 1212 }, { "epoch": 0.01, "grad_norm": 4.7605359185049645, "learning_rate": 1.999979618537288e-06, "loss": 1.475, "step": 1213 }, { "epoch": 0.01, "grad_norm": 4.992826613342054, "learning_rate": 1.9999795846389267e-06, "loss": 1.309, "step": 1214 }, { "epoch": 0.01, "grad_norm": 5.751213987512799, "learning_rate": 1.999979550712399e-06, "loss": 1.3181, "step": 1215 }, { "epoch": 0.01, "grad_norm": 5.56222348172062, "learning_rate": 1.999979516757705e-06, "loss": 1.4476, "step": 1216 }, { "epoch": 0.01, "grad_norm": 7.211498865140055, "learning_rate": 1.9999794827748456e-06, "loss": 1.6613, "step": 1217 }, { "epoch": 0.01, "grad_norm": 4.76843858033082, "learning_rate": 1.9999794487638195e-06, "loss": 1.455, "step": 1218 }, { "epoch": 0.01, "grad_norm": 4.971537042597962, "learning_rate": 1.9999794147246278e-06, "loss": 1.4828, "step": 1219 }, { "epoch": 0.01, "grad_norm": 4.740101198670973, "learning_rate": 1.9999793806572697e-06, "loss": 1.4563, "step": 1220 }, { "epoch": 0.01, "grad_norm": 4.772585640252896, "learning_rate": 1.9999793465617452e-06, "loss": 1.4642, "step": 1221 }, { "epoch": 0.01, "grad_norm": 7.057122514777892, "learning_rate": 1.999979312438055e-06, "loss": 1.4538, "step": 1222 }, { "epoch": 0.01, "grad_norm": 5.39056314627121, "learning_rate": 1.999979278286199e-06, "loss": 1.547, "step": 1223 }, { "epoch": 0.01, "grad_norm": 4.991611184305103, "learning_rate": 1.9999792441061764e-06, "loss": 1.4423, "step": 1224 }, { "epoch": 0.01, "grad_norm": 4.647894296933476, "learning_rate": 1.999979209897988e-06, "loss": 1.4384, "step": 1225 }, { "epoch": 0.01, "grad_norm": 4.99949285749195, "learning_rate": 1.9999791756616334e-06, "loss": 1.5835, "step": 1226 }, { "epoch": 0.01, "grad_norm": 4.567738724749353, "learning_rate": 1.9999791413971127e-06, "loss": 1.3663, "step": 1227 }, { "epoch": 0.01, "grad_norm": 5.394852681906166, "learning_rate": 1.9999791071044256e-06, "loss": 1.3405, "step": 1228 }, { "epoch": 0.01, "grad_norm": 5.3926441113626185, "learning_rate": 1.999979072783573e-06, "loss": 1.4969, "step": 1229 }, { "epoch": 0.01, "grad_norm": 5.406148666652971, "learning_rate": 1.999979038434554e-06, "loss": 1.5782, "step": 1230 }, { "epoch": 0.01, "grad_norm": 4.894426333285634, "learning_rate": 1.999979004057369e-06, "loss": 1.4171, "step": 1231 }, { "epoch": 0.01, "grad_norm": 4.662885913776113, "learning_rate": 1.9999789696520178e-06, "loss": 1.3504, "step": 1232 }, { "epoch": 0.01, "grad_norm": 4.859148631113401, "learning_rate": 1.9999789352185005e-06, "loss": 1.3593, "step": 1233 }, { "epoch": 0.01, "grad_norm": 4.501735921525826, "learning_rate": 1.999978900756817e-06, "loss": 1.497, "step": 1234 }, { "epoch": 0.01, "grad_norm": 4.767981300489952, "learning_rate": 1.999978866266968e-06, "loss": 1.4222, "step": 1235 }, { "epoch": 0.01, "grad_norm": 5.601659778166056, "learning_rate": 1.9999788317489523e-06, "loss": 1.3368, "step": 1236 }, { "epoch": 0.01, "grad_norm": 5.981059658355397, "learning_rate": 1.999978797202771e-06, "loss": 1.3621, "step": 1237 }, { "epoch": 0.01, "grad_norm": 6.128261931608396, "learning_rate": 1.9999787626284236e-06, "loss": 1.6156, "step": 1238 }, { "epoch": 0.01, "grad_norm": 4.881645282270224, "learning_rate": 1.9999787280259097e-06, "loss": 1.3842, "step": 1239 }, { "epoch": 0.01, "grad_norm": 5.893724788852448, "learning_rate": 1.9999786933952302e-06, "loss": 1.6442, "step": 1240 }, { "epoch": 0.01, "grad_norm": 5.222605242350468, "learning_rate": 1.9999786587363844e-06, "loss": 1.373, "step": 1241 }, { "epoch": 0.01, "eval_loss": 1.6639937162399292, "eval_runtime": 4.616, "eval_samples_per_second": 1.95, "eval_steps_per_second": 1.083, "step": 1241 }, { "epoch": 0.01, "grad_norm": 4.66908931719967, "learning_rate": 1.999978624049372e-06, "loss": 1.4993, "step": 1242 }, { "epoch": 0.01, "grad_norm": 4.9307281292109035, "learning_rate": 1.9999785893341944e-06, "loss": 1.5608, "step": 1243 }, { "epoch": 0.01, "grad_norm": 4.826571091140564, "learning_rate": 1.99997855459085e-06, "loss": 1.3162, "step": 1244 }, { "epoch": 0.01, "grad_norm": 4.8493801546756465, "learning_rate": 1.9999785198193405e-06, "loss": 1.4282, "step": 1245 }, { "epoch": 0.01, "grad_norm": 4.697208964299474, "learning_rate": 1.999978485019664e-06, "loss": 1.5024, "step": 1246 }, { "epoch": 0.01, "grad_norm": 5.919002989067272, "learning_rate": 1.999978450191822e-06, "loss": 1.5431, "step": 1247 }, { "epoch": 0.01, "grad_norm": 4.99900985273033, "learning_rate": 1.9999784153358135e-06, "loss": 1.4855, "step": 1248 }, { "epoch": 0.01, "grad_norm": 4.617964526314292, "learning_rate": 1.999978380451639e-06, "loss": 1.5545, "step": 1249 }, { "epoch": 0.01, "grad_norm": 5.192495194044367, "learning_rate": 1.9999783455392987e-06, "loss": 1.4625, "step": 1250 }, { "epoch": 0.01, "grad_norm": 5.017628302905397, "learning_rate": 1.9999783105987924e-06, "loss": 1.5347, "step": 1251 }, { "epoch": 0.01, "grad_norm": 4.576487300446409, "learning_rate": 1.9999782756301197e-06, "loss": 1.5079, "step": 1252 }, { "epoch": 0.01, "grad_norm": 5.5622763050535085, "learning_rate": 1.999978240633281e-06, "loss": 1.4778, "step": 1253 }, { "epoch": 0.01, "grad_norm": 4.320462635223293, "learning_rate": 1.9999782056082763e-06, "loss": 1.1527, "step": 1254 }, { "epoch": 0.01, "grad_norm": 4.753621834652253, "learning_rate": 1.9999781705551057e-06, "loss": 1.4019, "step": 1255 }, { "epoch": 0.01, "grad_norm": 4.399262354558424, "learning_rate": 1.9999781354737687e-06, "loss": 1.4149, "step": 1256 }, { "epoch": 0.01, "grad_norm": 4.602639150018978, "learning_rate": 1.9999781003642658e-06, "loss": 1.3364, "step": 1257 }, { "epoch": 0.01, "grad_norm": 4.589898719773274, "learning_rate": 1.999978065226597e-06, "loss": 1.4087, "step": 1258 }, { "epoch": 0.01, "grad_norm": 4.536362353334885, "learning_rate": 1.999978030060762e-06, "loss": 1.4561, "step": 1259 }, { "epoch": 0.01, "grad_norm": 4.565066571845603, "learning_rate": 1.999977994866761e-06, "loss": 1.4477, "step": 1260 }, { "epoch": 0.01, "grad_norm": 4.82724395797132, "learning_rate": 1.999977959644594e-06, "loss": 1.385, "step": 1261 }, { "epoch": 0.01, "grad_norm": 4.485711386546417, "learning_rate": 1.999977924394261e-06, "loss": 1.3381, "step": 1262 }, { "epoch": 0.01, "grad_norm": 5.007143806674197, "learning_rate": 1.9999778891157615e-06, "loss": 1.4557, "step": 1263 }, { "epoch": 0.01, "grad_norm": 5.361949819262946, "learning_rate": 1.9999778538090964e-06, "loss": 1.3389, "step": 1264 }, { "epoch": 0.01, "grad_norm": 4.503528627850973, "learning_rate": 1.999977818474265e-06, "loss": 1.4698, "step": 1265 }, { "epoch": 0.01, "grad_norm": 5.233324651443236, "learning_rate": 1.999977783111268e-06, "loss": 1.3247, "step": 1266 }, { "epoch": 0.01, "grad_norm": 5.5046307792398865, "learning_rate": 1.9999777477201044e-06, "loss": 1.3497, "step": 1267 }, { "epoch": 0.01, "grad_norm": 5.645109391595665, "learning_rate": 1.999977712300775e-06, "loss": 1.3201, "step": 1268 }, { "epoch": 0.01, "grad_norm": 5.1446712059940465, "learning_rate": 1.9999776768532796e-06, "loss": 1.4262, "step": 1269 }, { "epoch": 0.01, "grad_norm": 5.28587028516593, "learning_rate": 1.999977641377618e-06, "loss": 1.6932, "step": 1270 }, { "epoch": 0.01, "grad_norm": 5.598379302979389, "learning_rate": 1.9999776058737906e-06, "loss": 1.4316, "step": 1271 }, { "epoch": 0.01, "grad_norm": 6.016449286637428, "learning_rate": 1.9999775703417965e-06, "loss": 1.5183, "step": 1272 }, { "epoch": 0.01, "grad_norm": 5.415644315937934, "learning_rate": 1.999977534781637e-06, "loss": 1.3129, "step": 1273 }, { "epoch": 0.01, "grad_norm": 4.704232037612598, "learning_rate": 1.9999774991933113e-06, "loss": 1.4044, "step": 1274 }, { "epoch": 0.01, "grad_norm": 4.6149493250979, "learning_rate": 1.9999774635768197e-06, "loss": 1.3827, "step": 1275 }, { "epoch": 0.01, "grad_norm": 5.530098577109759, "learning_rate": 1.9999774279321617e-06, "loss": 1.6327, "step": 1276 }, { "epoch": 0.01, "grad_norm": 4.743687061339907, "learning_rate": 1.9999773922593383e-06, "loss": 1.5278, "step": 1277 }, { "epoch": 0.01, "grad_norm": 4.8692755474083205, "learning_rate": 1.9999773565583484e-06, "loss": 1.5159, "step": 1278 }, { "epoch": 0.01, "grad_norm": 7.0274537662792556, "learning_rate": 1.9999773208291925e-06, "loss": 1.6578, "step": 1279 }, { "epoch": 0.01, "grad_norm": 5.493325452863065, "learning_rate": 1.9999772850718703e-06, "loss": 1.3068, "step": 1280 }, { "epoch": 0.01, "grad_norm": 4.623446755309784, "learning_rate": 1.9999772492863825e-06, "loss": 1.3591, "step": 1281 }, { "epoch": 0.01, "grad_norm": 6.58471292058994, "learning_rate": 1.999977213472729e-06, "loss": 1.2837, "step": 1282 }, { "epoch": 0.01, "grad_norm": 4.718580482751096, "learning_rate": 1.9999771776309083e-06, "loss": 1.5876, "step": 1283 }, { "epoch": 0.01, "grad_norm": 6.699986818479373, "learning_rate": 1.999977141760922e-06, "loss": 1.3901, "step": 1284 }, { "epoch": 0.01, "grad_norm": 5.60582750403323, "learning_rate": 1.99997710586277e-06, "loss": 1.4277, "step": 1285 }, { "epoch": 0.01, "grad_norm": 4.673408978804247, "learning_rate": 1.9999770699364526e-06, "loss": 1.4454, "step": 1286 }, { "epoch": 0.01, "grad_norm": 4.9768592067046695, "learning_rate": 1.999977033981968e-06, "loss": 1.3688, "step": 1287 }, { "epoch": 0.01, "grad_norm": 6.49039104639861, "learning_rate": 1.999976997999318e-06, "loss": 1.3585, "step": 1288 }, { "epoch": 0.01, "grad_norm": 5.441000592991114, "learning_rate": 1.999976961988502e-06, "loss": 1.4013, "step": 1289 }, { "epoch": 0.01, "grad_norm": 5.248224180467002, "learning_rate": 1.9999769259495197e-06, "loss": 1.6231, "step": 1290 }, { "epoch": 0.01, "grad_norm": 4.967205396228199, "learning_rate": 1.9999768898823714e-06, "loss": 1.3776, "step": 1291 }, { "epoch": 0.01, "grad_norm": 4.786827176028182, "learning_rate": 1.999976853787057e-06, "loss": 1.5409, "step": 1292 }, { "epoch": 0.01, "grad_norm": 4.629480539543215, "learning_rate": 1.999976817663577e-06, "loss": 1.4354, "step": 1293 }, { "epoch": 0.01, "grad_norm": 5.100068802921767, "learning_rate": 1.9999767815119305e-06, "loss": 1.4632, "step": 1294 }, { "epoch": 0.01, "grad_norm": 5.134078229009721, "learning_rate": 1.999976745332118e-06, "loss": 1.4747, "step": 1295 }, { "epoch": 0.01, "grad_norm": 5.039774129292199, "learning_rate": 1.99997670912414e-06, "loss": 1.5499, "step": 1296 }, { "epoch": 0.01, "grad_norm": 5.581809616341501, "learning_rate": 1.9999766728879955e-06, "loss": 1.5581, "step": 1297 }, { "epoch": 0.01, "grad_norm": 4.769459899834711, "learning_rate": 1.999976636623685e-06, "loss": 1.6019, "step": 1298 }, { "epoch": 0.01, "grad_norm": 4.789913009365324, "learning_rate": 1.9999766003312087e-06, "loss": 1.4384, "step": 1299 }, { "epoch": 0.01, "grad_norm": 4.948273546122202, "learning_rate": 1.9999765640105663e-06, "loss": 1.4859, "step": 1300 }, { "epoch": 0.01, "grad_norm": 5.046098251037359, "learning_rate": 1.999976527661758e-06, "loss": 1.3215, "step": 1301 }, { "epoch": 0.01, "grad_norm": 4.759646769803917, "learning_rate": 1.9999764912847834e-06, "loss": 1.26, "step": 1302 }, { "epoch": 0.01, "grad_norm": 5.049312397622781, "learning_rate": 1.999976454879643e-06, "loss": 1.2966, "step": 1303 }, { "epoch": 0.01, "grad_norm": 4.7730400364294985, "learning_rate": 1.9999764184463365e-06, "loss": 1.5075, "step": 1304 }, { "epoch": 0.01, "grad_norm": 4.615090990630283, "learning_rate": 1.999976381984864e-06, "loss": 1.3308, "step": 1305 }, { "epoch": 0.01, "grad_norm": 5.050723699922145, "learning_rate": 1.999976345495226e-06, "loss": 1.5391, "step": 1306 }, { "epoch": 0.01, "grad_norm": 5.2292497493973995, "learning_rate": 1.999976308977421e-06, "loss": 1.4581, "step": 1307 }, { "epoch": 0.01, "grad_norm": 4.484046582647355, "learning_rate": 1.9999762724314504e-06, "loss": 1.2275, "step": 1308 }, { "epoch": 0.01, "grad_norm": 4.7418804258275, "learning_rate": 1.9999762358573144e-06, "loss": 1.4074, "step": 1309 }, { "epoch": 0.01, "grad_norm": 4.656737686412653, "learning_rate": 1.9999761992550116e-06, "loss": 1.4102, "step": 1310 }, { "epoch": 0.01, "grad_norm": 4.913690315220175, "learning_rate": 1.9999761626245433e-06, "loss": 1.5153, "step": 1311 }, { "epoch": 0.01, "grad_norm": 4.631259670029744, "learning_rate": 1.9999761259659085e-06, "loss": 1.4168, "step": 1312 }, { "epoch": 0.01, "grad_norm": 6.352234843978522, "learning_rate": 1.9999760892791083e-06, "loss": 1.7813, "step": 1313 }, { "epoch": 0.01, "grad_norm": 4.806086460573851, "learning_rate": 1.9999760525641416e-06, "loss": 1.4341, "step": 1314 }, { "epoch": 0.01, "eval_loss": 1.664663314819336, "eval_runtime": 4.6478, "eval_samples_per_second": 1.936, "eval_steps_per_second": 1.076, "step": 1314 }, { "epoch": 0.01, "grad_norm": 4.865234880117637, "learning_rate": 1.999976015821009e-06, "loss": 1.5092, "step": 1315 }, { "epoch": 0.01, "grad_norm": 4.593356313971813, "learning_rate": 1.999975979049711e-06, "loss": 1.2732, "step": 1316 }, { "epoch": 0.01, "grad_norm": 4.765904676157058, "learning_rate": 1.9999759422502462e-06, "loss": 1.4894, "step": 1317 }, { "epoch": 0.01, "grad_norm": 4.869472003176806, "learning_rate": 1.9999759054226157e-06, "loss": 1.3162, "step": 1318 }, { "epoch": 0.01, "grad_norm": 6.707074378059004, "learning_rate": 1.9999758685668192e-06, "loss": 1.3817, "step": 1319 }, { "epoch": 0.01, "grad_norm": 4.360722245763356, "learning_rate": 1.999975831682857e-06, "loss": 1.3389, "step": 1320 }, { "epoch": 0.01, "grad_norm": 5.432944358728482, "learning_rate": 1.9999757947707284e-06, "loss": 1.4327, "step": 1321 }, { "epoch": 0.01, "grad_norm": 4.82335037337341, "learning_rate": 1.9999757578304336e-06, "loss": 1.6028, "step": 1322 }, { "epoch": 0.01, "grad_norm": 4.585722041095415, "learning_rate": 1.9999757208619733e-06, "loss": 1.4382, "step": 1323 }, { "epoch": 0.01, "grad_norm": 4.668470608848745, "learning_rate": 1.999975683865347e-06, "loss": 1.2558, "step": 1324 }, { "epoch": 0.01, "grad_norm": 4.731181861451524, "learning_rate": 1.9999756468405543e-06, "loss": 1.4787, "step": 1325 }, { "epoch": 0.01, "grad_norm": 5.300289798255382, "learning_rate": 1.999975609787596e-06, "loss": 1.4782, "step": 1326 }, { "epoch": 0.01, "grad_norm": 4.942011068638616, "learning_rate": 1.9999755727064715e-06, "loss": 1.5667, "step": 1327 }, { "epoch": 0.01, "grad_norm": 5.309328004860009, "learning_rate": 1.9999755355971813e-06, "loss": 1.5442, "step": 1328 }, { "epoch": 0.01, "grad_norm": 4.915409373782858, "learning_rate": 1.9999754984597248e-06, "loss": 1.3834, "step": 1329 }, { "epoch": 0.01, "grad_norm": 4.89211942651002, "learning_rate": 1.9999754612941023e-06, "loss": 1.3551, "step": 1330 }, { "epoch": 0.01, "grad_norm": 5.687686388314984, "learning_rate": 1.999975424100314e-06, "loss": 1.2831, "step": 1331 }, { "epoch": 0.01, "grad_norm": 4.582159025430598, "learning_rate": 1.9999753868783594e-06, "loss": 1.4689, "step": 1332 }, { "epoch": 0.01, "grad_norm": 4.8213772730034865, "learning_rate": 1.9999753496282394e-06, "loss": 1.4182, "step": 1333 }, { "epoch": 0.01, "grad_norm": 4.5280013653649105, "learning_rate": 1.999975312349953e-06, "loss": 1.4439, "step": 1334 }, { "epoch": 0.01, "grad_norm": 4.893199334359817, "learning_rate": 1.9999752750435008e-06, "loss": 1.5473, "step": 1335 }, { "epoch": 0.01, "grad_norm": 4.869413876075934, "learning_rate": 1.9999752377088825e-06, "loss": 1.3372, "step": 1336 }, { "epoch": 0.01, "grad_norm": 6.9958240347842695, "learning_rate": 1.9999752003460982e-06, "loss": 1.3613, "step": 1337 }, { "epoch": 0.01, "grad_norm": 5.4921822785420655, "learning_rate": 1.999975162955148e-06, "loss": 1.4645, "step": 1338 }, { "epoch": 0.01, "grad_norm": 5.304921830032534, "learning_rate": 1.999975125536032e-06, "loss": 1.4629, "step": 1339 }, { "epoch": 0.01, "grad_norm": 4.649096231830719, "learning_rate": 1.9999750880887497e-06, "loss": 1.2108, "step": 1340 }, { "epoch": 0.01, "grad_norm": 5.420599578392317, "learning_rate": 1.9999750506133017e-06, "loss": 1.3255, "step": 1341 }, { "epoch": 0.01, "grad_norm": 4.8401246330468695, "learning_rate": 1.9999750131096876e-06, "loss": 1.4344, "step": 1342 }, { "epoch": 0.01, "grad_norm": 5.15766506468985, "learning_rate": 1.9999749755779076e-06, "loss": 1.5095, "step": 1343 }, { "epoch": 0.01, "grad_norm": 4.838423380109326, "learning_rate": 1.999974938017961e-06, "loss": 1.4351, "step": 1344 }, { "epoch": 0.01, "grad_norm": 5.7944677876331205, "learning_rate": 1.9999749004298492e-06, "loss": 1.4865, "step": 1345 }, { "epoch": 0.01, "grad_norm": 5.857475428823912, "learning_rate": 1.9999748628135713e-06, "loss": 1.5694, "step": 1346 }, { "epoch": 0.01, "grad_norm": 5.117298413799976, "learning_rate": 1.9999748251691275e-06, "loss": 1.3554, "step": 1347 }, { "epoch": 0.01, "grad_norm": 4.729812229132196, "learning_rate": 1.9999747874965176e-06, "loss": 1.365, "step": 1348 }, { "epoch": 0.01, "grad_norm": 4.918522074472892, "learning_rate": 1.999974749795742e-06, "loss": 1.5329, "step": 1349 }, { "epoch": 0.01, "grad_norm": 6.128757606688177, "learning_rate": 1.9999747120668e-06, "loss": 1.4496, "step": 1350 }, { "epoch": 0.01, "grad_norm": 5.512588294782333, "learning_rate": 1.9999746743096923e-06, "loss": 1.4654, "step": 1351 }, { "epoch": 0.01, "grad_norm": 5.932892066430468, "learning_rate": 1.9999746365244182e-06, "loss": 1.5322, "step": 1352 }, { "epoch": 0.01, "grad_norm": 4.9351926580424585, "learning_rate": 1.999974598710979e-06, "loss": 1.509, "step": 1353 }, { "epoch": 0.01, "grad_norm": 4.73360746783656, "learning_rate": 1.999974560869373e-06, "loss": 1.4276, "step": 1354 }, { "epoch": 0.01, "grad_norm": 5.323019209743392, "learning_rate": 1.9999745229996014e-06, "loss": 1.4279, "step": 1355 }, { "epoch": 0.01, "grad_norm": 5.121493184418707, "learning_rate": 1.999974485101664e-06, "loss": 1.3617, "step": 1356 }, { "epoch": 0.01, "grad_norm": 4.924408494387465, "learning_rate": 1.9999744471755604e-06, "loss": 1.4852, "step": 1357 }, { "epoch": 0.01, "grad_norm": 4.577577155791868, "learning_rate": 1.999974409221291e-06, "loss": 1.463, "step": 1358 }, { "epoch": 0.01, "grad_norm": 4.677557438554256, "learning_rate": 1.9999743712388555e-06, "loss": 1.4702, "step": 1359 }, { "epoch": 0.01, "grad_norm": 4.869471619730696, "learning_rate": 1.999974333228254e-06, "loss": 1.4982, "step": 1360 }, { "epoch": 0.01, "grad_norm": 5.23329639876476, "learning_rate": 1.9999742951894867e-06, "loss": 1.4118, "step": 1361 }, { "epoch": 0.01, "grad_norm": 4.664165574708151, "learning_rate": 1.9999742571225534e-06, "loss": 1.4708, "step": 1362 }, { "epoch": 0.01, "grad_norm": 4.5907041789083705, "learning_rate": 1.999974219027454e-06, "loss": 1.4604, "step": 1363 }, { "epoch": 0.01, "grad_norm": 4.940547135047092, "learning_rate": 1.999974180904189e-06, "loss": 1.4571, "step": 1364 }, { "epoch": 0.01, "grad_norm": 5.234304088486649, "learning_rate": 1.9999741427527577e-06, "loss": 1.4042, "step": 1365 }, { "epoch": 0.01, "grad_norm": 5.075734957191831, "learning_rate": 1.9999741045731605e-06, "loss": 1.4649, "step": 1366 }, { "epoch": 0.01, "grad_norm": 4.545302713138621, "learning_rate": 1.999974066365398e-06, "loss": 1.3497, "step": 1367 }, { "epoch": 0.01, "grad_norm": 4.55221088039147, "learning_rate": 1.9999740281294687e-06, "loss": 1.342, "step": 1368 }, { "epoch": 0.01, "grad_norm": 4.6125447731949185, "learning_rate": 1.9999739898653736e-06, "loss": 1.4971, "step": 1369 }, { "epoch": 0.01, "grad_norm": 9.07610846685092, "learning_rate": 1.999973951573113e-06, "loss": 1.5226, "step": 1370 }, { "epoch": 0.01, "grad_norm": 5.318700322087232, "learning_rate": 1.999973913252686e-06, "loss": 1.4651, "step": 1371 }, { "epoch": 0.01, "grad_norm": 4.537300685685728, "learning_rate": 1.999973874904093e-06, "loss": 1.2609, "step": 1372 }, { "epoch": 0.01, "grad_norm": 5.193376063176117, "learning_rate": 1.9999738365273346e-06, "loss": 1.5545, "step": 1373 }, { "epoch": 0.01, "grad_norm": 5.273750459431588, "learning_rate": 1.99997379812241e-06, "loss": 1.4439, "step": 1374 }, { "epoch": 0.01, "grad_norm": 4.595241626337541, "learning_rate": 1.9999737596893198e-06, "loss": 1.4768, "step": 1375 }, { "epoch": 0.01, "grad_norm": 5.259906353516755, "learning_rate": 1.9999737212280634e-06, "loss": 1.5442, "step": 1376 }, { "epoch": 0.01, "grad_norm": 7.201372009358403, "learning_rate": 1.999973682738641e-06, "loss": 1.368, "step": 1377 }, { "epoch": 0.01, "grad_norm": 4.977174887944729, "learning_rate": 1.9999736442210523e-06, "loss": 1.469, "step": 1378 }, { "epoch": 0.01, "grad_norm": 4.782568509322288, "learning_rate": 1.999973605675298e-06, "loss": 1.4649, "step": 1379 }, { "epoch": 0.01, "grad_norm": 4.9003324240880515, "learning_rate": 1.999973567101378e-06, "loss": 1.4912, "step": 1380 }, { "epoch": 0.01, "grad_norm": 4.443690275403413, "learning_rate": 1.9999735284992916e-06, "loss": 1.3528, "step": 1381 }, { "epoch": 0.01, "grad_norm": 5.103063174767599, "learning_rate": 1.9999734898690395e-06, "loss": 1.5669, "step": 1382 }, { "epoch": 0.01, "grad_norm": 4.799160062093434, "learning_rate": 1.999973451210622e-06, "loss": 1.4682, "step": 1383 }, { "epoch": 0.01, "grad_norm": 4.89475639433693, "learning_rate": 1.9999734125240377e-06, "loss": 1.5212, "step": 1384 }, { "epoch": 0.01, "grad_norm": 4.458491045381737, "learning_rate": 1.999973373809288e-06, "loss": 1.2801, "step": 1385 }, { "epoch": 0.01, "grad_norm": 5.308111135518119, "learning_rate": 1.999973335066372e-06, "loss": 1.4821, "step": 1386 }, { "epoch": 0.01, "grad_norm": 4.548350666808942, "learning_rate": 1.9999732962952905e-06, "loss": 1.4494, "step": 1387 }, { "epoch": 0.01, "eval_loss": 1.6605433225631714, "eval_runtime": 4.645, "eval_samples_per_second": 1.938, "eval_steps_per_second": 1.076, "step": 1387 }, { "epoch": 0.01, "grad_norm": 5.872935272294134, "learning_rate": 1.999973257496043e-06, "loss": 1.5121, "step": 1388 }, { "epoch": 0.01, "grad_norm": 4.421100493500692, "learning_rate": 1.999973218668629e-06, "loss": 1.4706, "step": 1389 }, { "epoch": 0.01, "grad_norm": 4.651724509052579, "learning_rate": 1.99997317981305e-06, "loss": 1.3924, "step": 1390 }, { "epoch": 0.01, "grad_norm": 8.375754147106289, "learning_rate": 1.9999731409293047e-06, "loss": 1.6767, "step": 1391 }, { "epoch": 0.01, "grad_norm": 4.849899617039911, "learning_rate": 1.9999731020173934e-06, "loss": 1.4921, "step": 1392 }, { "epoch": 0.01, "grad_norm": 5.47624020449905, "learning_rate": 1.999973063077316e-06, "loss": 1.4762, "step": 1393 }, { "epoch": 0.01, "grad_norm": 4.357532784726696, "learning_rate": 1.999973024109073e-06, "loss": 1.3477, "step": 1394 }, { "epoch": 0.01, "grad_norm": 4.627544754203488, "learning_rate": 1.999972985112664e-06, "loss": 1.4234, "step": 1395 }, { "epoch": 0.01, "grad_norm": 4.782545342694373, "learning_rate": 1.999972946088089e-06, "loss": 1.3345, "step": 1396 }, { "epoch": 0.01, "grad_norm": 4.918312434237096, "learning_rate": 1.999972907035348e-06, "loss": 1.4585, "step": 1397 }, { "epoch": 0.01, "grad_norm": 4.74711940647535, "learning_rate": 1.9999728679544412e-06, "loss": 1.5588, "step": 1398 }, { "epoch": 0.01, "grad_norm": 6.756574444935555, "learning_rate": 1.999972828845369e-06, "loss": 1.5253, "step": 1399 }, { "epoch": 0.01, "grad_norm": 4.825155321890688, "learning_rate": 1.99997278970813e-06, "loss": 1.4386, "step": 1400 }, { "epoch": 0.01, "grad_norm": 5.893816643811335, "learning_rate": 1.9999727505427257e-06, "loss": 1.2703, "step": 1401 }, { "epoch": 0.01, "grad_norm": 5.271403904383769, "learning_rate": 1.999972711349155e-06, "loss": 1.332, "step": 1402 }, { "epoch": 0.01, "grad_norm": 5.094872014502599, "learning_rate": 1.999972672127419e-06, "loss": 1.4158, "step": 1403 }, { "epoch": 0.01, "grad_norm": 4.785102973962074, "learning_rate": 1.9999726328775166e-06, "loss": 1.4165, "step": 1404 }, { "epoch": 0.01, "grad_norm": 7.7983947530400535, "learning_rate": 1.9999725935994485e-06, "loss": 1.4353, "step": 1405 }, { "epoch": 0.01, "grad_norm": 14.646416220267023, "learning_rate": 1.9999725542932145e-06, "loss": 1.3781, "step": 1406 }, { "epoch": 0.01, "grad_norm": 5.037078769476571, "learning_rate": 1.9999725149588146e-06, "loss": 1.3702, "step": 1407 }, { "epoch": 0.01, "grad_norm": 4.9356881553322545, "learning_rate": 1.9999724755962486e-06, "loss": 1.5146, "step": 1408 }, { "epoch": 0.01, "grad_norm": 8.717638454921318, "learning_rate": 1.999972436205517e-06, "loss": 1.2775, "step": 1409 }, { "epoch": 0.01, "grad_norm": 4.998493967980832, "learning_rate": 1.9999723967866197e-06, "loss": 1.3866, "step": 1410 }, { "epoch": 0.01, "grad_norm": 4.7378104129164464, "learning_rate": 1.9999723573395564e-06, "loss": 1.32, "step": 1411 }, { "epoch": 0.01, "grad_norm": 5.599759309706441, "learning_rate": 1.9999723178643266e-06, "loss": 1.5095, "step": 1412 }, { "epoch": 0.01, "grad_norm": 4.876824835044584, "learning_rate": 1.9999722783609313e-06, "loss": 1.283, "step": 1413 }, { "epoch": 0.01, "grad_norm": 4.37874420839293, "learning_rate": 1.99997223882937e-06, "loss": 1.3672, "step": 1414 }, { "epoch": 0.01, "grad_norm": 6.233951321610936, "learning_rate": 1.999972199269643e-06, "loss": 1.5104, "step": 1415 }, { "epoch": 0.01, "grad_norm": 4.6266662891163195, "learning_rate": 1.99997215968175e-06, "loss": 1.422, "step": 1416 }, { "epoch": 0.01, "grad_norm": 5.264895342929483, "learning_rate": 1.9999721200656912e-06, "loss": 1.3256, "step": 1417 }, { "epoch": 0.01, "grad_norm": 5.126522085971787, "learning_rate": 1.9999720804214665e-06, "loss": 1.5388, "step": 1418 }, { "epoch": 0.01, "grad_norm": 5.0904993436779655, "learning_rate": 1.999972040749076e-06, "loss": 1.6427, "step": 1419 }, { "epoch": 0.01, "grad_norm": 4.557223446909417, "learning_rate": 1.999972001048519e-06, "loss": 1.4148, "step": 1420 }, { "epoch": 0.01, "grad_norm": 4.7803697348815914, "learning_rate": 1.999971961319797e-06, "loss": 1.5555, "step": 1421 }, { "epoch": 0.01, "grad_norm": 4.800921782137216, "learning_rate": 1.9999719215629085e-06, "loss": 1.4876, "step": 1422 }, { "epoch": 0.01, "grad_norm": 6.9994358843439, "learning_rate": 1.999971881777855e-06, "loss": 1.4667, "step": 1423 }, { "epoch": 0.01, "grad_norm": 4.464016639284502, "learning_rate": 1.9999718419646343e-06, "loss": 1.2817, "step": 1424 }, { "epoch": 0.01, "grad_norm": 4.791881059083396, "learning_rate": 1.9999718021232487e-06, "loss": 1.3099, "step": 1425 }, { "epoch": 0.01, "grad_norm": 5.046707728618333, "learning_rate": 1.9999717622536963e-06, "loss": 1.3161, "step": 1426 }, { "epoch": 0.01, "grad_norm": 4.941858775320615, "learning_rate": 1.9999717223559788e-06, "loss": 1.4566, "step": 1427 }, { "epoch": 0.01, "grad_norm": 5.008676930939444, "learning_rate": 1.9999716824300953e-06, "loss": 1.4165, "step": 1428 }, { "epoch": 0.01, "grad_norm": 5.280497134099123, "learning_rate": 1.999971642476046e-06, "loss": 1.4435, "step": 1429 }, { "epoch": 0.01, "grad_norm": 4.899113042250123, "learning_rate": 1.999971602493831e-06, "loss": 1.4571, "step": 1430 }, { "epoch": 0.01, "grad_norm": 5.250624631469121, "learning_rate": 1.9999715624834494e-06, "loss": 1.4758, "step": 1431 }, { "epoch": 0.01, "grad_norm": 4.646881476231068, "learning_rate": 1.999971522444902e-06, "loss": 1.581, "step": 1432 }, { "epoch": 0.01, "grad_norm": 6.1511243697760705, "learning_rate": 1.999971482378189e-06, "loss": 1.3673, "step": 1433 }, { "epoch": 0.01, "grad_norm": 4.963495678210403, "learning_rate": 1.9999714422833108e-06, "loss": 1.4672, "step": 1434 }, { "epoch": 0.01, "grad_norm": 4.879620074849547, "learning_rate": 1.999971402160266e-06, "loss": 1.4164, "step": 1435 }, { "epoch": 0.01, "grad_norm": 4.393239464097148, "learning_rate": 1.9999713620090556e-06, "loss": 1.2775, "step": 1436 }, { "epoch": 0.01, "grad_norm": 5.371614688959383, "learning_rate": 1.999971321829679e-06, "loss": 1.3789, "step": 1437 }, { "epoch": 0.01, "grad_norm": 4.859029231950951, "learning_rate": 1.9999712816221366e-06, "loss": 1.2764, "step": 1438 }, { "epoch": 0.01, "grad_norm": 5.978190883538404, "learning_rate": 1.9999712413864284e-06, "loss": 1.5097, "step": 1439 }, { "epoch": 0.01, "grad_norm": 4.859269754595811, "learning_rate": 1.9999712011225546e-06, "loss": 1.3549, "step": 1440 }, { "epoch": 0.01, "grad_norm": 4.598800701532648, "learning_rate": 1.9999711608305144e-06, "loss": 1.4791, "step": 1441 }, { "epoch": 0.01, "grad_norm": 5.233668406367012, "learning_rate": 1.9999711205103087e-06, "loss": 1.5436, "step": 1442 }, { "epoch": 0.01, "grad_norm": 6.585328618747414, "learning_rate": 1.9999710801619375e-06, "loss": 1.4573, "step": 1443 }, { "epoch": 0.01, "grad_norm": 4.975282503115869, "learning_rate": 1.9999710397854e-06, "loss": 1.3631, "step": 1444 }, { "epoch": 0.01, "grad_norm": 6.060428307246658, "learning_rate": 1.9999709993806967e-06, "loss": 1.4522, "step": 1445 }, { "epoch": 0.01, "grad_norm": 5.344016376470663, "learning_rate": 1.999970958947827e-06, "loss": 1.4702, "step": 1446 }, { "epoch": 0.01, "grad_norm": 6.153848988906273, "learning_rate": 1.999970918486792e-06, "loss": 1.5144, "step": 1447 }, { "epoch": 0.01, "grad_norm": 5.957762192580373, "learning_rate": 1.9999708779975914e-06, "loss": 1.5009, "step": 1448 }, { "epoch": 0.01, "grad_norm": 4.791562507924204, "learning_rate": 1.999970837480225e-06, "loss": 1.3169, "step": 1449 }, { "epoch": 0.01, "grad_norm": 4.757399799969445, "learning_rate": 1.9999707969346922e-06, "loss": 1.4207, "step": 1450 }, { "epoch": 0.01, "grad_norm": 4.939646292701733, "learning_rate": 1.9999707563609937e-06, "loss": 1.5557, "step": 1451 }, { "epoch": 0.01, "grad_norm": 5.213045670839742, "learning_rate": 1.9999707157591296e-06, "loss": 1.4483, "step": 1452 }, { "epoch": 0.01, "grad_norm": 4.262064916528809, "learning_rate": 1.999970675129099e-06, "loss": 1.395, "step": 1453 }, { "epoch": 0.01, "grad_norm": 4.936947311373785, "learning_rate": 1.999970634470903e-06, "loss": 1.4634, "step": 1454 }, { "epoch": 0.01, "grad_norm": 4.362006992850526, "learning_rate": 1.9999705937845413e-06, "loss": 1.2776, "step": 1455 }, { "epoch": 0.01, "grad_norm": 4.643820891960016, "learning_rate": 1.9999705530700138e-06, "loss": 1.2722, "step": 1456 }, { "epoch": 0.01, "grad_norm": 4.949308239528696, "learning_rate": 1.9999705123273203e-06, "loss": 1.5015, "step": 1457 }, { "epoch": 0.01, "grad_norm": 6.259335084505092, "learning_rate": 1.999970471556461e-06, "loss": 1.4663, "step": 1458 }, { "epoch": 0.01, "grad_norm": 5.267704451871602, "learning_rate": 1.9999704307574355e-06, "loss": 1.4835, "step": 1459 }, { "epoch": 0.01, "grad_norm": 5.251718898458907, "learning_rate": 1.9999703899302446e-06, "loss": 1.3878, "step": 1460 }, { "epoch": 0.01, "eval_loss": 1.6609848737716675, "eval_runtime": 4.6315, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.08, "step": 1460 }, { "epoch": 0.01, "grad_norm": 4.908979692543608, "learning_rate": 1.9999703490748877e-06, "loss": 1.4374, "step": 1461 }, { "epoch": 0.01, "grad_norm": 5.071932947528256, "learning_rate": 1.999970308191365e-06, "loss": 1.5086, "step": 1462 }, { "epoch": 0.01, "grad_norm": 4.519660989077838, "learning_rate": 1.9999702672796765e-06, "loss": 1.3081, "step": 1463 }, { "epoch": 0.01, "grad_norm": 5.423290223247818, "learning_rate": 1.999970226339822e-06, "loss": 1.5415, "step": 1464 }, { "epoch": 0.01, "grad_norm": 4.276573152012557, "learning_rate": 1.999970185371802e-06, "loss": 1.3497, "step": 1465 }, { "epoch": 0.01, "grad_norm": 4.873791431302587, "learning_rate": 1.9999701443756155e-06, "loss": 1.3745, "step": 1466 }, { "epoch": 0.01, "grad_norm": 4.9749684436238315, "learning_rate": 1.9999701033512637e-06, "loss": 1.4114, "step": 1467 }, { "epoch": 0.01, "grad_norm": 4.646373215103361, "learning_rate": 1.999970062298746e-06, "loss": 1.4112, "step": 1468 }, { "epoch": 0.01, "grad_norm": 4.864454376294363, "learning_rate": 1.9999700212180626e-06, "loss": 1.3469, "step": 1469 }, { "epoch": 0.01, "grad_norm": 4.937829789499327, "learning_rate": 1.999969980109213e-06, "loss": 1.5815, "step": 1470 }, { "epoch": 0.01, "grad_norm": 4.639905252344842, "learning_rate": 1.9999699389721977e-06, "loss": 1.3537, "step": 1471 }, { "epoch": 0.01, "grad_norm": 4.78299140918665, "learning_rate": 1.999969897807017e-06, "loss": 1.4826, "step": 1472 }, { "epoch": 0.01, "grad_norm": 4.793192549086234, "learning_rate": 1.9999698566136697e-06, "loss": 1.436, "step": 1473 }, { "epoch": 0.01, "grad_norm": 4.792423303861806, "learning_rate": 1.999969815392157e-06, "loss": 1.4389, "step": 1474 }, { "epoch": 0.01, "grad_norm": 8.817682445317553, "learning_rate": 1.9999697741424787e-06, "loss": 1.3081, "step": 1475 }, { "epoch": 0.01, "grad_norm": 4.763447598313762, "learning_rate": 1.9999697328646345e-06, "loss": 1.419, "step": 1476 }, { "epoch": 0.01, "grad_norm": 5.553254419945732, "learning_rate": 1.9999696915586243e-06, "loss": 1.5496, "step": 1477 }, { "epoch": 0.01, "grad_norm": 6.8449155578371785, "learning_rate": 1.999969650224448e-06, "loss": 1.4168, "step": 1478 }, { "epoch": 0.01, "grad_norm": 4.936042392237835, "learning_rate": 1.999969608862106e-06, "loss": 1.4295, "step": 1479 }, { "epoch": 0.01, "grad_norm": 4.692520927812083, "learning_rate": 1.9999695674715985e-06, "loss": 1.4999, "step": 1480 }, { "epoch": 0.01, "grad_norm": 4.618917093841338, "learning_rate": 1.9999695260529253e-06, "loss": 1.4672, "step": 1481 }, { "epoch": 0.01, "grad_norm": 4.540512172018715, "learning_rate": 1.999969484606086e-06, "loss": 1.4142, "step": 1482 }, { "epoch": 0.01, "grad_norm": 6.257964249055953, "learning_rate": 1.999969443131081e-06, "loss": 1.5545, "step": 1483 }, { "epoch": 0.01, "grad_norm": 5.361199744025823, "learning_rate": 1.99996940162791e-06, "loss": 1.4301, "step": 1484 }, { "epoch": 0.01, "grad_norm": 4.762262068823818, "learning_rate": 1.9999693600965733e-06, "loss": 1.3145, "step": 1485 }, { "epoch": 0.01, "grad_norm": 4.507782598355478, "learning_rate": 1.9999693185370708e-06, "loss": 1.346, "step": 1486 }, { "epoch": 0.01, "grad_norm": 5.20616920405698, "learning_rate": 1.9999692769494027e-06, "loss": 1.3654, "step": 1487 }, { "epoch": 0.01, "grad_norm": 4.9199475980705385, "learning_rate": 1.9999692353335686e-06, "loss": 1.4154, "step": 1488 }, { "epoch": 0.01, "grad_norm": 4.789863379301854, "learning_rate": 1.9999691936895685e-06, "loss": 1.4946, "step": 1489 }, { "epoch": 0.01, "grad_norm": 5.2869321337326225, "learning_rate": 1.9999691520174025e-06, "loss": 1.3195, "step": 1490 }, { "epoch": 0.01, "grad_norm": 5.757207256999973, "learning_rate": 1.999969110317071e-06, "loss": 1.6459, "step": 1491 }, { "epoch": 0.01, "grad_norm": 4.856756819873707, "learning_rate": 1.999969068588574e-06, "loss": 1.3793, "step": 1492 }, { "epoch": 0.01, "grad_norm": 5.1406066928499605, "learning_rate": 1.999969026831911e-06, "loss": 1.6314, "step": 1493 }, { "epoch": 0.01, "grad_norm": 4.339460225757653, "learning_rate": 1.9999689850470814e-06, "loss": 1.2897, "step": 1494 }, { "epoch": 0.01, "grad_norm": 5.118439798561242, "learning_rate": 1.999968943234087e-06, "loss": 1.3565, "step": 1495 }, { "epoch": 0.01, "grad_norm": 5.239267360652984, "learning_rate": 1.9999689013929264e-06, "loss": 1.3343, "step": 1496 }, { "epoch": 0.01, "grad_norm": 4.413381580899369, "learning_rate": 1.9999688595236003e-06, "loss": 1.3758, "step": 1497 }, { "epoch": 0.01, "grad_norm": 5.032731231134136, "learning_rate": 1.999968817626108e-06, "loss": 1.4616, "step": 1498 }, { "epoch": 0.01, "grad_norm": 5.181684854242491, "learning_rate": 1.99996877570045e-06, "loss": 1.5541, "step": 1499 }, { "epoch": 0.01, "grad_norm": 4.529939429132134, "learning_rate": 1.9999687337466264e-06, "loss": 1.4291, "step": 1500 }, { "epoch": 0.01, "grad_norm": 6.083882159448674, "learning_rate": 1.9999686917646365e-06, "loss": 1.527, "step": 1501 }, { "epoch": 0.01, "grad_norm": 5.290245063554081, "learning_rate": 1.999968649754481e-06, "loss": 1.4742, "step": 1502 }, { "epoch": 0.01, "grad_norm": 4.717789412166867, "learning_rate": 1.9999686077161605e-06, "loss": 1.4765, "step": 1503 }, { "epoch": 0.01, "grad_norm": 4.545533558985268, "learning_rate": 1.999968565649673e-06, "loss": 1.306, "step": 1504 }, { "epoch": 0.01, "grad_norm": 4.728947694233556, "learning_rate": 1.9999685235550206e-06, "loss": 1.3828, "step": 1505 }, { "epoch": 0.01, "grad_norm": 4.662992982900369, "learning_rate": 1.999968481432202e-06, "loss": 1.4146, "step": 1506 }, { "epoch": 0.01, "grad_norm": 6.039232945794986, "learning_rate": 1.9999684392812178e-06, "loss": 1.5415, "step": 1507 }, { "epoch": 0.01, "grad_norm": 5.039673874673663, "learning_rate": 1.999968397102068e-06, "loss": 1.3505, "step": 1508 }, { "epoch": 0.01, "grad_norm": 5.7877009049623025, "learning_rate": 1.9999683548947523e-06, "loss": 1.4874, "step": 1509 }, { "epoch": 0.01, "grad_norm": 4.641088327797642, "learning_rate": 1.9999683126592705e-06, "loss": 1.4175, "step": 1510 }, { "epoch": 0.01, "grad_norm": 4.831959829153163, "learning_rate": 1.999968270395623e-06, "loss": 1.4567, "step": 1511 }, { "epoch": 0.01, "grad_norm": 5.699928337044365, "learning_rate": 1.99996822810381e-06, "loss": 1.392, "step": 1512 }, { "epoch": 0.01, "grad_norm": 4.608406926276191, "learning_rate": 1.9999681857838308e-06, "loss": 1.4505, "step": 1513 }, { "epoch": 0.01, "grad_norm": 4.867903533500069, "learning_rate": 1.9999681434356863e-06, "loss": 1.3949, "step": 1514 }, { "epoch": 0.01, "grad_norm": 4.944562213455762, "learning_rate": 1.9999681010593754e-06, "loss": 1.2331, "step": 1515 }, { "epoch": 0.01, "grad_norm": 5.2571821063164474, "learning_rate": 1.9999680586548995e-06, "loss": 1.5634, "step": 1516 }, { "epoch": 0.01, "grad_norm": 4.907465548338297, "learning_rate": 1.999968016222257e-06, "loss": 1.3562, "step": 1517 }, { "epoch": 0.01, "grad_norm": 4.276801096528798, "learning_rate": 1.9999679737614493e-06, "loss": 1.1667, "step": 1518 }, { "epoch": 0.01, "grad_norm": 5.073955336024891, "learning_rate": 1.999967931272476e-06, "loss": 1.5431, "step": 1519 }, { "epoch": 0.01, "grad_norm": 5.183148059254548, "learning_rate": 1.9999678887553364e-06, "loss": 1.5844, "step": 1520 }, { "epoch": 0.01, "grad_norm": 4.591593727529927, "learning_rate": 1.9999678462100315e-06, "loss": 1.4007, "step": 1521 }, { "epoch": 0.01, "grad_norm": 4.83807070074554, "learning_rate": 1.99996780363656e-06, "loss": 1.4673, "step": 1522 }, { "epoch": 0.01, "grad_norm": 5.564201157030677, "learning_rate": 1.9999677610349238e-06, "loss": 1.4229, "step": 1523 }, { "epoch": 0.01, "grad_norm": 6.506652623301389, "learning_rate": 1.9999677184051214e-06, "loss": 1.4544, "step": 1524 }, { "epoch": 0.01, "grad_norm": 5.110861074492502, "learning_rate": 1.999967675747153e-06, "loss": 1.4452, "step": 1525 }, { "epoch": 0.01, "grad_norm": 4.550965773070968, "learning_rate": 1.999967633061019e-06, "loss": 1.39, "step": 1526 }, { "epoch": 0.01, "grad_norm": 15.977881886992321, "learning_rate": 1.9999675903467193e-06, "loss": 1.5182, "step": 1527 }, { "epoch": 0.01, "grad_norm": 4.886960525204347, "learning_rate": 1.999967547604254e-06, "loss": 1.3334, "step": 1528 }, { "epoch": 0.01, "grad_norm": 5.035221458256328, "learning_rate": 1.9999675048336225e-06, "loss": 1.5484, "step": 1529 }, { "epoch": 0.01, "grad_norm": 5.127298418854591, "learning_rate": 1.9999674620348256e-06, "loss": 1.5482, "step": 1530 }, { "epoch": 0.01, "grad_norm": 7.393021516760661, "learning_rate": 1.9999674192078627e-06, "loss": 1.3923, "step": 1531 }, { "epoch": 0.01, "grad_norm": 4.597322785666696, "learning_rate": 1.9999673763527343e-06, "loss": 1.4507, "step": 1532 }, { "epoch": 0.01, "grad_norm": 4.759409590128979, "learning_rate": 1.99996733346944e-06, "loss": 1.328, "step": 1533 }, { "epoch": 0.01, "eval_loss": 1.6537013053894043, "eval_runtime": 4.6319, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 1533 }, { "epoch": 0.01, "grad_norm": 5.004922521706707, "learning_rate": 1.99996729055798e-06, "loss": 1.3943, "step": 1534 }, { "epoch": 0.01, "grad_norm": 4.645165024309459, "learning_rate": 1.999967247618354e-06, "loss": 1.3322, "step": 1535 }, { "epoch": 0.01, "grad_norm": 4.999318863504048, "learning_rate": 1.9999672046505628e-06, "loss": 1.5395, "step": 1536 }, { "epoch": 0.01, "grad_norm": 4.86361645206254, "learning_rate": 1.9999671616546054e-06, "loss": 1.5212, "step": 1537 }, { "epoch": 0.01, "grad_norm": 5.153420847923612, "learning_rate": 1.9999671186304825e-06, "loss": 1.5427, "step": 1538 }, { "epoch": 0.01, "grad_norm": 5.0394644534786135, "learning_rate": 1.9999670755781936e-06, "loss": 1.2973, "step": 1539 }, { "epoch": 0.01, "grad_norm": 4.633634743770092, "learning_rate": 1.9999670324977388e-06, "loss": 1.5085, "step": 1540 }, { "epoch": 0.01, "grad_norm": 8.357519744982609, "learning_rate": 1.999966989389119e-06, "loss": 1.3645, "step": 1541 }, { "epoch": 0.01, "grad_norm": 4.802169847384737, "learning_rate": 1.9999669462523325e-06, "loss": 1.5779, "step": 1542 }, { "epoch": 0.01, "grad_norm": 4.668930926325562, "learning_rate": 1.999966903087381e-06, "loss": 1.4067, "step": 1543 }, { "epoch": 0.01, "grad_norm": 4.43999149148277, "learning_rate": 1.9999668598942636e-06, "loss": 1.2107, "step": 1544 }, { "epoch": 0.01, "grad_norm": 4.802567954958664, "learning_rate": 1.99996681667298e-06, "loss": 1.4996, "step": 1545 }, { "epoch": 0.01, "grad_norm": 5.043050576777043, "learning_rate": 1.999966773423531e-06, "loss": 1.4864, "step": 1546 }, { "epoch": 0.01, "grad_norm": 4.838293228729252, "learning_rate": 1.9999667301459164e-06, "loss": 1.5079, "step": 1547 }, { "epoch": 0.01, "grad_norm": 4.8303265370783235, "learning_rate": 1.999966686840136e-06, "loss": 1.6446, "step": 1548 }, { "epoch": 0.01, "grad_norm": 5.4088008516017005, "learning_rate": 1.9999666435061896e-06, "loss": 1.4987, "step": 1549 }, { "epoch": 0.01, "grad_norm": 4.441236329740386, "learning_rate": 1.9999666001440776e-06, "loss": 1.4233, "step": 1550 }, { "epoch": 0.01, "grad_norm": 4.854304337374531, "learning_rate": 1.9999665567538e-06, "loss": 1.5069, "step": 1551 }, { "epoch": 0.01, "grad_norm": 4.651751008584424, "learning_rate": 1.9999665133353567e-06, "loss": 1.4673, "step": 1552 }, { "epoch": 0.01, "grad_norm": 4.650463359683428, "learning_rate": 1.9999664698887473e-06, "loss": 1.4744, "step": 1553 }, { "epoch": 0.01, "grad_norm": 6.294289457136488, "learning_rate": 1.999966426413973e-06, "loss": 1.7254, "step": 1554 }, { "epoch": 0.01, "grad_norm": 5.026568380786172, "learning_rate": 1.9999663829110324e-06, "loss": 1.574, "step": 1555 }, { "epoch": 0.01, "grad_norm": 5.258151411074215, "learning_rate": 1.999966339379926e-06, "loss": 1.3902, "step": 1556 }, { "epoch": 0.01, "grad_norm": 4.636549612306679, "learning_rate": 1.999966295820654e-06, "loss": 1.3695, "step": 1557 }, { "epoch": 0.01, "grad_norm": 5.752515784505921, "learning_rate": 1.999966252233216e-06, "loss": 1.5815, "step": 1558 }, { "epoch": 0.01, "grad_norm": 5.150495198663999, "learning_rate": 1.9999662086176125e-06, "loss": 1.4708, "step": 1559 }, { "epoch": 0.01, "grad_norm": 4.696102556808951, "learning_rate": 1.9999661649738435e-06, "loss": 1.3237, "step": 1560 }, { "epoch": 0.01, "grad_norm": 4.487064828854456, "learning_rate": 1.9999661213019085e-06, "loss": 1.367, "step": 1561 }, { "epoch": 0.01, "grad_norm": 4.674090800118098, "learning_rate": 1.9999660776018076e-06, "loss": 1.3795, "step": 1562 }, { "epoch": 0.01, "grad_norm": 4.884374781206321, "learning_rate": 1.9999660338735415e-06, "loss": 1.4984, "step": 1563 }, { "epoch": 0.01, "grad_norm": 4.8404936570617, "learning_rate": 1.9999659901171095e-06, "loss": 1.4553, "step": 1564 }, { "epoch": 0.01, "grad_norm": 5.0291200173626, "learning_rate": 1.9999659463325115e-06, "loss": 1.4493, "step": 1565 }, { "epoch": 0.01, "grad_norm": 4.489038208651526, "learning_rate": 1.999965902519748e-06, "loss": 1.4916, "step": 1566 }, { "epoch": 0.01, "grad_norm": 5.325129603155862, "learning_rate": 1.999965858678819e-06, "loss": 1.4466, "step": 1567 }, { "epoch": 0.01, "grad_norm": 4.845547041345931, "learning_rate": 1.999965814809724e-06, "loss": 1.4709, "step": 1568 }, { "epoch": 0.01, "grad_norm": 4.6323997434080955, "learning_rate": 1.999965770912463e-06, "loss": 1.2593, "step": 1569 }, { "epoch": 0.01, "grad_norm": 5.263469443114738, "learning_rate": 1.999965726987037e-06, "loss": 1.4309, "step": 1570 }, { "epoch": 0.01, "grad_norm": 4.7842493763504, "learning_rate": 1.9999656830334447e-06, "loss": 1.5784, "step": 1571 }, { "epoch": 0.01, "grad_norm": 5.581499884497734, "learning_rate": 1.999965639051687e-06, "loss": 1.4264, "step": 1572 }, { "epoch": 0.01, "grad_norm": 4.713589565139688, "learning_rate": 1.9999655950417635e-06, "loss": 1.3676, "step": 1573 }, { "epoch": 0.01, "grad_norm": 5.224520331971595, "learning_rate": 1.9999655510036744e-06, "loss": 1.5145, "step": 1574 }, { "epoch": 0.01, "grad_norm": 5.044254687476926, "learning_rate": 1.9999655069374193e-06, "loss": 1.382, "step": 1575 }, { "epoch": 0.01, "grad_norm": 4.998067415237716, "learning_rate": 1.999965462842999e-06, "loss": 1.5149, "step": 1576 }, { "epoch": 0.01, "grad_norm": 5.376451146617778, "learning_rate": 1.9999654187204126e-06, "loss": 1.4605, "step": 1577 }, { "epoch": 0.01, "grad_norm": 5.62147315713685, "learning_rate": 1.9999653745696605e-06, "loss": 1.5111, "step": 1578 }, { "epoch": 0.01, "grad_norm": 5.386593885174143, "learning_rate": 1.999965330390743e-06, "loss": 1.5952, "step": 1579 }, { "epoch": 0.01, "grad_norm": 4.6360487402587784, "learning_rate": 1.999965286183659e-06, "loss": 1.3293, "step": 1580 }, { "epoch": 0.01, "grad_norm": 4.759388015554171, "learning_rate": 1.9999652419484104e-06, "loss": 1.3158, "step": 1581 }, { "epoch": 0.01, "grad_norm": 4.720044228281999, "learning_rate": 1.9999651976849957e-06, "loss": 1.5035, "step": 1582 }, { "epoch": 0.01, "grad_norm": 4.571250672292211, "learning_rate": 1.999965153393415e-06, "loss": 1.3919, "step": 1583 }, { "epoch": 0.01, "grad_norm": 4.512671101809202, "learning_rate": 1.9999651090736693e-06, "loss": 1.3005, "step": 1584 }, { "epoch": 0.01, "grad_norm": 5.311600498717284, "learning_rate": 1.999965064725757e-06, "loss": 1.5076, "step": 1585 }, { "epoch": 0.01, "grad_norm": 5.4380918504811815, "learning_rate": 1.9999650203496794e-06, "loss": 1.5773, "step": 1586 }, { "epoch": 0.01, "grad_norm": 4.821728820130227, "learning_rate": 1.999964975945436e-06, "loss": 1.4923, "step": 1587 }, { "epoch": 0.01, "grad_norm": 4.701896651829335, "learning_rate": 1.9999649315130274e-06, "loss": 1.3324, "step": 1588 }, { "epoch": 0.01, "grad_norm": 4.826124943857317, "learning_rate": 1.9999648870524526e-06, "loss": 1.3942, "step": 1589 }, { "epoch": 0.01, "grad_norm": 5.252549552664715, "learning_rate": 1.9999648425637123e-06, "loss": 1.4413, "step": 1590 }, { "epoch": 0.01, "grad_norm": 4.33848280401789, "learning_rate": 1.9999647980468065e-06, "loss": 1.3627, "step": 1591 }, { "epoch": 0.01, "grad_norm": 6.232495688477331, "learning_rate": 1.999964753501735e-06, "loss": 1.371, "step": 1592 }, { "epoch": 0.01, "grad_norm": 4.559527771002142, "learning_rate": 1.9999647089284974e-06, "loss": 1.4434, "step": 1593 }, { "epoch": 0.01, "grad_norm": 4.70894162267966, "learning_rate": 1.9999646643270945e-06, "loss": 1.4673, "step": 1594 }, { "epoch": 0.01, "grad_norm": 5.676920877976759, "learning_rate": 1.999964619697526e-06, "loss": 1.4621, "step": 1595 }, { "epoch": 0.01, "grad_norm": 6.232169194197823, "learning_rate": 1.9999645750397918e-06, "loss": 1.4599, "step": 1596 }, { "epoch": 0.01, "grad_norm": 4.693130001219432, "learning_rate": 1.9999645303538914e-06, "loss": 1.5755, "step": 1597 }, { "epoch": 0.01, "grad_norm": 4.616061797152542, "learning_rate": 1.9999644856398255e-06, "loss": 1.525, "step": 1598 }, { "epoch": 0.01, "grad_norm": 5.082035414984437, "learning_rate": 1.999964440897594e-06, "loss": 1.4748, "step": 1599 }, { "epoch": 0.01, "grad_norm": 5.530227379151722, "learning_rate": 1.999964396127197e-06, "loss": 1.4081, "step": 1600 }, { "epoch": 0.01, "grad_norm": 11.714257093847134, "learning_rate": 1.9999643513286343e-06, "loss": 1.4166, "step": 1601 }, { "epoch": 0.01, "grad_norm": 5.188422008714217, "learning_rate": 1.999964306501906e-06, "loss": 1.3882, "step": 1602 }, { "epoch": 0.01, "grad_norm": 5.037793578152508, "learning_rate": 1.999964261647012e-06, "loss": 1.4275, "step": 1603 }, { "epoch": 0.01, "grad_norm": 5.775212123850943, "learning_rate": 1.9999642167639523e-06, "loss": 1.6035, "step": 1604 }, { "epoch": 0.01, "grad_norm": 5.478124459351955, "learning_rate": 1.9999641718527268e-06, "loss": 1.3335, "step": 1605 }, { "epoch": 0.01, "grad_norm": 5.055970312783448, "learning_rate": 1.9999641269133357e-06, "loss": 1.5906, "step": 1606 }, { "epoch": 0.01, "eval_loss": 1.6535860300064087, "eval_runtime": 4.6131, "eval_samples_per_second": 1.951, "eval_steps_per_second": 1.084, "step": 1606 }, { "epoch": 0.01, "grad_norm": 4.7695927713899335, "learning_rate": 1.9999640819457787e-06, "loss": 1.4862, "step": 1607 }, { "epoch": 0.01, "grad_norm": 4.631860591367596, "learning_rate": 1.9999640369500566e-06, "loss": 1.3437, "step": 1608 }, { "epoch": 0.01, "grad_norm": 5.184619042074156, "learning_rate": 1.9999639919261685e-06, "loss": 1.3648, "step": 1609 }, { "epoch": 0.01, "grad_norm": 4.273228198303511, "learning_rate": 1.999963946874115e-06, "loss": 1.2881, "step": 1610 }, { "epoch": 0.01, "grad_norm": 5.019924351174469, "learning_rate": 1.9999639017938953e-06, "loss": 1.4094, "step": 1611 }, { "epoch": 0.01, "grad_norm": 6.796825605500974, "learning_rate": 1.99996385668551e-06, "loss": 1.4356, "step": 1612 }, { "epoch": 0.01, "grad_norm": 7.039100317720765, "learning_rate": 1.9999638115489595e-06, "loss": 1.507, "step": 1613 }, { "epoch": 0.01, "grad_norm": 4.760579560458608, "learning_rate": 1.9999637663842433e-06, "loss": 1.2805, "step": 1614 }, { "epoch": 0.01, "grad_norm": 5.289967418587153, "learning_rate": 1.999963721191361e-06, "loss": 1.5173, "step": 1615 }, { "epoch": 0.01, "grad_norm": 4.905881261758454, "learning_rate": 1.9999636759703135e-06, "loss": 1.4773, "step": 1616 }, { "epoch": 0.01, "grad_norm": 4.601006920242404, "learning_rate": 1.9999636307211002e-06, "loss": 1.4564, "step": 1617 }, { "epoch": 0.01, "grad_norm": 4.623950290324913, "learning_rate": 1.9999635854437214e-06, "loss": 1.5243, "step": 1618 }, { "epoch": 0.01, "grad_norm": 5.797135728663394, "learning_rate": 1.9999635401381767e-06, "loss": 1.6152, "step": 1619 }, { "epoch": 0.01, "grad_norm": 4.494921471069805, "learning_rate": 1.9999634948044664e-06, "loss": 1.3623, "step": 1620 }, { "epoch": 0.01, "grad_norm": 5.786655122435106, "learning_rate": 1.9999634494425906e-06, "loss": 1.3719, "step": 1621 }, { "epoch": 0.01, "grad_norm": 6.5772063648225405, "learning_rate": 1.9999634040525492e-06, "loss": 1.6291, "step": 1622 }, { "epoch": 0.01, "grad_norm": 4.797055299395125, "learning_rate": 1.999963358634342e-06, "loss": 1.4054, "step": 1623 }, { "epoch": 0.01, "grad_norm": 4.881210206846928, "learning_rate": 1.999963313187969e-06, "loss": 1.3922, "step": 1624 }, { "epoch": 0.01, "grad_norm": 4.7416575099427405, "learning_rate": 1.9999632677134306e-06, "loss": 1.3498, "step": 1625 }, { "epoch": 0.01, "grad_norm": 5.094353766754562, "learning_rate": 1.9999632222107262e-06, "loss": 1.4907, "step": 1626 }, { "epoch": 0.01, "grad_norm": 5.136116876869801, "learning_rate": 1.9999631766798568e-06, "loss": 1.4637, "step": 1627 }, { "epoch": 0.01, "grad_norm": 4.842724728052074, "learning_rate": 1.9999631311208213e-06, "loss": 1.5065, "step": 1628 }, { "epoch": 0.01, "grad_norm": 4.470556712718268, "learning_rate": 1.9999630855336203e-06, "loss": 1.4163, "step": 1629 }, { "epoch": 0.01, "grad_norm": 5.4357945417775015, "learning_rate": 1.999963039918254e-06, "loss": 1.5127, "step": 1630 }, { "epoch": 0.01, "grad_norm": 5.283784706095175, "learning_rate": 1.9999629942747213e-06, "loss": 1.5162, "step": 1631 }, { "epoch": 0.01, "grad_norm": 6.171430776129922, "learning_rate": 1.9999629486030233e-06, "loss": 1.4975, "step": 1632 }, { "epoch": 0.01, "grad_norm": 4.883035958460619, "learning_rate": 1.9999629029031597e-06, "loss": 1.5646, "step": 1633 }, { "epoch": 0.01, "grad_norm": 4.868003718244226, "learning_rate": 1.999962857175131e-06, "loss": 1.4711, "step": 1634 }, { "epoch": 0.01, "grad_norm": 4.688207116642901, "learning_rate": 1.999962811418936e-06, "loss": 1.235, "step": 1635 }, { "epoch": 0.01, "grad_norm": 4.779397974493194, "learning_rate": 1.9999627656345753e-06, "loss": 1.4816, "step": 1636 }, { "epoch": 0.01, "grad_norm": 4.787958938157594, "learning_rate": 1.9999627198220496e-06, "loss": 1.3246, "step": 1637 }, { "epoch": 0.01, "grad_norm": 5.294916108987848, "learning_rate": 1.999962673981358e-06, "loss": 1.4679, "step": 1638 }, { "epoch": 0.01, "grad_norm": 4.849618123624191, "learning_rate": 1.9999626281125002e-06, "loss": 1.4614, "step": 1639 }, { "epoch": 0.01, "grad_norm": 4.677011611090417, "learning_rate": 1.9999625822154775e-06, "loss": 1.3398, "step": 1640 }, { "epoch": 0.01, "grad_norm": 4.707575677733155, "learning_rate": 1.999962536290289e-06, "loss": 1.4338, "step": 1641 }, { "epoch": 0.01, "grad_norm": 5.1791912413051815, "learning_rate": 1.999962490336935e-06, "loss": 1.6644, "step": 1642 }, { "epoch": 0.01, "grad_norm": 4.636258199528408, "learning_rate": 1.999962444355415e-06, "loss": 1.3262, "step": 1643 }, { "epoch": 0.01, "grad_norm": 9.631481491029284, "learning_rate": 1.9999623983457297e-06, "loss": 1.3487, "step": 1644 }, { "epoch": 0.01, "grad_norm": 5.406573344850598, "learning_rate": 1.9999623523078784e-06, "loss": 1.4415, "step": 1645 }, { "epoch": 0.01, "grad_norm": 4.936420173298707, "learning_rate": 1.999962306241862e-06, "loss": 1.5284, "step": 1646 }, { "epoch": 0.01, "grad_norm": 7.408101415603079, "learning_rate": 1.9999622601476796e-06, "loss": 1.5724, "step": 1647 }, { "epoch": 0.01, "grad_norm": 4.91033489810393, "learning_rate": 1.999962214025332e-06, "loss": 1.4389, "step": 1648 }, { "epoch": 0.01, "grad_norm": 4.7490093326999965, "learning_rate": 1.9999621678748186e-06, "loss": 1.4127, "step": 1649 }, { "epoch": 0.01, "grad_norm": 5.02865055772892, "learning_rate": 1.999962121696139e-06, "loss": 1.5252, "step": 1650 }, { "epoch": 0.01, "grad_norm": 4.941820991121326, "learning_rate": 1.9999620754892946e-06, "loss": 1.5824, "step": 1651 }, { "epoch": 0.01, "grad_norm": 4.584733328893442, "learning_rate": 1.9999620292542845e-06, "loss": 1.4465, "step": 1652 }, { "epoch": 0.01, "grad_norm": 4.668886241818989, "learning_rate": 1.9999619829911084e-06, "loss": 1.5671, "step": 1653 }, { "epoch": 0.01, "grad_norm": 4.8595287693689455, "learning_rate": 1.9999619366997673e-06, "loss": 1.4179, "step": 1654 }, { "epoch": 0.01, "grad_norm": 4.505667922340234, "learning_rate": 1.99996189038026e-06, "loss": 1.3717, "step": 1655 }, { "epoch": 0.01, "grad_norm": 4.854733211903465, "learning_rate": 1.999961844032587e-06, "loss": 1.5526, "step": 1656 }, { "epoch": 0.01, "grad_norm": 6.052863076519862, "learning_rate": 1.999961797656749e-06, "loss": 1.5102, "step": 1657 }, { "epoch": 0.01, "grad_norm": 4.391873652615683, "learning_rate": 1.999961751252745e-06, "loss": 1.313, "step": 1658 }, { "epoch": 0.01, "grad_norm": 4.905334859088026, "learning_rate": 1.9999617048205758e-06, "loss": 1.4915, "step": 1659 }, { "epoch": 0.01, "grad_norm": 4.90511142571514, "learning_rate": 1.9999616583602405e-06, "loss": 1.3705, "step": 1660 }, { "epoch": 0.01, "grad_norm": 5.696590131572682, "learning_rate": 1.99996161187174e-06, "loss": 1.3774, "step": 1661 }, { "epoch": 0.01, "grad_norm": 4.901553847339917, "learning_rate": 1.999961565355074e-06, "loss": 1.4858, "step": 1662 }, { "epoch": 0.01, "grad_norm": 5.080014172214551, "learning_rate": 1.9999615188102415e-06, "loss": 1.3527, "step": 1663 }, { "epoch": 0.01, "grad_norm": 5.286819461235812, "learning_rate": 1.999961472237244e-06, "loss": 1.5443, "step": 1664 }, { "epoch": 0.01, "grad_norm": 5.083890130164741, "learning_rate": 1.999961425636081e-06, "loss": 1.4417, "step": 1665 }, { "epoch": 0.01, "grad_norm": 4.8993271722773235, "learning_rate": 1.9999613790067526e-06, "loss": 1.4137, "step": 1666 }, { "epoch": 0.01, "grad_norm": 5.130884125485172, "learning_rate": 1.9999613323492586e-06, "loss": 1.5135, "step": 1667 }, { "epoch": 0.01, "grad_norm": 4.560714520466941, "learning_rate": 1.9999612856635986e-06, "loss": 1.2326, "step": 1668 }, { "epoch": 0.01, "grad_norm": 4.469864395669994, "learning_rate": 1.9999612389497735e-06, "loss": 1.3952, "step": 1669 }, { "epoch": 0.01, "grad_norm": 5.0576333320329425, "learning_rate": 1.9999611922077824e-06, "loss": 1.4062, "step": 1670 }, { "epoch": 0.01, "grad_norm": 4.550473014043539, "learning_rate": 1.999961145437626e-06, "loss": 1.3792, "step": 1671 }, { "epoch": 0.01, "grad_norm": 4.749509524045929, "learning_rate": 1.9999610986393037e-06, "loss": 1.4487, "step": 1672 }, { "epoch": 0.01, "grad_norm": 5.365020045883757, "learning_rate": 1.999961051812816e-06, "loss": 1.4974, "step": 1673 }, { "epoch": 0.01, "grad_norm": 4.624114515501705, "learning_rate": 1.9999610049581627e-06, "loss": 1.388, "step": 1674 }, { "epoch": 0.01, "grad_norm": 4.510101567218574, "learning_rate": 1.999960958075344e-06, "loss": 1.3547, "step": 1675 }, { "epoch": 0.01, "grad_norm": 5.07080055515291, "learning_rate": 1.9999609111643596e-06, "loss": 1.4748, "step": 1676 }, { "epoch": 0.01, "grad_norm": 4.8404551347295985, "learning_rate": 1.9999608642252094e-06, "loss": 1.4301, "step": 1677 }, { "epoch": 0.01, "grad_norm": 5.636720567872511, "learning_rate": 1.999960817257894e-06, "loss": 1.351, "step": 1678 }, { "epoch": 0.01, "grad_norm": 4.652209867428249, "learning_rate": 1.999960770262413e-06, "loss": 1.4968, "step": 1679 }, { "epoch": 0.01, "eval_loss": 1.6513854265213013, "eval_runtime": 4.6169, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 1679 }, { "epoch": 0.01, "grad_norm": 4.725930228784767, "learning_rate": 1.999960723238766e-06, "loss": 1.409, "step": 1680 }, { "epoch": 0.01, "grad_norm": 4.954953461456659, "learning_rate": 1.9999606761869537e-06, "loss": 1.4448, "step": 1681 }, { "epoch": 0.01, "grad_norm": 5.255376181858065, "learning_rate": 1.9999606291069757e-06, "loss": 1.3616, "step": 1682 }, { "epoch": 0.01, "grad_norm": 4.709113389093899, "learning_rate": 1.9999605819988326e-06, "loss": 1.4076, "step": 1683 }, { "epoch": 0.01, "grad_norm": 5.766680383528532, "learning_rate": 1.9999605348625236e-06, "loss": 1.2462, "step": 1684 }, { "epoch": 0.01, "grad_norm": 4.409591894443744, "learning_rate": 1.999960487698049e-06, "loss": 1.4091, "step": 1685 }, { "epoch": 0.01, "grad_norm": 4.886994914850528, "learning_rate": 1.9999604405054093e-06, "loss": 1.4295, "step": 1686 }, { "epoch": 0.01, "grad_norm": 5.933768715491502, "learning_rate": 1.9999603932846036e-06, "loss": 1.3117, "step": 1687 }, { "epoch": 0.01, "grad_norm": 4.404087310320486, "learning_rate": 1.9999603460356324e-06, "loss": 1.4082, "step": 1688 }, { "epoch": 0.01, "grad_norm": 6.132686826605006, "learning_rate": 1.9999602987584952e-06, "loss": 1.4659, "step": 1689 }, { "epoch": 0.01, "grad_norm": 4.9809185718468205, "learning_rate": 1.999960251453193e-06, "loss": 1.5435, "step": 1690 }, { "epoch": 0.01, "grad_norm": 5.719735058251011, "learning_rate": 1.999960204119725e-06, "loss": 1.3241, "step": 1691 }, { "epoch": 0.01, "grad_norm": 5.016858779358495, "learning_rate": 1.9999601567580917e-06, "loss": 1.3907, "step": 1692 }, { "epoch": 0.01, "grad_norm": 5.518980098450484, "learning_rate": 1.999960109368293e-06, "loss": 1.5071, "step": 1693 }, { "epoch": 0.01, "grad_norm": 4.56680393314666, "learning_rate": 1.9999600619503284e-06, "loss": 1.277, "step": 1694 }, { "epoch": 0.01, "grad_norm": 5.221095667195953, "learning_rate": 1.9999600145041984e-06, "loss": 1.5565, "step": 1695 }, { "epoch": 0.01, "grad_norm": 4.838855837516108, "learning_rate": 1.9999599670299024e-06, "loss": 1.5075, "step": 1696 }, { "epoch": 0.01, "grad_norm": 5.181585087689867, "learning_rate": 1.9999599195274414e-06, "loss": 1.4358, "step": 1697 }, { "epoch": 0.01, "grad_norm": 4.775117548464921, "learning_rate": 1.9999598719968148e-06, "loss": 1.3676, "step": 1698 }, { "epoch": 0.01, "grad_norm": 4.675367411416997, "learning_rate": 1.9999598244380226e-06, "loss": 1.4902, "step": 1699 }, { "epoch": 0.01, "grad_norm": 4.534748298661586, "learning_rate": 1.999959776851065e-06, "loss": 1.3374, "step": 1700 }, { "epoch": 0.01, "grad_norm": 6.092872429354068, "learning_rate": 1.9999597292359413e-06, "loss": 1.3938, "step": 1701 }, { "epoch": 0.01, "grad_norm": 5.1363468139465835, "learning_rate": 1.9999596815926525e-06, "loss": 1.4641, "step": 1702 }, { "epoch": 0.01, "grad_norm": 4.679526003354978, "learning_rate": 1.9999596339211982e-06, "loss": 1.4826, "step": 1703 }, { "epoch": 0.01, "grad_norm": 5.409980918956625, "learning_rate": 1.999959586221578e-06, "loss": 1.459, "step": 1704 }, { "epoch": 0.01, "grad_norm": 4.506106987900174, "learning_rate": 1.9999595384937926e-06, "loss": 1.4694, "step": 1705 }, { "epoch": 0.01, "grad_norm": 5.139999841003741, "learning_rate": 1.9999594907378412e-06, "loss": 1.497, "step": 1706 }, { "epoch": 0.01, "grad_norm": 4.966305341143314, "learning_rate": 1.999959442953725e-06, "loss": 1.4117, "step": 1707 }, { "epoch": 0.01, "grad_norm": 11.81613488843635, "learning_rate": 1.9999593951414427e-06, "loss": 1.6682, "step": 1708 }, { "epoch": 0.01, "grad_norm": 5.377588601757201, "learning_rate": 1.999959347300995e-06, "loss": 1.3065, "step": 1709 }, { "epoch": 0.01, "grad_norm": 6.023397142367646, "learning_rate": 1.999959299432382e-06, "loss": 1.4304, "step": 1710 }, { "epoch": 0.01, "grad_norm": 4.675493936243796, "learning_rate": 1.9999592515356035e-06, "loss": 1.2623, "step": 1711 }, { "epoch": 0.01, "grad_norm": 4.86058388014828, "learning_rate": 1.9999592036106593e-06, "loss": 1.4823, "step": 1712 }, { "epoch": 0.01, "grad_norm": 4.556345518412603, "learning_rate": 1.9999591556575496e-06, "loss": 1.4409, "step": 1713 }, { "epoch": 0.01, "grad_norm": 7.066520926861509, "learning_rate": 1.9999591076762744e-06, "loss": 1.3931, "step": 1714 }, { "epoch": 0.01, "grad_norm": 5.416558606687308, "learning_rate": 1.9999590596668336e-06, "loss": 1.5513, "step": 1715 }, { "epoch": 0.01, "grad_norm": 4.566717284747848, "learning_rate": 1.9999590116292273e-06, "loss": 1.4196, "step": 1716 }, { "epoch": 0.01, "grad_norm": 4.774215759192912, "learning_rate": 1.9999589635634554e-06, "loss": 1.2179, "step": 1717 }, { "epoch": 0.01, "grad_norm": 5.871975120855977, "learning_rate": 1.999958915469518e-06, "loss": 1.3091, "step": 1718 }, { "epoch": 0.01, "grad_norm": 4.667731197924079, "learning_rate": 1.999958867347415e-06, "loss": 1.4477, "step": 1719 }, { "epoch": 0.01, "grad_norm": 8.363688597457996, "learning_rate": 1.9999588191971466e-06, "loss": 1.4968, "step": 1720 }, { "epoch": 0.01, "grad_norm": 6.524755329539832, "learning_rate": 1.999958771018713e-06, "loss": 1.5737, "step": 1721 }, { "epoch": 0.01, "grad_norm": 6.844327612634827, "learning_rate": 1.9999587228121135e-06, "loss": 1.5503, "step": 1722 }, { "epoch": 0.01, "grad_norm": 4.545944405243781, "learning_rate": 1.999958674577349e-06, "loss": 1.227, "step": 1723 }, { "epoch": 0.01, "grad_norm": 5.569019618596105, "learning_rate": 1.999958626314418e-06, "loss": 1.3704, "step": 1724 }, { "epoch": 0.01, "grad_norm": 4.840943336713329, "learning_rate": 1.9999585780233224e-06, "loss": 1.5026, "step": 1725 }, { "epoch": 0.01, "grad_norm": 5.04053411151362, "learning_rate": 1.9999585297040607e-06, "loss": 1.5642, "step": 1726 }, { "epoch": 0.01, "grad_norm": 5.6356258265297985, "learning_rate": 1.999958481356634e-06, "loss": 1.5226, "step": 1727 }, { "epoch": 0.01, "grad_norm": 4.640699731713796, "learning_rate": 1.9999584329810415e-06, "loss": 1.3299, "step": 1728 }, { "epoch": 0.01, "grad_norm": 8.472243462527622, "learning_rate": 1.9999583845772836e-06, "loss": 1.4519, "step": 1729 }, { "epoch": 0.01, "grad_norm": 7.611553303462497, "learning_rate": 1.99995833614536e-06, "loss": 1.4071, "step": 1730 }, { "epoch": 0.01, "grad_norm": 4.944281632533813, "learning_rate": 1.999958287685271e-06, "loss": 1.2544, "step": 1731 }, { "epoch": 0.01, "grad_norm": 4.6772733567861176, "learning_rate": 1.9999582391970166e-06, "loss": 1.3753, "step": 1732 }, { "epoch": 0.01, "grad_norm": 4.263194033542804, "learning_rate": 1.999958190680597e-06, "loss": 1.2437, "step": 1733 }, { "epoch": 0.01, "grad_norm": 6.654617300777728, "learning_rate": 1.9999581421360114e-06, "loss": 1.3443, "step": 1734 }, { "epoch": 0.01, "grad_norm": 5.17256863920859, "learning_rate": 1.9999580935632606e-06, "loss": 1.5056, "step": 1735 }, { "epoch": 0.01, "grad_norm": 5.491095887490403, "learning_rate": 1.999958044962344e-06, "loss": 1.32, "step": 1736 }, { "epoch": 0.01, "grad_norm": 5.391828873655179, "learning_rate": 1.999957996333262e-06, "loss": 1.342, "step": 1737 }, { "epoch": 0.01, "grad_norm": 4.966305374933118, "learning_rate": 1.999957947676015e-06, "loss": 1.3956, "step": 1738 }, { "epoch": 0.01, "grad_norm": 4.995740008299241, "learning_rate": 1.999957898990602e-06, "loss": 1.5289, "step": 1739 }, { "epoch": 0.01, "grad_norm": 4.381812845234866, "learning_rate": 1.9999578502770235e-06, "loss": 1.2572, "step": 1740 }, { "epoch": 0.01, "grad_norm": 4.61281321178712, "learning_rate": 1.9999578015352795e-06, "loss": 1.2066, "step": 1741 }, { "epoch": 0.01, "grad_norm": 4.441293918501137, "learning_rate": 1.99995775276537e-06, "loss": 1.3348, "step": 1742 }, { "epoch": 0.01, "grad_norm": 5.187132615217963, "learning_rate": 1.9999577039672954e-06, "loss": 1.5402, "step": 1743 }, { "epoch": 0.01, "grad_norm": 4.842621185034447, "learning_rate": 1.9999576551410553e-06, "loss": 1.3723, "step": 1744 }, { "epoch": 0.01, "grad_norm": 4.4907842747894895, "learning_rate": 1.9999576062866496e-06, "loss": 1.3577, "step": 1745 }, { "epoch": 0.01, "grad_norm": 4.71227076890854, "learning_rate": 1.999957557404078e-06, "loss": 1.3866, "step": 1746 }, { "epoch": 0.01, "grad_norm": 5.279369312058059, "learning_rate": 1.999957508493341e-06, "loss": 1.4093, "step": 1747 }, { "epoch": 0.01, "grad_norm": 4.646829536444046, "learning_rate": 1.9999574595544392e-06, "loss": 1.4286, "step": 1748 }, { "epoch": 0.01, "grad_norm": 4.593261228259904, "learning_rate": 1.9999574105873714e-06, "loss": 1.4716, "step": 1749 }, { "epoch": 0.01, "grad_norm": 5.0886616589245, "learning_rate": 1.999957361592138e-06, "loss": 1.6598, "step": 1750 }, { "epoch": 0.01, "grad_norm": 4.753829674453155, "learning_rate": 1.9999573125687395e-06, "loss": 1.3023, "step": 1751 }, { "epoch": 0.01, "grad_norm": 7.911294692900917, "learning_rate": 1.9999572635171754e-06, "loss": 1.4902, "step": 1752 }, { "epoch": 0.01, "eval_loss": 1.6478168964385986, "eval_runtime": 4.6461, "eval_samples_per_second": 1.937, "eval_steps_per_second": 1.076, "step": 1752 }, { "epoch": 0.01, "grad_norm": 4.700414649164786, "learning_rate": 1.999957214437446e-06, "loss": 1.482, "step": 1753 }, { "epoch": 0.01, "grad_norm": 4.9281821284726615, "learning_rate": 1.9999571653295507e-06, "loss": 1.306, "step": 1754 }, { "epoch": 0.01, "grad_norm": 4.787957118049158, "learning_rate": 1.99995711619349e-06, "loss": 1.3873, "step": 1755 }, { "epoch": 0.01, "grad_norm": 4.631636647001677, "learning_rate": 1.999957067029264e-06, "loss": 1.4089, "step": 1756 }, { "epoch": 0.01, "grad_norm": 4.754702249860543, "learning_rate": 1.9999570178368725e-06, "loss": 1.4074, "step": 1757 }, { "epoch": 0.01, "grad_norm": 5.069526479435708, "learning_rate": 1.9999569686163157e-06, "loss": 1.4378, "step": 1758 }, { "epoch": 0.01, "grad_norm": 4.9557014533023835, "learning_rate": 1.9999569193675933e-06, "loss": 1.3766, "step": 1759 }, { "epoch": 0.01, "grad_norm": 4.889794648818529, "learning_rate": 1.9999568700907053e-06, "loss": 1.5023, "step": 1760 }, { "epoch": 0.01, "grad_norm": 4.6039304576772, "learning_rate": 1.9999568207856523e-06, "loss": 1.3601, "step": 1761 }, { "epoch": 0.01, "grad_norm": 4.626450487742758, "learning_rate": 1.9999567714524337e-06, "loss": 1.3328, "step": 1762 }, { "epoch": 0.01, "grad_norm": 4.935339476011332, "learning_rate": 1.999956722091049e-06, "loss": 1.4359, "step": 1763 }, { "epoch": 0.01, "grad_norm": 5.059440744798255, "learning_rate": 1.9999566727014994e-06, "loss": 1.4256, "step": 1764 }, { "epoch": 0.01, "grad_norm": 4.856890423684793, "learning_rate": 1.999956623283784e-06, "loss": 1.3171, "step": 1765 }, { "epoch": 0.01, "grad_norm": 5.243931477116334, "learning_rate": 1.999956573837904e-06, "loss": 1.369, "step": 1766 }, { "epoch": 0.01, "grad_norm": 6.998539551416535, "learning_rate": 1.9999565243638576e-06, "loss": 1.439, "step": 1767 }, { "epoch": 0.01, "grad_norm": 4.836947003615344, "learning_rate": 1.999956474861646e-06, "loss": 1.4922, "step": 1768 }, { "epoch": 0.01, "grad_norm": 5.000816070522979, "learning_rate": 1.999956425331269e-06, "loss": 1.5798, "step": 1769 }, { "epoch": 0.01, "grad_norm": 5.162103021341265, "learning_rate": 1.9999563757727267e-06, "loss": 1.4635, "step": 1770 }, { "epoch": 0.01, "grad_norm": 4.754537659429389, "learning_rate": 1.999956326186019e-06, "loss": 1.4111, "step": 1771 }, { "epoch": 0.01, "grad_norm": 5.0189050672383, "learning_rate": 1.999956276571146e-06, "loss": 1.332, "step": 1772 }, { "epoch": 0.01, "grad_norm": 4.489911679774594, "learning_rate": 1.999956226928107e-06, "loss": 1.3697, "step": 1773 }, { "epoch": 0.01, "grad_norm": 5.139806714207968, "learning_rate": 1.999956177256903e-06, "loss": 1.4717, "step": 1774 }, { "epoch": 0.01, "grad_norm": 4.74100968652323, "learning_rate": 1.9999561275575333e-06, "loss": 1.4658, "step": 1775 }, { "epoch": 0.01, "grad_norm": 4.879178648686916, "learning_rate": 1.9999560778299984e-06, "loss": 1.4636, "step": 1776 }, { "epoch": 0.01, "grad_norm": 4.891078192372891, "learning_rate": 1.999956028074298e-06, "loss": 1.5723, "step": 1777 }, { "epoch": 0.01, "grad_norm": 5.653962016600203, "learning_rate": 1.999955978290432e-06, "loss": 1.5495, "step": 1778 }, { "epoch": 0.01, "grad_norm": 5.707164030187875, "learning_rate": 1.9999559284784005e-06, "loss": 1.64, "step": 1779 }, { "epoch": 0.01, "grad_norm": 4.454220280449505, "learning_rate": 1.999955878638204e-06, "loss": 1.336, "step": 1780 }, { "epoch": 0.01, "grad_norm": 4.593524682406172, "learning_rate": 1.9999558287698418e-06, "loss": 1.387, "step": 1781 }, { "epoch": 0.01, "grad_norm": 5.265214222108159, "learning_rate": 1.999955778873314e-06, "loss": 1.5196, "step": 1782 }, { "epoch": 0.01, "grad_norm": 4.8572705429997525, "learning_rate": 1.999955728948621e-06, "loss": 1.3624, "step": 1783 }, { "epoch": 0.01, "grad_norm": 4.789880139543268, "learning_rate": 1.9999556789957625e-06, "loss": 1.4488, "step": 1784 }, { "epoch": 0.01, "grad_norm": 6.621010536150032, "learning_rate": 1.9999556290147386e-06, "loss": 1.4625, "step": 1785 }, { "epoch": 0.01, "grad_norm": 5.383083285855116, "learning_rate": 1.999955579005549e-06, "loss": 1.4843, "step": 1786 }, { "epoch": 0.01, "grad_norm": 4.724436780612743, "learning_rate": 1.9999555289681946e-06, "loss": 1.3203, "step": 1787 }, { "epoch": 0.01, "grad_norm": 5.410873580372611, "learning_rate": 1.999955478902674e-06, "loss": 1.5224, "step": 1788 }, { "epoch": 0.01, "grad_norm": 5.290692087535022, "learning_rate": 1.9999554288089885e-06, "loss": 1.6514, "step": 1789 }, { "epoch": 0.01, "grad_norm": 4.756689362126989, "learning_rate": 1.9999553786871377e-06, "loss": 1.4927, "step": 1790 }, { "epoch": 0.01, "grad_norm": 5.336049328185806, "learning_rate": 1.9999553285371214e-06, "loss": 1.4515, "step": 1791 }, { "epoch": 0.01, "grad_norm": 5.194031618429659, "learning_rate": 1.999955278358939e-06, "loss": 1.5161, "step": 1792 }, { "epoch": 0.01, "grad_norm": 6.0237033121897845, "learning_rate": 1.9999552281525923e-06, "loss": 1.4292, "step": 1793 }, { "epoch": 0.01, "grad_norm": 4.482776055356631, "learning_rate": 1.9999551779180794e-06, "loss": 1.402, "step": 1794 }, { "epoch": 0.01, "grad_norm": 6.737389022108268, "learning_rate": 1.9999551276554014e-06, "loss": 1.5099, "step": 1795 }, { "epoch": 0.01, "grad_norm": 4.878318638241726, "learning_rate": 1.999955077364558e-06, "loss": 1.529, "step": 1796 }, { "epoch": 0.01, "grad_norm": 4.54561426906885, "learning_rate": 1.999955027045549e-06, "loss": 1.451, "step": 1797 }, { "epoch": 0.01, "grad_norm": 4.560824329023515, "learning_rate": 1.999954976698375e-06, "loss": 1.3774, "step": 1798 }, { "epoch": 0.01, "grad_norm": 4.798803868724221, "learning_rate": 1.9999549263230347e-06, "loss": 1.4121, "step": 1799 }, { "epoch": 0.01, "grad_norm": 6.383273482325183, "learning_rate": 1.99995487591953e-06, "loss": 1.4391, "step": 1800 }, { "epoch": 0.01, "grad_norm": 4.4091547065978425, "learning_rate": 1.9999548254878595e-06, "loss": 1.2512, "step": 1801 }, { "epoch": 0.01, "grad_norm": 4.838374021260141, "learning_rate": 1.999954775028023e-06, "loss": 1.4316, "step": 1802 }, { "epoch": 0.01, "grad_norm": 5.461055735973189, "learning_rate": 1.999954724540022e-06, "loss": 1.5318, "step": 1803 }, { "epoch": 0.01, "grad_norm": 5.633136261316013, "learning_rate": 1.9999546740238554e-06, "loss": 1.5676, "step": 1804 }, { "epoch": 0.01, "grad_norm": 4.303894260809806, "learning_rate": 1.9999546234795233e-06, "loss": 1.3127, "step": 1805 }, { "epoch": 0.01, "grad_norm": 5.072682679306013, "learning_rate": 1.9999545729070256e-06, "loss": 1.655, "step": 1806 }, { "epoch": 0.01, "grad_norm": 4.8324040981937815, "learning_rate": 1.999954522306363e-06, "loss": 1.4801, "step": 1807 }, { "epoch": 0.01, "grad_norm": 4.656940444517245, "learning_rate": 1.9999544716775345e-06, "loss": 1.3215, "step": 1808 }, { "epoch": 0.01, "grad_norm": 5.390009789147993, "learning_rate": 1.9999544210205406e-06, "loss": 1.4592, "step": 1809 }, { "epoch": 0.01, "grad_norm": 4.190582805213608, "learning_rate": 1.9999543703353816e-06, "loss": 1.2857, "step": 1810 }, { "epoch": 0.01, "grad_norm": 4.814450823568367, "learning_rate": 1.999954319622057e-06, "loss": 1.3498, "step": 1811 }, { "epoch": 0.01, "grad_norm": 4.868225374359218, "learning_rate": 1.9999542688805674e-06, "loss": 1.414, "step": 1812 }, { "epoch": 0.01, "grad_norm": 4.711417209971132, "learning_rate": 1.9999542181109123e-06, "loss": 1.355, "step": 1813 }, { "epoch": 0.01, "grad_norm": 5.007722725809002, "learning_rate": 1.9999541673130915e-06, "loss": 1.4713, "step": 1814 }, { "epoch": 0.01, "grad_norm": 4.8626655741355, "learning_rate": 1.9999541164871053e-06, "loss": 1.4264, "step": 1815 }, { "epoch": 0.01, "grad_norm": 4.723792462194043, "learning_rate": 1.999954065632954e-06, "loss": 1.4045, "step": 1816 }, { "epoch": 0.01, "grad_norm": 8.00100690699963, "learning_rate": 1.9999540147506374e-06, "loss": 1.4712, "step": 1817 }, { "epoch": 0.01, "grad_norm": 12.253942127148356, "learning_rate": 1.9999539638401553e-06, "loss": 1.4985, "step": 1818 }, { "epoch": 0.01, "grad_norm": 5.246538314584338, "learning_rate": 1.9999539129015073e-06, "loss": 1.4158, "step": 1819 }, { "epoch": 0.01, "grad_norm": 5.063213658151458, "learning_rate": 1.9999538619346947e-06, "loss": 1.4696, "step": 1820 }, { "epoch": 0.01, "grad_norm": 5.3593617360186405, "learning_rate": 1.9999538109397164e-06, "loss": 1.4141, "step": 1821 }, { "epoch": 0.01, "grad_norm": 4.648351531479633, "learning_rate": 1.9999537599165726e-06, "loss": 1.4502, "step": 1822 }, { "epoch": 0.01, "grad_norm": 6.134127084206851, "learning_rate": 1.9999537088652638e-06, "loss": 1.628, "step": 1823 }, { "epoch": 0.01, "grad_norm": 5.20685547745487, "learning_rate": 1.9999536577857893e-06, "loss": 1.2484, "step": 1824 }, { "epoch": 0.01, "grad_norm": 5.404803458184888, "learning_rate": 1.9999536066781494e-06, "loss": 1.4614, "step": 1825 }, { "epoch": 0.01, "eval_loss": 1.6452957391738892, "eval_runtime": 4.6377, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 1825 }, { "epoch": 0.01, "grad_norm": 5.325255851791553, "learning_rate": 1.9999535555423443e-06, "loss": 1.2913, "step": 1826 }, { "epoch": 0.01, "grad_norm": 4.812379984138036, "learning_rate": 1.9999535043783737e-06, "loss": 1.4321, "step": 1827 }, { "epoch": 0.01, "grad_norm": 4.574060048853807, "learning_rate": 1.999953453186238e-06, "loss": 1.4385, "step": 1828 }, { "epoch": 0.01, "grad_norm": 4.690167042783861, "learning_rate": 1.9999534019659366e-06, "loss": 1.4505, "step": 1829 }, { "epoch": 0.01, "grad_norm": 4.963037184486775, "learning_rate": 1.9999533507174702e-06, "loss": 1.3754, "step": 1830 }, { "epoch": 0.01, "grad_norm": 7.202540422032654, "learning_rate": 1.9999532994408383e-06, "loss": 1.5731, "step": 1831 }, { "epoch": 0.01, "grad_norm": 4.747566281161776, "learning_rate": 1.999953248136041e-06, "loss": 1.35, "step": 1832 }, { "epoch": 0.01, "grad_norm": 5.016639574119175, "learning_rate": 1.9999531968030782e-06, "loss": 1.5687, "step": 1833 }, { "epoch": 0.01, "grad_norm": 5.195599580383433, "learning_rate": 1.99995314544195e-06, "loss": 1.3277, "step": 1834 }, { "epoch": 0.01, "grad_norm": 4.71924385337544, "learning_rate": 1.999953094052657e-06, "loss": 1.3144, "step": 1835 }, { "epoch": 0.01, "grad_norm": 4.830368171047517, "learning_rate": 1.999953042635198e-06, "loss": 1.5285, "step": 1836 }, { "epoch": 0.01, "grad_norm": 8.34374594117172, "learning_rate": 1.999952991189574e-06, "loss": 1.4602, "step": 1837 }, { "epoch": 0.01, "grad_norm": 5.041136604255877, "learning_rate": 1.9999529397157847e-06, "loss": 1.4805, "step": 1838 }, { "epoch": 0.01, "grad_norm": 4.619671333702356, "learning_rate": 1.9999528882138298e-06, "loss": 1.4413, "step": 1839 }, { "epoch": 0.01, "grad_norm": 4.975416181507071, "learning_rate": 1.9999528366837097e-06, "loss": 1.395, "step": 1840 }, { "epoch": 0.01, "grad_norm": 5.04709900703668, "learning_rate": 1.9999527851254245e-06, "loss": 1.4027, "step": 1841 }, { "epoch": 0.01, "grad_norm": 5.834514593532968, "learning_rate": 1.9999527335389733e-06, "loss": 1.359, "step": 1842 }, { "epoch": 0.01, "grad_norm": 5.770095193729858, "learning_rate": 1.9999526819243574e-06, "loss": 1.5645, "step": 1843 }, { "epoch": 0.01, "grad_norm": 5.082331923840744, "learning_rate": 1.9999526302815756e-06, "loss": 1.436, "step": 1844 }, { "epoch": 0.01, "grad_norm": 4.516471050914241, "learning_rate": 1.999952578610629e-06, "loss": 1.338, "step": 1845 }, { "epoch": 0.01, "grad_norm": 4.952184920350597, "learning_rate": 1.999952526911517e-06, "loss": 1.4534, "step": 1846 }, { "epoch": 0.01, "grad_norm": 4.797375795198566, "learning_rate": 1.9999524751842394e-06, "loss": 1.3259, "step": 1847 }, { "epoch": 0.01, "grad_norm": 4.909335461688805, "learning_rate": 1.9999524234287963e-06, "loss": 1.464, "step": 1848 }, { "epoch": 0.01, "grad_norm": 4.7122725461586, "learning_rate": 1.9999523716451884e-06, "loss": 1.4756, "step": 1849 }, { "epoch": 0.01, "grad_norm": 4.836041938475466, "learning_rate": 1.9999523198334146e-06, "loss": 1.4444, "step": 1850 }, { "epoch": 0.01, "grad_norm": 4.42950126517274, "learning_rate": 1.999952267993476e-06, "loss": 1.3947, "step": 1851 }, { "epoch": 0.01, "grad_norm": 15.611894041325478, "learning_rate": 1.9999522161253717e-06, "loss": 1.5477, "step": 1852 }, { "epoch": 0.01, "grad_norm": 4.816487768685785, "learning_rate": 1.999952164229102e-06, "loss": 1.4487, "step": 1853 }, { "epoch": 0.01, "grad_norm": 4.705134898369493, "learning_rate": 1.9999521123046674e-06, "loss": 1.2334, "step": 1854 }, { "epoch": 0.01, "grad_norm": 4.9204906496645675, "learning_rate": 1.999952060352067e-06, "loss": 1.367, "step": 1855 }, { "epoch": 0.01, "grad_norm": 5.302614564464106, "learning_rate": 1.999952008371302e-06, "loss": 1.503, "step": 1856 }, { "epoch": 0.01, "grad_norm": 6.290262594106217, "learning_rate": 1.999951956362371e-06, "loss": 1.6158, "step": 1857 }, { "epoch": 0.01, "grad_norm": 4.832322054118164, "learning_rate": 1.999951904325275e-06, "loss": 1.3938, "step": 1858 }, { "epoch": 0.01, "grad_norm": 5.918339792868505, "learning_rate": 1.9999518522600134e-06, "loss": 1.5048, "step": 1859 }, { "epoch": 0.01, "grad_norm": 4.590992917749576, "learning_rate": 1.999951800166587e-06, "loss": 1.3696, "step": 1860 }, { "epoch": 0.01, "grad_norm": 6.614974294086556, "learning_rate": 1.9999517480449946e-06, "loss": 1.5505, "step": 1861 }, { "epoch": 0.01, "grad_norm": 4.7241251284973576, "learning_rate": 1.9999516958952373e-06, "loss": 1.3711, "step": 1862 }, { "epoch": 0.01, "grad_norm": 11.122789301908437, "learning_rate": 1.999951643717315e-06, "loss": 1.5607, "step": 1863 }, { "epoch": 0.01, "grad_norm": 4.849382872781319, "learning_rate": 1.9999515915112265e-06, "loss": 1.1324, "step": 1864 }, { "epoch": 0.01, "grad_norm": 4.873163835881678, "learning_rate": 1.9999515392769734e-06, "loss": 1.5033, "step": 1865 }, { "epoch": 0.01, "grad_norm": 5.173654117397512, "learning_rate": 1.9999514870145548e-06, "loss": 1.5117, "step": 1866 }, { "epoch": 0.01, "grad_norm": 4.642950123093243, "learning_rate": 1.9999514347239706e-06, "loss": 1.4977, "step": 1867 }, { "epoch": 0.01, "grad_norm": 6.846117886376132, "learning_rate": 1.9999513824052214e-06, "loss": 1.4254, "step": 1868 }, { "epoch": 0.01, "grad_norm": 5.101156060458363, "learning_rate": 1.999951330058307e-06, "loss": 1.5828, "step": 1869 }, { "epoch": 0.01, "grad_norm": 4.688953673802245, "learning_rate": 1.999951277683227e-06, "loss": 1.4109, "step": 1870 }, { "epoch": 0.01, "grad_norm": 4.703112399070017, "learning_rate": 1.999951225279982e-06, "loss": 1.4139, "step": 1871 }, { "epoch": 0.01, "grad_norm": 5.310901288329353, "learning_rate": 1.999951172848572e-06, "loss": 1.6931, "step": 1872 }, { "epoch": 0.01, "grad_norm": 5.741193705049953, "learning_rate": 1.9999511203889957e-06, "loss": 1.5699, "step": 1873 }, { "epoch": 0.01, "grad_norm": 4.6035588106756835, "learning_rate": 1.9999510679012545e-06, "loss": 1.4299, "step": 1874 }, { "epoch": 0.01, "grad_norm": 5.662657586420871, "learning_rate": 1.999951015385348e-06, "loss": 1.3702, "step": 1875 }, { "epoch": 0.01, "grad_norm": 4.8990643238694735, "learning_rate": 1.9999509628412766e-06, "loss": 1.4812, "step": 1876 }, { "epoch": 0.01, "grad_norm": 4.948600983425892, "learning_rate": 1.9999509102690396e-06, "loss": 1.4146, "step": 1877 }, { "epoch": 0.01, "grad_norm": 5.635028291248577, "learning_rate": 1.9999508576686375e-06, "loss": 1.4875, "step": 1878 }, { "epoch": 0.01, "grad_norm": 7.294273694280311, "learning_rate": 1.9999508050400702e-06, "loss": 1.332, "step": 1879 }, { "epoch": 0.01, "grad_norm": 5.184690906135398, "learning_rate": 1.999950752383337e-06, "loss": 1.3164, "step": 1880 }, { "epoch": 0.01, "grad_norm": 4.720673987223821, "learning_rate": 1.999950699698439e-06, "loss": 1.455, "step": 1881 }, { "epoch": 0.01, "grad_norm": 4.367468043629218, "learning_rate": 1.9999506469853757e-06, "loss": 1.3821, "step": 1882 }, { "epoch": 0.01, "grad_norm": 5.599591528503925, "learning_rate": 1.999950594244147e-06, "loss": 1.4663, "step": 1883 }, { "epoch": 0.01, "grad_norm": 4.428585648964182, "learning_rate": 1.999950541474753e-06, "loss": 1.3065, "step": 1884 }, { "epoch": 0.01, "grad_norm": 6.224684072029214, "learning_rate": 1.999950488677194e-06, "loss": 1.414, "step": 1885 }, { "epoch": 0.01, "grad_norm": 4.640404642843782, "learning_rate": 1.9999504358514695e-06, "loss": 1.4714, "step": 1886 }, { "epoch": 0.01, "grad_norm": 4.799354311682633, "learning_rate": 1.9999503829975796e-06, "loss": 1.3347, "step": 1887 }, { "epoch": 0.01, "grad_norm": 4.543001580495537, "learning_rate": 1.9999503301155246e-06, "loss": 1.3697, "step": 1888 }, { "epoch": 0.01, "grad_norm": 4.736298613103733, "learning_rate": 1.999950277205304e-06, "loss": 1.3371, "step": 1889 }, { "epoch": 0.01, "grad_norm": 4.571900592136959, "learning_rate": 1.999950224266919e-06, "loss": 1.385, "step": 1890 }, { "epoch": 0.01, "grad_norm": 6.969693151715899, "learning_rate": 1.9999501713003677e-06, "loss": 1.3239, "step": 1891 }, { "epoch": 0.01, "grad_norm": 4.655737378919456, "learning_rate": 1.999950118305652e-06, "loss": 1.5423, "step": 1892 }, { "epoch": 0.01, "grad_norm": 4.714447970923717, "learning_rate": 1.9999500652827704e-06, "loss": 1.5264, "step": 1893 }, { "epoch": 0.01, "grad_norm": 4.780305107625251, "learning_rate": 1.9999500122317235e-06, "loss": 1.3597, "step": 1894 }, { "epoch": 0.01, "grad_norm": 5.133517088409541, "learning_rate": 1.999949959152512e-06, "loss": 1.3169, "step": 1895 }, { "epoch": 0.01, "grad_norm": 4.924450244844968, "learning_rate": 1.9999499060451342e-06, "loss": 1.4864, "step": 1896 }, { "epoch": 0.01, "grad_norm": 6.275295688447123, "learning_rate": 1.9999498529095924e-06, "loss": 1.5597, "step": 1897 }, { "epoch": 0.01, "grad_norm": 4.7042297343124515, "learning_rate": 1.999949799745884e-06, "loss": 1.31, "step": 1898 }, { "epoch": 0.01, "eval_loss": 1.6416113376617432, "eval_runtime": 4.6259, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 1898 }, { "epoch": 0.01, "grad_norm": 6.423106928459506, "learning_rate": 1.9999497465540116e-06, "loss": 1.4495, "step": 1899 }, { "epoch": 0.01, "grad_norm": 5.0399225789912245, "learning_rate": 1.999949693333973e-06, "loss": 1.3536, "step": 1900 }, { "epoch": 0.01, "grad_norm": 4.928948443819752, "learning_rate": 1.9999496400857695e-06, "loss": 1.4797, "step": 1901 }, { "epoch": 0.01, "grad_norm": 4.67479846437279, "learning_rate": 1.9999495868094007e-06, "loss": 1.3, "step": 1902 }, { "epoch": 0.01, "grad_norm": 8.103095669421286, "learning_rate": 1.999949533504867e-06, "loss": 1.4622, "step": 1903 }, { "epoch": 0.01, "grad_norm": 5.134179165961643, "learning_rate": 1.9999494801721675e-06, "loss": 1.5459, "step": 1904 }, { "epoch": 0.01, "grad_norm": 5.025236385305749, "learning_rate": 1.999949426811303e-06, "loss": 1.5205, "step": 1905 }, { "epoch": 0.01, "grad_norm": 5.22119763423394, "learning_rate": 1.9999493734222734e-06, "loss": 1.3004, "step": 1906 }, { "epoch": 0.01, "grad_norm": 4.669191447439257, "learning_rate": 1.9999493200050783e-06, "loss": 1.3201, "step": 1907 }, { "epoch": 0.01, "grad_norm": 4.968754828299794, "learning_rate": 1.999949266559718e-06, "loss": 1.4346, "step": 1908 }, { "epoch": 0.01, "grad_norm": 4.487597166469777, "learning_rate": 1.9999492130861926e-06, "loss": 1.4025, "step": 1909 }, { "epoch": 0.01, "grad_norm": 4.879843563485657, "learning_rate": 1.9999491595845017e-06, "loss": 1.3078, "step": 1910 }, { "epoch": 0.01, "grad_norm": 4.910078993452133, "learning_rate": 1.9999491060546456e-06, "loss": 1.5774, "step": 1911 }, { "epoch": 0.01, "grad_norm": 4.647585620113467, "learning_rate": 1.9999490524966245e-06, "loss": 1.5707, "step": 1912 }, { "epoch": 0.01, "grad_norm": 4.567026839443758, "learning_rate": 1.999948998910438e-06, "loss": 1.3951, "step": 1913 }, { "epoch": 0.01, "grad_norm": 4.687048241254901, "learning_rate": 1.9999489452960864e-06, "loss": 1.4327, "step": 1914 }, { "epoch": 0.01, "grad_norm": 4.902103912285808, "learning_rate": 1.9999488916535695e-06, "loss": 1.477, "step": 1915 }, { "epoch": 0.01, "grad_norm": 4.57359910064146, "learning_rate": 1.9999488379828874e-06, "loss": 1.4144, "step": 1916 }, { "epoch": 0.01, "grad_norm": 5.105713137074651, "learning_rate": 1.99994878428404e-06, "loss": 1.4075, "step": 1917 }, { "epoch": 0.01, "grad_norm": 7.001514955863734, "learning_rate": 1.999948730557027e-06, "loss": 1.5653, "step": 1918 }, { "epoch": 0.01, "grad_norm": 5.65944563546782, "learning_rate": 1.9999486768018493e-06, "loss": 1.162, "step": 1919 }, { "epoch": 0.01, "grad_norm": 6.019156415872438, "learning_rate": 1.999948623018506e-06, "loss": 1.6276, "step": 1920 }, { "epoch": 0.01, "grad_norm": 4.677363975537384, "learning_rate": 1.999948569206998e-06, "loss": 1.4125, "step": 1921 }, { "epoch": 0.01, "grad_norm": 5.929950546593268, "learning_rate": 1.999948515367324e-06, "loss": 1.6226, "step": 1922 }, { "epoch": 0.01, "grad_norm": 4.6654710600977, "learning_rate": 1.9999484614994856e-06, "loss": 1.3902, "step": 1923 }, { "epoch": 0.01, "grad_norm": 5.450030912502321, "learning_rate": 1.9999484076034814e-06, "loss": 1.4725, "step": 1924 }, { "epoch": 0.01, "grad_norm": 4.994408282983105, "learning_rate": 1.9999483536793124e-06, "loss": 1.5852, "step": 1925 }, { "epoch": 0.01, "grad_norm": 4.714476017763944, "learning_rate": 1.999948299726978e-06, "loss": 1.4207, "step": 1926 }, { "epoch": 0.01, "grad_norm": 4.767073689418852, "learning_rate": 1.999948245746478e-06, "loss": 1.5189, "step": 1927 }, { "epoch": 0.01, "grad_norm": 4.392481238520719, "learning_rate": 1.9999481917378133e-06, "loss": 1.2504, "step": 1928 }, { "epoch": 0.01, "grad_norm": 4.354120579996221, "learning_rate": 1.9999481377009834e-06, "loss": 1.3287, "step": 1929 }, { "epoch": 0.01, "grad_norm": 4.643549574305054, "learning_rate": 1.999948083635988e-06, "loss": 1.4252, "step": 1930 }, { "epoch": 0.01, "grad_norm": 4.6243115760185605, "learning_rate": 1.9999480295428276e-06, "loss": 1.4642, "step": 1931 }, { "epoch": 0.01, "grad_norm": 5.261417138349398, "learning_rate": 1.9999479754215016e-06, "loss": 1.4865, "step": 1932 }, { "epoch": 0.01, "grad_norm": 4.618981173105002, "learning_rate": 1.999947921272011e-06, "loss": 1.495, "step": 1933 }, { "epoch": 0.01, "grad_norm": 4.958757287653689, "learning_rate": 1.9999478670943546e-06, "loss": 1.4359, "step": 1934 }, { "epoch": 0.01, "grad_norm": 4.930067872156233, "learning_rate": 1.9999478128885332e-06, "loss": 1.4284, "step": 1935 }, { "epoch": 0.01, "grad_norm": 4.470181062731635, "learning_rate": 1.9999477586545468e-06, "loss": 1.2989, "step": 1936 }, { "epoch": 0.01, "grad_norm": 5.127129338294685, "learning_rate": 1.999947704392395e-06, "loss": 1.443, "step": 1937 }, { "epoch": 0.01, "grad_norm": 5.127632526299122, "learning_rate": 1.999947650102078e-06, "loss": 1.3732, "step": 1938 }, { "epoch": 0.01, "grad_norm": 5.166443772114098, "learning_rate": 1.9999475957835958e-06, "loss": 1.4847, "step": 1939 }, { "epoch": 0.01, "grad_norm": 5.096431897023338, "learning_rate": 1.999947541436949e-06, "loss": 1.4193, "step": 1940 }, { "epoch": 0.01, "grad_norm": 4.836191132604963, "learning_rate": 1.999947487062136e-06, "loss": 1.4965, "step": 1941 }, { "epoch": 0.01, "grad_norm": 4.925828422364088, "learning_rate": 1.9999474326591583e-06, "loss": 1.4444, "step": 1942 }, { "epoch": 0.01, "grad_norm": 4.667390491716781, "learning_rate": 1.999947378228015e-06, "loss": 1.5763, "step": 1943 }, { "epoch": 0.01, "grad_norm": 5.279038785770578, "learning_rate": 1.9999473237687073e-06, "loss": 1.5275, "step": 1944 }, { "epoch": 0.01, "grad_norm": 5.641029202907494, "learning_rate": 1.999947269281234e-06, "loss": 1.5775, "step": 1945 }, { "epoch": 0.01, "grad_norm": 5.332390674887097, "learning_rate": 1.9999472147655955e-06, "loss": 1.5664, "step": 1946 }, { "epoch": 0.01, "grad_norm": 5.622675952825197, "learning_rate": 1.999947160221792e-06, "loss": 1.4099, "step": 1947 }, { "epoch": 0.01, "grad_norm": 4.613192639788906, "learning_rate": 1.9999471056498227e-06, "loss": 1.4579, "step": 1948 }, { "epoch": 0.01, "grad_norm": 4.879403704365523, "learning_rate": 1.999947051049689e-06, "loss": 1.4041, "step": 1949 }, { "epoch": 0.01, "grad_norm": 4.482648258267666, "learning_rate": 1.9999469964213895e-06, "loss": 1.317, "step": 1950 }, { "epoch": 0.01, "grad_norm": 5.108898563911747, "learning_rate": 1.999946941764925e-06, "loss": 1.4241, "step": 1951 }, { "epoch": 0.01, "grad_norm": 5.568831706403162, "learning_rate": 1.9999468870802954e-06, "loss": 1.5296, "step": 1952 }, { "epoch": 0.01, "grad_norm": 4.786414340270627, "learning_rate": 1.9999468323675007e-06, "loss": 1.5818, "step": 1953 }, { "epoch": 0.01, "grad_norm": 4.78919679188355, "learning_rate": 1.999946777626541e-06, "loss": 1.2252, "step": 1954 }, { "epoch": 0.01, "grad_norm": 4.524987858157338, "learning_rate": 1.9999467228574154e-06, "loss": 1.4747, "step": 1955 }, { "epoch": 0.01, "grad_norm": 4.948724773134819, "learning_rate": 1.9999466680601254e-06, "loss": 1.3693, "step": 1956 }, { "epoch": 0.01, "grad_norm": 4.498995965734504, "learning_rate": 1.9999466132346697e-06, "loss": 1.3926, "step": 1957 }, { "epoch": 0.01, "grad_norm": 6.690852300877899, "learning_rate": 1.9999465583810494e-06, "loss": 1.5302, "step": 1958 }, { "epoch": 0.01, "grad_norm": 4.628604279451408, "learning_rate": 1.9999465034992636e-06, "loss": 1.406, "step": 1959 }, { "epoch": 0.01, "grad_norm": 5.079437190752291, "learning_rate": 1.999946448589312e-06, "loss": 1.5523, "step": 1960 }, { "epoch": 0.01, "grad_norm": 4.966397313206571, "learning_rate": 1.9999463936511966e-06, "loss": 1.3872, "step": 1961 }, { "epoch": 0.01, "grad_norm": 4.652429476663953, "learning_rate": 1.999946338684915e-06, "loss": 1.4646, "step": 1962 }, { "epoch": 0.01, "grad_norm": 4.87657353554175, "learning_rate": 1.999946283690468e-06, "loss": 1.392, "step": 1963 }, { "epoch": 0.01, "grad_norm": 4.917643825546739, "learning_rate": 1.999946228667857e-06, "loss": 1.3851, "step": 1964 }, { "epoch": 0.01, "grad_norm": 5.67340721917725, "learning_rate": 1.9999461736170802e-06, "loss": 1.5101, "step": 1965 }, { "epoch": 0.01, "grad_norm": 4.39919847434518, "learning_rate": 1.999946118538138e-06, "loss": 1.3595, "step": 1966 }, { "epoch": 0.01, "grad_norm": 5.203293308343834, "learning_rate": 1.999946063431031e-06, "loss": 1.4727, "step": 1967 }, { "epoch": 0.01, "grad_norm": 4.678751954911345, "learning_rate": 1.9999460082957586e-06, "loss": 1.3947, "step": 1968 }, { "epoch": 0.01, "grad_norm": 4.569173814364991, "learning_rate": 1.999945953132321e-06, "loss": 1.4007, "step": 1969 }, { "epoch": 0.01, "grad_norm": 5.204406855731333, "learning_rate": 1.999945897940719e-06, "loss": 1.3463, "step": 1970 }, { "epoch": 0.01, "grad_norm": 6.853776529480603, "learning_rate": 1.999945842720951e-06, "loss": 1.6373, "step": 1971 }, { "epoch": 0.01, "eval_loss": 1.6432719230651855, "eval_runtime": 4.6393, "eval_samples_per_second": 1.94, "eval_steps_per_second": 1.078, "step": 1971 }, { "epoch": 0.01, "grad_norm": 4.975365230474304, "learning_rate": 1.9999457874730182e-06, "loss": 1.4259, "step": 1972 }, { "epoch": 0.01, "grad_norm": 5.106259928022218, "learning_rate": 1.9999457321969203e-06, "loss": 1.531, "step": 1973 }, { "epoch": 0.01, "grad_norm": 4.805401584156609, "learning_rate": 1.999945676892657e-06, "loss": 1.4088, "step": 1974 }, { "epoch": 0.01, "grad_norm": 6.312695783521927, "learning_rate": 1.9999456215602288e-06, "loss": 1.572, "step": 1975 }, { "epoch": 0.01, "grad_norm": 8.74462546951479, "learning_rate": 1.9999455661996355e-06, "loss": 1.2957, "step": 1976 }, { "epoch": 0.01, "grad_norm": 4.625243006738303, "learning_rate": 1.9999455108108767e-06, "loss": 1.4034, "step": 1977 }, { "epoch": 0.01, "grad_norm": 5.178605492022322, "learning_rate": 1.999945455393953e-06, "loss": 1.5405, "step": 1978 }, { "epoch": 0.01, "grad_norm": 4.655718915504933, "learning_rate": 1.999945399948864e-06, "loss": 1.5224, "step": 1979 }, { "epoch": 0.01, "grad_norm": 4.65830388317099, "learning_rate": 1.99994534447561e-06, "loss": 1.5357, "step": 1980 }, { "epoch": 0.01, "grad_norm": 5.288686031793038, "learning_rate": 1.999945288974191e-06, "loss": 1.5392, "step": 1981 }, { "epoch": 0.01, "grad_norm": 4.43642936625311, "learning_rate": 1.999945233444607e-06, "loss": 1.488, "step": 1982 }, { "epoch": 0.01, "grad_norm": 4.555178280796159, "learning_rate": 1.9999451778868575e-06, "loss": 1.3151, "step": 1983 }, { "epoch": 0.01, "grad_norm": 5.236202376421231, "learning_rate": 1.999945122300943e-06, "loss": 1.4033, "step": 1984 }, { "epoch": 0.01, "grad_norm": 4.887741373566786, "learning_rate": 1.999945066686863e-06, "loss": 1.506, "step": 1985 }, { "epoch": 0.01, "grad_norm": 4.9138979134608975, "learning_rate": 1.9999450110446183e-06, "loss": 1.4881, "step": 1986 }, { "epoch": 0.01, "grad_norm": 4.542007061012194, "learning_rate": 1.9999449553742084e-06, "loss": 1.4365, "step": 1987 }, { "epoch": 0.01, "grad_norm": 5.528814673826827, "learning_rate": 1.9999448996756337e-06, "loss": 1.4825, "step": 1988 }, { "epoch": 0.01, "grad_norm": 4.4557691220675375, "learning_rate": 1.9999448439488936e-06, "loss": 1.4531, "step": 1989 }, { "epoch": 0.01, "grad_norm": 4.758397625075794, "learning_rate": 1.999944788193988e-06, "loss": 1.4119, "step": 1990 }, { "epoch": 0.01, "grad_norm": 4.583454128627948, "learning_rate": 1.999944732410918e-06, "loss": 1.4116, "step": 1991 }, { "epoch": 0.01, "grad_norm": 4.532783999892837, "learning_rate": 1.999944676599682e-06, "loss": 1.2279, "step": 1992 }, { "epoch": 0.01, "grad_norm": 4.906450343559313, "learning_rate": 1.9999446207602813e-06, "loss": 1.4905, "step": 1993 }, { "epoch": 0.01, "grad_norm": 4.346899749377525, "learning_rate": 1.999944564892716e-06, "loss": 1.413, "step": 1994 }, { "epoch": 0.01, "grad_norm": 5.523495511164613, "learning_rate": 1.999944508996985e-06, "loss": 1.5664, "step": 1995 }, { "epoch": 0.01, "grad_norm": 4.959351356739527, "learning_rate": 1.999944453073089e-06, "loss": 1.3644, "step": 1996 }, { "epoch": 0.01, "grad_norm": 4.871410142653787, "learning_rate": 1.999944397121028e-06, "loss": 1.396, "step": 1997 }, { "epoch": 0.01, "grad_norm": 4.76408338524292, "learning_rate": 1.9999443411408018e-06, "loss": 1.4451, "step": 1998 }, { "epoch": 0.01, "grad_norm": 4.498293300971013, "learning_rate": 1.9999442851324104e-06, "loss": 1.397, "step": 1999 }, { "epoch": 0.01, "grad_norm": 4.9463867469913, "learning_rate": 1.999944229095854e-06, "loss": 1.459, "step": 2000 }, { "epoch": 0.01, "grad_norm": 4.4931004236648775, "learning_rate": 1.9999441730311324e-06, "loss": 1.4252, "step": 2001 }, { "epoch": 0.01, "grad_norm": 4.718233309528229, "learning_rate": 1.999944116938246e-06, "loss": 1.4942, "step": 2002 }, { "epoch": 0.01, "grad_norm": 5.889040911882571, "learning_rate": 1.9999440608171944e-06, "loss": 1.5412, "step": 2003 }, { "epoch": 0.01, "grad_norm": 5.014283756243838, "learning_rate": 1.9999440046679775e-06, "loss": 1.5232, "step": 2004 }, { "epoch": 0.01, "grad_norm": 5.395377515573, "learning_rate": 1.999943948490596e-06, "loss": 1.3687, "step": 2005 }, { "epoch": 0.01, "grad_norm": 5.1324825308243005, "learning_rate": 1.9999438922850487e-06, "loss": 1.459, "step": 2006 }, { "epoch": 0.01, "grad_norm": 5.302528839998102, "learning_rate": 1.9999438360513364e-06, "loss": 1.1898, "step": 2007 }, { "epoch": 0.01, "grad_norm": 4.921387927464629, "learning_rate": 1.9999437797894595e-06, "loss": 1.4922, "step": 2008 }, { "epoch": 0.01, "grad_norm": 4.66789838005482, "learning_rate": 1.9999437234994174e-06, "loss": 1.2908, "step": 2009 }, { "epoch": 0.01, "grad_norm": 4.466483592815993, "learning_rate": 1.99994366718121e-06, "loss": 1.3559, "step": 2010 }, { "epoch": 0.01, "grad_norm": 4.989407567640141, "learning_rate": 1.9999436108348375e-06, "loss": 1.3186, "step": 2011 }, { "epoch": 0.01, "grad_norm": 4.7584380055697135, "learning_rate": 1.9999435544602996e-06, "loss": 1.481, "step": 2012 }, { "epoch": 0.01, "grad_norm": 8.410113901804026, "learning_rate": 1.999943498057597e-06, "loss": 1.4516, "step": 2013 }, { "epoch": 0.01, "grad_norm": 5.145181164985424, "learning_rate": 1.99994344162673e-06, "loss": 1.2735, "step": 2014 }, { "epoch": 0.01, "grad_norm": 4.997375983666718, "learning_rate": 1.9999433851676967e-06, "loss": 1.5026, "step": 2015 }, { "epoch": 0.01, "grad_norm": 4.501900393004941, "learning_rate": 1.9999433286804992e-06, "loss": 1.5178, "step": 2016 }, { "epoch": 0.01, "grad_norm": 6.183628468382797, "learning_rate": 1.9999432721651362e-06, "loss": 1.5511, "step": 2017 }, { "epoch": 0.01, "grad_norm": 5.615198435575112, "learning_rate": 1.999943215621608e-06, "loss": 1.4319, "step": 2018 }, { "epoch": 0.01, "grad_norm": 4.91098537314749, "learning_rate": 1.999943159049915e-06, "loss": 1.5055, "step": 2019 }, { "epoch": 0.01, "grad_norm": 5.533623257731385, "learning_rate": 1.999943102450057e-06, "loss": 1.3607, "step": 2020 }, { "epoch": 0.01, "grad_norm": 4.823264533879004, "learning_rate": 1.9999430458220335e-06, "loss": 1.3773, "step": 2021 }, { "epoch": 0.01, "grad_norm": 4.610357246741694, "learning_rate": 1.9999429891658453e-06, "loss": 1.4646, "step": 2022 }, { "epoch": 0.01, "grad_norm": 6.279580425295216, "learning_rate": 1.999942932481492e-06, "loss": 1.3543, "step": 2023 }, { "epoch": 0.01, "grad_norm": 4.920902713497585, "learning_rate": 1.9999428757689737e-06, "loss": 1.4141, "step": 2024 }, { "epoch": 0.01, "grad_norm": 5.301852036011902, "learning_rate": 1.99994281902829e-06, "loss": 1.5213, "step": 2025 }, { "epoch": 0.01, "grad_norm": 5.135724505311549, "learning_rate": 1.9999427622594415e-06, "loss": 1.5948, "step": 2026 }, { "epoch": 0.01, "grad_norm": 4.852010935690981, "learning_rate": 1.9999427054624282e-06, "loss": 1.4197, "step": 2027 }, { "epoch": 0.01, "grad_norm": 4.9165669204888625, "learning_rate": 1.9999426486372494e-06, "loss": 1.5093, "step": 2028 }, { "epoch": 0.01, "grad_norm": 4.575212660119817, "learning_rate": 1.999942591783906e-06, "loss": 1.3396, "step": 2029 }, { "epoch": 0.01, "grad_norm": 5.259797384037423, "learning_rate": 1.9999425349023967e-06, "loss": 1.4153, "step": 2030 }, { "epoch": 0.01, "grad_norm": 5.548169399634219, "learning_rate": 1.999942477992723e-06, "loss": 1.3866, "step": 2031 }, { "epoch": 0.01, "grad_norm": 5.282143331135165, "learning_rate": 1.999942421054884e-06, "loss": 1.4014, "step": 2032 }, { "epoch": 0.01, "grad_norm": 4.574333910357711, "learning_rate": 1.99994236408888e-06, "loss": 1.3933, "step": 2033 }, { "epoch": 0.01, "grad_norm": 6.629247995921187, "learning_rate": 1.9999423070947114e-06, "loss": 1.5145, "step": 2034 }, { "epoch": 0.01, "grad_norm": 4.71772592571776, "learning_rate": 1.999942250072377e-06, "loss": 1.353, "step": 2035 }, { "epoch": 0.01, "grad_norm": 8.908977840581366, "learning_rate": 1.999942193021878e-06, "loss": 1.5628, "step": 2036 }, { "epoch": 0.01, "grad_norm": 6.021790328358759, "learning_rate": 1.9999421359432137e-06, "loss": 1.4148, "step": 2037 }, { "epoch": 0.01, "grad_norm": 5.176490897391262, "learning_rate": 1.999942078836385e-06, "loss": 1.586, "step": 2038 }, { "epoch": 0.01, "grad_norm": 4.99627582571255, "learning_rate": 1.9999420217013907e-06, "loss": 1.5591, "step": 2039 }, { "epoch": 0.01, "grad_norm": 5.02431904656082, "learning_rate": 1.9999419645382313e-06, "loss": 1.4402, "step": 2040 }, { "epoch": 0.01, "grad_norm": 4.713468056290315, "learning_rate": 1.999941907346907e-06, "loss": 1.339, "step": 2041 }, { "epoch": 0.01, "grad_norm": 5.7923489185700126, "learning_rate": 1.9999418501274176e-06, "loss": 1.4056, "step": 2042 }, { "epoch": 0.01, "grad_norm": 4.776165026219362, "learning_rate": 1.9999417928797632e-06, "loss": 1.5542, "step": 2043 }, { "epoch": 0.01, "grad_norm": 4.990681472067374, "learning_rate": 1.999941735603944e-06, "loss": 1.3998, "step": 2044 }, { "epoch": 0.01, "eval_loss": 1.6361159086227417, "eval_runtime": 4.6533, "eval_samples_per_second": 1.934, "eval_steps_per_second": 1.075, "step": 2044 }, { "epoch": 0.01, "grad_norm": 4.903134049101306, "learning_rate": 1.9999416782999592e-06, "loss": 1.4561, "step": 2045 }, { "epoch": 0.01, "grad_norm": 5.5369642093418285, "learning_rate": 1.99994162096781e-06, "loss": 1.4306, "step": 2046 }, { "epoch": 0.01, "grad_norm": 4.674123365349024, "learning_rate": 1.9999415636074956e-06, "loss": 1.397, "step": 2047 }, { "epoch": 0.01, "grad_norm": 5.020382697207387, "learning_rate": 1.9999415062190157e-06, "loss": 1.3635, "step": 2048 }, { "epoch": 0.01, "grad_norm": 5.402239102827217, "learning_rate": 1.9999414488023716e-06, "loss": 1.3843, "step": 2049 }, { "epoch": 0.01, "grad_norm": 5.367641687630019, "learning_rate": 1.999941391357562e-06, "loss": 1.4433, "step": 2050 }, { "epoch": 0.01, "grad_norm": 4.567561738037097, "learning_rate": 1.999941333884587e-06, "loss": 1.4309, "step": 2051 }, { "epoch": 0.01, "grad_norm": 5.525282756054969, "learning_rate": 1.9999412763834475e-06, "loss": 1.3293, "step": 2052 }, { "epoch": 0.01, "grad_norm": 5.908627704395454, "learning_rate": 1.999941218854143e-06, "loss": 1.3566, "step": 2053 }, { "epoch": 0.01, "grad_norm": 7.432919107784135, "learning_rate": 1.999941161296673e-06, "loss": 1.2394, "step": 2054 }, { "epoch": 0.01, "grad_norm": 4.778341393380081, "learning_rate": 1.9999411037110387e-06, "loss": 1.3844, "step": 2055 }, { "epoch": 0.01, "grad_norm": 4.954440841765967, "learning_rate": 1.9999410460972387e-06, "loss": 1.4503, "step": 2056 }, { "epoch": 0.01, "grad_norm": 4.758941261308998, "learning_rate": 1.999940988455274e-06, "loss": 1.4572, "step": 2057 }, { "epoch": 0.01, "grad_norm": 6.564392211755666, "learning_rate": 1.9999409307851446e-06, "loss": 1.1965, "step": 2058 }, { "epoch": 0.01, "grad_norm": 6.725945941216895, "learning_rate": 1.9999408730868497e-06, "loss": 1.3813, "step": 2059 }, { "epoch": 0.01, "grad_norm": 6.292635293917362, "learning_rate": 1.9999408153603897e-06, "loss": 1.4697, "step": 2060 }, { "epoch": 0.01, "grad_norm": 10.371826291437516, "learning_rate": 1.9999407576057654e-06, "loss": 1.4433, "step": 2061 }, { "epoch": 0.01, "grad_norm": 4.624824451917616, "learning_rate": 1.9999406998229756e-06, "loss": 1.5372, "step": 2062 }, { "epoch": 0.01, "grad_norm": 4.828294055435367, "learning_rate": 1.9999406420120206e-06, "loss": 1.4827, "step": 2063 }, { "epoch": 0.01, "grad_norm": 4.927358950382133, "learning_rate": 1.999940584172901e-06, "loss": 1.4385, "step": 2064 }, { "epoch": 0.01, "grad_norm": 5.4353174579637775, "learning_rate": 1.9999405263056162e-06, "loss": 1.4795, "step": 2065 }, { "epoch": 0.01, "grad_norm": 4.675940989803078, "learning_rate": 1.9999404684101663e-06, "loss": 1.4257, "step": 2066 }, { "epoch": 0.01, "grad_norm": 6.0245884345574225, "learning_rate": 1.9999404104865518e-06, "loss": 1.2672, "step": 2067 }, { "epoch": 0.01, "grad_norm": 6.1119365663762055, "learning_rate": 1.999940352534772e-06, "loss": 1.251, "step": 2068 }, { "epoch": 0.01, "grad_norm": 5.0121477273349315, "learning_rate": 1.9999402945548273e-06, "loss": 1.426, "step": 2069 }, { "epoch": 0.01, "grad_norm": 5.128014707129984, "learning_rate": 1.9999402365467174e-06, "loss": 1.3379, "step": 2070 }, { "epoch": 0.01, "grad_norm": 4.862057104969277, "learning_rate": 1.9999401785104428e-06, "loss": 1.424, "step": 2071 }, { "epoch": 0.01, "grad_norm": 4.873753921801018, "learning_rate": 1.999940120446003e-06, "loss": 1.3945, "step": 2072 }, { "epoch": 0.01, "grad_norm": 5.1012581647028945, "learning_rate": 1.999940062353398e-06, "loss": 1.5296, "step": 2073 }, { "epoch": 0.01, "grad_norm": 4.6157521798841055, "learning_rate": 1.9999400042326282e-06, "loss": 1.2629, "step": 2074 }, { "epoch": 0.01, "grad_norm": 4.803622065399791, "learning_rate": 1.999939946083694e-06, "loss": 1.408, "step": 2075 }, { "epoch": 0.01, "grad_norm": 4.740381722750265, "learning_rate": 1.9999398879065943e-06, "loss": 1.3605, "step": 2076 }, { "epoch": 0.01, "grad_norm": 4.831139523992052, "learning_rate": 1.99993982970133e-06, "loss": 1.2995, "step": 2077 }, { "epoch": 0.01, "grad_norm": 4.829459069984273, "learning_rate": 1.9999397714679002e-06, "loss": 1.3909, "step": 2078 }, { "epoch": 0.01, "grad_norm": 4.827491011717043, "learning_rate": 1.9999397132063055e-06, "loss": 1.559, "step": 2079 }, { "epoch": 0.01, "grad_norm": 4.947759137746265, "learning_rate": 1.9999396549165457e-06, "loss": 1.4971, "step": 2080 }, { "epoch": 0.01, "grad_norm": 4.739533384237727, "learning_rate": 1.9999395965986212e-06, "loss": 1.3768, "step": 2081 }, { "epoch": 0.01, "grad_norm": 7.339254038218756, "learning_rate": 1.9999395382525316e-06, "loss": 1.4479, "step": 2082 }, { "epoch": 0.01, "grad_norm": 4.537488781200089, "learning_rate": 1.9999394798782773e-06, "loss": 1.3548, "step": 2083 }, { "epoch": 0.01, "grad_norm": 4.7315607432035005, "learning_rate": 1.999939421475858e-06, "loss": 1.3923, "step": 2084 }, { "epoch": 0.01, "grad_norm": 5.154239758619694, "learning_rate": 1.9999393630452733e-06, "loss": 1.3989, "step": 2085 }, { "epoch": 0.01, "grad_norm": 4.535664302600297, "learning_rate": 1.999939304586524e-06, "loss": 1.3719, "step": 2086 }, { "epoch": 0.01, "grad_norm": 4.655332915728781, "learning_rate": 1.9999392460996098e-06, "loss": 1.3278, "step": 2087 }, { "epoch": 0.01, "grad_norm": 4.806855739816848, "learning_rate": 1.9999391875845303e-06, "loss": 1.5083, "step": 2088 }, { "epoch": 0.01, "grad_norm": 5.213130734185716, "learning_rate": 1.999939129041286e-06, "loss": 1.4663, "step": 2089 }, { "epoch": 0.01, "grad_norm": 6.229568233154151, "learning_rate": 1.999939070469877e-06, "loss": 1.3439, "step": 2090 }, { "epoch": 0.01, "grad_norm": 4.086635517733998, "learning_rate": 1.999939011870303e-06, "loss": 1.2555, "step": 2091 }, { "epoch": 0.01, "grad_norm": 4.727112810450526, "learning_rate": 1.999938953242564e-06, "loss": 1.4361, "step": 2092 }, { "epoch": 0.01, "grad_norm": 5.052258554851617, "learning_rate": 1.9999388945866596e-06, "loss": 1.3848, "step": 2093 }, { "epoch": 0.01, "grad_norm": 4.88161487935287, "learning_rate": 1.9999388359025903e-06, "loss": 1.4832, "step": 2094 }, { "epoch": 0.01, "grad_norm": 4.6739895014787685, "learning_rate": 1.9999387771903563e-06, "loss": 1.478, "step": 2095 }, { "epoch": 0.01, "grad_norm": 4.728323614320543, "learning_rate": 1.9999387184499576e-06, "loss": 1.5456, "step": 2096 }, { "epoch": 0.01, "grad_norm": 4.765256915622586, "learning_rate": 1.999938659681394e-06, "loss": 1.407, "step": 2097 }, { "epoch": 0.01, "grad_norm": 4.431982504309138, "learning_rate": 1.999938600884665e-06, "loss": 1.2764, "step": 2098 }, { "epoch": 0.01, "grad_norm": 4.902178089633812, "learning_rate": 1.999938542059771e-06, "loss": 1.5341, "step": 2099 }, { "epoch": 0.01, "grad_norm": 4.510226095063566, "learning_rate": 1.999938483206712e-06, "loss": 1.3526, "step": 2100 }, { "epoch": 0.01, "grad_norm": 5.074107216841095, "learning_rate": 1.9999384243254887e-06, "loss": 1.3247, "step": 2101 }, { "epoch": 0.01, "grad_norm": 4.667756343723743, "learning_rate": 1.9999383654160997e-06, "loss": 1.4049, "step": 2102 }, { "epoch": 0.01, "grad_norm": 4.752073321972883, "learning_rate": 1.999938306478546e-06, "loss": 1.4703, "step": 2103 }, { "epoch": 0.01, "grad_norm": 4.742868907788746, "learning_rate": 1.9999382475128277e-06, "loss": 1.2977, "step": 2104 }, { "epoch": 0.01, "grad_norm": 4.886382117755776, "learning_rate": 1.999938188518944e-06, "loss": 1.4292, "step": 2105 }, { "epoch": 0.01, "grad_norm": 4.781559805396283, "learning_rate": 1.999938129496896e-06, "loss": 1.3088, "step": 2106 }, { "epoch": 0.01, "grad_norm": 4.593766296713237, "learning_rate": 1.9999380704466828e-06, "loss": 1.3449, "step": 2107 }, { "epoch": 0.01, "grad_norm": 5.524007597424321, "learning_rate": 1.9999380113683048e-06, "loss": 1.4613, "step": 2108 }, { "epoch": 0.01, "grad_norm": 4.622852323760914, "learning_rate": 1.9999379522617613e-06, "loss": 1.4825, "step": 2109 }, { "epoch": 0.01, "grad_norm": 5.616671774319379, "learning_rate": 1.999937893127053e-06, "loss": 1.5888, "step": 2110 }, { "epoch": 0.01, "grad_norm": 4.481567608026818, "learning_rate": 1.99993783396418e-06, "loss": 1.4353, "step": 2111 }, { "epoch": 0.01, "grad_norm": 4.602477850579874, "learning_rate": 1.9999377747731426e-06, "loss": 1.3397, "step": 2112 }, { "epoch": 0.01, "grad_norm": 4.81449808655875, "learning_rate": 1.9999377155539395e-06, "loss": 1.4046, "step": 2113 }, { "epoch": 0.01, "grad_norm": 5.073953878891228, "learning_rate": 1.9999376563065716e-06, "loss": 1.4595, "step": 2114 }, { "epoch": 0.01, "grad_norm": 6.743457594007675, "learning_rate": 1.999937597031039e-06, "loss": 1.3975, "step": 2115 }, { "epoch": 0.01, "grad_norm": 4.530772105704062, "learning_rate": 1.9999375377273415e-06, "loss": 1.5028, "step": 2116 }, { "epoch": 0.01, "grad_norm": 5.257434903260648, "learning_rate": 1.999937478395479e-06, "loss": 1.4187, "step": 2117 }, { "epoch": 0.01, "eval_loss": 1.6335363388061523, "eval_runtime": 4.6337, "eval_samples_per_second": 1.942, "eval_steps_per_second": 1.079, "step": 2117 }, { "epoch": 0.01, "grad_norm": 4.722957307355561, "learning_rate": 1.9999374190354517e-06, "loss": 1.4206, "step": 2118 }, { "epoch": 0.01, "grad_norm": 4.473561914847432, "learning_rate": 1.999937359647259e-06, "loss": 1.3537, "step": 2119 }, { "epoch": 0.01, "grad_norm": 4.580086304567218, "learning_rate": 1.999937300230902e-06, "loss": 1.3944, "step": 2120 }, { "epoch": 0.01, "grad_norm": 4.245944086564923, "learning_rate": 1.9999372407863796e-06, "loss": 1.4074, "step": 2121 }, { "epoch": 0.01, "grad_norm": 4.676469646761249, "learning_rate": 1.999937181313693e-06, "loss": 1.4051, "step": 2122 }, { "epoch": 0.01, "grad_norm": 4.922681086761417, "learning_rate": 1.9999371218128408e-06, "loss": 1.4609, "step": 2123 }, { "epoch": 0.01, "grad_norm": 4.654470497689943, "learning_rate": 1.999937062283824e-06, "loss": 1.3944, "step": 2124 }, { "epoch": 0.01, "grad_norm": 4.961121386583948, "learning_rate": 1.9999370027266423e-06, "loss": 1.3877, "step": 2125 }, { "epoch": 0.01, "grad_norm": 5.590317374681009, "learning_rate": 1.9999369431412957e-06, "loss": 1.3661, "step": 2126 }, { "epoch": 0.01, "grad_norm": 5.530242304003091, "learning_rate": 1.9999368835277843e-06, "loss": 1.4709, "step": 2127 }, { "epoch": 0.01, "grad_norm": 4.619061402439089, "learning_rate": 1.999936823886108e-06, "loss": 1.3906, "step": 2128 }, { "epoch": 0.01, "grad_norm": 5.012913828479663, "learning_rate": 1.9999367642162662e-06, "loss": 1.5088, "step": 2129 }, { "epoch": 0.01, "grad_norm": 4.979952662806363, "learning_rate": 1.99993670451826e-06, "loss": 1.3039, "step": 2130 }, { "epoch": 0.01, "grad_norm": 4.5714181931648445, "learning_rate": 1.999936644792089e-06, "loss": 1.3549, "step": 2131 }, { "epoch": 0.01, "grad_norm": 4.770544347901491, "learning_rate": 1.999936585037753e-06, "loss": 1.3643, "step": 2132 }, { "epoch": 0.01, "grad_norm": 5.401807178237035, "learning_rate": 1.999936525255252e-06, "loss": 1.2191, "step": 2133 }, { "epoch": 0.01, "grad_norm": 5.894088878558767, "learning_rate": 1.999936465444586e-06, "loss": 1.5738, "step": 2134 }, { "epoch": 0.01, "grad_norm": 5.829142286770646, "learning_rate": 1.9999364056057555e-06, "loss": 1.5857, "step": 2135 }, { "epoch": 0.01, "grad_norm": 4.585395214505738, "learning_rate": 1.99993634573876e-06, "loss": 1.4653, "step": 2136 }, { "epoch": 0.01, "grad_norm": 4.752105085256273, "learning_rate": 1.9999362858435994e-06, "loss": 1.5093, "step": 2137 }, { "epoch": 0.01, "grad_norm": 4.567527923711839, "learning_rate": 1.9999362259202743e-06, "loss": 1.3605, "step": 2138 }, { "epoch": 0.01, "grad_norm": 4.928571238482233, "learning_rate": 1.999936165968784e-06, "loss": 1.4562, "step": 2139 }, { "epoch": 0.01, "grad_norm": 4.328826801769719, "learning_rate": 1.9999361059891288e-06, "loss": 1.347, "step": 2140 }, { "epoch": 0.01, "grad_norm": 4.886917145275371, "learning_rate": 1.9999360459813087e-06, "loss": 1.4164, "step": 2141 }, { "epoch": 0.01, "grad_norm": 6.291684965837117, "learning_rate": 1.999935985945324e-06, "loss": 1.4878, "step": 2142 }, { "epoch": 0.01, "grad_norm": 4.683286308224392, "learning_rate": 1.999935925881174e-06, "loss": 1.4892, "step": 2143 }, { "epoch": 0.01, "grad_norm": 5.5644324951779875, "learning_rate": 1.9999358657888597e-06, "loss": 1.4277, "step": 2144 }, { "epoch": 0.01, "grad_norm": 4.719071022798878, "learning_rate": 1.99993580566838e-06, "loss": 1.3753, "step": 2145 }, { "epoch": 0.01, "grad_norm": 5.384593412220494, "learning_rate": 1.9999357455197357e-06, "loss": 1.4575, "step": 2146 }, { "epoch": 0.01, "grad_norm": 4.49104950913911, "learning_rate": 1.999935685342927e-06, "loss": 1.4833, "step": 2147 }, { "epoch": 0.01, "grad_norm": 4.386345133556867, "learning_rate": 1.9999356251379525e-06, "loss": 1.2825, "step": 2148 }, { "epoch": 0.01, "grad_norm": 7.084357385014738, "learning_rate": 1.999935564904814e-06, "loss": 1.5971, "step": 2149 }, { "epoch": 0.01, "grad_norm": 4.483804488981741, "learning_rate": 1.99993550464351e-06, "loss": 1.3572, "step": 2150 }, { "epoch": 0.01, "grad_norm": 4.9800344533105045, "learning_rate": 1.999935444354042e-06, "loss": 1.4381, "step": 2151 }, { "epoch": 0.01, "grad_norm": 5.2020100481802025, "learning_rate": 1.999935384036408e-06, "loss": 1.5282, "step": 2152 }, { "epoch": 0.01, "grad_norm": 4.513002376188213, "learning_rate": 1.9999353236906097e-06, "loss": 1.4226, "step": 2153 }, { "epoch": 0.01, "grad_norm": 7.271851960236858, "learning_rate": 1.9999352633166466e-06, "loss": 1.4555, "step": 2154 }, { "epoch": 0.01, "grad_norm": 4.505644532012899, "learning_rate": 1.9999352029145187e-06, "loss": 1.3634, "step": 2155 }, { "epoch": 0.01, "grad_norm": 4.878236341705409, "learning_rate": 1.9999351424842258e-06, "loss": 1.3889, "step": 2156 }, { "epoch": 0.01, "grad_norm": 4.398260948524359, "learning_rate": 1.999935082025768e-06, "loss": 1.2472, "step": 2157 }, { "epoch": 0.01, "grad_norm": 4.83937816831458, "learning_rate": 1.9999350215391454e-06, "loss": 1.3623, "step": 2158 }, { "epoch": 0.01, "grad_norm": 4.865952677064821, "learning_rate": 1.999934961024358e-06, "loss": 1.4206, "step": 2159 }, { "epoch": 0.01, "grad_norm": 4.5376685001437, "learning_rate": 1.9999349004814058e-06, "loss": 1.4301, "step": 2160 }, { "epoch": 0.01, "grad_norm": 5.823420924569888, "learning_rate": 1.999934839910289e-06, "loss": 1.3083, "step": 2161 }, { "epoch": 0.01, "grad_norm": 4.854880352037673, "learning_rate": 1.999934779311007e-06, "loss": 1.4037, "step": 2162 }, { "epoch": 0.01, "grad_norm": 4.888876031412537, "learning_rate": 1.9999347186835603e-06, "loss": 1.4077, "step": 2163 }, { "epoch": 0.01, "grad_norm": 4.311155169710946, "learning_rate": 1.9999346580279485e-06, "loss": 1.2698, "step": 2164 }, { "epoch": 0.01, "grad_norm": 4.665174656087581, "learning_rate": 1.999934597344172e-06, "loss": 1.3712, "step": 2165 }, { "epoch": 0.01, "grad_norm": 5.162416137409393, "learning_rate": 1.999934536632231e-06, "loss": 1.4851, "step": 2166 }, { "epoch": 0.01, "grad_norm": 5.018889909992565, "learning_rate": 1.9999344758921247e-06, "loss": 1.452, "step": 2167 }, { "epoch": 0.01, "grad_norm": 4.793825358446103, "learning_rate": 1.999934415123854e-06, "loss": 1.4716, "step": 2168 }, { "epoch": 0.01, "grad_norm": 5.377258615565779, "learning_rate": 1.999934354327418e-06, "loss": 1.3063, "step": 2169 }, { "epoch": 0.01, "grad_norm": 4.656539358484254, "learning_rate": 1.9999342935028177e-06, "loss": 1.3844, "step": 2170 }, { "epoch": 0.01, "grad_norm": 5.869287289839831, "learning_rate": 1.9999342326500522e-06, "loss": 1.44, "step": 2171 }, { "epoch": 0.01, "grad_norm": 6.642787385567338, "learning_rate": 1.999934171769122e-06, "loss": 1.3566, "step": 2172 }, { "epoch": 0.01, "grad_norm": 4.8759975771352755, "learning_rate": 1.999934110860027e-06, "loss": 1.4363, "step": 2173 }, { "epoch": 0.01, "grad_norm": 5.319719105020677, "learning_rate": 1.999934049922767e-06, "loss": 1.3242, "step": 2174 }, { "epoch": 0.01, "grad_norm": 5.482396020182398, "learning_rate": 1.9999339889573426e-06, "loss": 1.5786, "step": 2175 }, { "epoch": 0.01, "grad_norm": 4.8751202038012265, "learning_rate": 1.999933927963753e-06, "loss": 1.5066, "step": 2176 }, { "epoch": 0.01, "grad_norm": 5.557230982959098, "learning_rate": 1.9999338669419984e-06, "loss": 1.4745, "step": 2177 }, { "epoch": 0.01, "grad_norm": 4.424760214415525, "learning_rate": 1.9999338058920797e-06, "loss": 1.3904, "step": 2178 }, { "epoch": 0.01, "grad_norm": 7.892348644844545, "learning_rate": 1.999933744813996e-06, "loss": 1.4981, "step": 2179 }, { "epoch": 0.01, "grad_norm": 4.751102176031589, "learning_rate": 1.999933683707747e-06, "loss": 1.4348, "step": 2180 }, { "epoch": 0.01, "grad_norm": 5.242736776599296, "learning_rate": 1.9999336225733336e-06, "loss": 1.5039, "step": 2181 }, { "epoch": 0.01, "grad_norm": 4.817718856225954, "learning_rate": 1.9999335614107553e-06, "loss": 1.4643, "step": 2182 }, { "epoch": 0.01, "grad_norm": 4.8671512340556315, "learning_rate": 1.999933500220012e-06, "loss": 1.4266, "step": 2183 }, { "epoch": 0.01, "grad_norm": 5.643685486109339, "learning_rate": 1.999933439001104e-06, "loss": 1.5044, "step": 2184 }, { "epoch": 0.01, "grad_norm": 4.812633898353339, "learning_rate": 1.9999333777540312e-06, "loss": 1.5403, "step": 2185 }, { "epoch": 0.01, "grad_norm": 4.5061691305089235, "learning_rate": 1.9999333164787937e-06, "loss": 1.2902, "step": 2186 }, { "epoch": 0.01, "grad_norm": 4.888665712532743, "learning_rate": 1.9999332551753915e-06, "loss": 1.4128, "step": 2187 }, { "epoch": 0.01, "grad_norm": 4.587451641455223, "learning_rate": 1.9999331938438245e-06, "loss": 1.4712, "step": 2188 }, { "epoch": 0.01, "grad_norm": 6.71997025307305, "learning_rate": 1.9999331324840925e-06, "loss": 1.3803, "step": 2189 }, { "epoch": 0.01, "grad_norm": 4.756832246172219, "learning_rate": 1.9999330710961957e-06, "loss": 1.4978, "step": 2190 }, { "epoch": 0.01, "eval_loss": 1.6319785118103027, "eval_runtime": 4.6267, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.081, "step": 2190 }, { "epoch": 0.01, "grad_norm": 5.050699124286467, "learning_rate": 1.9999330096801343e-06, "loss": 1.4176, "step": 2191 }, { "epoch": 0.01, "grad_norm": 4.898809642977276, "learning_rate": 1.9999329482359078e-06, "loss": 1.3222, "step": 2192 }, { "epoch": 0.01, "grad_norm": 4.704754463426861, "learning_rate": 1.999932886763517e-06, "loss": 1.3086, "step": 2193 }, { "epoch": 0.01, "grad_norm": 4.924235736627156, "learning_rate": 1.999932825262961e-06, "loss": 1.4768, "step": 2194 }, { "epoch": 0.01, "grad_norm": 7.042352873967065, "learning_rate": 1.9999327637342404e-06, "loss": 1.6917, "step": 2195 }, { "epoch": 0.01, "grad_norm": 4.9553596330060925, "learning_rate": 1.999932702177355e-06, "loss": 1.4411, "step": 2196 }, { "epoch": 0.01, "grad_norm": 4.963498435657598, "learning_rate": 1.9999326405923047e-06, "loss": 1.395, "step": 2197 }, { "epoch": 0.01, "grad_norm": 4.506052602096603, "learning_rate": 1.9999325789790895e-06, "loss": 1.247, "step": 2198 }, { "epoch": 0.01, "grad_norm": 5.278628706173458, "learning_rate": 1.99993251733771e-06, "loss": 1.4507, "step": 2199 }, { "epoch": 0.01, "grad_norm": 4.53129051287259, "learning_rate": 1.9999324556681656e-06, "loss": 1.4194, "step": 2200 }, { "epoch": 0.01, "grad_norm": 4.821670837144131, "learning_rate": 1.9999323939704564e-06, "loss": 1.3976, "step": 2201 }, { "epoch": 0.01, "grad_norm": 5.011321422146149, "learning_rate": 1.999932332244582e-06, "loss": 1.6157, "step": 2202 }, { "epoch": 0.01, "grad_norm": 4.951896374917011, "learning_rate": 1.9999322704905435e-06, "loss": 1.3892, "step": 2203 }, { "epoch": 0.01, "grad_norm": 4.653655462890473, "learning_rate": 1.99993220870834e-06, "loss": 1.4186, "step": 2204 }, { "epoch": 0.01, "grad_norm": 4.9073760614579784, "learning_rate": 1.9999321468979714e-06, "loss": 1.3918, "step": 2205 }, { "epoch": 0.01, "grad_norm": 4.615946898949113, "learning_rate": 1.9999320850594383e-06, "loss": 1.4513, "step": 2206 }, { "epoch": 0.01, "grad_norm": 4.781700938808737, "learning_rate": 1.99993202319274e-06, "loss": 1.4084, "step": 2207 }, { "epoch": 0.01, "grad_norm": 4.919243252625909, "learning_rate": 1.9999319612978777e-06, "loss": 1.3552, "step": 2208 }, { "epoch": 0.01, "grad_norm": 11.350832956279241, "learning_rate": 1.9999318993748505e-06, "loss": 1.6017, "step": 2209 }, { "epoch": 0.01, "grad_norm": 5.273831701204187, "learning_rate": 1.9999318374236587e-06, "loss": 1.4666, "step": 2210 }, { "epoch": 0.01, "grad_norm": 5.29032597581334, "learning_rate": 1.9999317754443013e-06, "loss": 1.6475, "step": 2211 }, { "epoch": 0.01, "grad_norm": 4.679461340547445, "learning_rate": 1.99993171343678e-06, "loss": 1.4847, "step": 2212 }, { "epoch": 0.01, "grad_norm": 4.881691601287856, "learning_rate": 1.9999316514010932e-06, "loss": 1.6707, "step": 2213 }, { "epoch": 0.01, "grad_norm": 4.880588356691005, "learning_rate": 1.999931589337242e-06, "loss": 1.4591, "step": 2214 }, { "epoch": 0.01, "grad_norm": 5.198012469615865, "learning_rate": 1.999931527245226e-06, "loss": 1.557, "step": 2215 }, { "epoch": 0.01, "grad_norm": 4.599284399092693, "learning_rate": 1.9999314651250456e-06, "loss": 1.4037, "step": 2216 }, { "epoch": 0.01, "grad_norm": 4.55023727427313, "learning_rate": 1.9999314029767004e-06, "loss": 1.3229, "step": 2217 }, { "epoch": 0.01, "grad_norm": 4.916366612795883, "learning_rate": 1.99993134080019e-06, "loss": 1.4152, "step": 2218 }, { "epoch": 0.01, "grad_norm": 4.272257622372692, "learning_rate": 1.9999312785955153e-06, "loss": 1.2069, "step": 2219 }, { "epoch": 0.02, "grad_norm": 5.793645780553963, "learning_rate": 1.9999312163626756e-06, "loss": 1.3437, "step": 2220 }, { "epoch": 0.02, "grad_norm": 4.765355672753897, "learning_rate": 1.9999311541016713e-06, "loss": 1.3828, "step": 2221 }, { "epoch": 0.02, "grad_norm": 4.609573170614245, "learning_rate": 1.9999310918125023e-06, "loss": 1.3513, "step": 2222 }, { "epoch": 0.02, "grad_norm": 4.705044702022353, "learning_rate": 1.9999310294951686e-06, "loss": 1.4292, "step": 2223 }, { "epoch": 0.02, "grad_norm": 6.549299970897536, "learning_rate": 1.99993096714967e-06, "loss": 1.3031, "step": 2224 }, { "epoch": 0.02, "grad_norm": 5.685926142426188, "learning_rate": 1.9999309047760067e-06, "loss": 1.4263, "step": 2225 }, { "epoch": 0.02, "grad_norm": 4.892295754105913, "learning_rate": 1.999930842374179e-06, "loss": 1.5274, "step": 2226 }, { "epoch": 0.02, "grad_norm": 4.723207695873954, "learning_rate": 1.999930779944186e-06, "loss": 1.2833, "step": 2227 }, { "epoch": 0.02, "grad_norm": 9.43058673722676, "learning_rate": 1.9999307174860284e-06, "loss": 1.3916, "step": 2228 }, { "epoch": 0.02, "grad_norm": 4.442144662758888, "learning_rate": 1.9999306549997065e-06, "loss": 1.3429, "step": 2229 }, { "epoch": 0.02, "grad_norm": 5.263310396091934, "learning_rate": 1.9999305924852196e-06, "loss": 1.4077, "step": 2230 }, { "epoch": 0.02, "grad_norm": 4.693930449937829, "learning_rate": 1.999930529942568e-06, "loss": 1.3103, "step": 2231 }, { "epoch": 0.02, "grad_norm": 4.394685919756182, "learning_rate": 1.9999304673717515e-06, "loss": 1.368, "step": 2232 }, { "epoch": 0.02, "grad_norm": 4.808214571784509, "learning_rate": 1.9999304047727705e-06, "loss": 1.5119, "step": 2233 }, { "epoch": 0.02, "grad_norm": 4.5647319860721165, "learning_rate": 1.9999303421456247e-06, "loss": 1.4842, "step": 2234 }, { "epoch": 0.02, "grad_norm": 5.270790805945803, "learning_rate": 1.9999302794903143e-06, "loss": 1.5416, "step": 2235 }, { "epoch": 0.02, "grad_norm": 5.115712741622331, "learning_rate": 1.999930216806839e-06, "loss": 1.6271, "step": 2236 }, { "epoch": 0.02, "grad_norm": 4.867749398037803, "learning_rate": 1.9999301540951993e-06, "loss": 1.4521, "step": 2237 }, { "epoch": 0.02, "grad_norm": 4.759697223017583, "learning_rate": 1.999930091355395e-06, "loss": 1.4084, "step": 2238 }, { "epoch": 0.02, "grad_norm": 4.910296412113315, "learning_rate": 1.999930028587425e-06, "loss": 1.3314, "step": 2239 }, { "epoch": 0.02, "grad_norm": 4.978260498852945, "learning_rate": 1.9999299657912913e-06, "loss": 1.389, "step": 2240 }, { "epoch": 0.02, "grad_norm": 5.148168402542014, "learning_rate": 1.9999299029669927e-06, "loss": 1.376, "step": 2241 }, { "epoch": 0.02, "grad_norm": 5.290115867763426, "learning_rate": 1.9999298401145294e-06, "loss": 1.6375, "step": 2242 }, { "epoch": 0.02, "grad_norm": 4.97467209793556, "learning_rate": 1.999929777233901e-06, "loss": 1.2882, "step": 2243 }, { "epoch": 0.02, "grad_norm": 5.344906181950449, "learning_rate": 1.9999297143251083e-06, "loss": 1.4118, "step": 2244 }, { "epoch": 0.02, "grad_norm": 4.308318004204632, "learning_rate": 1.999929651388151e-06, "loss": 1.3563, "step": 2245 }, { "epoch": 0.02, "grad_norm": 4.568152176449807, "learning_rate": 1.999929588423029e-06, "loss": 1.5212, "step": 2246 }, { "epoch": 0.02, "grad_norm": 4.601269314744774, "learning_rate": 1.9999295254297417e-06, "loss": 1.4808, "step": 2247 }, { "epoch": 0.02, "grad_norm": 4.854159941498488, "learning_rate": 1.9999294624082903e-06, "loss": 1.3002, "step": 2248 }, { "epoch": 0.02, "grad_norm": 4.718540879766618, "learning_rate": 1.9999293993586737e-06, "loss": 1.4872, "step": 2249 }, { "epoch": 0.02, "grad_norm": 4.7452734133361325, "learning_rate": 1.9999293362808933e-06, "loss": 1.3559, "step": 2250 }, { "epoch": 0.02, "grad_norm": 4.526122852996041, "learning_rate": 1.9999292731749473e-06, "loss": 1.4187, "step": 2251 }, { "epoch": 0.02, "grad_norm": 4.963529673562205, "learning_rate": 1.999929210040837e-06, "loss": 1.4198, "step": 2252 }, { "epoch": 0.02, "grad_norm": 4.77049575346727, "learning_rate": 1.999929146878562e-06, "loss": 1.2526, "step": 2253 }, { "epoch": 0.02, "grad_norm": 4.6993939688150395, "learning_rate": 1.999929083688122e-06, "loss": 1.5895, "step": 2254 }, { "epoch": 0.02, "grad_norm": 12.118619735033707, "learning_rate": 1.999929020469518e-06, "loss": 1.429, "step": 2255 }, { "epoch": 0.02, "grad_norm": 4.6599775134774815, "learning_rate": 1.999928957222749e-06, "loss": 1.5317, "step": 2256 }, { "epoch": 0.02, "grad_norm": 5.7355804324143405, "learning_rate": 1.9999288939478153e-06, "loss": 1.4884, "step": 2257 }, { "epoch": 0.02, "grad_norm": 5.3932795591976275, "learning_rate": 1.999928830644717e-06, "loss": 1.4564, "step": 2258 }, { "epoch": 0.02, "grad_norm": 5.091350488878629, "learning_rate": 1.9999287673134542e-06, "loss": 1.5135, "step": 2259 }, { "epoch": 0.02, "grad_norm": 5.7772806452872265, "learning_rate": 1.999928703954026e-06, "loss": 1.3913, "step": 2260 }, { "epoch": 0.02, "grad_norm": 4.604236988471221, "learning_rate": 1.999928640566434e-06, "loss": 1.5703, "step": 2261 }, { "epoch": 0.02, "grad_norm": 5.270866871119281, "learning_rate": 1.999928577150677e-06, "loss": 1.3042, "step": 2262 }, { "epoch": 0.02, "grad_norm": 4.55553737525265, "learning_rate": 1.9999285137067555e-06, "loss": 1.3917, "step": 2263 }, { "epoch": 0.02, "eval_loss": 1.6318128108978271, "eval_runtime": 4.6236, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.081, "step": 2263 }, { "epoch": 0.02, "grad_norm": 4.57569740895167, "learning_rate": 1.999928450234669e-06, "loss": 1.3676, "step": 2264 }, { "epoch": 0.02, "grad_norm": 5.233907230202161, "learning_rate": 1.9999283867344178e-06, "loss": 1.4965, "step": 2265 }, { "epoch": 0.02, "grad_norm": 4.36719675852659, "learning_rate": 1.9999283232060023e-06, "loss": 1.2795, "step": 2266 }, { "epoch": 0.02, "grad_norm": 6.6618468973225005, "learning_rate": 1.999928259649422e-06, "loss": 1.5947, "step": 2267 }, { "epoch": 0.02, "grad_norm": 5.211675232112218, "learning_rate": 1.999928196064677e-06, "loss": 1.6732, "step": 2268 }, { "epoch": 0.02, "grad_norm": 4.691350080075693, "learning_rate": 1.9999281324517672e-06, "loss": 1.4264, "step": 2269 }, { "epoch": 0.02, "grad_norm": 4.310562396402286, "learning_rate": 1.9999280688106934e-06, "loss": 1.3275, "step": 2270 }, { "epoch": 0.02, "grad_norm": 4.653521933252809, "learning_rate": 1.999928005141454e-06, "loss": 1.2394, "step": 2271 }, { "epoch": 0.02, "grad_norm": 5.251196276782816, "learning_rate": 1.999927941444051e-06, "loss": 1.4138, "step": 2272 }, { "epoch": 0.02, "grad_norm": 4.73612278780909, "learning_rate": 1.9999278777184825e-06, "loss": 1.3869, "step": 2273 }, { "epoch": 0.02, "grad_norm": 4.700582189189819, "learning_rate": 1.99992781396475e-06, "loss": 1.2867, "step": 2274 }, { "epoch": 0.02, "grad_norm": 4.995811821783478, "learning_rate": 1.999927750182852e-06, "loss": 1.4574, "step": 2275 }, { "epoch": 0.02, "grad_norm": 5.7048294811265405, "learning_rate": 1.99992768637279e-06, "loss": 1.5312, "step": 2276 }, { "epoch": 0.02, "grad_norm": 6.015500835378081, "learning_rate": 1.9999276225345635e-06, "loss": 1.4603, "step": 2277 }, { "epoch": 0.02, "grad_norm": 5.038403242723015, "learning_rate": 1.9999275586681717e-06, "loss": 1.4336, "step": 2278 }, { "epoch": 0.02, "grad_norm": 4.94366068107535, "learning_rate": 1.9999274947736156e-06, "loss": 1.4457, "step": 2279 }, { "epoch": 0.02, "grad_norm": 4.853975513709176, "learning_rate": 1.9999274308508953e-06, "loss": 1.4836, "step": 2280 }, { "epoch": 0.02, "grad_norm": 10.250005051833288, "learning_rate": 1.99992736690001e-06, "loss": 1.4746, "step": 2281 }, { "epoch": 0.02, "grad_norm": 4.706101925023441, "learning_rate": 1.9999273029209597e-06, "loss": 1.5153, "step": 2282 }, { "epoch": 0.02, "grad_norm": 8.691916068442643, "learning_rate": 1.9999272389137453e-06, "loss": 1.6037, "step": 2283 }, { "epoch": 0.02, "grad_norm": 4.891521294365354, "learning_rate": 1.999927174878366e-06, "loss": 1.4662, "step": 2284 }, { "epoch": 0.02, "grad_norm": 4.299833570856004, "learning_rate": 1.9999271108148224e-06, "loss": 1.4442, "step": 2285 }, { "epoch": 0.02, "grad_norm": 5.233339122045351, "learning_rate": 1.9999270467231143e-06, "loss": 1.593, "step": 2286 }, { "epoch": 0.02, "grad_norm": 4.876412995521755, "learning_rate": 1.999926982603241e-06, "loss": 1.4954, "step": 2287 }, { "epoch": 0.02, "grad_norm": 4.83398554670689, "learning_rate": 1.9999269184552033e-06, "loss": 1.4952, "step": 2288 }, { "epoch": 0.02, "grad_norm": 4.659977577870166, "learning_rate": 1.999926854279001e-06, "loss": 1.5399, "step": 2289 }, { "epoch": 0.02, "grad_norm": 4.709246532034214, "learning_rate": 1.9999267900746343e-06, "loss": 1.4852, "step": 2290 }, { "epoch": 0.02, "grad_norm": 4.469809734473597, "learning_rate": 1.9999267258421028e-06, "loss": 1.4133, "step": 2291 }, { "epoch": 0.02, "grad_norm": 4.470510103145094, "learning_rate": 1.999926661581407e-06, "loss": 1.393, "step": 2292 }, { "epoch": 0.02, "grad_norm": 5.045637874654517, "learning_rate": 1.999926597292546e-06, "loss": 1.4533, "step": 2293 }, { "epoch": 0.02, "grad_norm": 7.56360911520591, "learning_rate": 1.9999265329755204e-06, "loss": 1.4627, "step": 2294 }, { "epoch": 0.02, "grad_norm": 4.90860484176041, "learning_rate": 1.9999264686303306e-06, "loss": 1.4674, "step": 2295 }, { "epoch": 0.02, "grad_norm": 4.513345359016672, "learning_rate": 1.999926404256976e-06, "loss": 1.2764, "step": 2296 }, { "epoch": 0.02, "grad_norm": 5.223838246271355, "learning_rate": 1.9999263398554567e-06, "loss": 1.5127, "step": 2297 }, { "epoch": 0.02, "grad_norm": 5.195325189394897, "learning_rate": 1.999926275425773e-06, "loss": 1.3677, "step": 2298 }, { "epoch": 0.02, "grad_norm": 5.198912590441455, "learning_rate": 1.999926210967925e-06, "loss": 1.3427, "step": 2299 }, { "epoch": 0.02, "grad_norm": 4.614705914091713, "learning_rate": 1.999926146481912e-06, "loss": 1.4733, "step": 2300 }, { "epoch": 0.02, "grad_norm": 4.975373741434217, "learning_rate": 1.9999260819677345e-06, "loss": 1.4271, "step": 2301 }, { "epoch": 0.02, "grad_norm": 4.970231553749659, "learning_rate": 1.9999260174253926e-06, "loss": 1.52, "step": 2302 }, { "epoch": 0.02, "grad_norm": 5.1470565052529516, "learning_rate": 1.9999259528548856e-06, "loss": 1.4938, "step": 2303 }, { "epoch": 0.02, "grad_norm": 4.804932112180643, "learning_rate": 1.9999258882562143e-06, "loss": 1.3168, "step": 2304 }, { "epoch": 0.02, "grad_norm": 5.117885144420734, "learning_rate": 1.9999258236293784e-06, "loss": 1.4501, "step": 2305 }, { "epoch": 0.02, "grad_norm": 4.891670263126413, "learning_rate": 1.999925758974378e-06, "loss": 1.4911, "step": 2306 }, { "epoch": 0.02, "grad_norm": 5.195863211639049, "learning_rate": 1.9999256942912132e-06, "loss": 1.4502, "step": 2307 }, { "epoch": 0.02, "grad_norm": 5.2574150657434, "learning_rate": 1.9999256295798832e-06, "loss": 1.5026, "step": 2308 }, { "epoch": 0.02, "grad_norm": 5.872600619683232, "learning_rate": 1.9999255648403893e-06, "loss": 1.5434, "step": 2309 }, { "epoch": 0.02, "grad_norm": 4.686714043104333, "learning_rate": 1.9999255000727303e-06, "loss": 1.6448, "step": 2310 }, { "epoch": 0.02, "grad_norm": 4.631001470345563, "learning_rate": 1.999925435276907e-06, "loss": 1.3702, "step": 2311 }, { "epoch": 0.02, "grad_norm": 5.337508348563074, "learning_rate": 1.999925370452919e-06, "loss": 1.3578, "step": 2312 }, { "epoch": 0.02, "grad_norm": 4.65818564635677, "learning_rate": 1.999925305600767e-06, "loss": 1.4024, "step": 2313 }, { "epoch": 0.02, "grad_norm": 4.379822492720928, "learning_rate": 1.99992524072045e-06, "loss": 1.4887, "step": 2314 }, { "epoch": 0.02, "grad_norm": 5.299169016401586, "learning_rate": 1.9999251758119684e-06, "loss": 1.4817, "step": 2315 }, { "epoch": 0.02, "grad_norm": 5.225815129184746, "learning_rate": 1.999925110875322e-06, "loss": 1.4054, "step": 2316 }, { "epoch": 0.02, "grad_norm": 4.568549504456492, "learning_rate": 1.999925045910511e-06, "loss": 1.4551, "step": 2317 }, { "epoch": 0.02, "grad_norm": 5.431025457178463, "learning_rate": 1.999924980917536e-06, "loss": 1.4775, "step": 2318 }, { "epoch": 0.02, "grad_norm": 4.8399275566764235, "learning_rate": 1.999924915896396e-06, "loss": 1.4959, "step": 2319 }, { "epoch": 0.02, "grad_norm": 6.473563523649748, "learning_rate": 1.9999248508470912e-06, "loss": 1.4546, "step": 2320 }, { "epoch": 0.02, "grad_norm": 4.740884057821344, "learning_rate": 1.9999247857696227e-06, "loss": 1.4305, "step": 2321 }, { "epoch": 0.02, "grad_norm": 4.860828036066125, "learning_rate": 1.9999247206639887e-06, "loss": 1.4348, "step": 2322 }, { "epoch": 0.02, "grad_norm": 4.6662009244963, "learning_rate": 1.999924655530191e-06, "loss": 1.55, "step": 2323 }, { "epoch": 0.02, "grad_norm": 5.9234684189906925, "learning_rate": 1.9999245903682283e-06, "loss": 1.4198, "step": 2324 }, { "epoch": 0.02, "grad_norm": 4.492554780845503, "learning_rate": 1.999924525178101e-06, "loss": 1.3904, "step": 2325 }, { "epoch": 0.02, "grad_norm": 6.763408364164771, "learning_rate": 1.9999244599598095e-06, "loss": 1.2435, "step": 2326 }, { "epoch": 0.02, "grad_norm": 5.2225195105349975, "learning_rate": 1.9999243947133532e-06, "loss": 1.4942, "step": 2327 }, { "epoch": 0.02, "grad_norm": 5.06710806212036, "learning_rate": 1.9999243294387323e-06, "loss": 1.3485, "step": 2328 }, { "epoch": 0.02, "grad_norm": 4.97215088823951, "learning_rate": 1.9999242641359467e-06, "loss": 1.4744, "step": 2329 }, { "epoch": 0.02, "grad_norm": 6.085684880572574, "learning_rate": 1.999924198804997e-06, "loss": 1.347, "step": 2330 }, { "epoch": 0.02, "grad_norm": 7.378163474560379, "learning_rate": 1.9999241334458823e-06, "loss": 1.3741, "step": 2331 }, { "epoch": 0.02, "grad_norm": 4.503096219870561, "learning_rate": 1.999924068058603e-06, "loss": 1.5229, "step": 2332 }, { "epoch": 0.02, "grad_norm": 5.36927927637507, "learning_rate": 1.99992400264316e-06, "loss": 1.207, "step": 2333 }, { "epoch": 0.02, "grad_norm": 4.457857920180457, "learning_rate": 1.9999239371995517e-06, "loss": 1.4385, "step": 2334 }, { "epoch": 0.02, "grad_norm": 4.6240106572076956, "learning_rate": 1.999923871727779e-06, "loss": 1.4643, "step": 2335 }, { "epoch": 0.02, "grad_norm": 4.861127113148184, "learning_rate": 1.999923806227842e-06, "loss": 1.423, "step": 2336 }, { "epoch": 0.02, "eval_loss": 1.6273260116577148, "eval_runtime": 4.6368, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 2336 }, { "epoch": 0.02, "grad_norm": 4.9052593670852405, "learning_rate": 1.99992374069974e-06, "loss": 1.4129, "step": 2337 }, { "epoch": 0.02, "grad_norm": 5.35788095136664, "learning_rate": 1.9999236751434744e-06, "loss": 1.5536, "step": 2338 }, { "epoch": 0.02, "grad_norm": 4.5895028769970505, "learning_rate": 1.9999236095590435e-06, "loss": 1.3911, "step": 2339 }, { "epoch": 0.02, "grad_norm": 4.831901014472646, "learning_rate": 1.999923543946448e-06, "loss": 1.4226, "step": 2340 }, { "epoch": 0.02, "grad_norm": 4.898166932033894, "learning_rate": 1.9999234783056886e-06, "loss": 1.5627, "step": 2341 }, { "epoch": 0.02, "grad_norm": 4.579327466598639, "learning_rate": 1.999923412636764e-06, "loss": 1.4576, "step": 2342 }, { "epoch": 0.02, "grad_norm": 5.894443631125458, "learning_rate": 1.9999233469396754e-06, "loss": 1.1901, "step": 2343 }, { "epoch": 0.02, "grad_norm": 5.691429649313104, "learning_rate": 1.999923281214422e-06, "loss": 1.6006, "step": 2344 }, { "epoch": 0.02, "grad_norm": 5.688934834826396, "learning_rate": 1.999923215461004e-06, "loss": 1.562, "step": 2345 }, { "epoch": 0.02, "grad_norm": 5.587544882543127, "learning_rate": 1.999923149679422e-06, "loss": 1.4466, "step": 2346 }, { "epoch": 0.02, "grad_norm": 4.412183277646008, "learning_rate": 1.999923083869675e-06, "loss": 1.4389, "step": 2347 }, { "epoch": 0.02, "grad_norm": 4.939665962691669, "learning_rate": 1.9999230180317637e-06, "loss": 1.3812, "step": 2348 }, { "epoch": 0.02, "grad_norm": 5.20037573679403, "learning_rate": 1.9999229521656876e-06, "loss": 1.3061, "step": 2349 }, { "epoch": 0.02, "grad_norm": 5.219055105472269, "learning_rate": 1.9999228862714473e-06, "loss": 1.5609, "step": 2350 }, { "epoch": 0.02, "grad_norm": 4.346948682121618, "learning_rate": 1.9999228203490427e-06, "loss": 1.3565, "step": 2351 }, { "epoch": 0.02, "grad_norm": 4.644800626615502, "learning_rate": 1.999922754398473e-06, "loss": 1.3957, "step": 2352 }, { "epoch": 0.02, "grad_norm": 4.526521338106829, "learning_rate": 1.9999226884197394e-06, "loss": 1.3943, "step": 2353 }, { "epoch": 0.02, "grad_norm": 4.860775731784836, "learning_rate": 1.999922622412841e-06, "loss": 1.5376, "step": 2354 }, { "epoch": 0.02, "grad_norm": 4.987367583243536, "learning_rate": 1.999922556377778e-06, "loss": 1.1198, "step": 2355 }, { "epoch": 0.02, "grad_norm": 4.734455961994287, "learning_rate": 1.999922490314551e-06, "loss": 1.5315, "step": 2356 }, { "epoch": 0.02, "grad_norm": 4.71775855055711, "learning_rate": 1.999922424223159e-06, "loss": 1.4829, "step": 2357 }, { "epoch": 0.02, "grad_norm": 6.340625782014353, "learning_rate": 1.999922358103603e-06, "loss": 1.3968, "step": 2358 }, { "epoch": 0.02, "grad_norm": 4.624947314913078, "learning_rate": 1.999922291955882e-06, "loss": 1.3363, "step": 2359 }, { "epoch": 0.02, "grad_norm": 4.617825324071575, "learning_rate": 1.9999222257799964e-06, "loss": 1.5, "step": 2360 }, { "epoch": 0.02, "grad_norm": 4.485319822796489, "learning_rate": 1.999922159575947e-06, "loss": 1.4177, "step": 2361 }, { "epoch": 0.02, "grad_norm": 4.850158299358528, "learning_rate": 1.9999220933437324e-06, "loss": 1.3689, "step": 2362 }, { "epoch": 0.02, "grad_norm": 5.219052152565596, "learning_rate": 1.999922027083354e-06, "loss": 1.4092, "step": 2363 }, { "epoch": 0.02, "grad_norm": 4.931613378755357, "learning_rate": 1.999921960794811e-06, "loss": 1.326, "step": 2364 }, { "epoch": 0.02, "grad_norm": 4.992683247482379, "learning_rate": 1.9999218944781032e-06, "loss": 1.4233, "step": 2365 }, { "epoch": 0.02, "grad_norm": 4.349977050699164, "learning_rate": 1.9999218281332308e-06, "loss": 1.3647, "step": 2366 }, { "epoch": 0.02, "grad_norm": 4.701955352762745, "learning_rate": 1.999921761760194e-06, "loss": 1.4519, "step": 2367 }, { "epoch": 0.02, "grad_norm": 4.494673397335141, "learning_rate": 1.999921695358993e-06, "loss": 1.4392, "step": 2368 }, { "epoch": 0.02, "grad_norm": 4.7365593408579345, "learning_rate": 1.9999216289296273e-06, "loss": 1.3825, "step": 2369 }, { "epoch": 0.02, "grad_norm": 4.856324736948551, "learning_rate": 1.9999215624720974e-06, "loss": 1.5478, "step": 2370 }, { "epoch": 0.02, "grad_norm": 4.8131720405427645, "learning_rate": 1.999921495986403e-06, "loss": 1.4974, "step": 2371 }, { "epoch": 0.02, "grad_norm": 4.6424157982225935, "learning_rate": 1.9999214294725442e-06, "loss": 1.5017, "step": 2372 }, { "epoch": 0.02, "grad_norm": 5.7994216756118595, "learning_rate": 1.9999213629305206e-06, "loss": 1.3935, "step": 2373 }, { "epoch": 0.02, "grad_norm": 12.161522599209015, "learning_rate": 1.9999212963603328e-06, "loss": 1.5953, "step": 2374 }, { "epoch": 0.02, "grad_norm": 4.663179583938319, "learning_rate": 1.9999212297619806e-06, "loss": 1.3695, "step": 2375 }, { "epoch": 0.02, "grad_norm": 4.992382036334836, "learning_rate": 1.9999211631354633e-06, "loss": 1.5055, "step": 2376 }, { "epoch": 0.02, "grad_norm": 4.8753351925338695, "learning_rate": 1.9999210964807822e-06, "loss": 1.3948, "step": 2377 }, { "epoch": 0.02, "grad_norm": 6.222563145132559, "learning_rate": 1.9999210297979364e-06, "loss": 1.3826, "step": 2378 }, { "epoch": 0.02, "grad_norm": 4.88643245267929, "learning_rate": 1.9999209630869264e-06, "loss": 1.3931, "step": 2379 }, { "epoch": 0.02, "grad_norm": 5.560779789666663, "learning_rate": 1.999920896347752e-06, "loss": 1.4941, "step": 2380 }, { "epoch": 0.02, "grad_norm": 5.296411387581933, "learning_rate": 1.999920829580413e-06, "loss": 1.5771, "step": 2381 }, { "epoch": 0.02, "grad_norm": 4.739960436369839, "learning_rate": 1.9999207627849093e-06, "loss": 1.402, "step": 2382 }, { "epoch": 0.02, "grad_norm": 4.494606888309802, "learning_rate": 1.9999206959612413e-06, "loss": 1.3666, "step": 2383 }, { "epoch": 0.02, "grad_norm": 6.102337146702119, "learning_rate": 1.999920629109409e-06, "loss": 1.4514, "step": 2384 }, { "epoch": 0.02, "grad_norm": 4.2918281175179365, "learning_rate": 1.999920562229412e-06, "loss": 1.4275, "step": 2385 }, { "epoch": 0.02, "grad_norm": 4.5428032056841845, "learning_rate": 1.999920495321251e-06, "loss": 1.4875, "step": 2386 }, { "epoch": 0.02, "grad_norm": 4.981403898596884, "learning_rate": 1.9999204283849253e-06, "loss": 1.4604, "step": 2387 }, { "epoch": 0.02, "grad_norm": 4.656946904533867, "learning_rate": 1.999920361420435e-06, "loss": 1.3959, "step": 2388 }, { "epoch": 0.02, "grad_norm": 4.69793455074141, "learning_rate": 1.9999202944277807e-06, "loss": 1.3798, "step": 2389 }, { "epoch": 0.02, "grad_norm": 4.8697187270242415, "learning_rate": 1.9999202274069615e-06, "loss": 1.5877, "step": 2390 }, { "epoch": 0.02, "grad_norm": 6.303206163342291, "learning_rate": 1.999920160357978e-06, "loss": 1.3649, "step": 2391 }, { "epoch": 0.02, "grad_norm": 8.428043235458622, "learning_rate": 1.9999200932808304e-06, "loss": 1.4989, "step": 2392 }, { "epoch": 0.02, "grad_norm": 4.69775284325294, "learning_rate": 1.999920026175518e-06, "loss": 1.4359, "step": 2393 }, { "epoch": 0.02, "grad_norm": 7.468054893971967, "learning_rate": 1.9999199590420414e-06, "loss": 1.3861, "step": 2394 }, { "epoch": 0.02, "grad_norm": 5.1096088777638675, "learning_rate": 1.9999198918804005e-06, "loss": 1.3725, "step": 2395 }, { "epoch": 0.02, "grad_norm": 5.09309736334753, "learning_rate": 1.999919824690595e-06, "loss": 1.354, "step": 2396 }, { "epoch": 0.02, "grad_norm": 5.061146836059262, "learning_rate": 1.999919757472625e-06, "loss": 1.4047, "step": 2397 }, { "epoch": 0.02, "grad_norm": 4.698403918335731, "learning_rate": 1.999919690226491e-06, "loss": 1.5047, "step": 2398 }, { "epoch": 0.02, "grad_norm": 4.722675479564567, "learning_rate": 1.999919622952192e-06, "loss": 1.455, "step": 2399 }, { "epoch": 0.02, "grad_norm": 4.676824374912994, "learning_rate": 1.999919555649729e-06, "loss": 1.3603, "step": 2400 }, { "epoch": 0.02, "grad_norm": 4.660790706723943, "learning_rate": 1.999919488319101e-06, "loss": 1.4138, "step": 2401 }, { "epoch": 0.02, "grad_norm": 5.8886919642276006, "learning_rate": 1.9999194209603095e-06, "loss": 1.4662, "step": 2402 }, { "epoch": 0.02, "grad_norm": 4.728150466755214, "learning_rate": 1.999919353573353e-06, "loss": 1.3994, "step": 2403 }, { "epoch": 0.02, "grad_norm": 5.2893263295633, "learning_rate": 1.999919286158232e-06, "loss": 1.5503, "step": 2404 }, { "epoch": 0.02, "grad_norm": 4.462205296842043, "learning_rate": 1.9999192187149468e-06, "loss": 1.4898, "step": 2405 }, { "epoch": 0.02, "grad_norm": 4.171445485236358, "learning_rate": 1.9999191512434972e-06, "loss": 1.2432, "step": 2406 }, { "epoch": 0.02, "grad_norm": 7.065808066331224, "learning_rate": 1.9999190837438834e-06, "loss": 1.6226, "step": 2407 }, { "epoch": 0.02, "grad_norm": 4.94314214884708, "learning_rate": 1.999919016216105e-06, "loss": 1.4387, "step": 2408 }, { "epoch": 0.02, "grad_norm": 5.321472894774982, "learning_rate": 1.9999189486601625e-06, "loss": 1.525, "step": 2409 }, { "epoch": 0.02, "eval_loss": 1.6314270496368408, "eval_runtime": 4.6321, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 2409 }, { "epoch": 0.02, "grad_norm": 5.2667747042009365, "learning_rate": 1.9999188810760554e-06, "loss": 1.4374, "step": 2410 }, { "epoch": 0.02, "grad_norm": 8.33459038400234, "learning_rate": 1.9999188134637836e-06, "loss": 1.5239, "step": 2411 }, { "epoch": 0.02, "grad_norm": 4.682545029673098, "learning_rate": 1.9999187458233476e-06, "loss": 1.503, "step": 2412 }, { "epoch": 0.02, "grad_norm": 5.067629662325673, "learning_rate": 1.9999186781547473e-06, "loss": 1.5611, "step": 2413 }, { "epoch": 0.02, "grad_norm": 4.885518592851602, "learning_rate": 1.9999186104579827e-06, "loss": 1.399, "step": 2414 }, { "epoch": 0.02, "grad_norm": 4.8623966665305245, "learning_rate": 1.9999185427330535e-06, "loss": 1.5801, "step": 2415 }, { "epoch": 0.02, "grad_norm": 5.4540876235305555, "learning_rate": 1.9999184749799604e-06, "loss": 1.5821, "step": 2416 }, { "epoch": 0.02, "grad_norm": 4.705447443475058, "learning_rate": 1.999918407198702e-06, "loss": 1.3744, "step": 2417 }, { "epoch": 0.02, "grad_norm": 4.51004939941681, "learning_rate": 1.99991833938928e-06, "loss": 1.5278, "step": 2418 }, { "epoch": 0.02, "grad_norm": 5.781795445026661, "learning_rate": 1.9999182715516937e-06, "loss": 1.5668, "step": 2419 }, { "epoch": 0.02, "grad_norm": 5.4984493425238945, "learning_rate": 1.9999182036859427e-06, "loss": 1.4154, "step": 2420 }, { "epoch": 0.02, "grad_norm": 5.6682629310048345, "learning_rate": 1.999918135792027e-06, "loss": 1.1895, "step": 2421 }, { "epoch": 0.02, "grad_norm": 5.157730325288316, "learning_rate": 1.9999180678699474e-06, "loss": 1.3339, "step": 2422 }, { "epoch": 0.02, "grad_norm": 4.318416120479912, "learning_rate": 1.999917999919703e-06, "loss": 1.2886, "step": 2423 }, { "epoch": 0.02, "grad_norm": 6.782830315820668, "learning_rate": 1.999917931941295e-06, "loss": 1.3462, "step": 2424 }, { "epoch": 0.02, "grad_norm": 4.9884978695023685, "learning_rate": 1.9999178639347223e-06, "loss": 1.4021, "step": 2425 }, { "epoch": 0.02, "grad_norm": 4.93256932892002, "learning_rate": 1.999917795899985e-06, "loss": 1.493, "step": 2426 }, { "epoch": 0.02, "grad_norm": 4.556244224765811, "learning_rate": 1.9999177278370834e-06, "loss": 1.4557, "step": 2427 }, { "epoch": 0.02, "grad_norm": 5.629046540813747, "learning_rate": 1.999917659746018e-06, "loss": 1.5647, "step": 2428 }, { "epoch": 0.02, "grad_norm": 5.073793267819167, "learning_rate": 1.9999175916267875e-06, "loss": 1.5932, "step": 2429 }, { "epoch": 0.02, "grad_norm": 4.560810629230434, "learning_rate": 1.999917523479393e-06, "loss": 1.4904, "step": 2430 }, { "epoch": 0.02, "grad_norm": 5.097830006203227, "learning_rate": 1.999917455303834e-06, "loss": 1.4545, "step": 2431 }, { "epoch": 0.02, "grad_norm": 4.589161304086774, "learning_rate": 1.999917387100111e-06, "loss": 1.4455, "step": 2432 }, { "epoch": 0.02, "grad_norm": 5.262666403698471, "learning_rate": 1.9999173188682232e-06, "loss": 1.5229, "step": 2433 }, { "epoch": 0.02, "grad_norm": 6.509535961863751, "learning_rate": 1.999917250608171e-06, "loss": 1.4361, "step": 2434 }, { "epoch": 0.02, "grad_norm": 4.508763347593167, "learning_rate": 1.999917182319955e-06, "loss": 1.2921, "step": 2435 }, { "epoch": 0.02, "grad_norm": 5.40129312190739, "learning_rate": 1.999917114003574e-06, "loss": 1.3886, "step": 2436 }, { "epoch": 0.02, "grad_norm": 4.645761685555377, "learning_rate": 1.9999170456590293e-06, "loss": 1.3333, "step": 2437 }, { "epoch": 0.02, "grad_norm": 4.642316987637496, "learning_rate": 1.99991697728632e-06, "loss": 1.4236, "step": 2438 }, { "epoch": 0.02, "grad_norm": 5.516269375769559, "learning_rate": 1.9999169088854464e-06, "loss": 1.4596, "step": 2439 }, { "epoch": 0.02, "grad_norm": 4.96343766345647, "learning_rate": 1.9999168404564082e-06, "loss": 1.3929, "step": 2440 }, { "epoch": 0.02, "grad_norm": 4.69420089914068, "learning_rate": 1.999916771999206e-06, "loss": 1.4285, "step": 2441 }, { "epoch": 0.02, "grad_norm": 4.942122889945423, "learning_rate": 1.9999167035138392e-06, "loss": 1.4459, "step": 2442 }, { "epoch": 0.02, "grad_norm": 4.716647864815381, "learning_rate": 1.9999166350003083e-06, "loss": 1.4375, "step": 2443 }, { "epoch": 0.02, "grad_norm": 4.558659758531409, "learning_rate": 1.999916566458613e-06, "loss": 1.5505, "step": 2444 }, { "epoch": 0.02, "grad_norm": 5.032420136381452, "learning_rate": 1.9999164978887537e-06, "loss": 1.3798, "step": 2445 }, { "epoch": 0.02, "grad_norm": 4.387046415116763, "learning_rate": 1.9999164292907295e-06, "loss": 1.4069, "step": 2446 }, { "epoch": 0.02, "grad_norm": 4.829694624857794, "learning_rate": 1.999916360664541e-06, "loss": 1.4942, "step": 2447 }, { "epoch": 0.02, "grad_norm": 4.304283704762636, "learning_rate": 1.9999162920101885e-06, "loss": 1.3792, "step": 2448 }, { "epoch": 0.02, "grad_norm": 4.957652008482992, "learning_rate": 1.9999162233276715e-06, "loss": 1.5595, "step": 2449 }, { "epoch": 0.02, "grad_norm": 4.881099312639281, "learning_rate": 1.9999161546169903e-06, "loss": 1.3463, "step": 2450 }, { "epoch": 0.02, "grad_norm": 6.55918290195136, "learning_rate": 1.999916085878145e-06, "loss": 1.3157, "step": 2451 }, { "epoch": 0.02, "grad_norm": 4.933187617415705, "learning_rate": 1.999916017111135e-06, "loss": 1.325, "step": 2452 }, { "epoch": 0.02, "grad_norm": 5.522893367254299, "learning_rate": 1.999915948315961e-06, "loss": 1.3697, "step": 2453 }, { "epoch": 0.02, "grad_norm": 4.753016879907898, "learning_rate": 1.9999158794926227e-06, "loss": 1.5241, "step": 2454 }, { "epoch": 0.02, "grad_norm": 4.738367977115928, "learning_rate": 1.9999158106411197e-06, "loss": 1.5124, "step": 2455 }, { "epoch": 0.02, "grad_norm": 5.259436967977112, "learning_rate": 1.999915741761453e-06, "loss": 1.1845, "step": 2456 }, { "epoch": 0.02, "grad_norm": 5.327412482017732, "learning_rate": 1.9999156728536214e-06, "loss": 1.4195, "step": 2457 }, { "epoch": 0.02, "grad_norm": 4.815255649938258, "learning_rate": 1.9999156039176256e-06, "loss": 1.4678, "step": 2458 }, { "epoch": 0.02, "grad_norm": 4.830224872542197, "learning_rate": 1.999915534953466e-06, "loss": 1.4407, "step": 2459 }, { "epoch": 0.02, "grad_norm": 5.055447650223887, "learning_rate": 1.9999154659611416e-06, "loss": 1.3783, "step": 2460 }, { "epoch": 0.02, "grad_norm": 4.582184501347047, "learning_rate": 1.999915396940653e-06, "loss": 1.4415, "step": 2461 }, { "epoch": 0.02, "grad_norm": 4.691006151625649, "learning_rate": 1.999915327892e-06, "loss": 1.5197, "step": 2462 }, { "epoch": 0.02, "grad_norm": 5.591425262583514, "learning_rate": 1.999915258815183e-06, "loss": 1.4104, "step": 2463 }, { "epoch": 0.02, "grad_norm": 4.871239888750451, "learning_rate": 1.9999151897102016e-06, "loss": 1.3372, "step": 2464 }, { "epoch": 0.02, "grad_norm": 4.330668450921265, "learning_rate": 1.999915120577056e-06, "loss": 1.4389, "step": 2465 }, { "epoch": 0.02, "grad_norm": 5.340186565522926, "learning_rate": 1.999915051415746e-06, "loss": 1.4118, "step": 2466 }, { "epoch": 0.02, "grad_norm": 6.261449629532745, "learning_rate": 1.9999149822262718e-06, "loss": 1.7365, "step": 2467 }, { "epoch": 0.02, "grad_norm": 4.742851588374142, "learning_rate": 1.999914913008633e-06, "loss": 1.4563, "step": 2468 }, { "epoch": 0.02, "grad_norm": 5.025387402777135, "learning_rate": 1.99991484376283e-06, "loss": 1.4834, "step": 2469 }, { "epoch": 0.02, "grad_norm": 4.355185254680489, "learning_rate": 1.999914774488863e-06, "loss": 1.443, "step": 2470 }, { "epoch": 0.02, "grad_norm": 4.520118266978153, "learning_rate": 1.999914705186732e-06, "loss": 1.2398, "step": 2471 }, { "epoch": 0.02, "grad_norm": 4.679385582459207, "learning_rate": 1.9999146358564362e-06, "loss": 1.3546, "step": 2472 }, { "epoch": 0.02, "grad_norm": 4.653878537687508, "learning_rate": 1.9999145664979764e-06, "loss": 1.3641, "step": 2473 }, { "epoch": 0.02, "grad_norm": 5.288484525559772, "learning_rate": 1.9999144971113523e-06, "loss": 1.4576, "step": 2474 }, { "epoch": 0.02, "grad_norm": 5.113711451335311, "learning_rate": 1.999914427696564e-06, "loss": 1.1885, "step": 2475 }, { "epoch": 0.02, "grad_norm": 4.792513033095759, "learning_rate": 1.9999143582536113e-06, "loss": 1.49, "step": 2476 }, { "epoch": 0.02, "grad_norm": 4.916268869163306, "learning_rate": 1.999914288782494e-06, "loss": 1.4821, "step": 2477 }, { "epoch": 0.02, "grad_norm": 4.9324603436046415, "learning_rate": 1.999914219283213e-06, "loss": 1.5012, "step": 2478 }, { "epoch": 0.02, "grad_norm": 5.283635481534406, "learning_rate": 1.9999141497557674e-06, "loss": 1.3083, "step": 2479 }, { "epoch": 0.02, "grad_norm": 5.3789031660655935, "learning_rate": 1.9999140802001576e-06, "loss": 1.5692, "step": 2480 }, { "epoch": 0.02, "grad_norm": 4.496253992299153, "learning_rate": 1.9999140106163837e-06, "loss": 1.475, "step": 2481 }, { "epoch": 0.02, "grad_norm": 4.810348760053238, "learning_rate": 1.9999139410044454e-06, "loss": 1.3871, "step": 2482 }, { "epoch": 0.02, "eval_loss": 1.6317201852798462, "eval_runtime": 4.6311, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.08, "step": 2482 }, { "epoch": 0.02, "grad_norm": 4.936385744644064, "learning_rate": 1.999913871364343e-06, "loss": 1.5163, "step": 2483 }, { "epoch": 0.02, "grad_norm": 5.3932940350781315, "learning_rate": 1.999913801696076e-06, "loss": 1.2527, "step": 2484 }, { "epoch": 0.02, "grad_norm": 4.527212771343054, "learning_rate": 1.999913731999645e-06, "loss": 1.3808, "step": 2485 }, { "epoch": 0.02, "grad_norm": 4.6367894135268415, "learning_rate": 1.9999136622750497e-06, "loss": 1.4644, "step": 2486 }, { "epoch": 0.02, "grad_norm": 5.431008065838847, "learning_rate": 1.9999135925222905e-06, "loss": 1.4108, "step": 2487 }, { "epoch": 0.02, "grad_norm": 4.60955842936837, "learning_rate": 1.9999135227413667e-06, "loss": 1.5293, "step": 2488 }, { "epoch": 0.02, "grad_norm": 4.815249149207515, "learning_rate": 1.9999134529322785e-06, "loss": 1.4405, "step": 2489 }, { "epoch": 0.02, "grad_norm": 6.905408881518468, "learning_rate": 1.9999133830950266e-06, "loss": 1.5907, "step": 2490 }, { "epoch": 0.02, "grad_norm": 5.3654121964040815, "learning_rate": 1.99991331322961e-06, "loss": 1.5178, "step": 2491 }, { "epoch": 0.02, "grad_norm": 4.58569484491462, "learning_rate": 1.9999132433360294e-06, "loss": 1.3849, "step": 2492 }, { "epoch": 0.02, "grad_norm": 7.861754836669881, "learning_rate": 1.9999131734142846e-06, "loss": 1.5712, "step": 2493 }, { "epoch": 0.02, "grad_norm": 4.651684688922827, "learning_rate": 1.9999131034643755e-06, "loss": 1.4253, "step": 2494 }, { "epoch": 0.02, "grad_norm": 4.7784530707603885, "learning_rate": 1.9999130334863022e-06, "loss": 1.4367, "step": 2495 }, { "epoch": 0.02, "grad_norm": 4.616133306685504, "learning_rate": 1.9999129634800646e-06, "loss": 1.3498, "step": 2496 }, { "epoch": 0.02, "grad_norm": 5.119118711647458, "learning_rate": 1.9999128934456628e-06, "loss": 1.484, "step": 2497 }, { "epoch": 0.02, "grad_norm": 5.283146448319006, "learning_rate": 1.999912823383097e-06, "loss": 1.3398, "step": 2498 }, { "epoch": 0.02, "grad_norm": 4.571981013911557, "learning_rate": 1.9999127532923667e-06, "loss": 1.2996, "step": 2499 }, { "epoch": 0.02, "grad_norm": 5.332851655290826, "learning_rate": 1.999912683173472e-06, "loss": 1.5005, "step": 2500 }, { "epoch": 0.02, "grad_norm": 10.210017666226172, "learning_rate": 1.9999126130264135e-06, "loss": 1.3147, "step": 2501 }, { "epoch": 0.02, "grad_norm": 8.862337302086877, "learning_rate": 1.9999125428511907e-06, "loss": 1.3428, "step": 2502 }, { "epoch": 0.02, "grad_norm": 4.576195646780718, "learning_rate": 1.9999124726478036e-06, "loss": 1.4371, "step": 2503 }, { "epoch": 0.02, "grad_norm": 5.339924831875166, "learning_rate": 1.9999124024162523e-06, "loss": 1.5637, "step": 2504 }, { "epoch": 0.02, "grad_norm": 4.7192701371337025, "learning_rate": 1.9999123321565367e-06, "loss": 1.4242, "step": 2505 }, { "epoch": 0.02, "grad_norm": 5.163473966004869, "learning_rate": 1.999912261868657e-06, "loss": 1.3637, "step": 2506 }, { "epoch": 0.02, "grad_norm": 4.337334857544459, "learning_rate": 1.999912191552613e-06, "loss": 1.2784, "step": 2507 }, { "epoch": 0.02, "grad_norm": 6.051444830035015, "learning_rate": 1.999912121208405e-06, "loss": 1.6266, "step": 2508 }, { "epoch": 0.02, "grad_norm": 4.479649509721164, "learning_rate": 1.999912050836033e-06, "loss": 1.2662, "step": 2509 }, { "epoch": 0.02, "grad_norm": 4.472943621796013, "learning_rate": 1.9999119804354964e-06, "loss": 1.4028, "step": 2510 }, { "epoch": 0.02, "grad_norm": 4.710024013128741, "learning_rate": 1.9999119100067957e-06, "loss": 1.3498, "step": 2511 }, { "epoch": 0.02, "grad_norm": 4.970475591078188, "learning_rate": 1.9999118395499306e-06, "loss": 1.5096, "step": 2512 }, { "epoch": 0.02, "grad_norm": 4.75371637466236, "learning_rate": 1.9999117690649017e-06, "loss": 1.4432, "step": 2513 }, { "epoch": 0.02, "grad_norm": 4.468537061799282, "learning_rate": 1.9999116985517085e-06, "loss": 1.4252, "step": 2514 }, { "epoch": 0.02, "grad_norm": 4.49951522854794, "learning_rate": 1.999911628010351e-06, "loss": 1.3306, "step": 2515 }, { "epoch": 0.02, "grad_norm": 4.719020079035169, "learning_rate": 1.9999115574408294e-06, "loss": 1.3402, "step": 2516 }, { "epoch": 0.02, "grad_norm": 4.795670467668128, "learning_rate": 1.9999114868431434e-06, "loss": 1.4464, "step": 2517 }, { "epoch": 0.02, "grad_norm": 4.36164451676725, "learning_rate": 1.9999114162172936e-06, "loss": 1.3221, "step": 2518 }, { "epoch": 0.02, "grad_norm": 4.752857588130247, "learning_rate": 1.999911345563279e-06, "loss": 1.2821, "step": 2519 }, { "epoch": 0.02, "grad_norm": 5.279730766268046, "learning_rate": 1.9999112748811008e-06, "loss": 1.4016, "step": 2520 }, { "epoch": 0.02, "grad_norm": 4.695047237818786, "learning_rate": 1.9999112041707586e-06, "loss": 1.4613, "step": 2521 }, { "epoch": 0.02, "grad_norm": 4.692585340555871, "learning_rate": 1.9999111334322517e-06, "loss": 1.4694, "step": 2522 }, { "epoch": 0.02, "grad_norm": 4.771936002396662, "learning_rate": 1.999911062665581e-06, "loss": 1.3274, "step": 2523 }, { "epoch": 0.02, "grad_norm": 5.220174817764933, "learning_rate": 1.999910991870746e-06, "loss": 1.571, "step": 2524 }, { "epoch": 0.02, "grad_norm": 5.672229165440042, "learning_rate": 1.9999109210477466e-06, "loss": 1.5377, "step": 2525 }, { "epoch": 0.02, "grad_norm": 5.1839178319765935, "learning_rate": 1.999910850196583e-06, "loss": 1.3993, "step": 2526 }, { "epoch": 0.02, "grad_norm": 4.681220552283899, "learning_rate": 1.9999107793172557e-06, "loss": 1.4291, "step": 2527 }, { "epoch": 0.02, "grad_norm": 4.972007639821219, "learning_rate": 1.999910708409764e-06, "loss": 1.431, "step": 2528 }, { "epoch": 0.02, "grad_norm": 4.4861243909511295, "learning_rate": 1.999910637474108e-06, "loss": 1.4019, "step": 2529 }, { "epoch": 0.02, "grad_norm": 4.851005767238794, "learning_rate": 1.9999105665102883e-06, "loss": 1.4337, "step": 2530 }, { "epoch": 0.02, "grad_norm": 4.846084074130138, "learning_rate": 1.999910495518304e-06, "loss": 1.5946, "step": 2531 }, { "epoch": 0.02, "grad_norm": 4.536951575081545, "learning_rate": 1.9999104244981556e-06, "loss": 1.4743, "step": 2532 }, { "epoch": 0.02, "grad_norm": 7.248194758577965, "learning_rate": 1.9999103534498434e-06, "loss": 1.3195, "step": 2533 }, { "epoch": 0.02, "grad_norm": 6.056484295400697, "learning_rate": 1.9999102823733665e-06, "loss": 1.2914, "step": 2534 }, { "epoch": 0.02, "grad_norm": 4.505711846073996, "learning_rate": 1.999910211268726e-06, "loss": 1.4096, "step": 2535 }, { "epoch": 0.02, "grad_norm": 6.9063691562807055, "learning_rate": 1.999910140135921e-06, "loss": 1.2676, "step": 2536 }, { "epoch": 0.02, "grad_norm": 4.783198227824001, "learning_rate": 1.9999100689749517e-06, "loss": 1.4266, "step": 2537 }, { "epoch": 0.02, "grad_norm": 4.4803987517358665, "learning_rate": 1.9999099977858186e-06, "loss": 1.3878, "step": 2538 }, { "epoch": 0.02, "grad_norm": 21.125075865811542, "learning_rate": 1.9999099265685212e-06, "loss": 1.481, "step": 2539 }, { "epoch": 0.02, "grad_norm": 4.551735343167499, "learning_rate": 1.99990985532306e-06, "loss": 1.3335, "step": 2540 }, { "epoch": 0.02, "grad_norm": 4.525931993787852, "learning_rate": 1.999909784049434e-06, "loss": 1.4145, "step": 2541 }, { "epoch": 0.02, "grad_norm": 4.723242340525657, "learning_rate": 1.9999097127476444e-06, "loss": 1.425, "step": 2542 }, { "epoch": 0.02, "grad_norm": 11.52159628767104, "learning_rate": 1.9999096414176904e-06, "loss": 1.5459, "step": 2543 }, { "epoch": 0.02, "grad_norm": 4.499238976068291, "learning_rate": 1.9999095700595726e-06, "loss": 1.5052, "step": 2544 }, { "epoch": 0.02, "grad_norm": 4.3943003554534315, "learning_rate": 1.999909498673291e-06, "loss": 1.3532, "step": 2545 }, { "epoch": 0.02, "grad_norm": 5.384405241916104, "learning_rate": 1.9999094272588445e-06, "loss": 1.5439, "step": 2546 }, { "epoch": 0.02, "grad_norm": 5.401136108906954, "learning_rate": 1.999909355816234e-06, "loss": 1.4723, "step": 2547 }, { "epoch": 0.02, "grad_norm": 5.4369310319106114, "learning_rate": 1.9999092843454593e-06, "loss": 1.4131, "step": 2548 }, { "epoch": 0.02, "grad_norm": 4.975825034724867, "learning_rate": 1.999909212846521e-06, "loss": 1.5112, "step": 2549 }, { "epoch": 0.02, "grad_norm": 6.256239787239192, "learning_rate": 1.999909141319418e-06, "loss": 1.5494, "step": 2550 }, { "epoch": 0.02, "grad_norm": 4.563176202007188, "learning_rate": 1.9999090697641515e-06, "loss": 1.3635, "step": 2551 }, { "epoch": 0.02, "grad_norm": 4.431366662952086, "learning_rate": 1.9999089981807203e-06, "loss": 1.3677, "step": 2552 }, { "epoch": 0.02, "grad_norm": 4.189139491517448, "learning_rate": 1.9999089265691253e-06, "loss": 1.3525, "step": 2553 }, { "epoch": 0.02, "grad_norm": 5.080861898020357, "learning_rate": 1.999908854929366e-06, "loss": 1.3792, "step": 2554 }, { "epoch": 0.02, "grad_norm": 6.3571862989208325, "learning_rate": 1.9999087832614425e-06, "loss": 1.5974, "step": 2555 }, { "epoch": 0.02, "eval_loss": 1.626267433166504, "eval_runtime": 4.6223, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.082, "step": 2555 }, { "epoch": 0.02, "grad_norm": 5.82494449812951, "learning_rate": 1.9999087115653556e-06, "loss": 1.4692, "step": 2556 }, { "epoch": 0.02, "grad_norm": 4.929797834611915, "learning_rate": 1.999908639841104e-06, "loss": 1.4674, "step": 2557 }, { "epoch": 0.02, "grad_norm": 5.589160350904509, "learning_rate": 1.9999085680886884e-06, "loss": 1.4853, "step": 2558 }, { "epoch": 0.02, "grad_norm": 4.657411083617213, "learning_rate": 1.9999084963081087e-06, "loss": 1.3349, "step": 2559 }, { "epoch": 0.02, "grad_norm": 5.635155884205405, "learning_rate": 1.999908424499365e-06, "loss": 1.3812, "step": 2560 }, { "epoch": 0.02, "grad_norm": 11.039436368621972, "learning_rate": 1.9999083526624568e-06, "loss": 1.4228, "step": 2561 }, { "epoch": 0.02, "grad_norm": 4.731509118685059, "learning_rate": 1.999908280797385e-06, "loss": 1.5188, "step": 2562 }, { "epoch": 0.02, "grad_norm": 4.845729221450556, "learning_rate": 1.999908208904149e-06, "loss": 1.3457, "step": 2563 }, { "epoch": 0.02, "grad_norm": 4.6098361015362705, "learning_rate": 1.9999081369827488e-06, "loss": 1.4256, "step": 2564 }, { "epoch": 0.02, "grad_norm": 4.916273960056245, "learning_rate": 1.9999080650331847e-06, "loss": 1.3279, "step": 2565 }, { "epoch": 0.02, "grad_norm": 4.438335756166127, "learning_rate": 1.9999079930554563e-06, "loss": 1.241, "step": 2566 }, { "epoch": 0.02, "grad_norm": 4.817882655917825, "learning_rate": 1.999907921049564e-06, "loss": 1.4117, "step": 2567 }, { "epoch": 0.02, "grad_norm": 5.003123275534057, "learning_rate": 1.9999078490155076e-06, "loss": 1.2974, "step": 2568 }, { "epoch": 0.02, "grad_norm": 5.017000975631727, "learning_rate": 1.999907776953287e-06, "loss": 1.363, "step": 2569 }, { "epoch": 0.02, "grad_norm": 4.600005159137907, "learning_rate": 1.9999077048629022e-06, "loss": 1.4458, "step": 2570 }, { "epoch": 0.02, "grad_norm": 4.9265496357161425, "learning_rate": 1.9999076327443538e-06, "loss": 1.3172, "step": 2571 }, { "epoch": 0.02, "grad_norm": 5.824876884767345, "learning_rate": 1.9999075605976406e-06, "loss": 1.4476, "step": 2572 }, { "epoch": 0.02, "grad_norm": 4.361863130512629, "learning_rate": 1.9999074884227636e-06, "loss": 1.3836, "step": 2573 }, { "epoch": 0.02, "grad_norm": 4.7955376481293115, "learning_rate": 1.999907416219723e-06, "loss": 1.3473, "step": 2574 }, { "epoch": 0.02, "grad_norm": 4.666469831511099, "learning_rate": 1.9999073439885177e-06, "loss": 1.4382, "step": 2575 }, { "epoch": 0.02, "grad_norm": 4.73863410034199, "learning_rate": 1.999907271729149e-06, "loss": 1.3452, "step": 2576 }, { "epoch": 0.02, "grad_norm": 4.535702568291937, "learning_rate": 1.999907199441616e-06, "loss": 1.433, "step": 2577 }, { "epoch": 0.02, "grad_norm": 4.577074935520658, "learning_rate": 1.999907127125919e-06, "loss": 1.4831, "step": 2578 }, { "epoch": 0.02, "grad_norm": 5.722599811411352, "learning_rate": 1.9999070547820576e-06, "loss": 1.5383, "step": 2579 }, { "epoch": 0.02, "grad_norm": 22.655065655121753, "learning_rate": 1.999906982410032e-06, "loss": 1.3815, "step": 2580 }, { "epoch": 0.02, "grad_norm": 6.003296650995224, "learning_rate": 1.9999069100098425e-06, "loss": 1.5457, "step": 2581 }, { "epoch": 0.02, "grad_norm": 4.741277967056868, "learning_rate": 1.9999068375814896e-06, "loss": 1.5165, "step": 2582 }, { "epoch": 0.02, "grad_norm": 6.300018975344748, "learning_rate": 1.999906765124972e-06, "loss": 1.4036, "step": 2583 }, { "epoch": 0.02, "grad_norm": 7.552215789433446, "learning_rate": 1.9999066926402907e-06, "loss": 1.3368, "step": 2584 }, { "epoch": 0.02, "grad_norm": 5.470582591235478, "learning_rate": 1.999906620127445e-06, "loss": 1.5863, "step": 2585 }, { "epoch": 0.02, "grad_norm": 6.100878877106844, "learning_rate": 1.9999065475864355e-06, "loss": 1.501, "step": 2586 }, { "epoch": 0.02, "grad_norm": 4.389860076860747, "learning_rate": 1.9999064750172617e-06, "loss": 1.4151, "step": 2587 }, { "epoch": 0.02, "grad_norm": 5.63645147327467, "learning_rate": 1.999906402419924e-06, "loss": 1.5282, "step": 2588 }, { "epoch": 0.02, "grad_norm": 7.229014935819337, "learning_rate": 1.9999063297944226e-06, "loss": 1.5142, "step": 2589 }, { "epoch": 0.02, "grad_norm": 4.590820624310537, "learning_rate": 1.999906257140757e-06, "loss": 1.3774, "step": 2590 }, { "epoch": 0.02, "grad_norm": 4.724381873263039, "learning_rate": 1.9999061844589268e-06, "loss": 1.3214, "step": 2591 }, { "epoch": 0.02, "grad_norm": 4.484001614536028, "learning_rate": 1.999906111748933e-06, "loss": 1.3858, "step": 2592 }, { "epoch": 0.02, "grad_norm": 4.208403452992326, "learning_rate": 1.999906039010775e-06, "loss": 1.3302, "step": 2593 }, { "epoch": 0.02, "grad_norm": 4.505618278783764, "learning_rate": 1.9999059662444537e-06, "loss": 1.4443, "step": 2594 }, { "epoch": 0.02, "grad_norm": 4.591195710426086, "learning_rate": 1.999905893449968e-06, "loss": 1.3943, "step": 2595 }, { "epoch": 0.02, "grad_norm": 4.696805772544854, "learning_rate": 1.9999058206273177e-06, "loss": 1.4307, "step": 2596 }, { "epoch": 0.02, "grad_norm": 5.021073562891561, "learning_rate": 1.999905747776504e-06, "loss": 1.5345, "step": 2597 }, { "epoch": 0.02, "grad_norm": 4.7598018312861194, "learning_rate": 1.999905674897526e-06, "loss": 1.6453, "step": 2598 }, { "epoch": 0.02, "grad_norm": 4.581994132757299, "learning_rate": 1.9999056019903835e-06, "loss": 1.4905, "step": 2599 }, { "epoch": 0.02, "grad_norm": 4.841310271459569, "learning_rate": 1.999905529055078e-06, "loss": 1.4203, "step": 2600 }, { "epoch": 0.02, "grad_norm": 5.189830617523313, "learning_rate": 1.999905456091608e-06, "loss": 1.3274, "step": 2601 }, { "epoch": 0.02, "grad_norm": 4.720146352163821, "learning_rate": 1.999905383099974e-06, "loss": 1.4674, "step": 2602 }, { "epoch": 0.02, "grad_norm": 4.543963809407407, "learning_rate": 1.999905310080176e-06, "loss": 1.4255, "step": 2603 }, { "epoch": 0.02, "grad_norm": 5.18614032345166, "learning_rate": 1.999905237032214e-06, "loss": 1.4297, "step": 2604 }, { "epoch": 0.02, "grad_norm": 4.465791115799426, "learning_rate": 1.999905163956088e-06, "loss": 1.4399, "step": 2605 }, { "epoch": 0.02, "grad_norm": 4.694449197808517, "learning_rate": 1.9999050908517977e-06, "loss": 1.4938, "step": 2606 }, { "epoch": 0.02, "grad_norm": 5.4688966732418365, "learning_rate": 1.9999050177193436e-06, "loss": 1.4578, "step": 2607 }, { "epoch": 0.02, "grad_norm": 5.10844282586031, "learning_rate": 1.9999049445587256e-06, "loss": 1.3347, "step": 2608 }, { "epoch": 0.02, "grad_norm": 4.762680506088013, "learning_rate": 1.999904871369944e-06, "loss": 1.4342, "step": 2609 }, { "epoch": 0.02, "grad_norm": 5.979551926427287, "learning_rate": 1.999904798152998e-06, "loss": 1.5078, "step": 2610 }, { "epoch": 0.02, "grad_norm": 4.533251775082327, "learning_rate": 1.999904724907888e-06, "loss": 1.2065, "step": 2611 }, { "epoch": 0.02, "grad_norm": 5.121922341187516, "learning_rate": 1.9999046516346138e-06, "loss": 1.5454, "step": 2612 }, { "epoch": 0.02, "grad_norm": 5.781680034327118, "learning_rate": 1.9999045783331757e-06, "loss": 1.4578, "step": 2613 }, { "epoch": 0.02, "grad_norm": 5.826495337980005, "learning_rate": 1.999904505003574e-06, "loss": 1.4422, "step": 2614 }, { "epoch": 0.02, "grad_norm": 6.335079927329629, "learning_rate": 1.9999044316458078e-06, "loss": 1.3345, "step": 2615 }, { "epoch": 0.02, "grad_norm": 4.778422328814399, "learning_rate": 1.9999043582598782e-06, "loss": 1.3202, "step": 2616 }, { "epoch": 0.02, "grad_norm": 4.8282702985765145, "learning_rate": 1.999904284845784e-06, "loss": 1.407, "step": 2617 }, { "epoch": 0.02, "grad_norm": 4.867494484747974, "learning_rate": 1.999904211403526e-06, "loss": 1.5508, "step": 2618 }, { "epoch": 0.02, "grad_norm": 4.781917993855085, "learning_rate": 1.9999041379331044e-06, "loss": 1.3034, "step": 2619 }, { "epoch": 0.02, "grad_norm": 4.712392259942616, "learning_rate": 1.9999040644345186e-06, "loss": 1.5253, "step": 2620 }, { "epoch": 0.02, "grad_norm": 5.030532462772803, "learning_rate": 1.9999039909077686e-06, "loss": 1.5149, "step": 2621 }, { "epoch": 0.02, "grad_norm": 6.621139707916859, "learning_rate": 1.999903917352855e-06, "loss": 1.5093, "step": 2622 }, { "epoch": 0.02, "grad_norm": 8.008006829313361, "learning_rate": 1.9999038437697774e-06, "loss": 1.4026, "step": 2623 }, { "epoch": 0.02, "grad_norm": 6.136006728609445, "learning_rate": 1.9999037701585354e-06, "loss": 1.5477, "step": 2624 }, { "epoch": 0.02, "grad_norm": 4.760430548827488, "learning_rate": 1.9999036965191295e-06, "loss": 1.4864, "step": 2625 }, { "epoch": 0.02, "grad_norm": 5.006634707089645, "learning_rate": 1.9999036228515603e-06, "loss": 1.4774, "step": 2626 }, { "epoch": 0.02, "grad_norm": 4.588292407337218, "learning_rate": 1.9999035491558263e-06, "loss": 1.3735, "step": 2627 }, { "epoch": 0.02, "grad_norm": 4.816326208591829, "learning_rate": 1.999903475431929e-06, "loss": 1.2208, "step": 2628 }, { "epoch": 0.02, "eval_loss": 1.6295270919799805, "eval_runtime": 4.6318, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 2628 }, { "epoch": 0.02, "grad_norm": 4.638403311998232, "learning_rate": 1.9999034016798672e-06, "loss": 1.4943, "step": 2629 }, { "epoch": 0.02, "grad_norm": 4.961202684709964, "learning_rate": 1.999903327899642e-06, "loss": 1.4069, "step": 2630 }, { "epoch": 0.02, "grad_norm": 4.348996064319459, "learning_rate": 1.9999032540912524e-06, "loss": 1.1698, "step": 2631 }, { "epoch": 0.02, "grad_norm": 4.444823433064274, "learning_rate": 1.999903180254699e-06, "loss": 1.3074, "step": 2632 }, { "epoch": 0.02, "grad_norm": 4.965888101259268, "learning_rate": 1.9999031063899817e-06, "loss": 1.3823, "step": 2633 }, { "epoch": 0.02, "grad_norm": 6.453473043817252, "learning_rate": 1.999903032497101e-06, "loss": 1.4969, "step": 2634 }, { "epoch": 0.02, "grad_norm": 5.068247346576678, "learning_rate": 1.9999029585760553e-06, "loss": 1.5847, "step": 2635 }, { "epoch": 0.02, "grad_norm": 5.6600389675468294, "learning_rate": 1.9999028846268463e-06, "loss": 1.3724, "step": 2636 }, { "epoch": 0.02, "grad_norm": 4.374383238249181, "learning_rate": 1.9999028106494735e-06, "loss": 1.4313, "step": 2637 }, { "epoch": 0.02, "grad_norm": 4.680029067986695, "learning_rate": 1.999902736643936e-06, "loss": 1.3181, "step": 2638 }, { "epoch": 0.02, "grad_norm": 4.580712621593848, "learning_rate": 1.999902662610235e-06, "loss": 1.3987, "step": 2639 }, { "epoch": 0.02, "grad_norm": 6.092256320436487, "learning_rate": 1.9999025885483706e-06, "loss": 1.6909, "step": 2640 }, { "epoch": 0.02, "grad_norm": 4.7827050109074385, "learning_rate": 1.9999025144583415e-06, "loss": 1.517, "step": 2641 }, { "epoch": 0.02, "grad_norm": 5.011057538279476, "learning_rate": 1.9999024403401486e-06, "loss": 1.4349, "step": 2642 }, { "epoch": 0.02, "grad_norm": 4.8294099232047785, "learning_rate": 1.9999023661937923e-06, "loss": 1.4704, "step": 2643 }, { "epoch": 0.02, "grad_norm": 4.597159590269809, "learning_rate": 1.9999022920192712e-06, "loss": 1.3426, "step": 2644 }, { "epoch": 0.02, "grad_norm": 4.9150712542829496, "learning_rate": 1.9999022178165868e-06, "loss": 1.3268, "step": 2645 }, { "epoch": 0.02, "grad_norm": 5.027583647063347, "learning_rate": 1.9999021435857385e-06, "loss": 1.4659, "step": 2646 }, { "epoch": 0.02, "grad_norm": 4.68442647731963, "learning_rate": 1.9999020693267263e-06, "loss": 1.3447, "step": 2647 }, { "epoch": 0.02, "grad_norm": 4.9605688635740695, "learning_rate": 1.9999019950395503e-06, "loss": 1.4796, "step": 2648 }, { "epoch": 0.02, "grad_norm": 4.809892082871139, "learning_rate": 1.9999019207242097e-06, "loss": 1.4104, "step": 2649 }, { "epoch": 0.02, "grad_norm": 4.743744228034844, "learning_rate": 1.999901846380706e-06, "loss": 1.4046, "step": 2650 }, { "epoch": 0.02, "grad_norm": 4.269118159347582, "learning_rate": 1.999901772009038e-06, "loss": 1.2958, "step": 2651 }, { "epoch": 0.02, "grad_norm": 5.853949838326959, "learning_rate": 1.999901697609206e-06, "loss": 1.4926, "step": 2652 }, { "epoch": 0.02, "grad_norm": 4.776685722649282, "learning_rate": 1.99990162318121e-06, "loss": 1.4355, "step": 2653 }, { "epoch": 0.02, "grad_norm": 6.208703600062207, "learning_rate": 1.9999015487250503e-06, "loss": 1.5673, "step": 2654 }, { "epoch": 0.02, "grad_norm": 6.7039599749841186, "learning_rate": 1.999901474240727e-06, "loss": 1.5676, "step": 2655 }, { "epoch": 0.02, "grad_norm": 4.417347561144367, "learning_rate": 1.9999013997282393e-06, "loss": 1.3797, "step": 2656 }, { "epoch": 0.02, "grad_norm": 4.6270515827313465, "learning_rate": 1.999901325187588e-06, "loss": 1.4198, "step": 2657 }, { "epoch": 0.02, "grad_norm": 4.644490932379133, "learning_rate": 1.9999012506187726e-06, "loss": 1.3909, "step": 2658 }, { "epoch": 0.02, "grad_norm": 4.766952782820096, "learning_rate": 1.9999011760217935e-06, "loss": 1.5499, "step": 2659 }, { "epoch": 0.02, "grad_norm": 5.024704187001623, "learning_rate": 1.99990110139665e-06, "loss": 1.5967, "step": 2660 }, { "epoch": 0.02, "grad_norm": 4.484156877401999, "learning_rate": 1.9999010267433432e-06, "loss": 1.3459, "step": 2661 }, { "epoch": 0.02, "grad_norm": 4.960235392776955, "learning_rate": 1.9999009520618725e-06, "loss": 1.47, "step": 2662 }, { "epoch": 0.02, "grad_norm": 5.013252259638264, "learning_rate": 1.9999008773522376e-06, "loss": 1.2694, "step": 2663 }, { "epoch": 0.02, "grad_norm": 4.796804229686202, "learning_rate": 1.9999008026144392e-06, "loss": 1.4616, "step": 2664 }, { "epoch": 0.02, "grad_norm": 5.1481769265464825, "learning_rate": 1.9999007278484766e-06, "loss": 1.494, "step": 2665 }, { "epoch": 0.02, "grad_norm": 4.895324247068634, "learning_rate": 1.99990065305435e-06, "loss": 1.5384, "step": 2666 }, { "epoch": 0.02, "grad_norm": 4.655033924444686, "learning_rate": 1.99990057823206e-06, "loss": 1.427, "step": 2667 }, { "epoch": 0.02, "grad_norm": 4.7945063377020425, "learning_rate": 1.999900503381606e-06, "loss": 1.4635, "step": 2668 }, { "epoch": 0.02, "grad_norm": 6.282804236505679, "learning_rate": 1.999900428502988e-06, "loss": 1.3777, "step": 2669 }, { "epoch": 0.02, "grad_norm": 5.956119769772323, "learning_rate": 1.9999003535962058e-06, "loss": 1.5537, "step": 2670 }, { "epoch": 0.02, "grad_norm": 4.744533892549758, "learning_rate": 1.99990027866126e-06, "loss": 1.411, "step": 2671 }, { "epoch": 0.02, "grad_norm": 4.685437895155002, "learning_rate": 1.9999002036981505e-06, "loss": 1.3089, "step": 2672 }, { "epoch": 0.02, "grad_norm": 4.679001375905665, "learning_rate": 1.999900128706877e-06, "loss": 1.5004, "step": 2673 }, { "epoch": 0.02, "grad_norm": 4.816836882292446, "learning_rate": 1.99990005368744e-06, "loss": 1.4415, "step": 2674 }, { "epoch": 0.02, "grad_norm": 5.397353037395355, "learning_rate": 1.9998999786398383e-06, "loss": 1.457, "step": 2675 }, { "epoch": 0.02, "grad_norm": 4.836128128995581, "learning_rate": 1.9998999035640734e-06, "loss": 1.3645, "step": 2676 }, { "epoch": 0.02, "grad_norm": 6.345520428845227, "learning_rate": 1.9998998284601446e-06, "loss": 1.5424, "step": 2677 }, { "epoch": 0.02, "grad_norm": 5.582942854801658, "learning_rate": 1.999899753328052e-06, "loss": 1.4421, "step": 2678 }, { "epoch": 0.02, "grad_norm": 4.5571891655235275, "learning_rate": 1.999899678167795e-06, "loss": 1.3543, "step": 2679 }, { "epoch": 0.02, "grad_norm": 5.349049923012939, "learning_rate": 1.999899602979375e-06, "loss": 1.209, "step": 2680 }, { "epoch": 0.02, "grad_norm": 4.584099365206942, "learning_rate": 1.9998995277627906e-06, "loss": 1.4517, "step": 2681 }, { "epoch": 0.02, "grad_norm": 4.851375584879138, "learning_rate": 1.9998994525180426e-06, "loss": 1.3411, "step": 2682 }, { "epoch": 0.02, "grad_norm": 4.6601703251560025, "learning_rate": 1.9998993772451304e-06, "loss": 1.3298, "step": 2683 }, { "epoch": 0.02, "grad_norm": 4.8561771787958365, "learning_rate": 1.9998993019440547e-06, "loss": 1.4937, "step": 2684 }, { "epoch": 0.02, "grad_norm": 5.1790245554459755, "learning_rate": 1.999899226614815e-06, "loss": 1.471, "step": 2685 }, { "epoch": 0.02, "grad_norm": 4.742734422892583, "learning_rate": 1.9998991512574113e-06, "loss": 1.5035, "step": 2686 }, { "epoch": 0.02, "grad_norm": 4.785648242343571, "learning_rate": 1.999899075871844e-06, "loss": 1.5267, "step": 2687 }, { "epoch": 0.02, "grad_norm": 5.057841994753448, "learning_rate": 1.999899000458113e-06, "loss": 1.3789, "step": 2688 }, { "epoch": 0.02, "grad_norm": 5.0118905395912945, "learning_rate": 1.999898925016218e-06, "loss": 1.3579, "step": 2689 }, { "epoch": 0.02, "grad_norm": 5.35192729500763, "learning_rate": 1.999898849546159e-06, "loss": 1.4838, "step": 2690 }, { "epoch": 0.02, "grad_norm": 4.337685327376343, "learning_rate": 1.9998987740479367e-06, "loss": 1.3297, "step": 2691 }, { "epoch": 0.02, "grad_norm": 4.808629946983388, "learning_rate": 1.99989869852155e-06, "loss": 1.4873, "step": 2692 }, { "epoch": 0.02, "grad_norm": 5.2407858594510035, "learning_rate": 1.999898622967e-06, "loss": 1.5086, "step": 2693 }, { "epoch": 0.02, "grad_norm": 4.941322758554874, "learning_rate": 1.9998985473842858e-06, "loss": 1.3952, "step": 2694 }, { "epoch": 0.02, "grad_norm": 4.260790698502375, "learning_rate": 1.9998984717734078e-06, "loss": 1.3652, "step": 2695 }, { "epoch": 0.02, "grad_norm": 5.2005982502957595, "learning_rate": 1.999898396134366e-06, "loss": 1.5483, "step": 2696 }, { "epoch": 0.02, "grad_norm": 4.772216539864026, "learning_rate": 1.9998983204671606e-06, "loss": 1.2309, "step": 2697 }, { "epoch": 0.02, "grad_norm": 4.448621297124078, "learning_rate": 1.999898244771791e-06, "loss": 1.3967, "step": 2698 }, { "epoch": 0.02, "grad_norm": 5.6464418666503295, "learning_rate": 1.9998981690482577e-06, "loss": 1.2706, "step": 2699 }, { "epoch": 0.02, "grad_norm": 5.142174365370682, "learning_rate": 1.999898093296561e-06, "loss": 1.3629, "step": 2700 }, { "epoch": 0.02, "grad_norm": 4.489321589199872, "learning_rate": 1.9998980175167003e-06, "loss": 1.4768, "step": 2701 }, { "epoch": 0.02, "eval_loss": 1.6249644756317139, "eval_runtime": 4.6538, "eval_samples_per_second": 1.934, "eval_steps_per_second": 1.074, "step": 2701 }, { "epoch": 0.02, "grad_norm": 4.728941606346881, "learning_rate": 1.9998979417086758e-06, "loss": 1.4208, "step": 2702 }, { "epoch": 0.02, "grad_norm": 6.721435664278658, "learning_rate": 1.9998978658724874e-06, "loss": 1.6635, "step": 2703 }, { "epoch": 0.02, "grad_norm": 5.672729490886054, "learning_rate": 1.9998977900081353e-06, "loss": 1.3433, "step": 2704 }, { "epoch": 0.02, "grad_norm": 5.476896381988856, "learning_rate": 1.9998977141156192e-06, "loss": 1.4389, "step": 2705 }, { "epoch": 0.02, "grad_norm": 4.783782128937111, "learning_rate": 1.9998976381949394e-06, "loss": 1.4866, "step": 2706 }, { "epoch": 0.02, "grad_norm": 4.724902284051836, "learning_rate": 1.999897562246096e-06, "loss": 1.3447, "step": 2707 }, { "epoch": 0.02, "grad_norm": 4.850028686058834, "learning_rate": 1.999897486269089e-06, "loss": 1.3209, "step": 2708 }, { "epoch": 0.02, "grad_norm": 4.83053553627955, "learning_rate": 1.9998974102639175e-06, "loss": 1.5586, "step": 2709 }, { "epoch": 0.02, "grad_norm": 4.712477754987166, "learning_rate": 1.9998973342305827e-06, "loss": 1.4225, "step": 2710 }, { "epoch": 0.02, "grad_norm": 4.54227448455875, "learning_rate": 1.999897258169084e-06, "loss": 1.3462, "step": 2711 }, { "epoch": 0.02, "grad_norm": 5.5411282379403985, "learning_rate": 1.9998971820794215e-06, "loss": 1.4259, "step": 2712 }, { "epoch": 0.02, "grad_norm": 4.739705320723445, "learning_rate": 1.999897105961595e-06, "loss": 1.457, "step": 2713 }, { "epoch": 0.02, "grad_norm": 4.701603405635517, "learning_rate": 1.9998970298156053e-06, "loss": 1.4297, "step": 2714 }, { "epoch": 0.02, "grad_norm": 4.991641204953335, "learning_rate": 1.9998969536414512e-06, "loss": 1.4822, "step": 2715 }, { "epoch": 0.02, "grad_norm": 5.480036657625506, "learning_rate": 1.9998968774391338e-06, "loss": 1.3061, "step": 2716 }, { "epoch": 0.02, "grad_norm": 4.857570508327938, "learning_rate": 1.9998968012086524e-06, "loss": 1.5446, "step": 2717 }, { "epoch": 0.02, "grad_norm": 4.917848715898165, "learning_rate": 1.9998967249500073e-06, "loss": 1.4604, "step": 2718 }, { "epoch": 0.02, "grad_norm": 4.532991154379635, "learning_rate": 1.9998966486631987e-06, "loss": 1.3497, "step": 2719 }, { "epoch": 0.02, "grad_norm": 4.516960068763919, "learning_rate": 1.999896572348226e-06, "loss": 1.3371, "step": 2720 }, { "epoch": 0.02, "grad_norm": 5.4294845884659875, "learning_rate": 1.9998964960050895e-06, "loss": 1.4655, "step": 2721 }, { "epoch": 0.02, "grad_norm": 6.465778207032482, "learning_rate": 1.9998964196337894e-06, "loss": 1.1239, "step": 2722 }, { "epoch": 0.02, "grad_norm": 4.380579112263108, "learning_rate": 1.9998963432343254e-06, "loss": 1.4072, "step": 2723 }, { "epoch": 0.02, "grad_norm": 5.347121577221788, "learning_rate": 1.999896266806698e-06, "loss": 1.4905, "step": 2724 }, { "epoch": 0.02, "grad_norm": 4.091313845837881, "learning_rate": 1.9998961903509063e-06, "loss": 1.1784, "step": 2725 }, { "epoch": 0.02, "grad_norm": 4.9326180566194155, "learning_rate": 1.9998961138669512e-06, "loss": 1.5945, "step": 2726 }, { "epoch": 0.02, "grad_norm": 12.547464104001262, "learning_rate": 1.9998960373548323e-06, "loss": 1.7427, "step": 2727 }, { "epoch": 0.02, "grad_norm": 4.955919797294654, "learning_rate": 1.9998959608145495e-06, "loss": 1.5829, "step": 2728 }, { "epoch": 0.02, "grad_norm": 4.259954508473689, "learning_rate": 1.9998958842461033e-06, "loss": 1.2882, "step": 2729 }, { "epoch": 0.02, "grad_norm": 4.242648662389775, "learning_rate": 1.9998958076494933e-06, "loss": 1.2997, "step": 2730 }, { "epoch": 0.02, "grad_norm": 4.442253262087044, "learning_rate": 1.9998957310247194e-06, "loss": 1.2864, "step": 2731 }, { "epoch": 0.02, "grad_norm": 4.672333737798939, "learning_rate": 1.9998956543717816e-06, "loss": 1.5532, "step": 2732 }, { "epoch": 0.02, "grad_norm": 4.768920768366485, "learning_rate": 1.9998955776906805e-06, "loss": 1.4113, "step": 2733 }, { "epoch": 0.02, "grad_norm": 5.365298258133692, "learning_rate": 1.9998955009814155e-06, "loss": 1.4779, "step": 2734 }, { "epoch": 0.02, "grad_norm": 4.977640813439696, "learning_rate": 1.9998954242439866e-06, "loss": 1.3955, "step": 2735 }, { "epoch": 0.02, "grad_norm": 4.779169735265989, "learning_rate": 1.999895347478394e-06, "loss": 1.4087, "step": 2736 }, { "epoch": 0.02, "grad_norm": 5.7600337936905275, "learning_rate": 1.9998952706846378e-06, "loss": 1.46, "step": 2737 }, { "epoch": 0.02, "grad_norm": 4.654467318840138, "learning_rate": 1.999895193862718e-06, "loss": 1.3913, "step": 2738 }, { "epoch": 0.02, "grad_norm": 4.704589355819833, "learning_rate": 1.999895117012634e-06, "loss": 1.5544, "step": 2739 }, { "epoch": 0.02, "grad_norm": 5.1941897065206595, "learning_rate": 1.9998950401343863e-06, "loss": 1.273, "step": 2740 }, { "epoch": 0.02, "grad_norm": 4.439869003538217, "learning_rate": 1.9998949632279753e-06, "loss": 1.4237, "step": 2741 }, { "epoch": 0.02, "grad_norm": 4.50147659109475, "learning_rate": 1.9998948862934008e-06, "loss": 1.3462, "step": 2742 }, { "epoch": 0.02, "grad_norm": 5.584850957877264, "learning_rate": 1.999894809330662e-06, "loss": 1.2504, "step": 2743 }, { "epoch": 0.02, "grad_norm": 6.042375090827711, "learning_rate": 1.99989473233976e-06, "loss": 1.3003, "step": 2744 }, { "epoch": 0.02, "grad_norm": 5.023492077637436, "learning_rate": 1.9998946553206938e-06, "loss": 1.4422, "step": 2745 }, { "epoch": 0.02, "grad_norm": 4.748553264507999, "learning_rate": 1.999894578273464e-06, "loss": 1.5238, "step": 2746 }, { "epoch": 0.02, "grad_norm": 4.570660659610173, "learning_rate": 1.9998945011980706e-06, "loss": 1.4269, "step": 2747 }, { "epoch": 0.02, "grad_norm": 4.452679880713028, "learning_rate": 1.9998944240945134e-06, "loss": 1.4164, "step": 2748 }, { "epoch": 0.02, "grad_norm": 5.836656376655193, "learning_rate": 1.999894346962793e-06, "loss": 1.2856, "step": 2749 }, { "epoch": 0.02, "grad_norm": 4.798771542642795, "learning_rate": 1.999894269802908e-06, "loss": 1.3372, "step": 2750 }, { "epoch": 0.02, "grad_norm": 5.170216529268188, "learning_rate": 1.9998941926148597e-06, "loss": 1.4021, "step": 2751 }, { "epoch": 0.02, "grad_norm": 7.167356093869082, "learning_rate": 1.999894115398648e-06, "loss": 1.4013, "step": 2752 }, { "epoch": 0.02, "grad_norm": 4.587890795315358, "learning_rate": 1.9998940381542725e-06, "loss": 1.4176, "step": 2753 }, { "epoch": 0.02, "grad_norm": 5.177127842863576, "learning_rate": 1.999893960881733e-06, "loss": 1.5869, "step": 2754 }, { "epoch": 0.02, "grad_norm": 6.574137972828784, "learning_rate": 1.9998938835810304e-06, "loss": 1.5673, "step": 2755 }, { "epoch": 0.02, "grad_norm": 5.778126829684162, "learning_rate": 1.9998938062521637e-06, "loss": 1.5361, "step": 2756 }, { "epoch": 0.02, "grad_norm": 7.851913885459397, "learning_rate": 1.9998937288951332e-06, "loss": 1.412, "step": 2757 }, { "epoch": 0.02, "grad_norm": 5.122129218110728, "learning_rate": 1.9998936515099393e-06, "loss": 1.4406, "step": 2758 }, { "epoch": 0.02, "grad_norm": 4.775466238362049, "learning_rate": 1.9998935740965815e-06, "loss": 1.3279, "step": 2759 }, { "epoch": 0.02, "grad_norm": 10.097236710699216, "learning_rate": 1.9998934966550604e-06, "loss": 1.5103, "step": 2760 }, { "epoch": 0.02, "grad_norm": 4.852445289044562, "learning_rate": 1.999893419185375e-06, "loss": 1.469, "step": 2761 }, { "epoch": 0.02, "grad_norm": 4.912077857670805, "learning_rate": 1.9998933416875265e-06, "loss": 1.4218, "step": 2762 }, { "epoch": 0.02, "grad_norm": 4.689088161317069, "learning_rate": 1.999893264161514e-06, "loss": 1.3336, "step": 2763 }, { "epoch": 0.02, "grad_norm": 4.766112013484904, "learning_rate": 1.999893186607338e-06, "loss": 1.3066, "step": 2764 }, { "epoch": 0.02, "grad_norm": 5.593303699801557, "learning_rate": 1.9998931090249985e-06, "loss": 1.5657, "step": 2765 }, { "epoch": 0.02, "grad_norm": 5.163779733210183, "learning_rate": 1.9998930314144947e-06, "loss": 1.4395, "step": 2766 }, { "epoch": 0.02, "grad_norm": 5.674315487003727, "learning_rate": 1.999892953775828e-06, "loss": 1.5407, "step": 2767 }, { "epoch": 0.02, "grad_norm": 4.676661266930629, "learning_rate": 1.999892876108997e-06, "loss": 1.5139, "step": 2768 }, { "epoch": 0.02, "grad_norm": 9.128307746492878, "learning_rate": 1.9998927984140026e-06, "loss": 1.3218, "step": 2769 }, { "epoch": 0.02, "grad_norm": 4.240640196066125, "learning_rate": 1.9998927206908447e-06, "loss": 1.2989, "step": 2770 }, { "epoch": 0.02, "grad_norm": 6.27816898885737, "learning_rate": 1.999892642939523e-06, "loss": 1.3487, "step": 2771 }, { "epoch": 0.02, "grad_norm": 5.025010385489242, "learning_rate": 1.9998925651600377e-06, "loss": 1.3706, "step": 2772 }, { "epoch": 0.02, "grad_norm": 4.464981368592326, "learning_rate": 1.9998924873523886e-06, "loss": 1.3258, "step": 2773 }, { "epoch": 0.02, "grad_norm": 4.665177992679757, "learning_rate": 1.999892409516576e-06, "loss": 1.3428, "step": 2774 }, { "epoch": 0.02, "eval_loss": 1.6228926181793213, "eval_runtime": 4.6339, "eval_samples_per_second": 1.942, "eval_steps_per_second": 1.079, "step": 2774 }, { "epoch": 0.02, "grad_norm": 4.868974430046027, "learning_rate": 1.9998923316526e-06, "loss": 1.4719, "step": 2775 }, { "epoch": 0.02, "grad_norm": 5.304645907019113, "learning_rate": 1.99989225376046e-06, "loss": 1.5306, "step": 2776 }, { "epoch": 0.02, "grad_norm": 5.387713736296703, "learning_rate": 1.9998921758401565e-06, "loss": 1.3475, "step": 2777 }, { "epoch": 0.02, "grad_norm": 4.442792610487523, "learning_rate": 1.9998920978916895e-06, "loss": 1.36, "step": 2778 }, { "epoch": 0.02, "grad_norm": 5.683582295474543, "learning_rate": 1.9998920199150587e-06, "loss": 1.4342, "step": 2779 }, { "epoch": 0.02, "grad_norm": 5.777504817382186, "learning_rate": 1.9998919419102644e-06, "loss": 1.3633, "step": 2780 }, { "epoch": 0.02, "grad_norm": 4.598408814061361, "learning_rate": 1.9998918638773063e-06, "loss": 1.3561, "step": 2781 }, { "epoch": 0.02, "grad_norm": 4.531887898739241, "learning_rate": 1.9998917858161843e-06, "loss": 1.4105, "step": 2782 }, { "epoch": 0.02, "grad_norm": 4.379373053575446, "learning_rate": 1.999891707726899e-06, "loss": 1.3188, "step": 2783 }, { "epoch": 0.02, "grad_norm": 4.7609097936376195, "learning_rate": 1.99989162960945e-06, "loss": 1.3523, "step": 2784 }, { "epoch": 0.02, "grad_norm": 4.7932273540706545, "learning_rate": 1.9998915514638374e-06, "loss": 1.4839, "step": 2785 }, { "epoch": 0.02, "grad_norm": 4.403408099341777, "learning_rate": 1.9998914732900618e-06, "loss": 1.3651, "step": 2786 }, { "epoch": 0.02, "grad_norm": 5.363493337417038, "learning_rate": 1.9998913950881214e-06, "loss": 1.3903, "step": 2787 }, { "epoch": 0.02, "grad_norm": 4.703872779282757, "learning_rate": 1.999891316858018e-06, "loss": 1.3602, "step": 2788 }, { "epoch": 0.02, "grad_norm": 4.594866601682083, "learning_rate": 1.9998912385997513e-06, "loss": 1.2933, "step": 2789 }, { "epoch": 0.02, "grad_norm": 4.606930331365836, "learning_rate": 1.9998911603133207e-06, "loss": 1.4318, "step": 2790 }, { "epoch": 0.02, "grad_norm": 7.67703505544561, "learning_rate": 1.9998910819987262e-06, "loss": 1.1987, "step": 2791 }, { "epoch": 0.02, "grad_norm": 4.858347802822455, "learning_rate": 1.9998910036559688e-06, "loss": 1.3342, "step": 2792 }, { "epoch": 0.02, "grad_norm": 4.238675085196649, "learning_rate": 1.999890925285047e-06, "loss": 1.1874, "step": 2793 }, { "epoch": 0.02, "grad_norm": 4.636225801921176, "learning_rate": 1.999890846885962e-06, "loss": 1.3584, "step": 2794 }, { "epoch": 0.02, "grad_norm": 4.633817615345395, "learning_rate": 1.9998907684587133e-06, "loss": 1.4063, "step": 2795 }, { "epoch": 0.02, "grad_norm": 4.893305859480175, "learning_rate": 1.999890690003301e-06, "loss": 1.4289, "step": 2796 }, { "epoch": 0.02, "grad_norm": 4.7172189836983796, "learning_rate": 1.999890611519725e-06, "loss": 1.4481, "step": 2797 }, { "epoch": 0.02, "grad_norm": 4.881737643176644, "learning_rate": 1.999890533007986e-06, "loss": 1.4956, "step": 2798 }, { "epoch": 0.02, "grad_norm": 4.986642114604307, "learning_rate": 1.9998904544680827e-06, "loss": 1.6169, "step": 2799 }, { "epoch": 0.02, "grad_norm": 5.2687198972018905, "learning_rate": 1.999890375900016e-06, "loss": 1.3991, "step": 2800 }, { "epoch": 0.02, "grad_norm": 4.692782105304108, "learning_rate": 1.9998902973037858e-06, "loss": 1.3991, "step": 2801 }, { "epoch": 0.02, "grad_norm": 5.509363415701804, "learning_rate": 1.999890218679392e-06, "loss": 1.4011, "step": 2802 }, { "epoch": 0.02, "grad_norm": 6.431731471169426, "learning_rate": 1.9998901400268348e-06, "loss": 1.3621, "step": 2803 }, { "epoch": 0.02, "grad_norm": 5.881655716824346, "learning_rate": 1.9998900613461137e-06, "loss": 1.4308, "step": 2804 }, { "epoch": 0.02, "grad_norm": 5.081486388140841, "learning_rate": 1.9998899826372292e-06, "loss": 1.3531, "step": 2805 }, { "epoch": 0.02, "grad_norm": 5.1521080991642965, "learning_rate": 1.9998899039001813e-06, "loss": 1.31, "step": 2806 }, { "epoch": 0.02, "grad_norm": 5.210301192592582, "learning_rate": 1.9998898251349696e-06, "loss": 1.5408, "step": 2807 }, { "epoch": 0.02, "grad_norm": 4.589178232793303, "learning_rate": 1.9998897463415944e-06, "loss": 1.3702, "step": 2808 }, { "epoch": 0.02, "grad_norm": 4.375935678979782, "learning_rate": 1.999889667520056e-06, "loss": 1.3905, "step": 2809 }, { "epoch": 0.02, "grad_norm": 6.436455510679924, "learning_rate": 1.999889588670353e-06, "loss": 1.3327, "step": 2810 }, { "epoch": 0.02, "grad_norm": 5.29249042088829, "learning_rate": 1.9998895097924875e-06, "loss": 1.4343, "step": 2811 }, { "epoch": 0.02, "grad_norm": 4.784394796780278, "learning_rate": 1.9998894308864578e-06, "loss": 1.4381, "step": 2812 }, { "epoch": 0.02, "grad_norm": 4.946717504980601, "learning_rate": 1.9998893519522646e-06, "loss": 1.4603, "step": 2813 }, { "epoch": 0.02, "grad_norm": 4.766983915055353, "learning_rate": 1.999889272989908e-06, "loss": 1.5148, "step": 2814 }, { "epoch": 0.02, "grad_norm": 4.708307718420187, "learning_rate": 1.9998891939993877e-06, "loss": 1.3662, "step": 2815 }, { "epoch": 0.02, "grad_norm": 5.562866445292407, "learning_rate": 1.999889114980704e-06, "loss": 1.5425, "step": 2816 }, { "epoch": 0.02, "grad_norm": 4.8892392384319745, "learning_rate": 1.9998890359338566e-06, "loss": 1.5102, "step": 2817 }, { "epoch": 0.02, "grad_norm": 6.681445930399619, "learning_rate": 1.999888956858846e-06, "loss": 1.5407, "step": 2818 }, { "epoch": 0.02, "grad_norm": 4.932383185973984, "learning_rate": 1.9998888777556714e-06, "loss": 1.3519, "step": 2819 }, { "epoch": 0.02, "grad_norm": 4.573974144160327, "learning_rate": 1.9998887986243335e-06, "loss": 1.3792, "step": 2820 }, { "epoch": 0.02, "grad_norm": 5.945964809681693, "learning_rate": 1.999888719464832e-06, "loss": 1.3079, "step": 2821 }, { "epoch": 0.02, "grad_norm": 4.6784540645297366, "learning_rate": 1.999888640277167e-06, "loss": 1.3658, "step": 2822 }, { "epoch": 0.02, "grad_norm": 5.4758180832015775, "learning_rate": 1.9998885610613383e-06, "loss": 1.4252, "step": 2823 }, { "epoch": 0.02, "grad_norm": 4.924403479676831, "learning_rate": 1.9998884818173467e-06, "loss": 1.4362, "step": 2824 }, { "epoch": 0.02, "grad_norm": 5.1946491484073745, "learning_rate": 1.999888402545191e-06, "loss": 1.4022, "step": 2825 }, { "epoch": 0.02, "grad_norm": 5.116594932180298, "learning_rate": 1.999888323244872e-06, "loss": 1.458, "step": 2826 }, { "epoch": 0.02, "grad_norm": 4.641238557929875, "learning_rate": 1.999888243916389e-06, "loss": 1.3547, "step": 2827 }, { "epoch": 0.02, "grad_norm": 4.594999462463972, "learning_rate": 1.999888164559743e-06, "loss": 1.4215, "step": 2828 }, { "epoch": 0.02, "grad_norm": 5.047703148420556, "learning_rate": 1.9998880851749335e-06, "loss": 1.4627, "step": 2829 }, { "epoch": 0.02, "grad_norm": 5.422103915078788, "learning_rate": 1.99988800576196e-06, "loss": 1.5064, "step": 2830 }, { "epoch": 0.02, "grad_norm": 4.927470474587449, "learning_rate": 1.999887926320823e-06, "loss": 1.4422, "step": 2831 }, { "epoch": 0.02, "grad_norm": 4.8891825059531095, "learning_rate": 1.999887846851523e-06, "loss": 1.3049, "step": 2832 }, { "epoch": 0.02, "grad_norm": 4.89929973889463, "learning_rate": 1.9998877673540592e-06, "loss": 1.4641, "step": 2833 }, { "epoch": 0.02, "grad_norm": 4.330474087668212, "learning_rate": 1.999887687828432e-06, "loss": 1.2395, "step": 2834 }, { "epoch": 0.02, "grad_norm": 4.689316103906117, "learning_rate": 1.999887608274641e-06, "loss": 1.5746, "step": 2835 }, { "epoch": 0.02, "grad_norm": 30.022522598437778, "learning_rate": 1.999887528692687e-06, "loss": 1.552, "step": 2836 }, { "epoch": 0.02, "grad_norm": 4.979619290046022, "learning_rate": 1.999887449082569e-06, "loss": 1.4355, "step": 2837 }, { "epoch": 0.02, "grad_norm": 4.5639834865515185, "learning_rate": 1.9998873694442878e-06, "loss": 1.383, "step": 2838 }, { "epoch": 0.02, "grad_norm": 4.572884425917966, "learning_rate": 1.9998872897778427e-06, "loss": 1.4367, "step": 2839 }, { "epoch": 0.02, "grad_norm": 4.935183007183054, "learning_rate": 1.9998872100832346e-06, "loss": 1.2201, "step": 2840 }, { "epoch": 0.02, "grad_norm": 6.211909738145334, "learning_rate": 1.9998871303604627e-06, "loss": 1.3496, "step": 2841 }, { "epoch": 0.02, "grad_norm": 4.5122328941008, "learning_rate": 1.9998870506095274e-06, "loss": 1.2845, "step": 2842 }, { "epoch": 0.02, "grad_norm": 4.924447610654198, "learning_rate": 1.9998869708304286e-06, "loss": 1.4161, "step": 2843 }, { "epoch": 0.02, "grad_norm": 5.2054369143001145, "learning_rate": 1.9998868910231665e-06, "loss": 1.5187, "step": 2844 }, { "epoch": 0.02, "grad_norm": 4.861035173442859, "learning_rate": 1.9998868111877404e-06, "loss": 1.3305, "step": 2845 }, { "epoch": 0.02, "grad_norm": 4.63315608539278, "learning_rate": 1.9998867313241514e-06, "loss": 1.3722, "step": 2846 }, { "epoch": 0.02, "grad_norm": 4.566229003342197, "learning_rate": 1.9998866514323985e-06, "loss": 1.3612, "step": 2847 }, { "epoch": 0.02, "eval_loss": 1.6186871528625488, "eval_runtime": 4.6182, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 2847 }, { "epoch": 0.02, "grad_norm": 4.9773867457031615, "learning_rate": 1.9998865715124822e-06, "loss": 1.3684, "step": 2848 }, { "epoch": 0.02, "grad_norm": 4.776322816477926, "learning_rate": 1.9998864915644025e-06, "loss": 1.355, "step": 2849 }, { "epoch": 0.02, "grad_norm": 5.200825295575172, "learning_rate": 1.9998864115881594e-06, "loss": 1.4849, "step": 2850 }, { "epoch": 0.02, "grad_norm": 8.669774310904668, "learning_rate": 1.999886331583753e-06, "loss": 1.439, "step": 2851 }, { "epoch": 0.02, "grad_norm": 4.961641023055851, "learning_rate": 1.9998862515511824e-06, "loss": 1.3667, "step": 2852 }, { "epoch": 0.02, "grad_norm": 4.822249685134192, "learning_rate": 1.999886171490449e-06, "loss": 1.3395, "step": 2853 }, { "epoch": 0.02, "grad_norm": 6.098165282053863, "learning_rate": 1.999886091401552e-06, "loss": 1.4478, "step": 2854 }, { "epoch": 0.02, "grad_norm": 6.297976415034525, "learning_rate": 1.999886011284491e-06, "loss": 1.3084, "step": 2855 }, { "epoch": 0.02, "grad_norm": 4.60213897121254, "learning_rate": 1.9998859311392675e-06, "loss": 1.3949, "step": 2856 }, { "epoch": 0.02, "grad_norm": 5.128426005267793, "learning_rate": 1.99988585096588e-06, "loss": 1.5439, "step": 2857 }, { "epoch": 0.02, "grad_norm": 4.238279682926694, "learning_rate": 1.999885770764329e-06, "loss": 1.3442, "step": 2858 }, { "epoch": 0.02, "grad_norm": 4.454111710242467, "learning_rate": 1.9998856905346142e-06, "loss": 1.2708, "step": 2859 }, { "epoch": 0.02, "grad_norm": 6.597041686087069, "learning_rate": 1.9998856102767365e-06, "loss": 1.4927, "step": 2860 }, { "epoch": 0.02, "grad_norm": 4.9566004997832325, "learning_rate": 1.9998855299906953e-06, "loss": 1.4864, "step": 2861 }, { "epoch": 0.02, "grad_norm": 4.803022534988008, "learning_rate": 1.9998854496764907e-06, "loss": 1.4479, "step": 2862 }, { "epoch": 0.02, "grad_norm": 5.3284227061764025, "learning_rate": 1.9998853693341226e-06, "loss": 1.3775, "step": 2863 }, { "epoch": 0.02, "grad_norm": 4.449051605434009, "learning_rate": 1.9998852889635907e-06, "loss": 1.3944, "step": 2864 }, { "epoch": 0.02, "grad_norm": 4.524264356204093, "learning_rate": 1.999885208564896e-06, "loss": 1.3925, "step": 2865 }, { "epoch": 0.02, "grad_norm": 4.576738634642325, "learning_rate": 1.9998851281380375e-06, "loss": 1.4023, "step": 2866 }, { "epoch": 0.02, "grad_norm": 4.5128730915629305, "learning_rate": 1.999885047683015e-06, "loss": 1.3308, "step": 2867 }, { "epoch": 0.02, "grad_norm": 5.774177800040769, "learning_rate": 1.99988496719983e-06, "loss": 1.3723, "step": 2868 }, { "epoch": 0.02, "grad_norm": 4.841933746799073, "learning_rate": 1.999884886688481e-06, "loss": 1.4242, "step": 2869 }, { "epoch": 0.02, "grad_norm": 7.190890743181518, "learning_rate": 1.999884806148969e-06, "loss": 1.4457, "step": 2870 }, { "epoch": 0.02, "grad_norm": 4.389488660525391, "learning_rate": 1.999884725581293e-06, "loss": 1.3262, "step": 2871 }, { "epoch": 0.02, "grad_norm": 4.9366007381855725, "learning_rate": 1.999884644985454e-06, "loss": 1.4215, "step": 2872 }, { "epoch": 0.02, "grad_norm": 4.51134223312188, "learning_rate": 1.9998845643614514e-06, "loss": 1.4528, "step": 2873 }, { "epoch": 0.02, "grad_norm": 4.5911234789565345, "learning_rate": 1.9998844837092857e-06, "loss": 1.3725, "step": 2874 }, { "epoch": 0.02, "grad_norm": 4.714425532161108, "learning_rate": 1.9998844030289562e-06, "loss": 1.3326, "step": 2875 }, { "epoch": 0.02, "grad_norm": 6.194761253541817, "learning_rate": 1.9998843223204633e-06, "loss": 1.4908, "step": 2876 }, { "epoch": 0.02, "grad_norm": 4.504642312252081, "learning_rate": 1.999884241583807e-06, "loss": 1.3287, "step": 2877 }, { "epoch": 0.02, "grad_norm": 6.794699180375288, "learning_rate": 1.9998841608189875e-06, "loss": 1.4244, "step": 2878 }, { "epoch": 0.02, "grad_norm": 4.751999842845239, "learning_rate": 1.9998840800260043e-06, "loss": 1.454, "step": 2879 }, { "epoch": 0.02, "grad_norm": 4.726264353641071, "learning_rate": 1.999883999204858e-06, "loss": 1.3897, "step": 2880 }, { "epoch": 0.02, "grad_norm": 4.729803974265776, "learning_rate": 1.999883918355548e-06, "loss": 1.5002, "step": 2881 }, { "epoch": 0.02, "grad_norm": 5.285212933171045, "learning_rate": 1.999883837478075e-06, "loss": 1.6833, "step": 2882 }, { "epoch": 0.02, "grad_norm": 4.596625945889018, "learning_rate": 1.999883756572438e-06, "loss": 1.3805, "step": 2883 }, { "epoch": 0.02, "grad_norm": 5.0059586218503105, "learning_rate": 1.999883675638638e-06, "loss": 1.4069, "step": 2884 }, { "epoch": 0.02, "grad_norm": 4.8719439751839575, "learning_rate": 1.999883594676675e-06, "loss": 1.4273, "step": 2885 }, { "epoch": 0.02, "grad_norm": 4.836717800773741, "learning_rate": 1.999883513686548e-06, "loss": 1.5191, "step": 2886 }, { "epoch": 0.02, "grad_norm": 5.448400261385887, "learning_rate": 1.9998834326682575e-06, "loss": 1.5117, "step": 2887 }, { "epoch": 0.02, "grad_norm": 4.341830112851468, "learning_rate": 1.9998833516218043e-06, "loss": 1.3386, "step": 2888 }, { "epoch": 0.02, "grad_norm": 4.330108245622447, "learning_rate": 1.999883270547187e-06, "loss": 1.3865, "step": 2889 }, { "epoch": 0.02, "grad_norm": 5.161889492402766, "learning_rate": 1.9998831894444064e-06, "loss": 1.4923, "step": 2890 }, { "epoch": 0.02, "grad_norm": 4.583340804795185, "learning_rate": 1.9998831083134626e-06, "loss": 1.4128, "step": 2891 }, { "epoch": 0.02, "grad_norm": 4.475403361495518, "learning_rate": 1.9998830271543557e-06, "loss": 1.2633, "step": 2892 }, { "epoch": 0.02, "grad_norm": 4.429544299978376, "learning_rate": 1.999882945967085e-06, "loss": 1.2271, "step": 2893 }, { "epoch": 0.02, "grad_norm": 4.924573918325059, "learning_rate": 1.999882864751651e-06, "loss": 1.5329, "step": 2894 }, { "epoch": 0.02, "grad_norm": 4.72164854751247, "learning_rate": 1.9998827835080538e-06, "loss": 1.3411, "step": 2895 }, { "epoch": 0.02, "grad_norm": 5.513620466728661, "learning_rate": 1.9998827022362932e-06, "loss": 1.557, "step": 2896 }, { "epoch": 0.02, "grad_norm": 4.893569608594087, "learning_rate": 1.9998826209363693e-06, "loss": 1.4545, "step": 2897 }, { "epoch": 0.02, "grad_norm": 5.59988609368051, "learning_rate": 1.999882539608282e-06, "loss": 1.4207, "step": 2898 }, { "epoch": 0.02, "grad_norm": 6.289259835995451, "learning_rate": 1.9998824582520306e-06, "loss": 1.4633, "step": 2899 }, { "epoch": 0.02, "grad_norm": 4.600672131161404, "learning_rate": 1.999882376867617e-06, "loss": 1.2747, "step": 2900 }, { "epoch": 0.02, "grad_norm": 4.73209077596222, "learning_rate": 1.9998822954550396e-06, "loss": 1.4534, "step": 2901 }, { "epoch": 0.02, "grad_norm": 4.649416419045721, "learning_rate": 1.9998822140142985e-06, "loss": 1.423, "step": 2902 }, { "epoch": 0.02, "grad_norm": 4.815284149190514, "learning_rate": 1.999882132545394e-06, "loss": 1.1985, "step": 2903 }, { "epoch": 0.02, "grad_norm": 4.993787069518763, "learning_rate": 1.999882051048327e-06, "loss": 1.3226, "step": 2904 }, { "epoch": 0.02, "grad_norm": 5.30556271239256, "learning_rate": 1.9998819695230956e-06, "loss": 1.6076, "step": 2905 }, { "epoch": 0.02, "grad_norm": 5.111676758762733, "learning_rate": 1.9998818879697013e-06, "loss": 1.4156, "step": 2906 }, { "epoch": 0.02, "grad_norm": 4.57532368590634, "learning_rate": 1.999881806388144e-06, "loss": 1.5127, "step": 2907 }, { "epoch": 0.02, "grad_norm": 5.097665766053441, "learning_rate": 1.9998817247784227e-06, "loss": 1.2153, "step": 2908 }, { "epoch": 0.02, "grad_norm": 4.7664796065004404, "learning_rate": 1.9998816431405386e-06, "loss": 1.4997, "step": 2909 }, { "epoch": 0.02, "grad_norm": 4.707849161238309, "learning_rate": 1.999881561474491e-06, "loss": 1.3614, "step": 2910 }, { "epoch": 0.02, "grad_norm": 4.715418586973402, "learning_rate": 1.99988147978028e-06, "loss": 1.3648, "step": 2911 }, { "epoch": 0.02, "grad_norm": 4.698559700737574, "learning_rate": 1.9998813980579055e-06, "loss": 1.4627, "step": 2912 }, { "epoch": 0.02, "grad_norm": 4.492256458663923, "learning_rate": 1.999881316307368e-06, "loss": 1.3053, "step": 2913 }, { "epoch": 0.02, "grad_norm": 4.154059898049573, "learning_rate": 1.9998812345286667e-06, "loss": 1.1044, "step": 2914 }, { "epoch": 0.02, "grad_norm": 5.62208125851778, "learning_rate": 1.999881152721803e-06, "loss": 1.3578, "step": 2915 }, { "epoch": 0.02, "grad_norm": 4.66847610055548, "learning_rate": 1.999881070886775e-06, "loss": 1.5454, "step": 2916 }, { "epoch": 0.02, "grad_norm": 4.317475104898607, "learning_rate": 1.999880989023584e-06, "loss": 1.3935, "step": 2917 }, { "epoch": 0.02, "grad_norm": 4.527930294875366, "learning_rate": 1.99988090713223e-06, "loss": 1.4274, "step": 2918 }, { "epoch": 0.02, "grad_norm": 4.416272106126581, "learning_rate": 1.9998808252127123e-06, "loss": 1.3224, "step": 2919 }, { "epoch": 0.02, "grad_norm": 5.316728315979006, "learning_rate": 1.9998807432650313e-06, "loss": 1.5215, "step": 2920 }, { "epoch": 0.02, "eval_loss": 1.6199383735656738, "eval_runtime": 4.6314, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.08, "step": 2920 }, { "epoch": 0.02, "grad_norm": 4.490493646901724, "learning_rate": 1.999880661289187e-06, "loss": 1.4416, "step": 2921 }, { "epoch": 0.02, "grad_norm": 4.624580242853857, "learning_rate": 1.9998805792851795e-06, "loss": 1.3179, "step": 2922 }, { "epoch": 0.02, "grad_norm": 8.466914354748946, "learning_rate": 1.9998804972530087e-06, "loss": 1.6161, "step": 2923 }, { "epoch": 0.02, "grad_norm": 4.764398192424773, "learning_rate": 1.9998804151926745e-06, "loss": 1.5612, "step": 2924 }, { "epoch": 0.02, "grad_norm": 6.270011750308185, "learning_rate": 1.9998803331041772e-06, "loss": 1.3574, "step": 2925 }, { "epoch": 0.02, "grad_norm": 5.688420881807707, "learning_rate": 1.999880250987516e-06, "loss": 1.4334, "step": 2926 }, { "epoch": 0.02, "grad_norm": 4.649454207981505, "learning_rate": 1.999880168842692e-06, "loss": 1.3212, "step": 2927 }, { "epoch": 0.02, "grad_norm": 5.156111274716404, "learning_rate": 1.9998800866697046e-06, "loss": 1.1788, "step": 2928 }, { "epoch": 0.02, "grad_norm": 5.189510589727771, "learning_rate": 1.999880004468554e-06, "loss": 1.5181, "step": 2929 }, { "epoch": 0.02, "grad_norm": 5.20778314788957, "learning_rate": 1.99987992223924e-06, "loss": 1.4894, "step": 2930 }, { "epoch": 0.02, "grad_norm": 4.613692977472925, "learning_rate": 1.9998798399817628e-06, "loss": 1.3359, "step": 2931 }, { "epoch": 0.02, "grad_norm": 4.45441252637321, "learning_rate": 1.999879757696122e-06, "loss": 1.3484, "step": 2932 }, { "epoch": 0.02, "grad_norm": 5.84796908759116, "learning_rate": 1.9998796753823182e-06, "loss": 1.3102, "step": 2933 }, { "epoch": 0.02, "grad_norm": 5.057071265216301, "learning_rate": 1.9998795930403515e-06, "loss": 1.4964, "step": 2934 }, { "epoch": 0.02, "grad_norm": 4.65737240395791, "learning_rate": 1.9998795106702213e-06, "loss": 1.3021, "step": 2935 }, { "epoch": 0.02, "grad_norm": 4.466035985587055, "learning_rate": 1.9998794282719273e-06, "loss": 1.5032, "step": 2936 }, { "epoch": 0.02, "grad_norm": 5.856160311135987, "learning_rate": 1.9998793458454702e-06, "loss": 1.5538, "step": 2937 }, { "epoch": 0.02, "grad_norm": 4.610346348605172, "learning_rate": 1.99987926339085e-06, "loss": 1.462, "step": 2938 }, { "epoch": 0.02, "grad_norm": 4.928466235921176, "learning_rate": 1.9998791809080667e-06, "loss": 1.3877, "step": 2939 }, { "epoch": 0.02, "grad_norm": 4.973945408707448, "learning_rate": 1.99987909839712e-06, "loss": 1.3815, "step": 2940 }, { "epoch": 0.02, "grad_norm": 5.625537540163681, "learning_rate": 1.99987901585801e-06, "loss": 1.2582, "step": 2941 }, { "epoch": 0.02, "grad_norm": 4.477355848214243, "learning_rate": 1.9998789332907363e-06, "loss": 1.236, "step": 2942 }, { "epoch": 0.02, "grad_norm": 4.492618763572616, "learning_rate": 1.9998788506953e-06, "loss": 1.3155, "step": 2943 }, { "epoch": 0.02, "grad_norm": 4.602003607577262, "learning_rate": 1.9998787680717002e-06, "loss": 1.2852, "step": 2944 }, { "epoch": 0.02, "grad_norm": 4.567502332448367, "learning_rate": 1.999878685419937e-06, "loss": 1.4841, "step": 2945 }, { "epoch": 0.02, "grad_norm": 4.9405542923073815, "learning_rate": 1.9998786027400105e-06, "loss": 1.3458, "step": 2946 }, { "epoch": 0.02, "grad_norm": 4.6929594602807025, "learning_rate": 1.999878520031921e-06, "loss": 1.4189, "step": 2947 }, { "epoch": 0.02, "grad_norm": 5.197148859419166, "learning_rate": 1.9998784372956684e-06, "loss": 1.3436, "step": 2948 }, { "epoch": 0.02, "grad_norm": 6.599441454159472, "learning_rate": 1.999878354531252e-06, "loss": 1.3806, "step": 2949 }, { "epoch": 0.02, "grad_norm": 4.615989461212095, "learning_rate": 1.9998782717386726e-06, "loss": 1.3673, "step": 2950 }, { "epoch": 0.02, "grad_norm": 4.9156780589895295, "learning_rate": 1.9998781889179302e-06, "loss": 1.3702, "step": 2951 }, { "epoch": 0.02, "grad_norm": 4.29684223795888, "learning_rate": 1.9998781060690244e-06, "loss": 1.3308, "step": 2952 }, { "epoch": 0.02, "grad_norm": 4.517575497946087, "learning_rate": 1.999878023191955e-06, "loss": 1.3706, "step": 2953 }, { "epoch": 0.02, "grad_norm": 4.767296353617181, "learning_rate": 1.999877940286723e-06, "loss": 1.5224, "step": 2954 }, { "epoch": 0.02, "grad_norm": 5.751314082046642, "learning_rate": 1.9998778573533272e-06, "loss": 1.3569, "step": 2955 }, { "epoch": 0.02, "grad_norm": 4.40734821128334, "learning_rate": 1.9998777743917686e-06, "loss": 1.4663, "step": 2956 }, { "epoch": 0.02, "grad_norm": 4.350543171262743, "learning_rate": 1.9998776914020465e-06, "loss": 1.2895, "step": 2957 }, { "epoch": 0.02, "grad_norm": 4.6281115386375395, "learning_rate": 1.9998776083841614e-06, "loss": 1.5124, "step": 2958 }, { "epoch": 0.02, "grad_norm": 5.2232144573948185, "learning_rate": 1.9998775253381125e-06, "loss": 1.2376, "step": 2959 }, { "epoch": 0.02, "grad_norm": 5.689282233407712, "learning_rate": 1.999877442263901e-06, "loss": 1.5472, "step": 2960 }, { "epoch": 0.02, "grad_norm": 4.917409633814714, "learning_rate": 1.999877359161526e-06, "loss": 1.4326, "step": 2961 }, { "epoch": 0.02, "grad_norm": 4.994380191725462, "learning_rate": 1.9998772760309877e-06, "loss": 1.3549, "step": 2962 }, { "epoch": 0.02, "grad_norm": 14.47259236356199, "learning_rate": 1.9998771928722868e-06, "loss": 1.4232, "step": 2963 }, { "epoch": 0.02, "grad_norm": 4.656592137584895, "learning_rate": 1.999877109685422e-06, "loss": 1.3973, "step": 2964 }, { "epoch": 0.02, "grad_norm": 5.6307964918603615, "learning_rate": 1.9998770264703942e-06, "loss": 1.3096, "step": 2965 }, { "epoch": 0.02, "grad_norm": 4.5164633394118825, "learning_rate": 1.9998769432272035e-06, "loss": 1.3215, "step": 2966 }, { "epoch": 0.02, "grad_norm": 4.5103839413771185, "learning_rate": 1.9998768599558493e-06, "loss": 1.3789, "step": 2967 }, { "epoch": 0.02, "grad_norm": 4.2216327904598385, "learning_rate": 1.9998767766563317e-06, "loss": 1.3297, "step": 2968 }, { "epoch": 0.02, "grad_norm": 5.059047029865863, "learning_rate": 1.999876693328651e-06, "loss": 1.3984, "step": 2969 }, { "epoch": 0.02, "grad_norm": 4.700581236103446, "learning_rate": 1.9998766099728075e-06, "loss": 1.5166, "step": 2970 }, { "epoch": 0.02, "grad_norm": 4.854212931369697, "learning_rate": 1.9998765265888e-06, "loss": 1.4143, "step": 2971 }, { "epoch": 0.02, "grad_norm": 7.725119262001126, "learning_rate": 1.99987644317663e-06, "loss": 1.178, "step": 2972 }, { "epoch": 0.02, "grad_norm": 4.370144258521158, "learning_rate": 1.9998763597362965e-06, "loss": 1.3893, "step": 2973 }, { "epoch": 0.02, "grad_norm": 4.638058498542562, "learning_rate": 1.9998762762678e-06, "loss": 1.4689, "step": 2974 }, { "epoch": 0.02, "grad_norm": 5.213106554526003, "learning_rate": 1.9998761927711402e-06, "loss": 1.3459, "step": 2975 }, { "epoch": 0.02, "grad_norm": 4.858477665120681, "learning_rate": 1.9998761092463173e-06, "loss": 1.4884, "step": 2976 }, { "epoch": 0.02, "grad_norm": 4.570024440771522, "learning_rate": 1.999876025693331e-06, "loss": 1.469, "step": 2977 }, { "epoch": 0.02, "grad_norm": 8.16955091971151, "learning_rate": 1.9998759421121818e-06, "loss": 1.4388, "step": 2978 }, { "epoch": 0.02, "grad_norm": 4.903458987120334, "learning_rate": 1.999875858502869e-06, "loss": 1.3786, "step": 2979 }, { "epoch": 0.02, "grad_norm": 4.529203011610496, "learning_rate": 1.9998757748653933e-06, "loss": 1.2684, "step": 2980 }, { "epoch": 0.02, "grad_norm": 4.729523457919252, "learning_rate": 1.9998756911997546e-06, "loss": 1.5905, "step": 2981 }, { "epoch": 0.02, "grad_norm": 5.582684926177018, "learning_rate": 1.9998756075059525e-06, "loss": 1.598, "step": 2982 }, { "epoch": 0.02, "grad_norm": 4.659482522086951, "learning_rate": 1.9998755237839874e-06, "loss": 1.3603, "step": 2983 }, { "epoch": 0.02, "grad_norm": 5.628018340481434, "learning_rate": 1.999875440033859e-06, "loss": 1.5392, "step": 2984 }, { "epoch": 0.02, "grad_norm": 5.03947825852203, "learning_rate": 1.9998753562555673e-06, "loss": 1.47, "step": 2985 }, { "epoch": 0.02, "grad_norm": 4.908146139932012, "learning_rate": 1.9998752724491127e-06, "loss": 1.5795, "step": 2986 }, { "epoch": 0.02, "grad_norm": 4.642150400889997, "learning_rate": 1.9998751886144948e-06, "loss": 1.4156, "step": 2987 }, { "epoch": 0.02, "grad_norm": 5.580016701743678, "learning_rate": 1.999875104751714e-06, "loss": 1.3989, "step": 2988 }, { "epoch": 0.02, "grad_norm": 4.60906478003677, "learning_rate": 1.99987502086077e-06, "loss": 1.4226, "step": 2989 }, { "epoch": 0.02, "grad_norm": 4.563599437751002, "learning_rate": 1.9998749369416624e-06, "loss": 1.5099, "step": 2990 }, { "epoch": 0.02, "grad_norm": 4.223270236357775, "learning_rate": 1.9998748529943916e-06, "loss": 1.3115, "step": 2991 }, { "epoch": 0.02, "grad_norm": 5.633656419315019, "learning_rate": 1.9998747690189583e-06, "loss": 1.4029, "step": 2992 }, { "epoch": 0.02, "grad_norm": 4.630285775575748, "learning_rate": 1.9998746850153614e-06, "loss": 1.3736, "step": 2993 }, { "epoch": 0.02, "eval_loss": 1.6208245754241943, "eval_runtime": 4.6165, "eval_samples_per_second": 1.95, "eval_steps_per_second": 1.083, "step": 2993 }, { "epoch": 0.02, "grad_norm": 4.936246851895618, "learning_rate": 1.9998746009836016e-06, "loss": 1.406, "step": 2994 }, { "epoch": 0.02, "grad_norm": 5.149643170506049, "learning_rate": 1.9998745169236784e-06, "loss": 1.3712, "step": 2995 }, { "epoch": 0.02, "grad_norm": 4.788568030128947, "learning_rate": 1.999874432835592e-06, "loss": 1.4386, "step": 2996 }, { "epoch": 0.02, "grad_norm": 4.578352571887644, "learning_rate": 1.999874348719343e-06, "loss": 1.3946, "step": 2997 }, { "epoch": 0.02, "grad_norm": 5.124704832034737, "learning_rate": 1.9998742645749303e-06, "loss": 1.301, "step": 2998 }, { "epoch": 0.02, "grad_norm": 4.405037637799255, "learning_rate": 1.999874180402355e-06, "loss": 1.4424, "step": 2999 }, { "epoch": 0.02, "grad_norm": 4.6389112469188, "learning_rate": 1.999874096201616e-06, "loss": 1.4466, "step": 3000 }, { "epoch": 0.02, "grad_norm": 5.19152230930798, "learning_rate": 1.9998740119727144e-06, "loss": 1.4456, "step": 3001 }, { "epoch": 0.02, "grad_norm": 5.121951342555157, "learning_rate": 1.999873927715649e-06, "loss": 1.3808, "step": 3002 }, { "epoch": 0.02, "grad_norm": 4.607444674665859, "learning_rate": 1.999873843430421e-06, "loss": 1.4636, "step": 3003 }, { "epoch": 0.02, "grad_norm": 4.46848570773085, "learning_rate": 1.9998737591170298e-06, "loss": 1.3631, "step": 3004 }, { "epoch": 0.02, "grad_norm": 4.330823996962009, "learning_rate": 1.9998736747754753e-06, "loss": 1.3945, "step": 3005 }, { "epoch": 0.02, "grad_norm": 5.168688715915758, "learning_rate": 1.999873590405758e-06, "loss": 1.265, "step": 3006 }, { "epoch": 0.02, "grad_norm": 5.21101121326567, "learning_rate": 1.9998735060078773e-06, "loss": 1.4035, "step": 3007 }, { "epoch": 0.02, "grad_norm": 4.9325677986365335, "learning_rate": 1.999873421581834e-06, "loss": 1.5848, "step": 3008 }, { "epoch": 0.02, "grad_norm": 4.726896704178509, "learning_rate": 1.999873337127627e-06, "loss": 1.3209, "step": 3009 }, { "epoch": 0.02, "grad_norm": 5.923881575075372, "learning_rate": 1.999873252645257e-06, "loss": 1.3872, "step": 3010 }, { "epoch": 0.02, "grad_norm": 4.405371972429703, "learning_rate": 1.999873168134724e-06, "loss": 1.3634, "step": 3011 }, { "epoch": 0.02, "grad_norm": 4.953407177301594, "learning_rate": 1.999873083596028e-06, "loss": 1.4459, "step": 3012 }, { "epoch": 0.02, "grad_norm": 4.432312541640287, "learning_rate": 1.9998729990291685e-06, "loss": 1.36, "step": 3013 }, { "epoch": 0.02, "grad_norm": 4.6106671367098855, "learning_rate": 1.9998729144341462e-06, "loss": 1.3693, "step": 3014 }, { "epoch": 0.02, "grad_norm": 4.363880892273669, "learning_rate": 1.999872829810961e-06, "loss": 1.4125, "step": 3015 }, { "epoch": 0.02, "grad_norm": 4.671858356354371, "learning_rate": 1.9998727451596126e-06, "loss": 1.4628, "step": 3016 }, { "epoch": 0.02, "grad_norm": 5.1327904162947835, "learning_rate": 1.999872660480101e-06, "loss": 1.484, "step": 3017 }, { "epoch": 0.02, "grad_norm": 6.726123143448659, "learning_rate": 1.999872575772426e-06, "loss": 1.3706, "step": 3018 }, { "epoch": 0.02, "grad_norm": 4.6201517878177984, "learning_rate": 1.9998724910365884e-06, "loss": 1.3531, "step": 3019 }, { "epoch": 0.02, "grad_norm": 4.7617011200342585, "learning_rate": 1.9998724062725877e-06, "loss": 1.292, "step": 3020 }, { "epoch": 0.02, "grad_norm": 4.684197716362908, "learning_rate": 1.999872321480424e-06, "loss": 1.4344, "step": 3021 }, { "epoch": 0.02, "grad_norm": 4.683892557856816, "learning_rate": 1.9998722366600968e-06, "loss": 1.5146, "step": 3022 }, { "epoch": 0.02, "grad_norm": 4.659566302966707, "learning_rate": 1.9998721518116066e-06, "loss": 1.4766, "step": 3023 }, { "epoch": 0.02, "grad_norm": 4.56223709999532, "learning_rate": 1.9998720669349535e-06, "loss": 1.4421, "step": 3024 }, { "epoch": 0.02, "grad_norm": 6.040663438792937, "learning_rate": 1.9998719820301373e-06, "loss": 1.4359, "step": 3025 }, { "epoch": 0.02, "grad_norm": 4.513110115307069, "learning_rate": 1.9998718970971578e-06, "loss": 1.4452, "step": 3026 }, { "epoch": 0.02, "grad_norm": 4.912192039188993, "learning_rate": 1.9998718121360156e-06, "loss": 1.4882, "step": 3027 }, { "epoch": 0.02, "grad_norm": 4.525564198825131, "learning_rate": 1.99987172714671e-06, "loss": 1.2955, "step": 3028 }, { "epoch": 0.02, "grad_norm": 5.369450073058397, "learning_rate": 1.999871642129242e-06, "loss": 1.4109, "step": 3029 }, { "epoch": 0.02, "grad_norm": 6.399069353447917, "learning_rate": 1.9998715570836104e-06, "loss": 1.5341, "step": 3030 }, { "epoch": 0.02, "grad_norm": 4.863376705625461, "learning_rate": 1.999871472009816e-06, "loss": 1.3178, "step": 3031 }, { "epoch": 0.02, "grad_norm": 5.999004673955154, "learning_rate": 1.999871386907858e-06, "loss": 1.358, "step": 3032 }, { "epoch": 0.02, "grad_norm": 4.667839262748732, "learning_rate": 1.9998713017777373e-06, "loss": 1.4091, "step": 3033 }, { "epoch": 0.02, "grad_norm": 4.5743647889890475, "learning_rate": 1.9998712166194537e-06, "loss": 1.3385, "step": 3034 }, { "epoch": 0.02, "grad_norm": 4.975891971893521, "learning_rate": 1.9998711314330067e-06, "loss": 1.4072, "step": 3035 }, { "epoch": 0.02, "grad_norm": 4.830200195962722, "learning_rate": 1.999871046218397e-06, "loss": 1.4913, "step": 3036 }, { "epoch": 0.02, "grad_norm": 4.805815436884879, "learning_rate": 1.9998709609756238e-06, "loss": 1.4484, "step": 3037 }, { "epoch": 0.02, "grad_norm": 4.600176911297759, "learning_rate": 1.9998708757046882e-06, "loss": 1.2973, "step": 3038 }, { "epoch": 0.02, "grad_norm": 5.060093592105702, "learning_rate": 1.9998707904055893e-06, "loss": 1.1942, "step": 3039 }, { "epoch": 0.02, "grad_norm": 4.802674568723147, "learning_rate": 1.9998707050783273e-06, "loss": 1.5334, "step": 3040 }, { "epoch": 0.02, "grad_norm": 5.426696065645248, "learning_rate": 1.9998706197229023e-06, "loss": 1.4872, "step": 3041 }, { "epoch": 0.02, "grad_norm": 5.898642916611915, "learning_rate": 1.999870534339314e-06, "loss": 1.5561, "step": 3042 }, { "epoch": 0.02, "grad_norm": 4.5979860791423635, "learning_rate": 1.999870448927563e-06, "loss": 1.5231, "step": 3043 }, { "epoch": 0.02, "grad_norm": 4.535954226116485, "learning_rate": 1.999870363487649e-06, "loss": 1.3058, "step": 3044 }, { "epoch": 0.02, "grad_norm": 4.745795440686125, "learning_rate": 1.9998702780195716e-06, "loss": 1.4387, "step": 3045 }, { "epoch": 0.02, "grad_norm": 4.635960415984835, "learning_rate": 1.999870192523332e-06, "loss": 1.4911, "step": 3046 }, { "epoch": 0.02, "grad_norm": 4.759909878239948, "learning_rate": 1.9998701069989287e-06, "loss": 1.4024, "step": 3047 }, { "epoch": 0.02, "grad_norm": 5.802542836315548, "learning_rate": 1.9998700214463623e-06, "loss": 1.4512, "step": 3048 }, { "epoch": 0.02, "grad_norm": 6.378746183379735, "learning_rate": 1.9998699358656334e-06, "loss": 1.5516, "step": 3049 }, { "epoch": 0.02, "grad_norm": 4.618905225926722, "learning_rate": 1.999869850256741e-06, "loss": 1.3609, "step": 3050 }, { "epoch": 0.02, "grad_norm": 4.905191662886102, "learning_rate": 1.9998697646196856e-06, "loss": 1.5205, "step": 3051 }, { "epoch": 0.02, "grad_norm": 4.391956056937744, "learning_rate": 1.9998696789544677e-06, "loss": 1.3854, "step": 3052 }, { "epoch": 0.02, "grad_norm": 5.57834379820915, "learning_rate": 1.9998695932610863e-06, "loss": 1.4971, "step": 3053 }, { "epoch": 0.02, "grad_norm": 4.9358795449257, "learning_rate": 1.9998695075395423e-06, "loss": 1.3831, "step": 3054 }, { "epoch": 0.02, "grad_norm": 5.168823055094264, "learning_rate": 1.999869421789835e-06, "loss": 1.2735, "step": 3055 }, { "epoch": 0.02, "grad_norm": 5.134532360101292, "learning_rate": 1.9998693360119646e-06, "loss": 1.3684, "step": 3056 }, { "epoch": 0.02, "grad_norm": 5.281772250542135, "learning_rate": 1.9998692502059317e-06, "loss": 1.25, "step": 3057 }, { "epoch": 0.02, "grad_norm": 4.862624797364669, "learning_rate": 1.9998691643717353e-06, "loss": 1.4888, "step": 3058 }, { "epoch": 0.02, "grad_norm": 4.5369738915051965, "learning_rate": 1.999869078509376e-06, "loss": 1.3752, "step": 3059 }, { "epoch": 0.02, "grad_norm": 5.264457712237391, "learning_rate": 1.999868992618854e-06, "loss": 1.3389, "step": 3060 }, { "epoch": 0.02, "grad_norm": 5.148997720929055, "learning_rate": 1.9998689067001686e-06, "loss": 1.4634, "step": 3061 }, { "epoch": 0.02, "grad_norm": 5.69514649110256, "learning_rate": 1.9998688207533207e-06, "loss": 1.4781, "step": 3062 }, { "epoch": 0.02, "grad_norm": 6.906133670567751, "learning_rate": 1.9998687347783098e-06, "loss": 1.2841, "step": 3063 }, { "epoch": 0.02, "grad_norm": 4.450305903996531, "learning_rate": 1.9998686487751354e-06, "loss": 1.3008, "step": 3064 }, { "epoch": 0.02, "grad_norm": 4.9761783730067926, "learning_rate": 1.9998685627437985e-06, "loss": 1.1917, "step": 3065 }, { "epoch": 0.02, "grad_norm": 4.90862469089785, "learning_rate": 1.9998684766842985e-06, "loss": 1.3536, "step": 3066 }, { "epoch": 0.02, "eval_loss": 1.614479660987854, "eval_runtime": 4.6281, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.08, "step": 3066 }, { "epoch": 0.02, "grad_norm": 4.514246443648035, "learning_rate": 1.999868390596635e-06, "loss": 1.46, "step": 3067 }, { "epoch": 0.02, "grad_norm": 4.795614462660274, "learning_rate": 1.9998683044808097e-06, "loss": 1.4595, "step": 3068 }, { "epoch": 0.02, "grad_norm": 5.018533958527286, "learning_rate": 1.9998682183368203e-06, "loss": 1.3904, "step": 3069 }, { "epoch": 0.02, "grad_norm": 4.942520732496524, "learning_rate": 1.999868132164669e-06, "loss": 1.4983, "step": 3070 }, { "epoch": 0.02, "grad_norm": 4.777495716431802, "learning_rate": 1.999868045964354e-06, "loss": 1.4747, "step": 3071 }, { "epoch": 0.02, "grad_norm": 4.426971447313713, "learning_rate": 1.999867959735876e-06, "loss": 1.2999, "step": 3072 }, { "epoch": 0.02, "grad_norm": 6.039700517061406, "learning_rate": 1.999867873479235e-06, "loss": 1.3682, "step": 3073 }, { "epoch": 0.02, "grad_norm": 7.018072800299674, "learning_rate": 1.9998677871944316e-06, "loss": 1.4626, "step": 3074 }, { "epoch": 0.02, "grad_norm": 4.620448908678396, "learning_rate": 1.9998677008814647e-06, "loss": 1.3042, "step": 3075 }, { "epoch": 0.02, "grad_norm": 4.309072084405069, "learning_rate": 1.999867614540335e-06, "loss": 1.3497, "step": 3076 }, { "epoch": 0.02, "grad_norm": 6.1018581186592025, "learning_rate": 1.9998675281710427e-06, "loss": 1.333, "step": 3077 }, { "epoch": 0.02, "grad_norm": 4.945567567355229, "learning_rate": 1.999867441773587e-06, "loss": 1.2968, "step": 3078 }, { "epoch": 0.02, "grad_norm": 5.406409232689529, "learning_rate": 1.9998673553479687e-06, "loss": 1.4256, "step": 3079 }, { "epoch": 0.02, "grad_norm": 7.218194975860625, "learning_rate": 1.9998672688941872e-06, "loss": 1.4264, "step": 3080 }, { "epoch": 0.02, "grad_norm": 4.739664546488815, "learning_rate": 1.999867182412243e-06, "loss": 1.355, "step": 3081 }, { "epoch": 0.02, "grad_norm": 4.91149132142301, "learning_rate": 1.9998670959021357e-06, "loss": 1.524, "step": 3082 }, { "epoch": 0.02, "grad_norm": 4.432247494986651, "learning_rate": 1.9998670093638656e-06, "loss": 1.2699, "step": 3083 }, { "epoch": 0.02, "grad_norm": 5.31888360981259, "learning_rate": 1.9998669227974326e-06, "loss": 1.3485, "step": 3084 }, { "epoch": 0.02, "grad_norm": 4.766908478434986, "learning_rate": 1.9998668362028365e-06, "loss": 1.3453, "step": 3085 }, { "epoch": 0.02, "grad_norm": 4.684318593661933, "learning_rate": 1.9998667495800775e-06, "loss": 1.3912, "step": 3086 }, { "epoch": 0.02, "grad_norm": 4.707084972080545, "learning_rate": 1.9998666629291554e-06, "loss": 1.4999, "step": 3087 }, { "epoch": 0.02, "grad_norm": 4.658271583331655, "learning_rate": 1.999866576250071e-06, "loss": 1.3584, "step": 3088 }, { "epoch": 0.02, "grad_norm": 5.8632287556229645, "learning_rate": 1.999866489542823e-06, "loss": 1.4268, "step": 3089 }, { "epoch": 0.02, "grad_norm": 5.305841703345282, "learning_rate": 1.9998664028074126e-06, "loss": 1.425, "step": 3090 }, { "epoch": 0.02, "grad_norm": 4.537852499872765, "learning_rate": 1.999866316043839e-06, "loss": 1.3343, "step": 3091 }, { "epoch": 0.02, "grad_norm": 4.525709370028815, "learning_rate": 1.9998662292521023e-06, "loss": 1.4023, "step": 3092 }, { "epoch": 0.02, "grad_norm": 4.566090943972325, "learning_rate": 1.999866142432203e-06, "loss": 1.4643, "step": 3093 }, { "epoch": 0.02, "grad_norm": 4.2695795542821875, "learning_rate": 1.9998660555841405e-06, "loss": 1.2825, "step": 3094 }, { "epoch": 0.02, "grad_norm": 4.396645953705008, "learning_rate": 1.9998659687079157e-06, "loss": 1.2208, "step": 3095 }, { "epoch": 0.02, "grad_norm": 4.585920026446324, "learning_rate": 1.9998658818035275e-06, "loss": 1.3192, "step": 3096 }, { "epoch": 0.02, "grad_norm": 4.724254453766402, "learning_rate": 1.9998657948709768e-06, "loss": 1.3259, "step": 3097 }, { "epoch": 0.02, "grad_norm": 4.546497072265447, "learning_rate": 1.9998657079102626e-06, "loss": 1.4462, "step": 3098 }, { "epoch": 0.02, "grad_norm": 5.067058100414485, "learning_rate": 1.999865620921386e-06, "loss": 1.371, "step": 3099 }, { "epoch": 0.02, "grad_norm": 4.680817047547335, "learning_rate": 1.9998655339043465e-06, "loss": 1.382, "step": 3100 }, { "epoch": 0.02, "grad_norm": 4.684807070794312, "learning_rate": 1.999865446859144e-06, "loss": 1.3792, "step": 3101 }, { "epoch": 0.02, "grad_norm": 5.108394614810223, "learning_rate": 1.999865359785779e-06, "loss": 1.3695, "step": 3102 }, { "epoch": 0.02, "grad_norm": 4.678775301437812, "learning_rate": 1.99986527268425e-06, "loss": 1.3234, "step": 3103 }, { "epoch": 0.02, "grad_norm": 7.198154854127655, "learning_rate": 1.999865185554559e-06, "loss": 1.3228, "step": 3104 }, { "epoch": 0.02, "grad_norm": 4.954029857664215, "learning_rate": 1.9998650983967053e-06, "loss": 1.2434, "step": 3105 }, { "epoch": 0.02, "grad_norm": 4.716268546944123, "learning_rate": 1.999865011210688e-06, "loss": 1.3835, "step": 3106 }, { "epoch": 0.02, "grad_norm": 4.706544890974926, "learning_rate": 1.9998649239965085e-06, "loss": 1.4839, "step": 3107 }, { "epoch": 0.02, "grad_norm": 5.0878170468870545, "learning_rate": 1.9998648367541656e-06, "loss": 1.5127, "step": 3108 }, { "epoch": 0.02, "grad_norm": 4.546409694838889, "learning_rate": 1.99986474948366e-06, "loss": 1.4668, "step": 3109 }, { "epoch": 0.02, "grad_norm": 4.262790272972708, "learning_rate": 1.9998646621849916e-06, "loss": 1.3487, "step": 3110 }, { "epoch": 0.02, "grad_norm": 5.437228037164254, "learning_rate": 1.9998645748581606e-06, "loss": 1.4362, "step": 3111 }, { "epoch": 0.02, "grad_norm": 4.591524369547653, "learning_rate": 1.9998644875031665e-06, "loss": 1.3518, "step": 3112 }, { "epoch": 0.02, "grad_norm": 4.943081436316897, "learning_rate": 1.9998644001200095e-06, "loss": 1.2977, "step": 3113 }, { "epoch": 0.02, "grad_norm": 4.745398436424019, "learning_rate": 1.9998643127086894e-06, "loss": 1.2858, "step": 3114 }, { "epoch": 0.02, "grad_norm": 4.451084562147302, "learning_rate": 1.9998642252692072e-06, "loss": 1.4795, "step": 3115 }, { "epoch": 0.02, "grad_norm": 4.642520886807483, "learning_rate": 1.9998641378015616e-06, "loss": 1.5245, "step": 3116 }, { "epoch": 0.02, "grad_norm": 4.829976676988873, "learning_rate": 1.9998640503057534e-06, "loss": 1.3036, "step": 3117 }, { "epoch": 0.02, "grad_norm": 4.272610276691716, "learning_rate": 1.9998639627817822e-06, "loss": 1.4307, "step": 3118 }, { "epoch": 0.02, "grad_norm": 4.279012748336821, "learning_rate": 1.999863875229648e-06, "loss": 1.3246, "step": 3119 }, { "epoch": 0.02, "grad_norm": 5.017262661372447, "learning_rate": 1.999863787649351e-06, "loss": 1.4561, "step": 3120 }, { "epoch": 0.02, "grad_norm": 7.159770574845129, "learning_rate": 1.9998637000408915e-06, "loss": 1.4325, "step": 3121 }, { "epoch": 0.02, "grad_norm": 5.138809355786813, "learning_rate": 1.9998636124042688e-06, "loss": 1.3595, "step": 3122 }, { "epoch": 0.02, "grad_norm": 7.411909404561028, "learning_rate": 1.9998635247394834e-06, "loss": 1.2877, "step": 3123 }, { "epoch": 0.02, "grad_norm": 4.8356590420018675, "learning_rate": 1.999863437046535e-06, "loss": 1.382, "step": 3124 }, { "epoch": 0.02, "grad_norm": 5.438539228016791, "learning_rate": 1.999863349325424e-06, "loss": 1.2727, "step": 3125 }, { "epoch": 0.02, "grad_norm": 5.290518205135856, "learning_rate": 1.9998632615761503e-06, "loss": 1.2096, "step": 3126 }, { "epoch": 0.02, "grad_norm": 4.869806082110767, "learning_rate": 1.9998631737987134e-06, "loss": 1.3335, "step": 3127 }, { "epoch": 0.02, "grad_norm": 5.001272070760005, "learning_rate": 1.999863085993114e-06, "loss": 1.48, "step": 3128 }, { "epoch": 0.02, "grad_norm": 4.588174640600946, "learning_rate": 1.999862998159352e-06, "loss": 1.3918, "step": 3129 }, { "epoch": 0.02, "grad_norm": 4.48282878680433, "learning_rate": 1.9998629102974263e-06, "loss": 1.3621, "step": 3130 }, { "epoch": 0.02, "grad_norm": 4.327695042907354, "learning_rate": 1.9998628224073387e-06, "loss": 1.1881, "step": 3131 }, { "epoch": 0.02, "grad_norm": 4.935240520013496, "learning_rate": 1.999862734489088e-06, "loss": 1.4896, "step": 3132 }, { "epoch": 0.02, "grad_norm": 4.762415048958248, "learning_rate": 1.999862646542674e-06, "loss": 1.3664, "step": 3133 }, { "epoch": 0.02, "grad_norm": 4.514983890100395, "learning_rate": 1.999862558568098e-06, "loss": 1.3976, "step": 3134 }, { "epoch": 0.02, "grad_norm": 4.318544973311922, "learning_rate": 1.9998624705653586e-06, "loss": 1.3479, "step": 3135 }, { "epoch": 0.02, "grad_norm": 4.831718076957779, "learning_rate": 1.999862382534457e-06, "loss": 1.4235, "step": 3136 }, { "epoch": 0.02, "grad_norm": 5.289805977486961, "learning_rate": 1.9998622944753917e-06, "loss": 1.2588, "step": 3137 }, { "epoch": 0.02, "grad_norm": 4.49073990213304, "learning_rate": 1.9998622063881643e-06, "loss": 1.4302, "step": 3138 }, { "epoch": 0.02, "grad_norm": 9.548814969360347, "learning_rate": 1.999862118272774e-06, "loss": 1.3936, "step": 3139 }, { "epoch": 0.02, "eval_loss": 1.6162147521972656, "eval_runtime": 4.6224, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.082, "step": 3139 }, { "epoch": 0.02, "grad_norm": 5.563259529239899, "learning_rate": 1.999862030129221e-06, "loss": 1.4137, "step": 3140 }, { "epoch": 0.02, "grad_norm": 4.64213902825873, "learning_rate": 1.999861941957505e-06, "loss": 1.5853, "step": 3141 }, { "epoch": 0.02, "grad_norm": 5.394890330300038, "learning_rate": 1.999861853757626e-06, "loss": 1.4709, "step": 3142 }, { "epoch": 0.02, "grad_norm": 4.911886598072668, "learning_rate": 1.9998617655295847e-06, "loss": 1.4397, "step": 3143 }, { "epoch": 0.02, "grad_norm": 13.65470961214868, "learning_rate": 1.9998616772733802e-06, "loss": 1.199, "step": 3144 }, { "epoch": 0.02, "grad_norm": 4.636379664598463, "learning_rate": 1.999861588989013e-06, "loss": 1.2613, "step": 3145 }, { "epoch": 0.02, "grad_norm": 4.9044581943490275, "learning_rate": 1.9998615006764835e-06, "loss": 1.4446, "step": 3146 }, { "epoch": 0.02, "grad_norm": 4.966547861520061, "learning_rate": 1.999861412335791e-06, "loss": 1.3623, "step": 3147 }, { "epoch": 0.02, "grad_norm": 5.124726053985599, "learning_rate": 1.9998613239669356e-06, "loss": 1.3474, "step": 3148 }, { "epoch": 0.02, "grad_norm": 4.572636835170383, "learning_rate": 1.9998612355699174e-06, "loss": 1.3531, "step": 3149 }, { "epoch": 0.02, "grad_norm": 4.618987542008811, "learning_rate": 1.9998611471447362e-06, "loss": 1.4206, "step": 3150 }, { "epoch": 0.02, "grad_norm": 4.3621396822779746, "learning_rate": 1.999861058691393e-06, "loss": 1.4164, "step": 3151 }, { "epoch": 0.02, "grad_norm": 5.341999064682612, "learning_rate": 1.999860970209886e-06, "loss": 1.5216, "step": 3152 }, { "epoch": 0.02, "grad_norm": 16.061509297130684, "learning_rate": 1.999860881700217e-06, "loss": 1.3329, "step": 3153 }, { "epoch": 0.02, "grad_norm": 4.808958846677696, "learning_rate": 1.999860793162385e-06, "loss": 1.483, "step": 3154 }, { "epoch": 0.02, "grad_norm": 5.114117981524813, "learning_rate": 1.9998607045963903e-06, "loss": 1.5337, "step": 3155 }, { "epoch": 0.02, "grad_norm": 5.295040011290924, "learning_rate": 1.9998606160022328e-06, "loss": 1.3669, "step": 3156 }, { "epoch": 0.02, "grad_norm": 4.704314977626543, "learning_rate": 1.9998605273799127e-06, "loss": 1.3311, "step": 3157 }, { "epoch": 0.02, "grad_norm": 4.488680014172767, "learning_rate": 1.9998604387294296e-06, "loss": 1.4133, "step": 3158 }, { "epoch": 0.02, "grad_norm": 5.104286348143494, "learning_rate": 1.999860350050784e-06, "loss": 1.5682, "step": 3159 }, { "epoch": 0.02, "grad_norm": 4.485660767564034, "learning_rate": 1.9998602613439757e-06, "loss": 1.5762, "step": 3160 }, { "epoch": 0.02, "grad_norm": 4.939741107373614, "learning_rate": 1.9998601726090045e-06, "loss": 1.4258, "step": 3161 }, { "epoch": 0.02, "grad_norm": 4.881029948420793, "learning_rate": 1.9998600838458707e-06, "loss": 1.4878, "step": 3162 }, { "epoch": 0.02, "grad_norm": 5.300753949774209, "learning_rate": 1.999859995054574e-06, "loss": 1.5591, "step": 3163 }, { "epoch": 0.02, "grad_norm": 4.919255455770104, "learning_rate": 1.999859906235114e-06, "loss": 1.5089, "step": 3164 }, { "epoch": 0.02, "grad_norm": 4.903957763395779, "learning_rate": 1.9998598173874925e-06, "loss": 1.3316, "step": 3165 }, { "epoch": 0.02, "grad_norm": 4.786877472597078, "learning_rate": 1.9998597285117076e-06, "loss": 1.4113, "step": 3166 }, { "epoch": 0.02, "grad_norm": 4.642407926930237, "learning_rate": 1.99985963960776e-06, "loss": 1.4368, "step": 3167 }, { "epoch": 0.02, "grad_norm": 4.9722903457303715, "learning_rate": 1.9998595506756495e-06, "loss": 1.5279, "step": 3168 }, { "epoch": 0.02, "grad_norm": 4.735779629402262, "learning_rate": 1.999859461715377e-06, "loss": 1.4509, "step": 3169 }, { "epoch": 0.02, "grad_norm": 6.012513337480798, "learning_rate": 1.999859372726941e-06, "loss": 1.2937, "step": 3170 }, { "epoch": 0.02, "grad_norm": 4.373925924257122, "learning_rate": 1.9998592837103425e-06, "loss": 1.3488, "step": 3171 }, { "epoch": 0.02, "grad_norm": 4.813452991567045, "learning_rate": 1.9998591946655812e-06, "loss": 1.4029, "step": 3172 }, { "epoch": 0.02, "grad_norm": 4.354945741505938, "learning_rate": 1.9998591055926574e-06, "loss": 1.2901, "step": 3173 }, { "epoch": 0.02, "grad_norm": 4.4499436163682775, "learning_rate": 1.999859016491571e-06, "loss": 1.3946, "step": 3174 }, { "epoch": 0.02, "grad_norm": 5.212636696076956, "learning_rate": 1.9998589273623216e-06, "loss": 1.4967, "step": 3175 }, { "epoch": 0.02, "grad_norm": 6.480145318055372, "learning_rate": 1.99985883820491e-06, "loss": 1.3128, "step": 3176 }, { "epoch": 0.02, "grad_norm": 5.7962291969592865, "learning_rate": 1.999858749019335e-06, "loss": 1.3714, "step": 3177 }, { "epoch": 0.02, "grad_norm": 4.699063147883259, "learning_rate": 1.999858659805598e-06, "loss": 1.3319, "step": 3178 }, { "epoch": 0.02, "grad_norm": 5.026292120171505, "learning_rate": 1.999858570563698e-06, "loss": 1.2571, "step": 3179 }, { "epoch": 0.02, "grad_norm": 4.409840960626129, "learning_rate": 1.999858481293635e-06, "loss": 1.2903, "step": 3180 }, { "epoch": 0.02, "grad_norm": 4.625128875585861, "learning_rate": 1.9998583919954095e-06, "loss": 1.3866, "step": 3181 }, { "epoch": 0.02, "grad_norm": 4.5097657585652895, "learning_rate": 1.9998583026690216e-06, "loss": 1.3992, "step": 3182 }, { "epoch": 0.02, "grad_norm": 4.614507760819113, "learning_rate": 1.9998582133144708e-06, "loss": 1.4418, "step": 3183 }, { "epoch": 0.02, "grad_norm": 4.5439263422269685, "learning_rate": 1.9998581239317573e-06, "loss": 1.4107, "step": 3184 }, { "epoch": 0.02, "grad_norm": 5.919848478106863, "learning_rate": 1.9998580345208814e-06, "loss": 1.44, "step": 3185 }, { "epoch": 0.02, "grad_norm": 5.270625301228678, "learning_rate": 1.9998579450818424e-06, "loss": 1.3487, "step": 3186 }, { "epoch": 0.02, "grad_norm": 12.770245132363083, "learning_rate": 1.999857855614641e-06, "loss": 1.2209, "step": 3187 }, { "epoch": 0.02, "grad_norm": 4.851859510913612, "learning_rate": 1.9998577661192766e-06, "loss": 1.5614, "step": 3188 }, { "epoch": 0.02, "grad_norm": 4.675470634925794, "learning_rate": 1.99985767659575e-06, "loss": 1.3947, "step": 3189 }, { "epoch": 0.02, "grad_norm": 7.468219419933821, "learning_rate": 1.9998575870440602e-06, "loss": 1.3805, "step": 3190 }, { "epoch": 0.02, "grad_norm": 4.966019364108259, "learning_rate": 1.9998574974642083e-06, "loss": 1.5527, "step": 3191 }, { "epoch": 0.02, "grad_norm": 9.403779222001704, "learning_rate": 1.9998574078561935e-06, "loss": 1.4134, "step": 3192 }, { "epoch": 0.02, "grad_norm": 6.913592743917321, "learning_rate": 1.999857318220016e-06, "loss": 1.3894, "step": 3193 }, { "epoch": 0.02, "grad_norm": 4.977596675633243, "learning_rate": 1.999857228555676e-06, "loss": 1.5042, "step": 3194 }, { "epoch": 0.02, "grad_norm": 5.206567891787514, "learning_rate": 1.999857138863173e-06, "loss": 1.4468, "step": 3195 }, { "epoch": 0.02, "grad_norm": 5.04153279402594, "learning_rate": 1.999857049142508e-06, "loss": 1.3358, "step": 3196 }, { "epoch": 0.02, "grad_norm": 5.061753008719967, "learning_rate": 1.9998569593936797e-06, "loss": 1.2138, "step": 3197 }, { "epoch": 0.02, "grad_norm": 4.5006757680467615, "learning_rate": 1.999856869616689e-06, "loss": 1.4535, "step": 3198 }, { "epoch": 0.02, "grad_norm": 4.783752399409675, "learning_rate": 1.9998567798115356e-06, "loss": 1.371, "step": 3199 }, { "epoch": 0.02, "grad_norm": 4.316562075427887, "learning_rate": 1.9998566899782197e-06, "loss": 1.242, "step": 3200 }, { "epoch": 0.02, "grad_norm": 5.77063522200799, "learning_rate": 1.999856600116741e-06, "loss": 1.3504, "step": 3201 }, { "epoch": 0.02, "grad_norm": 5.811809642589763, "learning_rate": 1.9998565102271e-06, "loss": 1.244, "step": 3202 }, { "epoch": 0.02, "grad_norm": 4.246394909798491, "learning_rate": 1.9998564203092958e-06, "loss": 1.2308, "step": 3203 }, { "epoch": 0.02, "grad_norm": 5.515471870625349, "learning_rate": 1.999856330363329e-06, "loss": 1.4938, "step": 3204 }, { "epoch": 0.02, "grad_norm": 7.4680360822544944, "learning_rate": 1.9998562403892e-06, "loss": 1.456, "step": 3205 }, { "epoch": 0.02, "grad_norm": 4.505670670091151, "learning_rate": 1.999856150386908e-06, "loss": 1.3296, "step": 3206 }, { "epoch": 0.02, "grad_norm": 4.709421641146748, "learning_rate": 1.999856060356454e-06, "loss": 1.5864, "step": 3207 }, { "epoch": 0.02, "grad_norm": 5.147973376306806, "learning_rate": 1.999855970297837e-06, "loss": 1.5874, "step": 3208 }, { "epoch": 0.02, "grad_norm": 4.382675576687693, "learning_rate": 1.999855880211057e-06, "loss": 1.3383, "step": 3209 }, { "epoch": 0.02, "grad_norm": 5.6100363440541745, "learning_rate": 1.999855790096115e-06, "loss": 1.354, "step": 3210 }, { "epoch": 0.02, "grad_norm": 4.6842429101328005, "learning_rate": 1.99985569995301e-06, "loss": 1.2987, "step": 3211 }, { "epoch": 0.02, "grad_norm": 4.40109444884483, "learning_rate": 1.9998556097817423e-06, "loss": 1.2192, "step": 3212 }, { "epoch": 0.02, "eval_loss": 1.6133770942687988, "eval_runtime": 4.6291, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 3212 }, { "epoch": 0.02, "grad_norm": 4.686660224921117, "learning_rate": 1.9998555195823125e-06, "loss": 1.3872, "step": 3213 }, { "epoch": 0.02, "grad_norm": 5.0508756090986076, "learning_rate": 1.9998554293547197e-06, "loss": 1.3356, "step": 3214 }, { "epoch": 0.02, "grad_norm": 4.52397968155206, "learning_rate": 1.9998553390989644e-06, "loss": 1.4718, "step": 3215 }, { "epoch": 0.02, "grad_norm": 4.345795773897772, "learning_rate": 1.9998552488150464e-06, "loss": 1.3888, "step": 3216 }, { "epoch": 0.02, "grad_norm": 4.4749772284400064, "learning_rate": 1.9998551585029663e-06, "loss": 1.4003, "step": 3217 }, { "epoch": 0.02, "grad_norm": 4.894626866747002, "learning_rate": 1.9998550681627233e-06, "loss": 1.3879, "step": 3218 }, { "epoch": 0.02, "grad_norm": 6.358238813293754, "learning_rate": 1.9998549777943176e-06, "loss": 1.3516, "step": 3219 }, { "epoch": 0.02, "grad_norm": 10.113707594899338, "learning_rate": 1.999854887397749e-06, "loss": 1.4023, "step": 3220 }, { "epoch": 0.02, "grad_norm": 5.254325564929292, "learning_rate": 1.999854796973018e-06, "loss": 1.39, "step": 3221 }, { "epoch": 0.02, "grad_norm": 4.960705725899118, "learning_rate": 1.9998547065201247e-06, "loss": 1.5043, "step": 3222 }, { "epoch": 0.02, "grad_norm": 4.358280513258368, "learning_rate": 1.9998546160390688e-06, "loss": 1.334, "step": 3223 }, { "epoch": 0.02, "grad_norm": 5.374786742690436, "learning_rate": 1.9998545255298502e-06, "loss": 1.4208, "step": 3224 }, { "epoch": 0.02, "grad_norm": 5.11312037967647, "learning_rate": 1.999854434992469e-06, "loss": 1.3896, "step": 3225 }, { "epoch": 0.02, "grad_norm": 5.056956101904353, "learning_rate": 1.9998543444269254e-06, "loss": 1.6125, "step": 3226 }, { "epoch": 0.02, "grad_norm": 5.4444491440324345, "learning_rate": 1.999854253833219e-06, "loss": 1.4266, "step": 3227 }, { "epoch": 0.02, "grad_norm": 4.562601418248393, "learning_rate": 1.99985416321135e-06, "loss": 1.3555, "step": 3228 }, { "epoch": 0.02, "grad_norm": 4.494814980020431, "learning_rate": 1.9998540725613185e-06, "loss": 1.2719, "step": 3229 }, { "epoch": 0.02, "grad_norm": 4.463077255851529, "learning_rate": 1.9998539818831245e-06, "loss": 1.4654, "step": 3230 }, { "epoch": 0.02, "grad_norm": 4.6659227436502615, "learning_rate": 1.999853891176768e-06, "loss": 1.4081, "step": 3231 }, { "epoch": 0.02, "grad_norm": 5.27576441173752, "learning_rate": 1.999853800442249e-06, "loss": 1.3004, "step": 3232 }, { "epoch": 0.02, "grad_norm": 7.132500624849133, "learning_rate": 1.9998537096795676e-06, "loss": 1.4915, "step": 3233 }, { "epoch": 0.02, "grad_norm": 5.3773789294030445, "learning_rate": 1.9998536188887233e-06, "loss": 1.2791, "step": 3234 }, { "epoch": 0.02, "grad_norm": 4.608523278365858, "learning_rate": 1.999853528069716e-06, "loss": 1.5425, "step": 3235 }, { "epoch": 0.02, "grad_norm": 5.707153438089712, "learning_rate": 1.999853437222547e-06, "loss": 1.46, "step": 3236 }, { "epoch": 0.02, "grad_norm": 4.354798489424228, "learning_rate": 1.999853346347215e-06, "loss": 1.3055, "step": 3237 }, { "epoch": 0.02, "grad_norm": 4.717858890133739, "learning_rate": 1.999853255443721e-06, "loss": 1.4242, "step": 3238 }, { "epoch": 0.02, "grad_norm": 4.881285175892339, "learning_rate": 1.9998531645120637e-06, "loss": 1.3225, "step": 3239 }, { "epoch": 0.02, "grad_norm": 4.351033621024971, "learning_rate": 1.999853073552244e-06, "loss": 1.4343, "step": 3240 }, { "epoch": 0.02, "grad_norm": 6.092636219063135, "learning_rate": 1.999852982564262e-06, "loss": 1.4519, "step": 3241 }, { "epoch": 0.02, "grad_norm": 4.523318551270846, "learning_rate": 1.9998528915481176e-06, "loss": 1.3111, "step": 3242 }, { "epoch": 0.02, "grad_norm": 5.12499436718178, "learning_rate": 1.9998528005038105e-06, "loss": 1.4857, "step": 3243 }, { "epoch": 0.02, "grad_norm": 4.379826186575379, "learning_rate": 1.999852709431341e-06, "loss": 1.5748, "step": 3244 }, { "epoch": 0.02, "grad_norm": 4.845476579476725, "learning_rate": 1.9998526183307083e-06, "loss": 1.5415, "step": 3245 }, { "epoch": 0.02, "grad_norm": 4.5232667716691735, "learning_rate": 1.999852527201914e-06, "loss": 1.4959, "step": 3246 }, { "epoch": 0.02, "grad_norm": 5.0382078094923335, "learning_rate": 1.9998524360449566e-06, "loss": 1.3551, "step": 3247 }, { "epoch": 0.02, "grad_norm": 7.089730656857905, "learning_rate": 1.999852344859837e-06, "loss": 1.4391, "step": 3248 }, { "epoch": 0.02, "grad_norm": 4.233017204929559, "learning_rate": 1.9998522536465547e-06, "loss": 1.3869, "step": 3249 }, { "epoch": 0.02, "grad_norm": 4.927879269649831, "learning_rate": 1.99985216240511e-06, "loss": 1.3307, "step": 3250 }, { "epoch": 0.02, "grad_norm": 4.8097509464845585, "learning_rate": 1.999852071135503e-06, "loss": 1.5221, "step": 3251 }, { "epoch": 0.02, "grad_norm": 4.199061944873702, "learning_rate": 1.999851979837733e-06, "loss": 1.0908, "step": 3252 }, { "epoch": 0.02, "grad_norm": 4.531291994897648, "learning_rate": 1.9998518885118007e-06, "loss": 1.3583, "step": 3253 }, { "epoch": 0.02, "grad_norm": 4.903810401327965, "learning_rate": 1.999851797157706e-06, "loss": 1.4129, "step": 3254 }, { "epoch": 0.02, "grad_norm": 7.132170562144993, "learning_rate": 1.9998517057754487e-06, "loss": 1.2676, "step": 3255 }, { "epoch": 0.02, "grad_norm": 4.252451485945251, "learning_rate": 1.999851614365029e-06, "loss": 1.3305, "step": 3256 }, { "epoch": 0.02, "grad_norm": 9.063511090876979, "learning_rate": 1.9998515229264468e-06, "loss": 1.5813, "step": 3257 }, { "epoch": 0.02, "grad_norm": 5.235824119971203, "learning_rate": 1.999851431459702e-06, "loss": 1.186, "step": 3258 }, { "epoch": 0.02, "grad_norm": 4.324167091287429, "learning_rate": 1.9998513399647946e-06, "loss": 1.3319, "step": 3259 }, { "epoch": 0.02, "grad_norm": 4.418031249539854, "learning_rate": 1.999851248441725e-06, "loss": 1.3418, "step": 3260 }, { "epoch": 0.02, "grad_norm": 4.669234391667805, "learning_rate": 1.9998511568904925e-06, "loss": 1.4365, "step": 3261 }, { "epoch": 0.02, "grad_norm": 4.883978731284588, "learning_rate": 1.999851065311098e-06, "loss": 1.2247, "step": 3262 }, { "epoch": 0.02, "grad_norm": 4.3548479130386, "learning_rate": 1.9998509737035405e-06, "loss": 1.3808, "step": 3263 }, { "epoch": 0.02, "grad_norm": 4.5380211493875, "learning_rate": 1.999850882067821e-06, "loss": 1.4246, "step": 3264 }, { "epoch": 0.02, "grad_norm": 5.00105285779209, "learning_rate": 1.999850790403939e-06, "loss": 1.1821, "step": 3265 }, { "epoch": 0.02, "grad_norm": 4.375788203793615, "learning_rate": 1.9998506987118946e-06, "loss": 1.2683, "step": 3266 }, { "epoch": 0.02, "grad_norm": 4.435134668590281, "learning_rate": 1.9998506069916874e-06, "loss": 1.3995, "step": 3267 }, { "epoch": 0.02, "grad_norm": 5.011365587056869, "learning_rate": 1.9998505152433177e-06, "loss": 1.4123, "step": 3268 }, { "epoch": 0.02, "grad_norm": 5.3076628146390945, "learning_rate": 1.999850423466786e-06, "loss": 1.5271, "step": 3269 }, { "epoch": 0.02, "grad_norm": 6.518196335665455, "learning_rate": 1.9998503316620914e-06, "loss": 1.4203, "step": 3270 }, { "epoch": 0.02, "grad_norm": 4.862304791483059, "learning_rate": 1.9998502398292344e-06, "loss": 1.3987, "step": 3271 }, { "epoch": 0.02, "grad_norm": 4.78170745299268, "learning_rate": 1.999850147968215e-06, "loss": 1.3068, "step": 3272 }, { "epoch": 0.02, "grad_norm": 7.178380281277487, "learning_rate": 1.999850056079033e-06, "loss": 1.2724, "step": 3273 }, { "epoch": 0.02, "grad_norm": 5.433887746386324, "learning_rate": 1.999849964161689e-06, "loss": 1.1814, "step": 3274 }, { "epoch": 0.02, "grad_norm": 4.528452236866192, "learning_rate": 1.9998498722161823e-06, "loss": 1.3414, "step": 3275 }, { "epoch": 0.02, "grad_norm": 5.177232219836514, "learning_rate": 1.999849780242513e-06, "loss": 1.4879, "step": 3276 }, { "epoch": 0.02, "grad_norm": 7.066602473545536, "learning_rate": 1.9998496882406816e-06, "loss": 1.5043, "step": 3277 }, { "epoch": 0.02, "grad_norm": 4.589177379962832, "learning_rate": 1.9998495962106874e-06, "loss": 1.4113, "step": 3278 }, { "epoch": 0.02, "grad_norm": 5.298228051116629, "learning_rate": 1.999849504152531e-06, "loss": 1.5326, "step": 3279 }, { "epoch": 0.02, "grad_norm": 4.762978250317543, "learning_rate": 1.999849412066212e-06, "loss": 1.3229, "step": 3280 }, { "epoch": 0.02, "grad_norm": 4.719035320919961, "learning_rate": 1.9998493199517306e-06, "loss": 1.495, "step": 3281 }, { "epoch": 0.02, "grad_norm": 4.522910759102749, "learning_rate": 1.999849227809087e-06, "loss": 1.4205, "step": 3282 }, { "epoch": 0.02, "grad_norm": 4.837186220843632, "learning_rate": 1.9998491356382808e-06, "loss": 1.4704, "step": 3283 }, { "epoch": 0.02, "grad_norm": 4.817799074894965, "learning_rate": 1.9998490434393124e-06, "loss": 1.3439, "step": 3284 }, { "epoch": 0.02, "grad_norm": 4.763809137445679, "learning_rate": 1.999848951212181e-06, "loss": 1.3933, "step": 3285 }, { "epoch": 0.02, "eval_loss": 1.6162359714508057, "eval_runtime": 4.6209, "eval_samples_per_second": 1.948, "eval_steps_per_second": 1.082, "step": 3285 }, { "epoch": 0.02, "grad_norm": 4.2310982403645685, "learning_rate": 1.9998488589568875e-06, "loss": 1.3159, "step": 3286 }, { "epoch": 0.02, "grad_norm": 4.751228665002726, "learning_rate": 1.999848766673432e-06, "loss": 1.508, "step": 3287 }, { "epoch": 0.02, "grad_norm": 5.287367204511815, "learning_rate": 1.9998486743618136e-06, "loss": 1.2603, "step": 3288 }, { "epoch": 0.02, "grad_norm": 6.191777133783223, "learning_rate": 1.9998485820220328e-06, "loss": 1.4435, "step": 3289 }, { "epoch": 0.02, "grad_norm": 4.963362551339361, "learning_rate": 1.9998484896540898e-06, "loss": 1.6377, "step": 3290 }, { "epoch": 0.02, "grad_norm": 5.380914773426925, "learning_rate": 1.9998483972579842e-06, "loss": 1.3618, "step": 3291 }, { "epoch": 0.02, "grad_norm": 4.881974168253751, "learning_rate": 1.999848304833716e-06, "loss": 1.4793, "step": 3292 }, { "epoch": 0.02, "grad_norm": 4.35338471574215, "learning_rate": 1.999848212381286e-06, "loss": 1.3686, "step": 3293 }, { "epoch": 0.02, "grad_norm": 4.647483695518025, "learning_rate": 1.999848119900693e-06, "loss": 1.3954, "step": 3294 }, { "epoch": 0.02, "grad_norm": 4.374460008441114, "learning_rate": 1.999848027391938e-06, "loss": 1.3345, "step": 3295 }, { "epoch": 0.02, "grad_norm": 5.811138428730455, "learning_rate": 1.9998479348550204e-06, "loss": 1.5838, "step": 3296 }, { "epoch": 0.02, "grad_norm": 4.982938535210265, "learning_rate": 1.9998478422899407e-06, "loss": 1.3415, "step": 3297 }, { "epoch": 0.02, "grad_norm": 5.428729068613124, "learning_rate": 1.9998477496966984e-06, "loss": 1.5737, "step": 3298 }, { "epoch": 0.02, "grad_norm": 5.419973685315894, "learning_rate": 1.9998476570752935e-06, "loss": 1.4732, "step": 3299 }, { "epoch": 0.02, "grad_norm": 4.763708028938937, "learning_rate": 1.9998475644257264e-06, "loss": 1.3344, "step": 3300 }, { "epoch": 0.02, "grad_norm": 4.49987874526447, "learning_rate": 1.999847471747997e-06, "loss": 1.3842, "step": 3301 }, { "epoch": 0.02, "grad_norm": 6.059868654394915, "learning_rate": 1.999847379042105e-06, "loss": 1.5563, "step": 3302 }, { "epoch": 0.02, "grad_norm": 5.101031294475946, "learning_rate": 1.9998472863080507e-06, "loss": 1.3156, "step": 3303 }, { "epoch": 0.02, "grad_norm": 5.04036355932879, "learning_rate": 1.9998471935458343e-06, "loss": 1.4731, "step": 3304 }, { "epoch": 0.02, "grad_norm": 7.42245956153174, "learning_rate": 1.999847100755455e-06, "loss": 1.3902, "step": 3305 }, { "epoch": 0.02, "grad_norm": 4.774733280681285, "learning_rate": 1.999847007936914e-06, "loss": 1.4806, "step": 3306 }, { "epoch": 0.02, "grad_norm": 5.071391799173309, "learning_rate": 1.99984691509021e-06, "loss": 1.3596, "step": 3307 }, { "epoch": 0.02, "grad_norm": 5.2995003638028235, "learning_rate": 1.9998468222153443e-06, "loss": 1.4474, "step": 3308 }, { "epoch": 0.02, "grad_norm": 5.3213954454836445, "learning_rate": 1.999846729312316e-06, "loss": 1.4512, "step": 3309 }, { "epoch": 0.02, "grad_norm": 5.334198218103061, "learning_rate": 1.999846636381125e-06, "loss": 1.5658, "step": 3310 }, { "epoch": 0.02, "grad_norm": 4.974385468276713, "learning_rate": 1.9998465434217723e-06, "loss": 1.4068, "step": 3311 }, { "epoch": 0.02, "grad_norm": 4.891182503439006, "learning_rate": 1.999846450434257e-06, "loss": 1.321, "step": 3312 }, { "epoch": 0.02, "grad_norm": 5.359459633381728, "learning_rate": 1.999846357418579e-06, "loss": 1.428, "step": 3313 }, { "epoch": 0.02, "grad_norm": 6.035895559789005, "learning_rate": 1.999846264374739e-06, "loss": 1.2147, "step": 3314 }, { "epoch": 0.02, "grad_norm": 4.397355468592714, "learning_rate": 1.999846171302736e-06, "loss": 1.3722, "step": 3315 }, { "epoch": 0.02, "grad_norm": 7.155669309774268, "learning_rate": 1.9998460782025713e-06, "loss": 1.3978, "step": 3316 }, { "epoch": 0.02, "grad_norm": 4.757021430848576, "learning_rate": 1.9998459850742443e-06, "loss": 1.5465, "step": 3317 }, { "epoch": 0.02, "grad_norm": 4.786149548595665, "learning_rate": 1.9998458919177547e-06, "loss": 1.41, "step": 3318 }, { "epoch": 0.02, "grad_norm": 4.289680760432469, "learning_rate": 1.9998457987331026e-06, "loss": 1.3994, "step": 3319 }, { "epoch": 0.02, "grad_norm": 8.633968114607555, "learning_rate": 1.9998457055202887e-06, "loss": 1.5452, "step": 3320 }, { "epoch": 0.02, "grad_norm": 4.513494336305803, "learning_rate": 1.9998456122793123e-06, "loss": 1.3446, "step": 3321 }, { "epoch": 0.02, "grad_norm": 5.396311638031696, "learning_rate": 1.9998455190101732e-06, "loss": 1.4307, "step": 3322 }, { "epoch": 0.02, "grad_norm": 12.224345682379019, "learning_rate": 1.999845425712872e-06, "loss": 1.3998, "step": 3323 }, { "epoch": 0.02, "grad_norm": 5.037124480361044, "learning_rate": 1.9998453323874087e-06, "loss": 1.3464, "step": 3324 }, { "epoch": 0.02, "grad_norm": 4.619878980217735, "learning_rate": 1.999845239033783e-06, "loss": 1.3604, "step": 3325 }, { "epoch": 0.02, "grad_norm": 4.480008801845805, "learning_rate": 1.999845145651995e-06, "loss": 1.3031, "step": 3326 }, { "epoch": 0.02, "grad_norm": 4.454286328620764, "learning_rate": 1.9998450522420446e-06, "loss": 1.3064, "step": 3327 }, { "epoch": 0.02, "grad_norm": 4.57634704126347, "learning_rate": 1.999844958803932e-06, "loss": 1.4419, "step": 3328 }, { "epoch": 0.02, "grad_norm": 4.525977006177297, "learning_rate": 1.9998448653376565e-06, "loss": 1.4284, "step": 3329 }, { "epoch": 0.02, "grad_norm": 4.871951894152188, "learning_rate": 1.9998447718432194e-06, "loss": 1.5416, "step": 3330 }, { "epoch": 0.02, "grad_norm": 5.355722208034374, "learning_rate": 1.9998446783206197e-06, "loss": 1.3012, "step": 3331 }, { "epoch": 0.02, "grad_norm": 4.768559786700981, "learning_rate": 1.999844584769858e-06, "loss": 1.3515, "step": 3332 }, { "epoch": 0.02, "grad_norm": 4.700472874326849, "learning_rate": 1.9998444911909335e-06, "loss": 1.406, "step": 3333 }, { "epoch": 0.02, "grad_norm": 4.6108709253998414, "learning_rate": 1.999844397583847e-06, "loss": 1.2965, "step": 3334 }, { "epoch": 0.02, "grad_norm": 4.119228316339279, "learning_rate": 1.9998443039485983e-06, "loss": 1.2358, "step": 3335 }, { "epoch": 0.02, "grad_norm": 4.6431919347398525, "learning_rate": 1.999844210285187e-06, "loss": 1.3773, "step": 3336 }, { "epoch": 0.02, "grad_norm": 5.036505345315098, "learning_rate": 1.9998441165936137e-06, "loss": 1.426, "step": 3337 }, { "epoch": 0.02, "grad_norm": 4.916331951067264, "learning_rate": 1.999844022873878e-06, "loss": 1.4319, "step": 3338 }, { "epoch": 0.02, "grad_norm": 4.502760853285271, "learning_rate": 1.99984392912598e-06, "loss": 1.3404, "step": 3339 }, { "epoch": 0.02, "grad_norm": 4.674212137830531, "learning_rate": 1.9998438353499197e-06, "loss": 1.3524, "step": 3340 }, { "epoch": 0.02, "grad_norm": 4.969002536880517, "learning_rate": 1.9998437415456972e-06, "loss": 1.4791, "step": 3341 }, { "epoch": 0.02, "grad_norm": 5.336062177765582, "learning_rate": 1.9998436477133126e-06, "loss": 1.5062, "step": 3342 }, { "epoch": 0.02, "grad_norm": 6.803443625291179, "learning_rate": 1.9998435538527655e-06, "loss": 1.5203, "step": 3343 }, { "epoch": 0.02, "grad_norm": 4.519808932086315, "learning_rate": 1.999843459964056e-06, "loss": 1.3434, "step": 3344 }, { "epoch": 0.02, "grad_norm": 4.916582018340864, "learning_rate": 1.9998433660471847e-06, "loss": 1.2779, "step": 3345 }, { "epoch": 0.02, "grad_norm": 4.597694435049481, "learning_rate": 1.9998432721021506e-06, "loss": 1.4044, "step": 3346 }, { "epoch": 0.02, "grad_norm": 4.780005071775639, "learning_rate": 1.9998431781289544e-06, "loss": 1.1947, "step": 3347 }, { "epoch": 0.02, "grad_norm": 4.419392700972962, "learning_rate": 1.999843084127596e-06, "loss": 1.3743, "step": 3348 }, { "epoch": 0.02, "grad_norm": 4.791430996135701, "learning_rate": 1.9998429900980756e-06, "loss": 1.2246, "step": 3349 }, { "epoch": 0.02, "grad_norm": 4.482748079898622, "learning_rate": 1.9998428960403925e-06, "loss": 1.2955, "step": 3350 }, { "epoch": 0.02, "grad_norm": 4.958560367082578, "learning_rate": 1.9998428019545477e-06, "loss": 1.3907, "step": 3351 }, { "epoch": 0.02, "grad_norm": 5.686365686539982, "learning_rate": 1.9998427078405404e-06, "loss": 1.4705, "step": 3352 }, { "epoch": 0.02, "grad_norm": 4.865648897158208, "learning_rate": 1.9998426136983704e-06, "loss": 1.4142, "step": 3353 }, { "epoch": 0.02, "grad_norm": 4.720528487925224, "learning_rate": 1.9998425195280387e-06, "loss": 1.3621, "step": 3354 }, { "epoch": 0.02, "grad_norm": 6.23799082246452, "learning_rate": 1.999842425329545e-06, "loss": 1.4965, "step": 3355 }, { "epoch": 0.02, "grad_norm": 4.338394874264906, "learning_rate": 1.999842331102888e-06, "loss": 1.4025, "step": 3356 }, { "epoch": 0.02, "grad_norm": 22.872377507742062, "learning_rate": 1.99984223684807e-06, "loss": 1.2324, "step": 3357 }, { "epoch": 0.02, "grad_norm": 5.052623717517635, "learning_rate": 1.9998421425650892e-06, "loss": 1.1698, "step": 3358 }, { "epoch": 0.02, "eval_loss": 1.615498661994934, "eval_runtime": 4.6186, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 3358 }, { "epoch": 0.02, "grad_norm": 4.403452284708548, "learning_rate": 1.999842048253946e-06, "loss": 1.4041, "step": 3359 }, { "epoch": 0.02, "grad_norm": 5.612691456341564, "learning_rate": 1.999841953914641e-06, "loss": 1.4215, "step": 3360 }, { "epoch": 0.02, "grad_norm": 4.5286749608866055, "learning_rate": 1.9998418595471733e-06, "loss": 1.2596, "step": 3361 }, { "epoch": 0.02, "grad_norm": 4.675637414315521, "learning_rate": 1.9998417651515436e-06, "loss": 1.4302, "step": 3362 }, { "epoch": 0.02, "grad_norm": 4.99356723093498, "learning_rate": 1.9998416707277517e-06, "loss": 1.5043, "step": 3363 }, { "epoch": 0.02, "grad_norm": 5.346376765762083, "learning_rate": 1.9998415762757977e-06, "loss": 1.4411, "step": 3364 }, { "epoch": 0.02, "grad_norm": 4.465445688844873, "learning_rate": 1.9998414817956815e-06, "loss": 1.3665, "step": 3365 }, { "epoch": 0.02, "grad_norm": 4.947373911092748, "learning_rate": 1.9998413872874027e-06, "loss": 1.4684, "step": 3366 }, { "epoch": 0.02, "grad_norm": 5.336576397977949, "learning_rate": 1.9998412927509622e-06, "loss": 1.4559, "step": 3367 }, { "epoch": 0.02, "grad_norm": 5.554379287052272, "learning_rate": 1.999841198186359e-06, "loss": 1.3741, "step": 3368 }, { "epoch": 0.02, "grad_norm": 4.691693057261446, "learning_rate": 1.999841103593594e-06, "loss": 1.3809, "step": 3369 }, { "epoch": 0.02, "grad_norm": 4.841766853561151, "learning_rate": 1.9998410089726666e-06, "loss": 1.4358, "step": 3370 }, { "epoch": 0.02, "grad_norm": 4.445225429854286, "learning_rate": 1.999840914323577e-06, "loss": 1.2202, "step": 3371 }, { "epoch": 0.02, "grad_norm": 4.587434031150207, "learning_rate": 1.9998408196463254e-06, "loss": 1.3254, "step": 3372 }, { "epoch": 0.02, "grad_norm": 4.635518865319935, "learning_rate": 1.9998407249409115e-06, "loss": 1.4297, "step": 3373 }, { "epoch": 0.02, "grad_norm": 4.693662361831059, "learning_rate": 1.9998406302073356e-06, "loss": 1.5567, "step": 3374 }, { "epoch": 0.02, "grad_norm": 4.86578821971193, "learning_rate": 1.999840535445597e-06, "loss": 1.5298, "step": 3375 }, { "epoch": 0.02, "grad_norm": 5.196254708579967, "learning_rate": 1.9998404406556967e-06, "loss": 1.4366, "step": 3376 }, { "epoch": 0.02, "grad_norm": 5.704264956948557, "learning_rate": 1.999840345837634e-06, "loss": 1.2585, "step": 3377 }, { "epoch": 0.02, "grad_norm": 7.123480430806445, "learning_rate": 1.9998402509914093e-06, "loss": 1.4561, "step": 3378 }, { "epoch": 0.02, "grad_norm": 4.651652795610573, "learning_rate": 1.999840156117022e-06, "loss": 1.422, "step": 3379 }, { "epoch": 0.02, "grad_norm": 4.895531346419424, "learning_rate": 1.999840061214473e-06, "loss": 1.39, "step": 3380 }, { "epoch": 0.02, "grad_norm": 4.431768446645135, "learning_rate": 1.9998399662837614e-06, "loss": 1.2118, "step": 3381 }, { "epoch": 0.02, "grad_norm": 4.73997458177594, "learning_rate": 1.999839871324888e-06, "loss": 1.3665, "step": 3382 }, { "epoch": 0.02, "grad_norm": 4.995671187040337, "learning_rate": 1.9998397763378524e-06, "loss": 1.3142, "step": 3383 }, { "epoch": 0.02, "grad_norm": 5.272798024995659, "learning_rate": 1.9998396813226545e-06, "loss": 1.3689, "step": 3384 }, { "epoch": 0.02, "grad_norm": 4.4670178221368655, "learning_rate": 1.9998395862792944e-06, "loss": 1.4755, "step": 3385 }, { "epoch": 0.02, "grad_norm": 4.832806175109676, "learning_rate": 1.999839491207772e-06, "loss": 1.4044, "step": 3386 }, { "epoch": 0.02, "grad_norm": 4.27660795435882, "learning_rate": 1.9998393961080883e-06, "loss": 1.3548, "step": 3387 }, { "epoch": 0.02, "grad_norm": 4.819442397568239, "learning_rate": 1.9998393009802417e-06, "loss": 1.4295, "step": 3388 }, { "epoch": 0.02, "grad_norm": 4.911584590161657, "learning_rate": 1.999839205824233e-06, "loss": 1.5393, "step": 3389 }, { "epoch": 0.02, "grad_norm": 5.255900800064722, "learning_rate": 1.9998391106400622e-06, "loss": 1.4227, "step": 3390 }, { "epoch": 0.02, "grad_norm": 4.46927240762758, "learning_rate": 1.9998390154277293e-06, "loss": 1.3794, "step": 3391 }, { "epoch": 0.02, "grad_norm": 4.445886168494522, "learning_rate": 1.999838920187234e-06, "loss": 1.3245, "step": 3392 }, { "epoch": 0.02, "grad_norm": 4.605358505461649, "learning_rate": 1.9998388249185773e-06, "loss": 1.4412, "step": 3393 }, { "epoch": 0.02, "grad_norm": 4.535610642306297, "learning_rate": 1.999838729621758e-06, "loss": 1.5244, "step": 3394 }, { "epoch": 0.02, "grad_norm": 7.61293667721256, "learning_rate": 1.9998386342967767e-06, "loss": 1.3762, "step": 3395 }, { "epoch": 0.02, "grad_norm": 5.576615477919866, "learning_rate": 1.999838538943633e-06, "loss": 1.1744, "step": 3396 }, { "epoch": 0.02, "grad_norm": 4.982158065266377, "learning_rate": 1.999838443562327e-06, "loss": 1.4918, "step": 3397 }, { "epoch": 0.02, "grad_norm": 4.380167900793968, "learning_rate": 1.9998383481528595e-06, "loss": 1.4074, "step": 3398 }, { "epoch": 0.02, "grad_norm": 4.950258411072331, "learning_rate": 1.9998382527152297e-06, "loss": 1.4923, "step": 3399 }, { "epoch": 0.02, "grad_norm": 4.592714907688114, "learning_rate": 1.9998381572494374e-06, "loss": 1.3628, "step": 3400 }, { "epoch": 0.02, "grad_norm": 4.424967211816432, "learning_rate": 1.9998380617554833e-06, "loss": 1.4329, "step": 3401 }, { "epoch": 0.02, "grad_norm": 4.525704926126062, "learning_rate": 1.999837966233367e-06, "loss": 1.412, "step": 3402 }, { "epoch": 0.02, "grad_norm": 4.7381010367112735, "learning_rate": 1.9998378706830887e-06, "loss": 1.4563, "step": 3403 }, { "epoch": 0.02, "grad_norm": 5.357820102817708, "learning_rate": 1.999837775104648e-06, "loss": 1.392, "step": 3404 }, { "epoch": 0.02, "grad_norm": 4.877459475025764, "learning_rate": 1.9998376794980455e-06, "loss": 1.446, "step": 3405 }, { "epoch": 0.02, "grad_norm": 4.736389873828967, "learning_rate": 1.999837583863281e-06, "loss": 1.3899, "step": 3406 }, { "epoch": 0.02, "grad_norm": 4.6136258426518655, "learning_rate": 1.999837488200354e-06, "loss": 1.4183, "step": 3407 }, { "epoch": 0.02, "grad_norm": 4.573629869371029, "learning_rate": 1.9998373925092654e-06, "loss": 1.4148, "step": 3408 }, { "epoch": 0.02, "grad_norm": 5.331117554718104, "learning_rate": 1.999837296790014e-06, "loss": 1.4761, "step": 3409 }, { "epoch": 0.02, "grad_norm": 6.167559123390286, "learning_rate": 1.999837201042601e-06, "loss": 1.374, "step": 3410 }, { "epoch": 0.02, "grad_norm": 5.348496307914284, "learning_rate": 1.999837105267026e-06, "loss": 1.3786, "step": 3411 }, { "epoch": 0.02, "grad_norm": 5.130052691622517, "learning_rate": 1.9998370094632887e-06, "loss": 1.4301, "step": 3412 }, { "epoch": 0.02, "grad_norm": 4.497336313396663, "learning_rate": 1.9998369136313892e-06, "loss": 1.24, "step": 3413 }, { "epoch": 0.02, "grad_norm": 5.813744620601688, "learning_rate": 1.999836817771328e-06, "loss": 1.4909, "step": 3414 }, { "epoch": 0.02, "grad_norm": 4.591174887700467, "learning_rate": 1.9998367218831043e-06, "loss": 1.4375, "step": 3415 }, { "epoch": 0.02, "grad_norm": 4.522216046172172, "learning_rate": 1.9998366259667188e-06, "loss": 1.3572, "step": 3416 }, { "epoch": 0.02, "grad_norm": 4.717345593583763, "learning_rate": 1.999836530022171e-06, "loss": 1.4897, "step": 3417 }, { "epoch": 0.02, "grad_norm": 5.321027044092839, "learning_rate": 1.9998364340494614e-06, "loss": 1.4912, "step": 3418 }, { "epoch": 0.02, "grad_norm": 4.7605877409336195, "learning_rate": 1.99983633804859e-06, "loss": 1.4961, "step": 3419 }, { "epoch": 0.02, "grad_norm": 4.4178839583225775, "learning_rate": 1.999836242019556e-06, "loss": 1.2966, "step": 3420 }, { "epoch": 0.02, "grad_norm": 5.056171291505301, "learning_rate": 1.99983614596236e-06, "loss": 1.3807, "step": 3421 }, { "epoch": 0.02, "grad_norm": 5.4340753688988075, "learning_rate": 1.9998360498770024e-06, "loss": 1.3203, "step": 3422 }, { "epoch": 0.02, "grad_norm": 5.057972934945415, "learning_rate": 1.9998359537634822e-06, "loss": 1.4758, "step": 3423 }, { "epoch": 0.02, "grad_norm": 5.829632088605847, "learning_rate": 1.9998358576218004e-06, "loss": 1.517, "step": 3424 }, { "epoch": 0.02, "grad_norm": 4.85482687780751, "learning_rate": 1.999835761451956e-06, "loss": 1.4677, "step": 3425 }, { "epoch": 0.02, "grad_norm": 5.922500655311902, "learning_rate": 1.9998356652539498e-06, "loss": 1.3043, "step": 3426 }, { "epoch": 0.02, "grad_norm": 4.588250525378909, "learning_rate": 1.999835569027782e-06, "loss": 1.3587, "step": 3427 }, { "epoch": 0.02, "grad_norm": 4.382966019888421, "learning_rate": 1.9998354727734514e-06, "loss": 1.255, "step": 3428 }, { "epoch": 0.02, "grad_norm": 4.920889632741727, "learning_rate": 1.999835376490959e-06, "loss": 1.3713, "step": 3429 }, { "epoch": 0.02, "grad_norm": 4.80574963169274, "learning_rate": 1.9998352801803045e-06, "loss": 1.6275, "step": 3430 }, { "epoch": 0.02, "grad_norm": 5.028070166963233, "learning_rate": 1.999835183841488e-06, "loss": 1.4875, "step": 3431 }, { "epoch": 0.02, "eval_loss": 1.6133036613464355, "eval_runtime": 4.6357, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.079, "step": 3431 }, { "epoch": 0.02, "grad_norm": 4.434318252171852, "learning_rate": 1.9998350874745098e-06, "loss": 1.363, "step": 3432 }, { "epoch": 0.02, "grad_norm": 6.623393165669105, "learning_rate": 1.9998349910793694e-06, "loss": 1.5814, "step": 3433 }, { "epoch": 0.02, "grad_norm": 5.456113027521206, "learning_rate": 1.999834894656067e-06, "loss": 1.2424, "step": 3434 }, { "epoch": 0.02, "grad_norm": 5.519733967210898, "learning_rate": 1.9998347982046026e-06, "loss": 1.4818, "step": 3435 }, { "epoch": 0.02, "grad_norm": 4.648419524976711, "learning_rate": 1.9998347017249762e-06, "loss": 1.2958, "step": 3436 }, { "epoch": 0.02, "grad_norm": 4.479293176004785, "learning_rate": 1.9998346052171877e-06, "loss": 1.3048, "step": 3437 }, { "epoch": 0.02, "grad_norm": 4.723612084510464, "learning_rate": 1.999834508681237e-06, "loss": 1.4803, "step": 3438 }, { "epoch": 0.02, "grad_norm": 4.379602061766586, "learning_rate": 1.9998344121171245e-06, "loss": 1.3367, "step": 3439 }, { "epoch": 0.02, "grad_norm": 4.763559064357727, "learning_rate": 1.99983431552485e-06, "loss": 1.5105, "step": 3440 }, { "epoch": 0.02, "grad_norm": 7.923117534883834, "learning_rate": 1.9998342189044136e-06, "loss": 1.4361, "step": 3441 }, { "epoch": 0.02, "grad_norm": 4.714745220308912, "learning_rate": 1.9998341222558148e-06, "loss": 1.5172, "step": 3442 }, { "epoch": 0.02, "grad_norm": 7.225844294787685, "learning_rate": 1.999834025579054e-06, "loss": 1.4366, "step": 3443 }, { "epoch": 0.02, "grad_norm": 4.3720897698041625, "learning_rate": 1.999833928874132e-06, "loss": 1.3114, "step": 3444 }, { "epoch": 0.02, "grad_norm": 5.468815321569939, "learning_rate": 1.9998338321410473e-06, "loss": 1.3577, "step": 3445 }, { "epoch": 0.02, "grad_norm": 5.369771276792973, "learning_rate": 1.9998337353798007e-06, "loss": 1.4633, "step": 3446 }, { "epoch": 0.02, "grad_norm": 5.373295136247841, "learning_rate": 1.9998336385903923e-06, "loss": 1.3569, "step": 3447 }, { "epoch": 0.02, "grad_norm": 4.775224501462429, "learning_rate": 1.9998335417728218e-06, "loss": 1.3241, "step": 3448 }, { "epoch": 0.02, "grad_norm": 5.063351567591072, "learning_rate": 1.999833444927089e-06, "loss": 1.4984, "step": 3449 }, { "epoch": 0.02, "grad_norm": 5.460324129913872, "learning_rate": 1.9998333480531947e-06, "loss": 1.4443, "step": 3450 }, { "epoch": 0.02, "grad_norm": 4.49405386456319, "learning_rate": 1.999833251151138e-06, "loss": 1.3851, "step": 3451 }, { "epoch": 0.02, "grad_norm": 4.561632118793946, "learning_rate": 1.99983315422092e-06, "loss": 1.3756, "step": 3452 }, { "epoch": 0.02, "grad_norm": 5.787428115911711, "learning_rate": 1.9998330572625394e-06, "loss": 1.2516, "step": 3453 }, { "epoch": 0.02, "grad_norm": 4.2654374946244955, "learning_rate": 1.999832960275997e-06, "loss": 1.3323, "step": 3454 }, { "epoch": 0.02, "grad_norm": 4.943112051244451, "learning_rate": 1.9998328632612925e-06, "loss": 1.433, "step": 3455 }, { "epoch": 0.02, "grad_norm": 6.64758931877515, "learning_rate": 1.999832766218426e-06, "loss": 1.3929, "step": 3456 }, { "epoch": 0.02, "grad_norm": 5.004313093362644, "learning_rate": 1.999832669147398e-06, "loss": 1.3248, "step": 3457 }, { "epoch": 0.02, "grad_norm": 5.974142973723598, "learning_rate": 1.9998325720482075e-06, "loss": 1.4423, "step": 3458 }, { "epoch": 0.02, "grad_norm": 6.115923629555795, "learning_rate": 1.9998324749208554e-06, "loss": 1.3141, "step": 3459 }, { "epoch": 0.02, "grad_norm": 5.29889351882071, "learning_rate": 1.999832377765341e-06, "loss": 1.5085, "step": 3460 }, { "epoch": 0.02, "grad_norm": 4.933579769870536, "learning_rate": 1.999832280581665e-06, "loss": 1.3622, "step": 3461 }, { "epoch": 0.02, "grad_norm": 4.54785822775587, "learning_rate": 1.9998321833698267e-06, "loss": 1.4151, "step": 3462 }, { "epoch": 0.02, "grad_norm": 4.464340495701981, "learning_rate": 1.999832086129827e-06, "loss": 1.3599, "step": 3463 }, { "epoch": 0.02, "grad_norm": 5.73588324623857, "learning_rate": 1.999831988861665e-06, "loss": 1.6014, "step": 3464 }, { "epoch": 0.02, "grad_norm": 4.4591831342702966, "learning_rate": 1.9998318915653408e-06, "loss": 1.322, "step": 3465 }, { "epoch": 0.02, "grad_norm": 4.723127658638915, "learning_rate": 1.9998317942408553e-06, "loss": 1.5451, "step": 3466 }, { "epoch": 0.02, "grad_norm": 4.852467128721474, "learning_rate": 1.9998316968882073e-06, "loss": 1.301, "step": 3467 }, { "epoch": 0.02, "grad_norm": 4.810154485536803, "learning_rate": 1.9998315995073976e-06, "loss": 1.421, "step": 3468 }, { "epoch": 0.02, "grad_norm": 4.355440555741922, "learning_rate": 1.999831502098426e-06, "loss": 1.3608, "step": 3469 }, { "epoch": 0.02, "grad_norm": 4.2600779975771434, "learning_rate": 1.9998314046612925e-06, "loss": 1.3282, "step": 3470 }, { "epoch": 0.02, "grad_norm": 4.713840798819233, "learning_rate": 1.9998313071959967e-06, "loss": 1.469, "step": 3471 }, { "epoch": 0.02, "grad_norm": 4.40703476488466, "learning_rate": 1.9998312097025396e-06, "loss": 1.195, "step": 3472 }, { "epoch": 0.02, "grad_norm": 4.707652605308421, "learning_rate": 1.99983111218092e-06, "loss": 1.401, "step": 3473 }, { "epoch": 0.02, "grad_norm": 4.9351410023359135, "learning_rate": 1.9998310146311386e-06, "loss": 1.5518, "step": 3474 }, { "epoch": 0.02, "grad_norm": 7.835268364237103, "learning_rate": 1.999830917053196e-06, "loss": 1.4963, "step": 3475 }, { "epoch": 0.02, "grad_norm": 4.461769170842469, "learning_rate": 1.9998308194470907e-06, "loss": 1.3913, "step": 3476 }, { "epoch": 0.02, "grad_norm": 4.986732482048744, "learning_rate": 1.9998307218128237e-06, "loss": 1.4655, "step": 3477 }, { "epoch": 0.02, "grad_norm": 5.103030642318946, "learning_rate": 1.999830624150395e-06, "loss": 1.4183, "step": 3478 }, { "epoch": 0.02, "grad_norm": 5.347945323080289, "learning_rate": 1.999830526459804e-06, "loss": 1.5263, "step": 3479 }, { "epoch": 0.02, "grad_norm": 4.836255099815674, "learning_rate": 1.999830428741051e-06, "loss": 1.3183, "step": 3480 }, { "epoch": 0.02, "grad_norm": 4.36280200083679, "learning_rate": 1.9998303309941367e-06, "loss": 1.4045, "step": 3481 }, { "epoch": 0.02, "grad_norm": 4.277029361572978, "learning_rate": 1.9998302332190603e-06, "loss": 1.2107, "step": 3482 }, { "epoch": 0.02, "grad_norm": 4.367758928198617, "learning_rate": 1.9998301354158217e-06, "loss": 1.3907, "step": 3483 }, { "epoch": 0.02, "grad_norm": 4.39073373168116, "learning_rate": 1.9998300375844213e-06, "loss": 1.2528, "step": 3484 }, { "epoch": 0.02, "grad_norm": 4.626910873986676, "learning_rate": 1.9998299397248592e-06, "loss": 1.3984, "step": 3485 }, { "epoch": 0.02, "grad_norm": 4.690894198671245, "learning_rate": 1.9998298418371354e-06, "loss": 1.463, "step": 3486 }, { "epoch": 0.02, "grad_norm": 4.2521261877601555, "learning_rate": 1.9998297439212494e-06, "loss": 1.2599, "step": 3487 }, { "epoch": 0.02, "grad_norm": 9.954516216479742, "learning_rate": 1.9998296459772013e-06, "loss": 1.3263, "step": 3488 }, { "epoch": 0.02, "grad_norm": 4.64293902683026, "learning_rate": 1.999829548004992e-06, "loss": 1.4007, "step": 3489 }, { "epoch": 0.02, "grad_norm": 4.7871583563034035, "learning_rate": 1.9998294500046204e-06, "loss": 1.4429, "step": 3490 }, { "epoch": 0.02, "grad_norm": 4.615463851562626, "learning_rate": 1.999829351976087e-06, "loss": 1.5247, "step": 3491 }, { "epoch": 0.02, "grad_norm": 4.704943808755736, "learning_rate": 1.9998292539193916e-06, "loss": 1.576, "step": 3492 }, { "epoch": 0.02, "grad_norm": 5.7783079479655655, "learning_rate": 1.9998291558345344e-06, "loss": 1.4055, "step": 3493 }, { "epoch": 0.02, "grad_norm": 5.441145271123304, "learning_rate": 1.9998290577215155e-06, "loss": 1.5482, "step": 3494 }, { "epoch": 0.02, "grad_norm": 9.36043045218442, "learning_rate": 1.9998289595803345e-06, "loss": 1.3262, "step": 3495 }, { "epoch": 0.02, "grad_norm": 4.745604478290225, "learning_rate": 1.9998288614109917e-06, "loss": 1.5021, "step": 3496 }, { "epoch": 0.02, "grad_norm": 4.4255945297536385, "learning_rate": 1.999828763213487e-06, "loss": 1.4826, "step": 3497 }, { "epoch": 0.02, "grad_norm": 4.268280892701323, "learning_rate": 1.999828664987821e-06, "loss": 1.3277, "step": 3498 }, { "epoch": 0.02, "grad_norm": 4.925308919808803, "learning_rate": 1.9998285667339926e-06, "loss": 1.3895, "step": 3499 }, { "epoch": 0.02, "grad_norm": 4.388627815177559, "learning_rate": 1.9998284684520024e-06, "loss": 1.1406, "step": 3500 }, { "epoch": 0.02, "grad_norm": 4.7990704395075205, "learning_rate": 1.99982837014185e-06, "loss": 1.5664, "step": 3501 }, { "epoch": 0.02, "grad_norm": 5.069150584999445, "learning_rate": 1.9998282718035366e-06, "loss": 1.5177, "step": 3502 }, { "epoch": 0.02, "grad_norm": 4.792701471795762, "learning_rate": 1.999828173437061e-06, "loss": 1.4161, "step": 3503 }, { "epoch": 0.02, "grad_norm": 4.535667453487885, "learning_rate": 1.999828075042424e-06, "loss": 1.3286, "step": 3504 }, { "epoch": 0.02, "eval_loss": 1.612628698348999, "eval_runtime": 4.6365, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 3504 }, { "epoch": 0.02, "grad_norm": 4.836032937210056, "learning_rate": 1.9998279766196242e-06, "loss": 1.4098, "step": 3505 }, { "epoch": 0.02, "grad_norm": 4.780070229048976, "learning_rate": 1.999827878168663e-06, "loss": 1.4433, "step": 3506 }, { "epoch": 0.02, "grad_norm": 5.2390642232027655, "learning_rate": 1.9998277796895403e-06, "loss": 1.3572, "step": 3507 }, { "epoch": 0.02, "grad_norm": 4.973359117911899, "learning_rate": 1.999827681182255e-06, "loss": 1.3459, "step": 3508 }, { "epoch": 0.02, "grad_norm": 4.378947132862404, "learning_rate": 1.9998275826468085e-06, "loss": 1.4072, "step": 3509 }, { "epoch": 0.02, "grad_norm": 5.58519122967338, "learning_rate": 1.9998274840832003e-06, "loss": 1.4547, "step": 3510 }, { "epoch": 0.02, "grad_norm": 4.718146444105029, "learning_rate": 1.99982738549143e-06, "loss": 1.3933, "step": 3511 }, { "epoch": 0.02, "grad_norm": 4.433816849751808, "learning_rate": 1.9998272868714978e-06, "loss": 1.5081, "step": 3512 }, { "epoch": 0.02, "grad_norm": 4.668728068958253, "learning_rate": 1.999827188223404e-06, "loss": 1.4535, "step": 3513 }, { "epoch": 0.02, "grad_norm": 5.205098278485999, "learning_rate": 1.9998270895471484e-06, "loss": 1.3971, "step": 3514 }, { "epoch": 0.02, "grad_norm": 4.2812240461470905, "learning_rate": 1.999826990842731e-06, "loss": 1.2847, "step": 3515 }, { "epoch": 0.02, "grad_norm": 4.421468940513477, "learning_rate": 1.9998268921101516e-06, "loss": 1.2344, "step": 3516 }, { "epoch": 0.02, "grad_norm": 5.240697700729821, "learning_rate": 1.999826793349411e-06, "loss": 1.3846, "step": 3517 }, { "epoch": 0.02, "grad_norm": 4.624113047494641, "learning_rate": 1.999826694560508e-06, "loss": 1.5729, "step": 3518 }, { "epoch": 0.02, "grad_norm": 4.125813300538199, "learning_rate": 1.999826595743443e-06, "loss": 1.3067, "step": 3519 }, { "epoch": 0.02, "grad_norm": 4.72485279895408, "learning_rate": 1.9998264968982165e-06, "loss": 1.4624, "step": 3520 }, { "epoch": 0.02, "grad_norm": 7.440580236377528, "learning_rate": 1.9998263980248284e-06, "loss": 1.3921, "step": 3521 }, { "epoch": 0.02, "grad_norm": 5.506465574269989, "learning_rate": 1.9998262991232786e-06, "loss": 1.3429, "step": 3522 }, { "epoch": 0.02, "grad_norm": 4.892687291895052, "learning_rate": 1.9998262001935666e-06, "loss": 1.4849, "step": 3523 }, { "epoch": 0.02, "grad_norm": 4.714503092123628, "learning_rate": 1.999826101235693e-06, "loss": 1.3495, "step": 3524 }, { "epoch": 0.02, "grad_norm": 6.080297383574053, "learning_rate": 1.9998260022496574e-06, "loss": 1.4098, "step": 3525 }, { "epoch": 0.02, "grad_norm": 4.672363930477558, "learning_rate": 1.9998259032354607e-06, "loss": 1.1808, "step": 3526 }, { "epoch": 0.02, "grad_norm": 4.762254121900801, "learning_rate": 1.999825804193102e-06, "loss": 1.2784, "step": 3527 }, { "epoch": 0.02, "grad_norm": 5.0090886444132465, "learning_rate": 1.999825705122581e-06, "loss": 1.3983, "step": 3528 }, { "epoch": 0.02, "grad_norm": 4.541188950258432, "learning_rate": 1.9998256060238984e-06, "loss": 1.4821, "step": 3529 }, { "epoch": 0.02, "grad_norm": 4.508252601462831, "learning_rate": 1.9998255068970544e-06, "loss": 1.4049, "step": 3530 }, { "epoch": 0.02, "grad_norm": 4.384733739344799, "learning_rate": 1.9998254077420486e-06, "loss": 1.3762, "step": 3531 }, { "epoch": 0.02, "grad_norm": 4.91582726748242, "learning_rate": 1.9998253085588806e-06, "loss": 1.4418, "step": 3532 }, { "epoch": 0.02, "grad_norm": 4.228789208224146, "learning_rate": 1.9998252093475513e-06, "loss": 1.2963, "step": 3533 }, { "epoch": 0.02, "grad_norm": 4.697321495816711, "learning_rate": 1.99982511010806e-06, "loss": 1.4528, "step": 3534 }, { "epoch": 0.02, "grad_norm": 4.803901064568725, "learning_rate": 1.9998250108404072e-06, "loss": 1.569, "step": 3535 }, { "epoch": 0.02, "grad_norm": 4.353829814405607, "learning_rate": 1.9998249115445924e-06, "loss": 1.3503, "step": 3536 }, { "epoch": 0.02, "grad_norm": 4.469444434313929, "learning_rate": 1.9998248122206158e-06, "loss": 1.485, "step": 3537 }, { "epoch": 0.02, "grad_norm": 4.135382837905591, "learning_rate": 1.999824712868478e-06, "loss": 1.3355, "step": 3538 }, { "epoch": 0.02, "grad_norm": 5.198880866777614, "learning_rate": 1.999824613488178e-06, "loss": 1.469, "step": 3539 }, { "epoch": 0.02, "grad_norm": 5.2331303310285415, "learning_rate": 1.999824514079716e-06, "loss": 1.3518, "step": 3540 }, { "epoch": 0.02, "grad_norm": 4.8429386253121125, "learning_rate": 1.9998244146430926e-06, "loss": 1.4942, "step": 3541 }, { "epoch": 0.02, "grad_norm": 4.574611939532659, "learning_rate": 1.9998243151783077e-06, "loss": 1.4268, "step": 3542 }, { "epoch": 0.02, "grad_norm": 4.94932612125061, "learning_rate": 1.9998242156853608e-06, "loss": 1.6209, "step": 3543 }, { "epoch": 0.02, "grad_norm": 6.006103951828245, "learning_rate": 1.999824116164252e-06, "loss": 1.2351, "step": 3544 }, { "epoch": 0.02, "grad_norm": 4.166042167548124, "learning_rate": 1.999824016614982e-06, "loss": 1.142, "step": 3545 }, { "epoch": 0.02, "grad_norm": 5.0620007916305925, "learning_rate": 1.99982391703755e-06, "loss": 1.4374, "step": 3546 }, { "epoch": 0.02, "grad_norm": 4.971313439844006, "learning_rate": 1.999823817431956e-06, "loss": 1.4932, "step": 3547 }, { "epoch": 0.02, "grad_norm": 4.948526977664145, "learning_rate": 1.999823717798201e-06, "loss": 1.3942, "step": 3548 }, { "epoch": 0.02, "grad_norm": 6.433000571492981, "learning_rate": 1.9998236181362836e-06, "loss": 1.6458, "step": 3549 }, { "epoch": 0.02, "grad_norm": 5.7172630835592475, "learning_rate": 1.9998235184462045e-06, "loss": 1.3787, "step": 3550 }, { "epoch": 0.02, "grad_norm": 4.835524125983266, "learning_rate": 1.999823418727964e-06, "loss": 1.4862, "step": 3551 }, { "epoch": 0.02, "grad_norm": 4.538989738684982, "learning_rate": 1.999823318981562e-06, "loss": 1.2833, "step": 3552 }, { "epoch": 0.02, "grad_norm": 4.900851370893367, "learning_rate": 1.999823219206998e-06, "loss": 1.4289, "step": 3553 }, { "epoch": 0.02, "grad_norm": 4.742076041128345, "learning_rate": 1.9998231194042723e-06, "loss": 1.5084, "step": 3554 }, { "epoch": 0.02, "grad_norm": 5.130400288886862, "learning_rate": 1.999823019573385e-06, "loss": 1.251, "step": 3555 }, { "epoch": 0.02, "grad_norm": 5.2268449306817, "learning_rate": 1.999822919714336e-06, "loss": 1.2141, "step": 3556 }, { "epoch": 0.02, "grad_norm": 4.5252607523373936, "learning_rate": 1.9998228198271253e-06, "loss": 1.3066, "step": 3557 }, { "epoch": 0.02, "grad_norm": 7.351860568361683, "learning_rate": 1.999822719911753e-06, "loss": 1.3784, "step": 3558 }, { "epoch": 0.02, "grad_norm": 4.3402345801839415, "learning_rate": 1.9998226199682187e-06, "loss": 1.375, "step": 3559 }, { "epoch": 0.02, "grad_norm": 6.810672354782998, "learning_rate": 1.999822519996523e-06, "loss": 1.4548, "step": 3560 }, { "epoch": 0.02, "grad_norm": 4.856779921376611, "learning_rate": 1.999822419996666e-06, "loss": 1.3662, "step": 3561 }, { "epoch": 0.02, "grad_norm": 4.495518456829636, "learning_rate": 1.9998223199686466e-06, "loss": 1.3658, "step": 3562 }, { "epoch": 0.02, "grad_norm": 4.972175534549607, "learning_rate": 1.999822219912466e-06, "loss": 1.4305, "step": 3563 }, { "epoch": 0.02, "grad_norm": 4.5993830494584484, "learning_rate": 1.9998221198281236e-06, "loss": 1.3633, "step": 3564 }, { "epoch": 0.02, "grad_norm": 4.464312906911071, "learning_rate": 1.9998220197156194e-06, "loss": 1.3512, "step": 3565 }, { "epoch": 0.02, "grad_norm": 4.666030982227595, "learning_rate": 1.9998219195749536e-06, "loss": 1.4271, "step": 3566 }, { "epoch": 0.02, "grad_norm": 4.5017841565719365, "learning_rate": 1.9998218194061264e-06, "loss": 1.2379, "step": 3567 }, { "epoch": 0.02, "grad_norm": 5.160263531672295, "learning_rate": 1.9998217192091375e-06, "loss": 1.4269, "step": 3568 }, { "epoch": 0.02, "grad_norm": 5.394350494166971, "learning_rate": 1.9998216189839865e-06, "loss": 1.5847, "step": 3569 }, { "epoch": 0.02, "grad_norm": 4.543610113197201, "learning_rate": 1.999821518730674e-06, "loss": 1.5249, "step": 3570 }, { "epoch": 0.02, "grad_norm": 4.524606361318028, "learning_rate": 1.9998214184492e-06, "loss": 1.4545, "step": 3571 }, { "epoch": 0.02, "grad_norm": 4.8729914641558425, "learning_rate": 1.9998213181395643e-06, "loss": 1.4856, "step": 3572 }, { "epoch": 0.02, "grad_norm": 6.712818698671254, "learning_rate": 1.999821217801767e-06, "loss": 1.4254, "step": 3573 }, { "epoch": 0.02, "grad_norm": 7.129380023262798, "learning_rate": 1.9998211174358083e-06, "loss": 1.5263, "step": 3574 }, { "epoch": 0.02, "grad_norm": 5.632118655345206, "learning_rate": 1.999821017041688e-06, "loss": 1.2905, "step": 3575 }, { "epoch": 0.02, "grad_norm": 4.713339231139657, "learning_rate": 1.9998209166194055e-06, "loss": 1.4079, "step": 3576 }, { "epoch": 0.02, "grad_norm": 4.761666895797936, "learning_rate": 1.9998208161689615e-06, "loss": 1.348, "step": 3577 }, { "epoch": 0.02, "eval_loss": 1.6088365316390991, "eval_runtime": 4.6298, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 3577 }, { "epoch": 0.02, "grad_norm": 5.053550759411962, "learning_rate": 1.999820715690356e-06, "loss": 1.4378, "step": 3578 }, { "epoch": 0.02, "grad_norm": 4.656417963233415, "learning_rate": 1.999820615183589e-06, "loss": 1.4196, "step": 3579 }, { "epoch": 0.02, "grad_norm": 4.720114969577187, "learning_rate": 1.9998205146486604e-06, "loss": 1.4177, "step": 3580 }, { "epoch": 0.02, "grad_norm": 5.1590061020053435, "learning_rate": 1.99982041408557e-06, "loss": 1.3961, "step": 3581 }, { "epoch": 0.02, "grad_norm": 4.609450714516466, "learning_rate": 1.999820313494318e-06, "loss": 1.3561, "step": 3582 }, { "epoch": 0.02, "grad_norm": 6.864818427122029, "learning_rate": 1.9998202128749045e-06, "loss": 1.3521, "step": 3583 }, { "epoch": 0.02, "grad_norm": 4.532234217131842, "learning_rate": 1.9998201122273292e-06, "loss": 1.2275, "step": 3584 }, { "epoch": 0.02, "grad_norm": 5.083685514923316, "learning_rate": 1.9998200115515927e-06, "loss": 1.7183, "step": 3585 }, { "epoch": 0.02, "grad_norm": 5.1648857831860004, "learning_rate": 1.9998199108476944e-06, "loss": 1.4292, "step": 3586 }, { "epoch": 0.02, "grad_norm": 5.404570069932921, "learning_rate": 1.9998198101156344e-06, "loss": 1.4524, "step": 3587 }, { "epoch": 0.02, "grad_norm": 4.4636298910229435, "learning_rate": 1.9998197093554126e-06, "loss": 1.3022, "step": 3588 }, { "epoch": 0.02, "grad_norm": 4.698700968868405, "learning_rate": 1.9998196085670296e-06, "loss": 1.3204, "step": 3589 }, { "epoch": 0.02, "grad_norm": 4.210913435197878, "learning_rate": 1.9998195077504848e-06, "loss": 1.3195, "step": 3590 }, { "epoch": 0.02, "grad_norm": 6.072601657781737, "learning_rate": 1.9998194069057783e-06, "loss": 1.4633, "step": 3591 }, { "epoch": 0.02, "grad_norm": 4.947711709512535, "learning_rate": 1.9998193060329105e-06, "loss": 1.3934, "step": 3592 }, { "epoch": 0.02, "grad_norm": 4.489367907366415, "learning_rate": 1.999819205131881e-06, "loss": 1.3327, "step": 3593 }, { "epoch": 0.02, "grad_norm": 5.046442232090646, "learning_rate": 1.99981910420269e-06, "loss": 1.519, "step": 3594 }, { "epoch": 0.02, "grad_norm": 4.4848572461168015, "learning_rate": 1.999819003245337e-06, "loss": 1.4837, "step": 3595 }, { "epoch": 0.02, "grad_norm": 6.189445025381679, "learning_rate": 1.999818902259823e-06, "loss": 1.322, "step": 3596 }, { "epoch": 0.02, "grad_norm": 5.052080974977652, "learning_rate": 1.999818801246147e-06, "loss": 1.4127, "step": 3597 }, { "epoch": 0.02, "grad_norm": 5.364751238319127, "learning_rate": 1.99981870020431e-06, "loss": 1.2812, "step": 3598 }, { "epoch": 0.02, "grad_norm": 6.0230864070326104, "learning_rate": 1.9998185991343108e-06, "loss": 1.1783, "step": 3599 }, { "epoch": 0.02, "grad_norm": 5.459589611896564, "learning_rate": 1.9998184980361504e-06, "loss": 1.2804, "step": 3600 }, { "epoch": 0.02, "grad_norm": 6.424443515802799, "learning_rate": 1.9998183969098283e-06, "loss": 1.2931, "step": 3601 }, { "epoch": 0.02, "grad_norm": 4.769365934369103, "learning_rate": 1.9998182957553445e-06, "loss": 1.3411, "step": 3602 }, { "epoch": 0.02, "grad_norm": 4.944616559405836, "learning_rate": 1.9998181945726993e-06, "loss": 1.2028, "step": 3603 }, { "epoch": 0.02, "grad_norm": 4.852238644907209, "learning_rate": 1.9998180933618925e-06, "loss": 1.3328, "step": 3604 }, { "epoch": 0.02, "grad_norm": 4.822059635157921, "learning_rate": 1.9998179921229243e-06, "loss": 1.2309, "step": 3605 }, { "epoch": 0.02, "grad_norm": 4.495892687998826, "learning_rate": 1.9998178908557944e-06, "loss": 1.4956, "step": 3606 }, { "epoch": 0.02, "grad_norm": 4.355235617835983, "learning_rate": 1.999817789560503e-06, "loss": 1.181, "step": 3607 }, { "epoch": 0.02, "grad_norm": 4.220700935420473, "learning_rate": 1.99981768823705e-06, "loss": 1.2949, "step": 3608 }, { "epoch": 0.02, "grad_norm": 4.93767906248555, "learning_rate": 1.9998175868854353e-06, "loss": 1.4398, "step": 3609 }, { "epoch": 0.02, "grad_norm": 4.976911864648587, "learning_rate": 1.9998174855056593e-06, "loss": 1.3841, "step": 3610 }, { "epoch": 0.02, "grad_norm": 4.915768062533412, "learning_rate": 1.999817384097722e-06, "loss": 1.3776, "step": 3611 }, { "epoch": 0.02, "grad_norm": 5.400452386799031, "learning_rate": 1.999817282661623e-06, "loss": 1.5242, "step": 3612 }, { "epoch": 0.02, "grad_norm": 9.596731304469015, "learning_rate": 1.999817181197362e-06, "loss": 1.4432, "step": 3613 }, { "epoch": 0.02, "grad_norm": 4.417819290463723, "learning_rate": 1.99981707970494e-06, "loss": 1.3012, "step": 3614 }, { "epoch": 0.02, "grad_norm": 4.908570364549861, "learning_rate": 1.9998169781843566e-06, "loss": 1.4815, "step": 3615 }, { "epoch": 0.02, "grad_norm": 4.538384996779538, "learning_rate": 1.999816876635611e-06, "loss": 1.4016, "step": 3616 }, { "epoch": 0.02, "grad_norm": 4.527777934939574, "learning_rate": 1.9998167750587048e-06, "loss": 1.3748, "step": 3617 }, { "epoch": 0.02, "grad_norm": 5.9438933831368645, "learning_rate": 1.9998166734536362e-06, "loss": 1.3516, "step": 3618 }, { "epoch": 0.02, "grad_norm": 5.111197603829205, "learning_rate": 1.9998165718204064e-06, "loss": 1.4535, "step": 3619 }, { "epoch": 0.02, "grad_norm": 4.3741990248604035, "learning_rate": 1.9998164701590153e-06, "loss": 1.3127, "step": 3620 }, { "epoch": 0.02, "grad_norm": 4.416015259175785, "learning_rate": 1.999816368469463e-06, "loss": 1.2723, "step": 3621 }, { "epoch": 0.02, "grad_norm": 5.045256087161643, "learning_rate": 1.9998162667517488e-06, "loss": 1.4302, "step": 3622 }, { "epoch": 0.02, "grad_norm": 5.026141735704103, "learning_rate": 1.999816165005873e-06, "loss": 1.5235, "step": 3623 }, { "epoch": 0.02, "grad_norm": 4.817827993597373, "learning_rate": 1.9998160632318357e-06, "loss": 1.3734, "step": 3624 }, { "epoch": 0.02, "grad_norm": 5.13290587555347, "learning_rate": 1.999815961429637e-06, "loss": 1.6108, "step": 3625 }, { "epoch": 0.02, "grad_norm": 3.988580619133651, "learning_rate": 1.9998158595992766e-06, "loss": 1.1573, "step": 3626 }, { "epoch": 0.02, "grad_norm": 4.66265731554959, "learning_rate": 1.9998157577407546e-06, "loss": 1.1933, "step": 3627 }, { "epoch": 0.02, "grad_norm": 4.648371974483903, "learning_rate": 1.999815655854072e-06, "loss": 1.4983, "step": 3628 }, { "epoch": 0.02, "grad_norm": 4.62998311734592, "learning_rate": 1.9998155539392273e-06, "loss": 1.3605, "step": 3629 }, { "epoch": 0.02, "grad_norm": 4.650232836970276, "learning_rate": 1.999815451996221e-06, "loss": 1.4764, "step": 3630 }, { "epoch": 0.02, "grad_norm": 4.871641037949958, "learning_rate": 1.9998153500250534e-06, "loss": 1.2777, "step": 3631 }, { "epoch": 0.02, "grad_norm": 4.617802934649518, "learning_rate": 1.999815248025724e-06, "loss": 1.4119, "step": 3632 }, { "epoch": 0.02, "grad_norm": 4.411468513060949, "learning_rate": 1.999815145998234e-06, "loss": 1.2856, "step": 3633 }, { "epoch": 0.02, "grad_norm": 4.4136554607658995, "learning_rate": 1.999815043942582e-06, "loss": 1.3036, "step": 3634 }, { "epoch": 0.02, "grad_norm": 4.696213561907659, "learning_rate": 1.9998149418587684e-06, "loss": 1.4445, "step": 3635 }, { "epoch": 0.02, "grad_norm": 4.707575975813092, "learning_rate": 1.9998148397467934e-06, "loss": 1.4956, "step": 3636 }, { "epoch": 0.02, "grad_norm": 4.477661718215679, "learning_rate": 1.9998147376066567e-06, "loss": 1.4149, "step": 3637 }, { "epoch": 0.02, "grad_norm": 5.242955704509717, "learning_rate": 1.9998146354383588e-06, "loss": 1.3746, "step": 3638 }, { "epoch": 0.02, "grad_norm": 4.407436531750599, "learning_rate": 1.9998145332418995e-06, "loss": 1.3222, "step": 3639 }, { "epoch": 0.02, "grad_norm": 5.4015116251774575, "learning_rate": 1.999814431017279e-06, "loss": 1.4696, "step": 3640 }, { "epoch": 0.02, "grad_norm": 4.230060461672011, "learning_rate": 1.9998143287644966e-06, "loss": 1.1936, "step": 3641 }, { "epoch": 0.02, "grad_norm": 4.832174281020922, "learning_rate": 1.999814226483553e-06, "loss": 1.5139, "step": 3642 }, { "epoch": 0.02, "grad_norm": 4.458935294035869, "learning_rate": 1.999814124174448e-06, "loss": 1.366, "step": 3643 }, { "epoch": 0.02, "grad_norm": 4.858967822250987, "learning_rate": 1.9998140218371814e-06, "loss": 1.4256, "step": 3644 }, { "epoch": 0.02, "grad_norm": 5.384171006945276, "learning_rate": 1.9998139194717534e-06, "loss": 1.5396, "step": 3645 }, { "epoch": 0.02, "grad_norm": 5.362217816203799, "learning_rate": 1.999813817078164e-06, "loss": 1.4846, "step": 3646 }, { "epoch": 0.02, "grad_norm": 4.829886703693458, "learning_rate": 1.999813714656413e-06, "loss": 1.465, "step": 3647 }, { "epoch": 0.02, "grad_norm": 4.300928121079802, "learning_rate": 1.999813612206501e-06, "loss": 1.3776, "step": 3648 }, { "epoch": 0.02, "grad_norm": 4.186443596966576, "learning_rate": 1.9998135097284272e-06, "loss": 1.3409, "step": 3649 }, { "epoch": 0.02, "grad_norm": 4.883455465101393, "learning_rate": 1.9998134072221923e-06, "loss": 1.41, "step": 3650 }, { "epoch": 0.02, "eval_loss": 1.6074836254119873, "eval_runtime": 4.6299, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 3650 }, { "epoch": 0.02, "grad_norm": 5.160544967304233, "learning_rate": 1.9998133046877957e-06, "loss": 1.4479, "step": 3651 }, { "epoch": 0.02, "grad_norm": 4.591999173432497, "learning_rate": 1.9998132021252378e-06, "loss": 1.424, "step": 3652 }, { "epoch": 0.02, "grad_norm": 4.334725366745728, "learning_rate": 1.9998130995345185e-06, "loss": 1.3303, "step": 3653 }, { "epoch": 0.02, "grad_norm": 4.667679588938856, "learning_rate": 1.9998129969156376e-06, "loss": 1.3497, "step": 3654 }, { "epoch": 0.02, "grad_norm": 4.648593704664988, "learning_rate": 1.9998128942685953e-06, "loss": 1.5196, "step": 3655 }, { "epoch": 0.02, "grad_norm": 5.228106530732631, "learning_rate": 1.999812791593392e-06, "loss": 1.5034, "step": 3656 }, { "epoch": 0.02, "grad_norm": 4.825079154423852, "learning_rate": 1.9998126888900272e-06, "loss": 1.5114, "step": 3657 }, { "epoch": 0.02, "grad_norm": 5.588839443240046, "learning_rate": 1.9998125861585006e-06, "loss": 1.224, "step": 3658 }, { "epoch": 0.02, "grad_norm": 11.115206705835469, "learning_rate": 1.9998124833988127e-06, "loss": 1.402, "step": 3659 }, { "epoch": 0.02, "grad_norm": 5.028407264356555, "learning_rate": 1.9998123806109635e-06, "loss": 1.3366, "step": 3660 }, { "epoch": 0.02, "grad_norm": 4.883233407550293, "learning_rate": 1.999812277794953e-06, "loss": 1.3908, "step": 3661 }, { "epoch": 0.02, "grad_norm": 4.849560264366864, "learning_rate": 1.999812174950781e-06, "loss": 1.4642, "step": 3662 }, { "epoch": 0.02, "grad_norm": 5.134509693028477, "learning_rate": 1.999812072078448e-06, "loss": 1.338, "step": 3663 }, { "epoch": 0.02, "grad_norm": 4.64736551563639, "learning_rate": 1.999811969177953e-06, "loss": 1.3188, "step": 3664 }, { "epoch": 0.02, "grad_norm": 5.202295612938597, "learning_rate": 1.999811866249297e-06, "loss": 1.5156, "step": 3665 }, { "epoch": 0.02, "grad_norm": 4.864330255262217, "learning_rate": 1.9998117632924795e-06, "loss": 1.4044, "step": 3666 }, { "epoch": 0.02, "grad_norm": 5.095975352302671, "learning_rate": 1.9998116603075008e-06, "loss": 1.4834, "step": 3667 }, { "epoch": 0.02, "grad_norm": 4.401897149251593, "learning_rate": 1.9998115572943607e-06, "loss": 1.3598, "step": 3668 }, { "epoch": 0.02, "grad_norm": 4.516321121878343, "learning_rate": 1.9998114542530593e-06, "loss": 1.3812, "step": 3669 }, { "epoch": 0.02, "grad_norm": 4.582276547112316, "learning_rate": 1.999811351183596e-06, "loss": 1.4511, "step": 3670 }, { "epoch": 0.02, "grad_norm": 4.268406465536788, "learning_rate": 1.9998112480859718e-06, "loss": 1.3777, "step": 3671 }, { "epoch": 0.02, "grad_norm": 5.414725079724578, "learning_rate": 1.999811144960186e-06, "loss": 1.3594, "step": 3672 }, { "epoch": 0.02, "grad_norm": 5.036395550915229, "learning_rate": 1.9998110418062394e-06, "loss": 1.5366, "step": 3673 }, { "epoch": 0.02, "grad_norm": 4.811922967477642, "learning_rate": 1.9998109386241307e-06, "loss": 1.489, "step": 3674 }, { "epoch": 0.02, "grad_norm": 4.909204659000984, "learning_rate": 1.999810835413861e-06, "loss": 1.4438, "step": 3675 }, { "epoch": 0.02, "grad_norm": 4.933064342481729, "learning_rate": 1.99981073217543e-06, "loss": 1.3926, "step": 3676 }, { "epoch": 0.02, "grad_norm": 5.172865172666342, "learning_rate": 1.999810628908838e-06, "loss": 1.3739, "step": 3677 }, { "epoch": 0.02, "grad_norm": 4.581844754398152, "learning_rate": 1.999810525614084e-06, "loss": 1.3825, "step": 3678 }, { "epoch": 0.02, "grad_norm": 4.002706815163878, "learning_rate": 1.999810422291169e-06, "loss": 1.2023, "step": 3679 }, { "epoch": 0.02, "grad_norm": 4.529339089739914, "learning_rate": 1.9998103189400925e-06, "loss": 1.3956, "step": 3680 }, { "epoch": 0.02, "grad_norm": 4.371778302872117, "learning_rate": 1.999810215560855e-06, "loss": 1.0882, "step": 3681 }, { "epoch": 0.02, "grad_norm": 4.604473618690761, "learning_rate": 1.999810112153456e-06, "loss": 1.3797, "step": 3682 }, { "epoch": 0.02, "grad_norm": 4.229913176526649, "learning_rate": 1.9998100087178954e-06, "loss": 1.3024, "step": 3683 }, { "epoch": 0.02, "grad_norm": 4.892279631388166, "learning_rate": 1.9998099052541736e-06, "loss": 1.4261, "step": 3684 }, { "epoch": 0.02, "grad_norm": 4.375988978498727, "learning_rate": 1.9998098017622905e-06, "loss": 1.426, "step": 3685 }, { "epoch": 0.02, "grad_norm": 4.517183513297791, "learning_rate": 1.9998096982422465e-06, "loss": 1.4715, "step": 3686 }, { "epoch": 0.02, "grad_norm": 4.715497920821096, "learning_rate": 1.9998095946940408e-06, "loss": 1.2492, "step": 3687 }, { "epoch": 0.02, "grad_norm": 4.4409337214742814, "learning_rate": 1.9998094911176738e-06, "loss": 1.3908, "step": 3688 }, { "epoch": 0.02, "grad_norm": 4.768774973994264, "learning_rate": 1.9998093875131454e-06, "loss": 1.3043, "step": 3689 }, { "epoch": 0.02, "grad_norm": 4.488443359293641, "learning_rate": 1.999809283880456e-06, "loss": 1.4254, "step": 3690 }, { "epoch": 0.02, "grad_norm": 4.946907576554227, "learning_rate": 1.999809180219605e-06, "loss": 1.3815, "step": 3691 }, { "epoch": 0.02, "grad_norm": 5.024643467089974, "learning_rate": 1.9998090765305927e-06, "loss": 1.3476, "step": 3692 }, { "epoch": 0.02, "grad_norm": 5.375622450304601, "learning_rate": 1.9998089728134195e-06, "loss": 1.575, "step": 3693 }, { "epoch": 0.02, "grad_norm": 4.7741580232890755, "learning_rate": 1.9998088690680847e-06, "loss": 1.3576, "step": 3694 }, { "epoch": 0.02, "grad_norm": 4.690371879763435, "learning_rate": 1.9998087652945886e-06, "loss": 1.3921, "step": 3695 }, { "epoch": 0.02, "grad_norm": 4.895129039414259, "learning_rate": 1.999808661492931e-06, "loss": 1.5504, "step": 3696 }, { "epoch": 0.02, "grad_norm": 4.841292889636168, "learning_rate": 1.9998085576631128e-06, "loss": 1.6215, "step": 3697 }, { "epoch": 0.02, "grad_norm": 4.335373503405417, "learning_rate": 1.9998084538051327e-06, "loss": 1.4034, "step": 3698 }, { "epoch": 0.02, "grad_norm": 4.892579988645074, "learning_rate": 1.999808349918992e-06, "loss": 1.4615, "step": 3699 }, { "epoch": 0.03, "grad_norm": 4.80405901674261, "learning_rate": 1.999808246004689e-06, "loss": 1.3834, "step": 3700 }, { "epoch": 0.03, "grad_norm": 5.015073387547917, "learning_rate": 1.9998081420622256e-06, "loss": 1.5382, "step": 3701 }, { "epoch": 0.03, "grad_norm": 5.340605580327248, "learning_rate": 1.9998080380916003e-06, "loss": 1.5587, "step": 3702 }, { "epoch": 0.03, "grad_norm": 4.6548885345585145, "learning_rate": 1.999807934092814e-06, "loss": 1.4507, "step": 3703 }, { "epoch": 0.03, "grad_norm": 7.988246529171995, "learning_rate": 1.9998078300658667e-06, "loss": 1.2863, "step": 3704 }, { "epoch": 0.03, "grad_norm": 4.366479726955373, "learning_rate": 1.999807726010758e-06, "loss": 1.3532, "step": 3705 }, { "epoch": 0.03, "grad_norm": 4.954787351869969, "learning_rate": 1.999807621927488e-06, "loss": 1.4679, "step": 3706 }, { "epoch": 0.03, "grad_norm": 5.106594076376741, "learning_rate": 1.9998075178160565e-06, "loss": 1.4912, "step": 3707 }, { "epoch": 0.03, "grad_norm": 5.53644272240156, "learning_rate": 1.999807413676464e-06, "loss": 1.3928, "step": 3708 }, { "epoch": 0.03, "grad_norm": 4.3898086275317265, "learning_rate": 1.9998073095087102e-06, "loss": 1.2247, "step": 3709 }, { "epoch": 0.03, "grad_norm": 6.129858409048548, "learning_rate": 1.9998072053127954e-06, "loss": 1.2888, "step": 3710 }, { "epoch": 0.03, "grad_norm": 6.411528618345709, "learning_rate": 1.999807101088719e-06, "loss": 1.4598, "step": 3711 }, { "epoch": 0.03, "grad_norm": 5.245071088827053, "learning_rate": 1.9998069968364813e-06, "loss": 1.3139, "step": 3712 }, { "epoch": 0.03, "grad_norm": 5.001994498250823, "learning_rate": 1.9998068925560825e-06, "loss": 1.4319, "step": 3713 }, { "epoch": 0.03, "grad_norm": 4.5711686332255725, "learning_rate": 1.9998067882475225e-06, "loss": 1.4286, "step": 3714 }, { "epoch": 0.03, "grad_norm": 4.494680754322679, "learning_rate": 1.999806683910801e-06, "loss": 1.3624, "step": 3715 }, { "epoch": 0.03, "grad_norm": 4.920443475726351, "learning_rate": 1.999806579545919e-06, "loss": 1.4834, "step": 3716 }, { "epoch": 0.03, "grad_norm": 9.09972634364239, "learning_rate": 1.9998064751528752e-06, "loss": 1.3838, "step": 3717 }, { "epoch": 0.03, "grad_norm": 4.696129650401296, "learning_rate": 1.99980637073167e-06, "loss": 1.5004, "step": 3718 }, { "epoch": 0.03, "grad_norm": 5.3037260313404415, "learning_rate": 1.999806266282304e-06, "loss": 1.5459, "step": 3719 }, { "epoch": 0.03, "grad_norm": 4.88221476190435, "learning_rate": 1.9998061618047767e-06, "loss": 1.4734, "step": 3720 }, { "epoch": 0.03, "grad_norm": 4.956709505619386, "learning_rate": 1.999806057299088e-06, "loss": 1.3683, "step": 3721 }, { "epoch": 0.03, "grad_norm": 5.1834233457849495, "learning_rate": 1.9998059527652382e-06, "loss": 1.3125, "step": 3722 }, { "epoch": 0.03, "grad_norm": 4.492850793982879, "learning_rate": 1.9998058482032273e-06, "loss": 1.2745, "step": 3723 }, { "epoch": 0.03, "eval_loss": 1.6084189414978027, "eval_runtime": 4.642, "eval_samples_per_second": 1.939, "eval_steps_per_second": 1.077, "step": 3723 }, { "epoch": 0.03, "grad_norm": 4.713923973024249, "learning_rate": 1.999805743613055e-06, "loss": 1.4203, "step": 3724 }, { "epoch": 0.03, "grad_norm": 4.441921382155817, "learning_rate": 1.999805638994722e-06, "loss": 1.4163, "step": 3725 }, { "epoch": 0.03, "grad_norm": 5.164638367038176, "learning_rate": 1.9998055343482274e-06, "loss": 1.4703, "step": 3726 }, { "epoch": 0.03, "grad_norm": 4.707717776966889, "learning_rate": 1.999805429673571e-06, "loss": 1.4389, "step": 3727 }, { "epoch": 0.03, "grad_norm": 5.404527912836125, "learning_rate": 1.9998053249707545e-06, "loss": 1.4697, "step": 3728 }, { "epoch": 0.03, "grad_norm": 4.582127757876631, "learning_rate": 1.999805220239776e-06, "loss": 1.4003, "step": 3729 }, { "epoch": 0.03, "grad_norm": 4.82572183924319, "learning_rate": 1.999805115480637e-06, "loss": 1.3832, "step": 3730 }, { "epoch": 0.03, "grad_norm": 5.099262606984633, "learning_rate": 1.9998050106933363e-06, "loss": 1.4593, "step": 3731 }, { "epoch": 0.03, "grad_norm": 4.705269403296674, "learning_rate": 1.9998049058778745e-06, "loss": 1.4539, "step": 3732 }, { "epoch": 0.03, "grad_norm": 5.343650845199192, "learning_rate": 1.9998048010342517e-06, "loss": 1.4299, "step": 3733 }, { "epoch": 0.03, "grad_norm": 4.761090749890984, "learning_rate": 1.9998046961624677e-06, "loss": 1.376, "step": 3734 }, { "epoch": 0.03, "grad_norm": 4.47848246597197, "learning_rate": 1.9998045912625223e-06, "loss": 1.3492, "step": 3735 }, { "epoch": 0.03, "grad_norm": 4.553841135265006, "learning_rate": 1.999804486334416e-06, "loss": 1.4707, "step": 3736 }, { "epoch": 0.03, "grad_norm": 5.112011524717292, "learning_rate": 1.999804381378148e-06, "loss": 1.3998, "step": 3737 }, { "epoch": 0.03, "grad_norm": 5.053186099424353, "learning_rate": 1.9998042763937197e-06, "loss": 1.3719, "step": 3738 }, { "epoch": 0.03, "grad_norm": 4.9280058328899665, "learning_rate": 1.9998041713811296e-06, "loss": 1.4414, "step": 3739 }, { "epoch": 0.03, "grad_norm": 4.927412166137148, "learning_rate": 1.9998040663403785e-06, "loss": 1.3497, "step": 3740 }, { "epoch": 0.03, "grad_norm": 4.868129380898875, "learning_rate": 1.999803961271466e-06, "loss": 1.4854, "step": 3741 }, { "epoch": 0.03, "grad_norm": 5.083726937764924, "learning_rate": 1.999803856174393e-06, "loss": 1.3262, "step": 3742 }, { "epoch": 0.03, "grad_norm": 4.680945628502999, "learning_rate": 1.9998037510491585e-06, "loss": 1.4529, "step": 3743 }, { "epoch": 0.03, "grad_norm": 9.213903521376912, "learning_rate": 1.9998036458957626e-06, "loss": 1.5463, "step": 3744 }, { "epoch": 0.03, "grad_norm": 4.604742089204025, "learning_rate": 1.999803540714206e-06, "loss": 1.4577, "step": 3745 }, { "epoch": 0.03, "grad_norm": 5.36229884657475, "learning_rate": 1.999803435504488e-06, "loss": 1.4056, "step": 3746 }, { "epoch": 0.03, "grad_norm": 4.600285403993327, "learning_rate": 1.9998033302666086e-06, "loss": 1.4862, "step": 3747 }, { "epoch": 0.03, "grad_norm": 5.265400100794315, "learning_rate": 1.9998032250005684e-06, "loss": 1.5089, "step": 3748 }, { "epoch": 0.03, "grad_norm": 5.1969214189492305, "learning_rate": 1.9998031197063673e-06, "loss": 1.4962, "step": 3749 }, { "epoch": 0.03, "grad_norm": 4.320190989350664, "learning_rate": 1.9998030143840045e-06, "loss": 1.3034, "step": 3750 }, { "epoch": 0.03, "grad_norm": 4.579075184163126, "learning_rate": 1.9998029090334813e-06, "loss": 1.4736, "step": 3751 }, { "epoch": 0.03, "grad_norm": 5.007083829002728, "learning_rate": 1.9998028036547963e-06, "loss": 1.3576, "step": 3752 }, { "epoch": 0.03, "grad_norm": 5.20578658851182, "learning_rate": 1.9998026982479504e-06, "loss": 1.3413, "step": 3753 }, { "epoch": 0.03, "grad_norm": 4.940850610449721, "learning_rate": 1.9998025928129432e-06, "loss": 1.5133, "step": 3754 }, { "epoch": 0.03, "grad_norm": 4.822761183175982, "learning_rate": 1.9998024873497756e-06, "loss": 1.2899, "step": 3755 }, { "epoch": 0.03, "grad_norm": 4.545241760698208, "learning_rate": 1.9998023818584462e-06, "loss": 1.3797, "step": 3756 }, { "epoch": 0.03, "grad_norm": 4.742084979843826, "learning_rate": 1.999802276338956e-06, "loss": 1.4692, "step": 3757 }, { "epoch": 0.03, "grad_norm": 4.658419562131754, "learning_rate": 1.9998021707913045e-06, "loss": 1.3793, "step": 3758 }, { "epoch": 0.03, "grad_norm": 5.183021678979285, "learning_rate": 1.999802065215492e-06, "loss": 1.4525, "step": 3759 }, { "epoch": 0.03, "grad_norm": 5.736088379622611, "learning_rate": 1.9998019596115183e-06, "loss": 1.2959, "step": 3760 }, { "epoch": 0.03, "grad_norm": 4.656062094359612, "learning_rate": 1.9998018539793837e-06, "loss": 1.3658, "step": 3761 }, { "epoch": 0.03, "grad_norm": 4.616745476907714, "learning_rate": 1.9998017483190878e-06, "loss": 1.4136, "step": 3762 }, { "epoch": 0.03, "grad_norm": 7.731566326102406, "learning_rate": 1.999801642630631e-06, "loss": 1.3751, "step": 3763 }, { "epoch": 0.03, "grad_norm": 5.432514416044134, "learning_rate": 1.999801536914013e-06, "loss": 1.3943, "step": 3764 }, { "epoch": 0.03, "grad_norm": 4.866033462628277, "learning_rate": 1.999801431169234e-06, "loss": 1.3415, "step": 3765 }, { "epoch": 0.03, "grad_norm": 5.468340002269182, "learning_rate": 1.9998013253962936e-06, "loss": 1.3156, "step": 3766 }, { "epoch": 0.03, "grad_norm": 4.647820771341247, "learning_rate": 1.9998012195951925e-06, "loss": 1.3947, "step": 3767 }, { "epoch": 0.03, "grad_norm": 4.657410377037526, "learning_rate": 1.99980111376593e-06, "loss": 1.4256, "step": 3768 }, { "epoch": 0.03, "grad_norm": 4.8464614615398505, "learning_rate": 1.9998010079085066e-06, "loss": 1.3297, "step": 3769 }, { "epoch": 0.03, "grad_norm": 4.332318170076133, "learning_rate": 1.9998009020229224e-06, "loss": 1.2867, "step": 3770 }, { "epoch": 0.03, "grad_norm": 4.878338350222502, "learning_rate": 1.999800796109177e-06, "loss": 1.38, "step": 3771 }, { "epoch": 0.03, "grad_norm": 4.493111995241049, "learning_rate": 1.9998006901672704e-06, "loss": 1.4075, "step": 3772 }, { "epoch": 0.03, "grad_norm": 4.469116344826484, "learning_rate": 1.9998005841972027e-06, "loss": 1.4129, "step": 3773 }, { "epoch": 0.03, "grad_norm": 4.5213045347853535, "learning_rate": 1.999800478198974e-06, "loss": 1.3233, "step": 3774 }, { "epoch": 0.03, "grad_norm": 4.65101410292212, "learning_rate": 1.999800372172584e-06, "loss": 1.3441, "step": 3775 }, { "epoch": 0.03, "grad_norm": 8.216969674240255, "learning_rate": 1.9998002661180334e-06, "loss": 1.2946, "step": 3776 }, { "epoch": 0.03, "grad_norm": 5.3767982950299755, "learning_rate": 1.9998001600353217e-06, "loss": 1.4547, "step": 3777 }, { "epoch": 0.03, "grad_norm": 6.118813225734277, "learning_rate": 1.9998000539244488e-06, "loss": 1.4563, "step": 3778 }, { "epoch": 0.03, "grad_norm": 4.720418776141597, "learning_rate": 1.999799947785415e-06, "loss": 1.305, "step": 3779 }, { "epoch": 0.03, "grad_norm": 4.516048109136921, "learning_rate": 1.9997998416182198e-06, "loss": 1.3212, "step": 3780 }, { "epoch": 0.03, "grad_norm": 4.62003964568539, "learning_rate": 1.9997997354228637e-06, "loss": 1.3838, "step": 3781 }, { "epoch": 0.03, "grad_norm": 5.0358945130998976, "learning_rate": 1.999799629199347e-06, "loss": 1.4947, "step": 3782 }, { "epoch": 0.03, "grad_norm": 4.5824414800490185, "learning_rate": 1.9997995229476686e-06, "loss": 1.3435, "step": 3783 }, { "epoch": 0.03, "grad_norm": 4.75725254853182, "learning_rate": 1.99979941666783e-06, "loss": 1.3627, "step": 3784 }, { "epoch": 0.03, "grad_norm": 5.14007088469601, "learning_rate": 1.9997993103598295e-06, "loss": 1.4974, "step": 3785 }, { "epoch": 0.03, "grad_norm": 4.60496266899906, "learning_rate": 1.9997992040236686e-06, "loss": 1.4381, "step": 3786 }, { "epoch": 0.03, "grad_norm": 4.287067707408165, "learning_rate": 1.999799097659346e-06, "loss": 1.4361, "step": 3787 }, { "epoch": 0.03, "grad_norm": 4.864411077361251, "learning_rate": 1.999798991266863e-06, "loss": 1.4978, "step": 3788 }, { "epoch": 0.03, "grad_norm": 6.476884055738115, "learning_rate": 1.999798884846219e-06, "loss": 1.5751, "step": 3789 }, { "epoch": 0.03, "grad_norm": 4.607204170305541, "learning_rate": 1.999798778397414e-06, "loss": 1.2613, "step": 3790 }, { "epoch": 0.03, "grad_norm": 4.513574436603101, "learning_rate": 1.9997986719204477e-06, "loss": 1.2507, "step": 3791 }, { "epoch": 0.03, "grad_norm": 4.397374539254503, "learning_rate": 1.9997985654153202e-06, "loss": 1.3785, "step": 3792 }, { "epoch": 0.03, "grad_norm": 4.7647312609589285, "learning_rate": 1.999798458882032e-06, "loss": 1.398, "step": 3793 }, { "epoch": 0.03, "grad_norm": 5.532986119411458, "learning_rate": 1.999798352320583e-06, "loss": 1.473, "step": 3794 }, { "epoch": 0.03, "grad_norm": 4.289540776605135, "learning_rate": 1.9997982457309727e-06, "loss": 1.254, "step": 3795 }, { "epoch": 0.03, "grad_norm": 4.650435124831928, "learning_rate": 1.9997981391132013e-06, "loss": 1.3785, "step": 3796 }, { "epoch": 0.03, "eval_loss": 1.6124725341796875, "eval_runtime": 4.6361, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 3796 }, { "epoch": 0.03, "grad_norm": 4.606995994308827, "learning_rate": 1.9997980324672695e-06, "loss": 1.4598, "step": 3797 }, { "epoch": 0.03, "grad_norm": 4.628886714284138, "learning_rate": 1.9997979257931764e-06, "loss": 1.4204, "step": 3798 }, { "epoch": 0.03, "grad_norm": 4.472096608209409, "learning_rate": 1.9997978190909223e-06, "loss": 1.2646, "step": 3799 }, { "epoch": 0.03, "grad_norm": 6.899108769896611, "learning_rate": 1.999797712360507e-06, "loss": 1.7302, "step": 3800 }, { "epoch": 0.03, "grad_norm": 11.698902110094348, "learning_rate": 1.999797605601931e-06, "loss": 1.2991, "step": 3801 }, { "epoch": 0.03, "grad_norm": 4.678694191733831, "learning_rate": 1.999797498815194e-06, "loss": 1.3871, "step": 3802 }, { "epoch": 0.03, "grad_norm": 4.623073739366768, "learning_rate": 1.999797392000296e-06, "loss": 1.4762, "step": 3803 }, { "epoch": 0.03, "grad_norm": 4.528916340724794, "learning_rate": 1.999797285157237e-06, "loss": 1.2994, "step": 3804 }, { "epoch": 0.03, "grad_norm": 5.120009553724364, "learning_rate": 1.9997971782860172e-06, "loss": 1.4481, "step": 3805 }, { "epoch": 0.03, "grad_norm": 4.646312172610378, "learning_rate": 1.9997970713866358e-06, "loss": 1.4237, "step": 3806 }, { "epoch": 0.03, "grad_norm": 4.824187645447148, "learning_rate": 1.9997969644590943e-06, "loss": 1.3742, "step": 3807 }, { "epoch": 0.03, "grad_norm": 4.707453765528019, "learning_rate": 1.999796857503391e-06, "loss": 1.358, "step": 3808 }, { "epoch": 0.03, "grad_norm": 4.874962378800376, "learning_rate": 1.9997967505195274e-06, "loss": 1.4034, "step": 3809 }, { "epoch": 0.03, "grad_norm": 5.728054711081159, "learning_rate": 1.999796643507503e-06, "loss": 1.4522, "step": 3810 }, { "epoch": 0.03, "grad_norm": 4.488435565656918, "learning_rate": 1.999796536467317e-06, "loss": 1.3188, "step": 3811 }, { "epoch": 0.03, "grad_norm": 5.309632018627234, "learning_rate": 1.9997964293989703e-06, "loss": 1.4794, "step": 3812 }, { "epoch": 0.03, "grad_norm": 5.21029005693669, "learning_rate": 1.9997963223024627e-06, "loss": 1.4637, "step": 3813 }, { "epoch": 0.03, "grad_norm": 4.719975810384763, "learning_rate": 1.999796215177794e-06, "loss": 1.4294, "step": 3814 }, { "epoch": 0.03, "grad_norm": 4.947061511680672, "learning_rate": 1.9997961080249648e-06, "loss": 1.2268, "step": 3815 }, { "epoch": 0.03, "grad_norm": 5.717301549581428, "learning_rate": 1.9997960008439745e-06, "loss": 1.5564, "step": 3816 }, { "epoch": 0.03, "grad_norm": 4.337568360997204, "learning_rate": 1.999795893634823e-06, "loss": 1.3617, "step": 3817 }, { "epoch": 0.03, "grad_norm": 4.714909910098067, "learning_rate": 1.999795786397511e-06, "loss": 1.3707, "step": 3818 }, { "epoch": 0.03, "grad_norm": 4.863348635773201, "learning_rate": 1.999795679132038e-06, "loss": 1.5601, "step": 3819 }, { "epoch": 0.03, "grad_norm": 4.725409416597695, "learning_rate": 1.999795571838404e-06, "loss": 1.5155, "step": 3820 }, { "epoch": 0.03, "grad_norm": 4.390907850020624, "learning_rate": 1.9997954645166087e-06, "loss": 1.3048, "step": 3821 }, { "epoch": 0.03, "grad_norm": 5.204362569456592, "learning_rate": 1.9997953571666528e-06, "loss": 1.4248, "step": 3822 }, { "epoch": 0.03, "grad_norm": 4.373634118732567, "learning_rate": 1.9997952497885363e-06, "loss": 1.4316, "step": 3823 }, { "epoch": 0.03, "grad_norm": 4.830427217746328, "learning_rate": 1.999795142382258e-06, "loss": 1.5032, "step": 3824 }, { "epoch": 0.03, "grad_norm": 5.35064585778059, "learning_rate": 1.99979503494782e-06, "loss": 1.3917, "step": 3825 }, { "epoch": 0.03, "grad_norm": 4.621090262533992, "learning_rate": 1.99979492748522e-06, "loss": 1.4374, "step": 3826 }, { "epoch": 0.03, "grad_norm": 4.486560370946347, "learning_rate": 1.9997948199944597e-06, "loss": 1.3844, "step": 3827 }, { "epoch": 0.03, "grad_norm": 4.89173032023379, "learning_rate": 1.9997947124755385e-06, "loss": 1.4796, "step": 3828 }, { "epoch": 0.03, "grad_norm": 4.968544330306352, "learning_rate": 1.9997946049284563e-06, "loss": 1.5391, "step": 3829 }, { "epoch": 0.03, "grad_norm": 5.432121217331672, "learning_rate": 1.9997944973532133e-06, "loss": 1.3526, "step": 3830 }, { "epoch": 0.03, "grad_norm": 4.433372309371036, "learning_rate": 1.9997943897498094e-06, "loss": 1.3612, "step": 3831 }, { "epoch": 0.03, "grad_norm": 4.388748973282565, "learning_rate": 1.9997942821182442e-06, "loss": 1.2879, "step": 3832 }, { "epoch": 0.03, "grad_norm": 5.522001370681306, "learning_rate": 1.9997941744585186e-06, "loss": 1.3845, "step": 3833 }, { "epoch": 0.03, "grad_norm": 5.064642846647261, "learning_rate": 1.999794066770632e-06, "loss": 1.4568, "step": 3834 }, { "epoch": 0.03, "grad_norm": 4.99394851245948, "learning_rate": 1.9997939590545846e-06, "loss": 1.3925, "step": 3835 }, { "epoch": 0.03, "grad_norm": 6.180590276962394, "learning_rate": 1.9997938513103763e-06, "loss": 1.4198, "step": 3836 }, { "epoch": 0.03, "grad_norm": 4.625626685100316, "learning_rate": 1.999793743538007e-06, "loss": 1.4152, "step": 3837 }, { "epoch": 0.03, "grad_norm": 5.7309361359320725, "learning_rate": 1.999793635737477e-06, "loss": 1.448, "step": 3838 }, { "epoch": 0.03, "grad_norm": 4.700706207946969, "learning_rate": 1.9997935279087857e-06, "loss": 1.4112, "step": 3839 }, { "epoch": 0.03, "grad_norm": 4.344275675891425, "learning_rate": 1.9997934200519343e-06, "loss": 1.3493, "step": 3840 }, { "epoch": 0.03, "grad_norm": 5.112163685600387, "learning_rate": 1.9997933121669216e-06, "loss": 1.4048, "step": 3841 }, { "epoch": 0.03, "grad_norm": 4.620444881119614, "learning_rate": 1.999793204253748e-06, "loss": 1.48, "step": 3842 }, { "epoch": 0.03, "grad_norm": 4.318406868845656, "learning_rate": 1.9997930963124135e-06, "loss": 1.32, "step": 3843 }, { "epoch": 0.03, "grad_norm": 4.786834196466122, "learning_rate": 1.999792988342918e-06, "loss": 1.4818, "step": 3844 }, { "epoch": 0.03, "grad_norm": 4.616685881971367, "learning_rate": 1.9997928803452624e-06, "loss": 1.3424, "step": 3845 }, { "epoch": 0.03, "grad_norm": 10.976292065630325, "learning_rate": 1.9997927723194453e-06, "loss": 1.529, "step": 3846 }, { "epoch": 0.03, "grad_norm": 5.479056535258812, "learning_rate": 1.9997926642654677e-06, "loss": 1.3713, "step": 3847 }, { "epoch": 0.03, "grad_norm": 5.29815155050577, "learning_rate": 1.9997925561833293e-06, "loss": 1.4513, "step": 3848 }, { "epoch": 0.03, "grad_norm": 4.642297981539935, "learning_rate": 1.99979244807303e-06, "loss": 1.2959, "step": 3849 }, { "epoch": 0.03, "grad_norm": 5.1046013534719465, "learning_rate": 1.9997923399345697e-06, "loss": 1.4979, "step": 3850 }, { "epoch": 0.03, "grad_norm": 4.646559943022004, "learning_rate": 1.9997922317679486e-06, "loss": 1.444, "step": 3851 }, { "epoch": 0.03, "grad_norm": 5.21849530959606, "learning_rate": 1.9997921235731667e-06, "loss": 1.5293, "step": 3852 }, { "epoch": 0.03, "grad_norm": 5.091788114990917, "learning_rate": 1.999792015350224e-06, "loss": 1.3336, "step": 3853 }, { "epoch": 0.03, "grad_norm": 5.027116597468565, "learning_rate": 1.9997919070991205e-06, "loss": 1.4953, "step": 3854 }, { "epoch": 0.03, "grad_norm": 4.351333006720373, "learning_rate": 1.999791798819856e-06, "loss": 1.2278, "step": 3855 }, { "epoch": 0.03, "grad_norm": 5.065186193436848, "learning_rate": 1.999791690512431e-06, "loss": 1.5342, "step": 3856 }, { "epoch": 0.03, "grad_norm": 5.055266043001361, "learning_rate": 1.9997915821768453e-06, "loss": 1.4839, "step": 3857 }, { "epoch": 0.03, "grad_norm": 4.4172809483128495, "learning_rate": 1.9997914738130985e-06, "loss": 1.3977, "step": 3858 }, { "epoch": 0.03, "grad_norm": 5.314641655648586, "learning_rate": 1.9997913654211908e-06, "loss": 1.452, "step": 3859 }, { "epoch": 0.03, "grad_norm": 5.308280921675756, "learning_rate": 1.9997912570011226e-06, "loss": 1.4482, "step": 3860 }, { "epoch": 0.03, "grad_norm": 4.735078171685458, "learning_rate": 1.9997911485528935e-06, "loss": 1.3584, "step": 3861 }, { "epoch": 0.03, "grad_norm": 4.8743091116457995, "learning_rate": 1.9997910400765036e-06, "loss": 1.4825, "step": 3862 }, { "epoch": 0.03, "grad_norm": 4.46078710955025, "learning_rate": 1.9997909315719528e-06, "loss": 1.2225, "step": 3863 }, { "epoch": 0.03, "grad_norm": 5.185001019529193, "learning_rate": 1.999790823039241e-06, "loss": 1.1979, "step": 3864 }, { "epoch": 0.03, "grad_norm": 5.476108668212412, "learning_rate": 1.999790714478369e-06, "loss": 1.5096, "step": 3865 }, { "epoch": 0.03, "grad_norm": 5.333965154940147, "learning_rate": 1.999790605889336e-06, "loss": 1.4091, "step": 3866 }, { "epoch": 0.03, "grad_norm": 4.971115470731839, "learning_rate": 1.999790497272142e-06, "loss": 1.3178, "step": 3867 }, { "epoch": 0.03, "grad_norm": 4.595571343395057, "learning_rate": 1.9997903886267876e-06, "loss": 1.4777, "step": 3868 }, { "epoch": 0.03, "grad_norm": 6.804248226375533, "learning_rate": 1.9997902799532724e-06, "loss": 1.5475, "step": 3869 }, { "epoch": 0.03, "eval_loss": 1.609044075012207, "eval_runtime": 4.6325, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 3869 }, { "epoch": 0.03, "grad_norm": 6.3443955811136865, "learning_rate": 1.9997901712515962e-06, "loss": 1.4244, "step": 3870 }, { "epoch": 0.03, "grad_norm": 4.839983959655444, "learning_rate": 1.9997900625217592e-06, "loss": 1.5665, "step": 3871 }, { "epoch": 0.03, "grad_norm": 4.654490084904816, "learning_rate": 1.9997899537637613e-06, "loss": 1.3532, "step": 3872 }, { "epoch": 0.03, "grad_norm": 4.615637710700499, "learning_rate": 1.999789844977603e-06, "loss": 1.4159, "step": 3873 }, { "epoch": 0.03, "grad_norm": 4.633952767212308, "learning_rate": 1.9997897361632837e-06, "loss": 1.1975, "step": 3874 }, { "epoch": 0.03, "grad_norm": 4.411171413065695, "learning_rate": 1.999789627320804e-06, "loss": 1.3549, "step": 3875 }, { "epoch": 0.03, "grad_norm": 5.134244853079362, "learning_rate": 1.9997895184501635e-06, "loss": 1.1633, "step": 3876 }, { "epoch": 0.03, "grad_norm": 4.85830290947824, "learning_rate": 1.999789409551362e-06, "loss": 1.4764, "step": 3877 }, { "epoch": 0.03, "grad_norm": 5.299102203588709, "learning_rate": 1.9997893006243997e-06, "loss": 1.4194, "step": 3878 }, { "epoch": 0.03, "grad_norm": 4.28950336109775, "learning_rate": 1.999789191669277e-06, "loss": 1.3345, "step": 3879 }, { "epoch": 0.03, "grad_norm": 5.104468113222967, "learning_rate": 1.9997890826859932e-06, "loss": 1.4397, "step": 3880 }, { "epoch": 0.03, "grad_norm": 4.525314572585726, "learning_rate": 1.9997889736745487e-06, "loss": 1.3641, "step": 3881 }, { "epoch": 0.03, "grad_norm": 4.80604323906331, "learning_rate": 1.9997888646349437e-06, "loss": 1.5621, "step": 3882 }, { "epoch": 0.03, "grad_norm": 4.986489589551769, "learning_rate": 1.9997887555671778e-06, "loss": 1.4912, "step": 3883 }, { "epoch": 0.03, "grad_norm": 4.763610048296777, "learning_rate": 1.9997886464712514e-06, "loss": 1.3143, "step": 3884 }, { "epoch": 0.03, "grad_norm": 4.603312739640314, "learning_rate": 1.999788537347164e-06, "loss": 1.3087, "step": 3885 }, { "epoch": 0.03, "grad_norm": 5.860926703214441, "learning_rate": 1.999788428194916e-06, "loss": 1.6144, "step": 3886 }, { "epoch": 0.03, "grad_norm": 4.379271845669785, "learning_rate": 1.999788319014507e-06, "loss": 1.4011, "step": 3887 }, { "epoch": 0.03, "grad_norm": 4.9033106276724725, "learning_rate": 1.999788209805938e-06, "loss": 1.4461, "step": 3888 }, { "epoch": 0.03, "grad_norm": 4.392198784426966, "learning_rate": 1.9997881005692078e-06, "loss": 1.4313, "step": 3889 }, { "epoch": 0.03, "grad_norm": 4.716333770686667, "learning_rate": 1.9997879913043174e-06, "loss": 1.3279, "step": 3890 }, { "epoch": 0.03, "grad_norm": 4.544317756727553, "learning_rate": 1.9997878820112657e-06, "loss": 1.5095, "step": 3891 }, { "epoch": 0.03, "grad_norm": 4.973078920371229, "learning_rate": 1.999787772690053e-06, "loss": 1.4075, "step": 3892 }, { "epoch": 0.03, "grad_norm": 6.727864529523832, "learning_rate": 1.9997876633406806e-06, "loss": 1.335, "step": 3893 }, { "epoch": 0.03, "grad_norm": 4.290783889756272, "learning_rate": 1.9997875539631467e-06, "loss": 1.2755, "step": 3894 }, { "epoch": 0.03, "grad_norm": 4.945830139530306, "learning_rate": 1.9997874445574524e-06, "loss": 1.4965, "step": 3895 }, { "epoch": 0.03, "grad_norm": 4.587699772861401, "learning_rate": 1.9997873351235976e-06, "loss": 1.4156, "step": 3896 }, { "epoch": 0.03, "grad_norm": 4.5866138092121655, "learning_rate": 1.999787225661582e-06, "loss": 1.3592, "step": 3897 }, { "epoch": 0.03, "grad_norm": 5.379860127454544, "learning_rate": 1.9997871161714053e-06, "loss": 1.549, "step": 3898 }, { "epoch": 0.03, "grad_norm": 4.702300945376333, "learning_rate": 1.9997870066530683e-06, "loss": 1.3092, "step": 3899 }, { "epoch": 0.03, "grad_norm": 4.380505561726293, "learning_rate": 1.9997868971065704e-06, "loss": 1.4049, "step": 3900 }, { "epoch": 0.03, "grad_norm": 5.2285487381602485, "learning_rate": 1.9997867875319125e-06, "loss": 1.474, "step": 3901 }, { "epoch": 0.03, "grad_norm": 5.021070133158655, "learning_rate": 1.9997866779290932e-06, "loss": 1.3639, "step": 3902 }, { "epoch": 0.03, "grad_norm": 4.329776684621402, "learning_rate": 1.9997865682981135e-06, "loss": 1.3514, "step": 3903 }, { "epoch": 0.03, "grad_norm": 4.394962775087942, "learning_rate": 1.9997864586389734e-06, "loss": 1.334, "step": 3904 }, { "epoch": 0.03, "grad_norm": 4.786666533051566, "learning_rate": 1.9997863489516724e-06, "loss": 1.3591, "step": 3905 }, { "epoch": 0.03, "grad_norm": 4.735286968414297, "learning_rate": 1.9997862392362104e-06, "loss": 1.407, "step": 3906 }, { "epoch": 0.03, "grad_norm": 4.267518003297955, "learning_rate": 1.999786129492588e-06, "loss": 1.1932, "step": 3907 }, { "epoch": 0.03, "grad_norm": 4.515666789178671, "learning_rate": 1.9997860197208052e-06, "loss": 1.3637, "step": 3908 }, { "epoch": 0.03, "grad_norm": 4.9980776201015225, "learning_rate": 1.9997859099208615e-06, "loss": 1.4577, "step": 3909 }, { "epoch": 0.03, "grad_norm": 4.211434821041643, "learning_rate": 1.9997858000927574e-06, "loss": 1.1688, "step": 3910 }, { "epoch": 0.03, "grad_norm": 7.888132205330383, "learning_rate": 1.9997856902364923e-06, "loss": 1.3159, "step": 3911 }, { "epoch": 0.03, "grad_norm": 5.941494171099467, "learning_rate": 1.999785580352067e-06, "loss": 1.425, "step": 3912 }, { "epoch": 0.03, "grad_norm": 4.869224817390615, "learning_rate": 1.9997854704394804e-06, "loss": 1.438, "step": 3913 }, { "epoch": 0.03, "grad_norm": 5.297649530067625, "learning_rate": 1.9997853604987336e-06, "loss": 1.4902, "step": 3914 }, { "epoch": 0.03, "grad_norm": 4.992406399386439, "learning_rate": 1.9997852505298263e-06, "loss": 1.5681, "step": 3915 }, { "epoch": 0.03, "grad_norm": 4.813668485681865, "learning_rate": 1.999785140532758e-06, "loss": 1.3887, "step": 3916 }, { "epoch": 0.03, "grad_norm": 5.317508995070949, "learning_rate": 1.9997850305075294e-06, "loss": 1.4104, "step": 3917 }, { "epoch": 0.03, "grad_norm": 5.054561762537068, "learning_rate": 1.99978492045414e-06, "loss": 1.4745, "step": 3918 }, { "epoch": 0.03, "grad_norm": 5.188130409149444, "learning_rate": 1.9997848103725904e-06, "loss": 1.428, "step": 3919 }, { "epoch": 0.03, "grad_norm": 5.10362490199844, "learning_rate": 1.9997847002628795e-06, "loss": 1.3174, "step": 3920 }, { "epoch": 0.03, "grad_norm": 5.006496992647117, "learning_rate": 1.999784590125008e-06, "loss": 1.3294, "step": 3921 }, { "epoch": 0.03, "grad_norm": 4.704361775147659, "learning_rate": 1.999784479958977e-06, "loss": 1.4846, "step": 3922 }, { "epoch": 0.03, "grad_norm": 4.858956613742092, "learning_rate": 1.999784369764784e-06, "loss": 1.373, "step": 3923 }, { "epoch": 0.03, "grad_norm": 4.3817809669340875, "learning_rate": 1.9997842595424315e-06, "loss": 1.3252, "step": 3924 }, { "epoch": 0.03, "grad_norm": 8.61682900236818, "learning_rate": 1.999784149291918e-06, "loss": 1.4753, "step": 3925 }, { "epoch": 0.03, "grad_norm": 4.268471550971707, "learning_rate": 1.9997840390132435e-06, "loss": 1.2907, "step": 3926 }, { "epoch": 0.03, "grad_norm": 5.435472253254623, "learning_rate": 1.999783928706409e-06, "loss": 1.3373, "step": 3927 }, { "epoch": 0.03, "grad_norm": 4.8016056109497365, "learning_rate": 1.9997838183714136e-06, "loss": 1.3735, "step": 3928 }, { "epoch": 0.03, "grad_norm": 4.496787612559162, "learning_rate": 1.9997837080082574e-06, "loss": 1.354, "step": 3929 }, { "epoch": 0.03, "grad_norm": 4.882937080001829, "learning_rate": 1.999783597616941e-06, "loss": 1.4851, "step": 3930 }, { "epoch": 0.03, "grad_norm": 4.5668329007476745, "learning_rate": 1.999783487197464e-06, "loss": 1.3022, "step": 3931 }, { "epoch": 0.03, "grad_norm": 4.455114251929893, "learning_rate": 1.9997833767498263e-06, "loss": 1.3397, "step": 3932 }, { "epoch": 0.03, "grad_norm": 5.649865875669786, "learning_rate": 1.999783266274028e-06, "loss": 1.4842, "step": 3933 }, { "epoch": 0.03, "grad_norm": 4.845876493389327, "learning_rate": 1.9997831557700693e-06, "loss": 1.3143, "step": 3934 }, { "epoch": 0.03, "grad_norm": 5.305138874854907, "learning_rate": 1.99978304523795e-06, "loss": 1.2055, "step": 3935 }, { "epoch": 0.03, "grad_norm": 5.454117689922595, "learning_rate": 1.99978293467767e-06, "loss": 1.3859, "step": 3936 }, { "epoch": 0.03, "grad_norm": 4.579488331559357, "learning_rate": 1.9997828240892293e-06, "loss": 1.4162, "step": 3937 }, { "epoch": 0.03, "grad_norm": 4.254227326782326, "learning_rate": 1.9997827134726285e-06, "loss": 1.2983, "step": 3938 }, { "epoch": 0.03, "grad_norm": 5.061593250958671, "learning_rate": 1.999782602827867e-06, "loss": 1.3513, "step": 3939 }, { "epoch": 0.03, "grad_norm": 5.029857391932401, "learning_rate": 1.9997824921549447e-06, "loss": 1.5465, "step": 3940 }, { "epoch": 0.03, "grad_norm": 4.960537566076112, "learning_rate": 1.999782381453862e-06, "loss": 1.404, "step": 3941 }, { "epoch": 0.03, "grad_norm": 4.556948603808782, "learning_rate": 1.9997822707246186e-06, "loss": 1.4234, "step": 3942 }, { "epoch": 0.03, "eval_loss": 1.608933448791504, "eval_runtime": 4.6101, "eval_samples_per_second": 1.952, "eval_steps_per_second": 1.085, "step": 3942 }, { "epoch": 0.03, "grad_norm": 4.580049744581183, "learning_rate": 1.999782159967215e-06, "loss": 1.3563, "step": 3943 }, { "epoch": 0.03, "grad_norm": 5.193426372026157, "learning_rate": 1.9997820491816508e-06, "loss": 1.3179, "step": 3944 }, { "epoch": 0.03, "grad_norm": 4.728904733040104, "learning_rate": 1.999781938367926e-06, "loss": 1.4606, "step": 3945 }, { "epoch": 0.03, "grad_norm": 4.667975939477055, "learning_rate": 1.9997818275260406e-06, "loss": 1.5067, "step": 3946 }, { "epoch": 0.03, "grad_norm": 4.581299171603196, "learning_rate": 1.9997817166559945e-06, "loss": 1.3725, "step": 3947 }, { "epoch": 0.03, "grad_norm": 5.191068787238222, "learning_rate": 1.999781605757788e-06, "loss": 1.4962, "step": 3948 }, { "epoch": 0.03, "grad_norm": 5.084539932740616, "learning_rate": 1.999781494831421e-06, "loss": 1.5835, "step": 3949 }, { "epoch": 0.03, "grad_norm": 4.796027609690265, "learning_rate": 1.9997813838768936e-06, "loss": 1.5041, "step": 3950 }, { "epoch": 0.03, "grad_norm": 4.976392472286276, "learning_rate": 1.9997812728942056e-06, "loss": 1.4992, "step": 3951 }, { "epoch": 0.03, "grad_norm": 5.559457510155157, "learning_rate": 1.999781161883357e-06, "loss": 1.4291, "step": 3952 }, { "epoch": 0.03, "grad_norm": 4.605119207455393, "learning_rate": 1.999781050844348e-06, "loss": 1.4741, "step": 3953 }, { "epoch": 0.03, "grad_norm": 4.808301488895371, "learning_rate": 1.9997809397771785e-06, "loss": 1.4037, "step": 3954 }, { "epoch": 0.03, "grad_norm": 6.4411593111650065, "learning_rate": 1.9997808286818486e-06, "loss": 1.4531, "step": 3955 }, { "epoch": 0.03, "grad_norm": 4.740355649812272, "learning_rate": 1.999780717558358e-06, "loss": 1.4348, "step": 3956 }, { "epoch": 0.03, "grad_norm": 4.972282316537102, "learning_rate": 1.9997806064067067e-06, "loss": 1.4094, "step": 3957 }, { "epoch": 0.03, "grad_norm": 5.905784470635161, "learning_rate": 1.999780495226895e-06, "loss": 1.3617, "step": 3958 }, { "epoch": 0.03, "grad_norm": 4.796865007354928, "learning_rate": 1.9997803840189235e-06, "loss": 1.3523, "step": 3959 }, { "epoch": 0.03, "grad_norm": 4.536181944864147, "learning_rate": 1.999780272782791e-06, "loss": 1.5046, "step": 3960 }, { "epoch": 0.03, "grad_norm": 4.6749505654603345, "learning_rate": 1.999780161518498e-06, "loss": 1.384, "step": 3961 }, { "epoch": 0.03, "grad_norm": 4.903052596720311, "learning_rate": 1.9997800502260445e-06, "loss": 1.5826, "step": 3962 }, { "epoch": 0.03, "grad_norm": 4.796271062985311, "learning_rate": 1.9997799389054305e-06, "loss": 1.4059, "step": 3963 }, { "epoch": 0.03, "grad_norm": 5.2515946784816405, "learning_rate": 1.999779827556656e-06, "loss": 1.2894, "step": 3964 }, { "epoch": 0.03, "grad_norm": 4.225795228441177, "learning_rate": 1.999779716179721e-06, "loss": 1.3054, "step": 3965 }, { "epoch": 0.03, "grad_norm": 4.559518911975675, "learning_rate": 1.9997796047746256e-06, "loss": 1.4068, "step": 3966 }, { "epoch": 0.03, "grad_norm": 4.634714711026586, "learning_rate": 1.99977949334137e-06, "loss": 1.4012, "step": 3967 }, { "epoch": 0.03, "grad_norm": 4.584694799901643, "learning_rate": 1.9997793818799537e-06, "loss": 1.4366, "step": 3968 }, { "epoch": 0.03, "grad_norm": 4.693090668823183, "learning_rate": 1.999779270390377e-06, "loss": 1.4605, "step": 3969 }, { "epoch": 0.03, "grad_norm": 5.086855314852408, "learning_rate": 1.9997791588726395e-06, "loss": 1.5038, "step": 3970 }, { "epoch": 0.03, "grad_norm": 4.39443386405979, "learning_rate": 1.999779047326742e-06, "loss": 1.4719, "step": 3971 }, { "epoch": 0.03, "grad_norm": 4.706233349183156, "learning_rate": 1.999778935752684e-06, "loss": 1.6196, "step": 3972 }, { "epoch": 0.03, "grad_norm": 4.820647267957313, "learning_rate": 1.999778824150465e-06, "loss": 1.2734, "step": 3973 }, { "epoch": 0.03, "grad_norm": 4.934592565312224, "learning_rate": 1.999778712520086e-06, "loss": 1.4111, "step": 3974 }, { "epoch": 0.03, "grad_norm": 4.799879242679965, "learning_rate": 1.9997786008615466e-06, "loss": 1.34, "step": 3975 }, { "epoch": 0.03, "grad_norm": 4.585473450773626, "learning_rate": 1.9997784891748468e-06, "loss": 1.2118, "step": 3976 }, { "epoch": 0.03, "grad_norm": 4.702753028411549, "learning_rate": 1.9997783774599864e-06, "loss": 1.4267, "step": 3977 }, { "epoch": 0.03, "grad_norm": 4.570246768461574, "learning_rate": 1.9997782657169657e-06, "loss": 1.2998, "step": 3978 }, { "epoch": 0.03, "grad_norm": 4.437431745240078, "learning_rate": 1.9997781539457844e-06, "loss": 1.3523, "step": 3979 }, { "epoch": 0.03, "grad_norm": 4.556007501661391, "learning_rate": 1.999778042146443e-06, "loss": 1.1876, "step": 3980 }, { "epoch": 0.03, "grad_norm": 4.68235134801429, "learning_rate": 1.999777930318941e-06, "loss": 1.427, "step": 3981 }, { "epoch": 0.03, "grad_norm": 5.2477374067298985, "learning_rate": 1.999777818463279e-06, "loss": 1.3294, "step": 3982 }, { "epoch": 0.03, "grad_norm": 4.969973240210854, "learning_rate": 1.999777706579456e-06, "loss": 1.445, "step": 3983 }, { "epoch": 0.03, "grad_norm": 6.475255051378398, "learning_rate": 1.9997775946674723e-06, "loss": 1.4502, "step": 3984 }, { "epoch": 0.03, "grad_norm": 4.7569190490790785, "learning_rate": 1.9997774827273287e-06, "loss": 1.323, "step": 3985 }, { "epoch": 0.03, "grad_norm": 10.463910544824667, "learning_rate": 1.9997773707590247e-06, "loss": 1.402, "step": 3986 }, { "epoch": 0.03, "grad_norm": 4.911192150564717, "learning_rate": 1.9997772587625602e-06, "loss": 1.5394, "step": 3987 }, { "epoch": 0.03, "grad_norm": 5.0936779839536355, "learning_rate": 1.9997771467379353e-06, "loss": 1.3797, "step": 3988 }, { "epoch": 0.03, "grad_norm": 4.327450789194284, "learning_rate": 1.99977703468515e-06, "loss": 1.3015, "step": 3989 }, { "epoch": 0.03, "grad_norm": 5.711022700492837, "learning_rate": 1.999776922604204e-06, "loss": 1.3313, "step": 3990 }, { "epoch": 0.03, "grad_norm": 4.507993646496012, "learning_rate": 1.9997768104950982e-06, "loss": 1.327, "step": 3991 }, { "epoch": 0.03, "grad_norm": 6.8173266094513165, "learning_rate": 1.999776698357832e-06, "loss": 1.3506, "step": 3992 }, { "epoch": 0.03, "grad_norm": 4.443999046130702, "learning_rate": 1.999776586192405e-06, "loss": 1.4122, "step": 3993 }, { "epoch": 0.03, "grad_norm": 6.275925888937289, "learning_rate": 1.999776473998818e-06, "loss": 1.2967, "step": 3994 }, { "epoch": 0.03, "grad_norm": 5.588704321059442, "learning_rate": 1.99977636177707e-06, "loss": 1.5474, "step": 3995 }, { "epoch": 0.03, "grad_norm": 4.361090508428173, "learning_rate": 1.999776249527162e-06, "loss": 1.3251, "step": 3996 }, { "epoch": 0.03, "grad_norm": 4.536710047638297, "learning_rate": 1.999776137249094e-06, "loss": 1.4165, "step": 3997 }, { "epoch": 0.03, "grad_norm": 4.879124345953217, "learning_rate": 1.999776024942865e-06, "loss": 1.5919, "step": 3998 }, { "epoch": 0.03, "grad_norm": 4.679159468660342, "learning_rate": 1.999775912608476e-06, "loss": 1.3357, "step": 3999 }, { "epoch": 0.03, "grad_norm": 7.831562575301233, "learning_rate": 1.9997758002459265e-06, "loss": 1.3025, "step": 4000 }, { "epoch": 0.03, "grad_norm": 4.7864056988022545, "learning_rate": 1.999775687855217e-06, "loss": 1.2995, "step": 4001 }, { "epoch": 0.03, "grad_norm": 5.4227260202558725, "learning_rate": 1.9997755754363464e-06, "loss": 1.3219, "step": 4002 }, { "epoch": 0.03, "grad_norm": 4.969825322270051, "learning_rate": 1.999775462989316e-06, "loss": 1.4753, "step": 4003 }, { "epoch": 0.03, "grad_norm": 4.532631194521758, "learning_rate": 1.9997753505141253e-06, "loss": 1.3181, "step": 4004 }, { "epoch": 0.03, "grad_norm": 4.679272810032164, "learning_rate": 1.999775238010774e-06, "loss": 1.631, "step": 4005 }, { "epoch": 0.03, "grad_norm": 5.182029465563357, "learning_rate": 1.9997751254792624e-06, "loss": 1.4744, "step": 4006 }, { "epoch": 0.03, "grad_norm": 4.411593057957463, "learning_rate": 1.9997750129195905e-06, "loss": 1.1997, "step": 4007 }, { "epoch": 0.03, "grad_norm": 5.033161434093656, "learning_rate": 1.999774900331758e-06, "loss": 1.4425, "step": 4008 }, { "epoch": 0.03, "grad_norm": 4.408013600366345, "learning_rate": 1.9997747877157657e-06, "loss": 1.3365, "step": 4009 }, { "epoch": 0.03, "grad_norm": 4.363187234933129, "learning_rate": 1.999774675071613e-06, "loss": 1.4402, "step": 4010 }, { "epoch": 0.03, "grad_norm": 4.169462839627568, "learning_rate": 1.9997745623992995e-06, "loss": 1.1835, "step": 4011 }, { "epoch": 0.03, "grad_norm": 4.882548943942266, "learning_rate": 1.999774449698826e-06, "loss": 1.4599, "step": 4012 }, { "epoch": 0.03, "grad_norm": 4.443178167442118, "learning_rate": 1.999774336970192e-06, "loss": 1.4095, "step": 4013 }, { "epoch": 0.03, "grad_norm": 4.936351614197963, "learning_rate": 1.999774224213398e-06, "loss": 1.3369, "step": 4014 }, { "epoch": 0.03, "grad_norm": 4.376647493775191, "learning_rate": 1.9997741114284433e-06, "loss": 1.2785, "step": 4015 }, { "epoch": 0.03, "eval_loss": 1.605502724647522, "eval_runtime": 4.6011, "eval_samples_per_second": 1.956, "eval_steps_per_second": 1.087, "step": 4015 }, { "epoch": 0.03, "grad_norm": 5.81702014427679, "learning_rate": 1.999773998615329e-06, "loss": 1.2795, "step": 4016 }, { "epoch": 0.03, "grad_norm": 4.439799757446666, "learning_rate": 1.9997738857740534e-06, "loss": 1.3988, "step": 4017 }, { "epoch": 0.03, "grad_norm": 4.5217208816813965, "learning_rate": 1.999773772904618e-06, "loss": 1.3703, "step": 4018 }, { "epoch": 0.03, "grad_norm": 4.411478052834264, "learning_rate": 1.9997736600070224e-06, "loss": 1.3704, "step": 4019 }, { "epoch": 0.03, "grad_norm": 5.015421910792882, "learning_rate": 1.9997735470812662e-06, "loss": 1.4253, "step": 4020 }, { "epoch": 0.03, "grad_norm": 4.649695545573731, "learning_rate": 1.99977343412735e-06, "loss": 1.2163, "step": 4021 }, { "epoch": 0.03, "grad_norm": 5.447350413974432, "learning_rate": 1.9997733211452733e-06, "loss": 1.4804, "step": 4022 }, { "epoch": 0.03, "grad_norm": 6.597818567857608, "learning_rate": 1.9997732081350366e-06, "loss": 1.6808, "step": 4023 }, { "epoch": 0.03, "grad_norm": 5.218633549402363, "learning_rate": 1.9997730950966395e-06, "loss": 1.4144, "step": 4024 }, { "epoch": 0.03, "grad_norm": 5.030621850118808, "learning_rate": 1.999772982030082e-06, "loss": 1.3481, "step": 4025 }, { "epoch": 0.03, "grad_norm": 5.31776062764359, "learning_rate": 1.9997728689353642e-06, "loss": 1.4274, "step": 4026 }, { "epoch": 0.03, "grad_norm": 4.876598254594031, "learning_rate": 1.999772755812486e-06, "loss": 1.3901, "step": 4027 }, { "epoch": 0.03, "grad_norm": 7.952739929942838, "learning_rate": 1.999772642661448e-06, "loss": 1.3196, "step": 4028 }, { "epoch": 0.03, "grad_norm": 4.874045250987033, "learning_rate": 1.999772529482249e-06, "loss": 1.4525, "step": 4029 }, { "epoch": 0.03, "grad_norm": 4.72947836940623, "learning_rate": 1.9997724162748904e-06, "loss": 1.5113, "step": 4030 }, { "epoch": 0.03, "grad_norm": 4.711468232289795, "learning_rate": 1.9997723030393713e-06, "loss": 1.293, "step": 4031 }, { "epoch": 0.03, "grad_norm": 5.70777791026347, "learning_rate": 1.999772189775692e-06, "loss": 1.4066, "step": 4032 }, { "epoch": 0.03, "grad_norm": 5.179985782882995, "learning_rate": 1.999772076483852e-06, "loss": 1.2818, "step": 4033 }, { "epoch": 0.03, "grad_norm": 4.482145322299973, "learning_rate": 1.9997719631638525e-06, "loss": 1.3937, "step": 4034 }, { "epoch": 0.03, "grad_norm": 4.865100926594526, "learning_rate": 1.9997718498156925e-06, "loss": 1.4866, "step": 4035 }, { "epoch": 0.03, "grad_norm": 5.319505921788666, "learning_rate": 1.999771736439372e-06, "loss": 1.2136, "step": 4036 }, { "epoch": 0.03, "grad_norm": 5.024642816718735, "learning_rate": 1.9997716230348914e-06, "loss": 1.4371, "step": 4037 }, { "epoch": 0.03, "grad_norm": 4.5752971681817405, "learning_rate": 1.9997715096022504e-06, "loss": 1.4228, "step": 4038 }, { "epoch": 0.03, "grad_norm": 4.785779724868782, "learning_rate": 1.9997713961414493e-06, "loss": 1.5482, "step": 4039 }, { "epoch": 0.03, "grad_norm": 4.877259192009734, "learning_rate": 1.999771282652488e-06, "loss": 1.3691, "step": 4040 }, { "epoch": 0.03, "grad_norm": 4.6319508420832305, "learning_rate": 1.9997711691353663e-06, "loss": 1.2074, "step": 4041 }, { "epoch": 0.03, "grad_norm": 4.163212300782814, "learning_rate": 1.9997710555900847e-06, "loss": 1.2008, "step": 4042 }, { "epoch": 0.03, "grad_norm": 4.950105617648507, "learning_rate": 1.9997709420166427e-06, "loss": 1.5649, "step": 4043 }, { "epoch": 0.03, "grad_norm": 4.442018378058149, "learning_rate": 1.9997708284150406e-06, "loss": 1.3645, "step": 4044 }, { "epoch": 0.03, "grad_norm": 4.70018315449476, "learning_rate": 1.999770714785278e-06, "loss": 1.2346, "step": 4045 }, { "epoch": 0.03, "grad_norm": 4.530758072659238, "learning_rate": 1.9997706011273555e-06, "loss": 1.4645, "step": 4046 }, { "epoch": 0.03, "grad_norm": 5.323004667459915, "learning_rate": 1.9997704874412725e-06, "loss": 1.3723, "step": 4047 }, { "epoch": 0.03, "grad_norm": 5.060895357255538, "learning_rate": 1.9997703737270294e-06, "loss": 1.5739, "step": 4048 }, { "epoch": 0.03, "grad_norm": 4.787970121653534, "learning_rate": 1.999770259984626e-06, "loss": 1.4104, "step": 4049 }, { "epoch": 0.03, "grad_norm": 12.40177745756567, "learning_rate": 1.9997701462140624e-06, "loss": 1.3771, "step": 4050 }, { "epoch": 0.03, "grad_norm": 4.320482435712474, "learning_rate": 1.9997700324153392e-06, "loss": 1.4717, "step": 4051 }, { "epoch": 0.03, "grad_norm": 5.2794455358094, "learning_rate": 1.9997699185884552e-06, "loss": 1.3222, "step": 4052 }, { "epoch": 0.03, "grad_norm": 5.730462111184971, "learning_rate": 1.999769804733411e-06, "loss": 1.1816, "step": 4053 }, { "epoch": 0.03, "grad_norm": 5.4716075048747275, "learning_rate": 1.9997696908502066e-06, "loss": 1.4838, "step": 4054 }, { "epoch": 0.03, "grad_norm": 5.268128589727725, "learning_rate": 1.999769576938842e-06, "loss": 1.4901, "step": 4055 }, { "epoch": 0.03, "grad_norm": 4.641299576222133, "learning_rate": 1.9997694629993175e-06, "loss": 1.3331, "step": 4056 }, { "epoch": 0.03, "grad_norm": 4.538353714479188, "learning_rate": 1.9997693490316324e-06, "loss": 1.5125, "step": 4057 }, { "epoch": 0.03, "grad_norm": 4.643445171399679, "learning_rate": 1.9997692350357878e-06, "loss": 1.514, "step": 4058 }, { "epoch": 0.03, "grad_norm": 4.488777963568965, "learning_rate": 1.9997691210117822e-06, "loss": 1.4551, "step": 4059 }, { "epoch": 0.03, "grad_norm": 4.987986794015922, "learning_rate": 1.999769006959617e-06, "loss": 1.3485, "step": 4060 }, { "epoch": 0.03, "grad_norm": 4.537368928551477, "learning_rate": 1.999768892879292e-06, "loss": 1.2985, "step": 4061 }, { "epoch": 0.03, "grad_norm": 4.581573228768927, "learning_rate": 1.999768778770806e-06, "loss": 1.4023, "step": 4062 }, { "epoch": 0.03, "grad_norm": 4.642888622590522, "learning_rate": 1.99976866463416e-06, "loss": 1.2889, "step": 4063 }, { "epoch": 0.03, "grad_norm": 4.49138054428745, "learning_rate": 1.999768550469354e-06, "loss": 1.3256, "step": 4064 }, { "epoch": 0.03, "grad_norm": 5.93881560386753, "learning_rate": 1.999768436276388e-06, "loss": 1.4778, "step": 4065 }, { "epoch": 0.03, "grad_norm": 7.093363337367287, "learning_rate": 1.9997683220552617e-06, "loss": 1.7425, "step": 4066 }, { "epoch": 0.03, "grad_norm": 4.899836569710874, "learning_rate": 1.999768207805975e-06, "loss": 1.4224, "step": 4067 }, { "epoch": 0.03, "grad_norm": 4.731944790670232, "learning_rate": 1.9997680935285287e-06, "loss": 1.3661, "step": 4068 }, { "epoch": 0.03, "grad_norm": 6.195312360802836, "learning_rate": 1.999767979222922e-06, "loss": 1.7585, "step": 4069 }, { "epoch": 0.03, "grad_norm": 5.572812404807666, "learning_rate": 1.999767864889155e-06, "loss": 1.3825, "step": 4070 }, { "epoch": 0.03, "grad_norm": 4.475306164864985, "learning_rate": 1.999767750527228e-06, "loss": 1.4231, "step": 4071 }, { "epoch": 0.03, "grad_norm": 4.587683114934448, "learning_rate": 1.999767636137141e-06, "loss": 1.4282, "step": 4072 }, { "epoch": 0.03, "grad_norm": 4.592435342857987, "learning_rate": 1.9997675217188935e-06, "loss": 1.3084, "step": 4073 }, { "epoch": 0.03, "grad_norm": 5.965298595979578, "learning_rate": 1.999767407272486e-06, "loss": 1.5104, "step": 4074 }, { "epoch": 0.03, "grad_norm": 7.8149677606284405, "learning_rate": 1.9997672927979184e-06, "loss": 1.0865, "step": 4075 }, { "epoch": 0.03, "grad_norm": 5.302380168510148, "learning_rate": 1.9997671782951906e-06, "loss": 1.4832, "step": 4076 }, { "epoch": 0.03, "grad_norm": 6.822340289927441, "learning_rate": 1.999767063764303e-06, "loss": 1.2806, "step": 4077 }, { "epoch": 0.03, "grad_norm": 4.7306444489547586, "learning_rate": 1.9997669492052553e-06, "loss": 1.5743, "step": 4078 }, { "epoch": 0.03, "grad_norm": 4.772577525233655, "learning_rate": 1.999766834618047e-06, "loss": 1.4067, "step": 4079 }, { "epoch": 0.03, "grad_norm": 4.995578016089388, "learning_rate": 1.9997667200026795e-06, "loss": 1.5044, "step": 4080 }, { "epoch": 0.03, "grad_norm": 6.540585852473317, "learning_rate": 1.999766605359151e-06, "loss": 1.3988, "step": 4081 }, { "epoch": 0.03, "grad_norm": 4.9242613191224045, "learning_rate": 1.9997664906874626e-06, "loss": 1.4696, "step": 4082 }, { "epoch": 0.03, "grad_norm": 7.773277695675728, "learning_rate": 1.9997663759876146e-06, "loss": 1.4097, "step": 4083 }, { "epoch": 0.03, "grad_norm": 5.282637675158807, "learning_rate": 1.9997662612596057e-06, "loss": 1.3417, "step": 4084 }, { "epoch": 0.03, "grad_norm": 4.265435799372886, "learning_rate": 1.999766146503437e-06, "loss": 1.4152, "step": 4085 }, { "epoch": 0.03, "grad_norm": 4.574823733091198, "learning_rate": 1.9997660317191085e-06, "loss": 1.3375, "step": 4086 }, { "epoch": 0.03, "grad_norm": 5.216863286965063, "learning_rate": 1.99976591690662e-06, "loss": 1.4868, "step": 4087 }, { "epoch": 0.03, "grad_norm": 6.157100168223549, "learning_rate": 1.999765802065971e-06, "loss": 1.5008, "step": 4088 }, { "epoch": 0.03, "eval_loss": 1.6094391345977783, "eval_runtime": 4.5946, "eval_samples_per_second": 1.959, "eval_steps_per_second": 1.088, "step": 4088 }, { "epoch": 0.03, "grad_norm": 4.660818048746123, "learning_rate": 1.999765687197162e-06, "loss": 1.3543, "step": 4089 }, { "epoch": 0.03, "grad_norm": 5.173107212430158, "learning_rate": 1.999765572300193e-06, "loss": 1.3567, "step": 4090 }, { "epoch": 0.03, "grad_norm": 5.61355918749296, "learning_rate": 1.999765457375064e-06, "loss": 1.3648, "step": 4091 }, { "epoch": 0.03, "grad_norm": 4.351935830906952, "learning_rate": 1.9997653424217747e-06, "loss": 1.3376, "step": 4092 }, { "epoch": 0.03, "grad_norm": 5.457313247573029, "learning_rate": 1.999765227440326e-06, "loss": 1.6406, "step": 4093 }, { "epoch": 0.03, "grad_norm": 5.795766409743238, "learning_rate": 1.9997651124307166e-06, "loss": 1.5296, "step": 4094 }, { "epoch": 0.03, "grad_norm": 4.498730958873877, "learning_rate": 1.999764997392947e-06, "loss": 1.3351, "step": 4095 }, { "epoch": 0.03, "grad_norm": 5.274407809362513, "learning_rate": 1.999764882327018e-06, "loss": 1.4761, "step": 4096 }, { "epoch": 0.03, "grad_norm": 4.727392248230539, "learning_rate": 1.9997647672329286e-06, "loss": 1.3203, "step": 4097 }, { "epoch": 0.03, "grad_norm": 4.810818156616844, "learning_rate": 1.999764652110679e-06, "loss": 1.362, "step": 4098 }, { "epoch": 0.03, "grad_norm": 6.1412822526536734, "learning_rate": 1.9997645369602694e-06, "loss": 1.6179, "step": 4099 }, { "epoch": 0.03, "grad_norm": 4.412358584525045, "learning_rate": 1.9997644217817e-06, "loss": 1.3839, "step": 4100 }, { "epoch": 0.03, "grad_norm": 4.625604305774711, "learning_rate": 1.9997643065749704e-06, "loss": 1.4076, "step": 4101 }, { "epoch": 0.03, "grad_norm": 4.509090597845766, "learning_rate": 1.999764191340081e-06, "loss": 1.3628, "step": 4102 }, { "epoch": 0.03, "grad_norm": 5.543950159841648, "learning_rate": 1.9997640760770313e-06, "loss": 1.2521, "step": 4103 }, { "epoch": 0.03, "grad_norm": 4.60978720247385, "learning_rate": 1.9997639607858217e-06, "loss": 1.4029, "step": 4104 }, { "epoch": 0.03, "grad_norm": 4.5554718881478, "learning_rate": 1.9997638454664517e-06, "loss": 1.392, "step": 4105 }, { "epoch": 0.03, "grad_norm": 5.350967712018642, "learning_rate": 1.999763730118922e-06, "loss": 1.2342, "step": 4106 }, { "epoch": 0.03, "grad_norm": 4.898985757492, "learning_rate": 1.9997636147432323e-06, "loss": 1.4771, "step": 4107 }, { "epoch": 0.03, "grad_norm": 5.054800157005661, "learning_rate": 1.9997634993393825e-06, "loss": 1.4479, "step": 4108 }, { "epoch": 0.03, "grad_norm": 4.389617234774308, "learning_rate": 1.9997633839073728e-06, "loss": 1.3288, "step": 4109 }, { "epoch": 0.03, "grad_norm": 4.809816788091109, "learning_rate": 1.9997632684472034e-06, "loss": 1.488, "step": 4110 }, { "epoch": 0.03, "grad_norm": 5.2360227612628885, "learning_rate": 1.999763152958873e-06, "loss": 1.2076, "step": 4111 }, { "epoch": 0.03, "grad_norm": 14.706237784689693, "learning_rate": 1.9997630374423837e-06, "loss": 1.4365, "step": 4112 }, { "epoch": 0.03, "grad_norm": 4.744371788103204, "learning_rate": 1.9997629218977338e-06, "loss": 1.246, "step": 4113 }, { "epoch": 0.03, "grad_norm": 5.254304156080211, "learning_rate": 1.999762806324924e-06, "loss": 1.4989, "step": 4114 }, { "epoch": 0.03, "grad_norm": 4.519337306080607, "learning_rate": 1.9997626907239543e-06, "loss": 1.304, "step": 4115 }, { "epoch": 0.03, "grad_norm": 5.031598386685862, "learning_rate": 1.9997625750948243e-06, "loss": 1.4555, "step": 4116 }, { "epoch": 0.03, "grad_norm": 4.565192400135453, "learning_rate": 1.9997624594375346e-06, "loss": 1.3196, "step": 4117 }, { "epoch": 0.03, "grad_norm": 5.450915486662151, "learning_rate": 1.999762343752085e-06, "loss": 1.3517, "step": 4118 }, { "epoch": 0.03, "grad_norm": 4.960106285245195, "learning_rate": 1.9997622280384753e-06, "loss": 1.5341, "step": 4119 }, { "epoch": 0.03, "grad_norm": 5.545318219414653, "learning_rate": 1.9997621122967056e-06, "loss": 1.513, "step": 4120 }, { "epoch": 0.03, "grad_norm": 4.799367583447145, "learning_rate": 1.999761996526776e-06, "loss": 1.4613, "step": 4121 }, { "epoch": 0.03, "grad_norm": 5.236043014076285, "learning_rate": 1.999761880728686e-06, "loss": 1.5529, "step": 4122 }, { "epoch": 0.03, "grad_norm": 4.5854393616259195, "learning_rate": 1.9997617649024366e-06, "loss": 1.3302, "step": 4123 }, { "epoch": 0.03, "grad_norm": 4.534933565989185, "learning_rate": 1.999761649048027e-06, "loss": 1.4311, "step": 4124 }, { "epoch": 0.03, "grad_norm": 4.603501982995031, "learning_rate": 1.9997615331654573e-06, "loss": 1.4373, "step": 4125 }, { "epoch": 0.03, "grad_norm": 7.700335059679705, "learning_rate": 1.999761417254728e-06, "loss": 1.2993, "step": 4126 }, { "epoch": 0.03, "grad_norm": 5.274308323786785, "learning_rate": 1.9997613013158387e-06, "loss": 1.4238, "step": 4127 }, { "epoch": 0.03, "grad_norm": 4.7616010749136, "learning_rate": 1.999761185348789e-06, "loss": 1.3236, "step": 4128 }, { "epoch": 0.03, "grad_norm": 5.226977950686807, "learning_rate": 1.99976106935358e-06, "loss": 1.4897, "step": 4129 }, { "epoch": 0.03, "grad_norm": 4.847969653368823, "learning_rate": 1.9997609533302103e-06, "loss": 1.2738, "step": 4130 }, { "epoch": 0.03, "grad_norm": 4.578242022149572, "learning_rate": 1.9997608372786815e-06, "loss": 1.3914, "step": 4131 }, { "epoch": 0.03, "grad_norm": 4.872166781535631, "learning_rate": 1.999760721198992e-06, "loss": 1.3268, "step": 4132 }, { "epoch": 0.03, "grad_norm": 5.446930550381257, "learning_rate": 1.999760605091143e-06, "loss": 1.302, "step": 4133 }, { "epoch": 0.03, "grad_norm": 4.335950490647596, "learning_rate": 1.9997604889551344e-06, "loss": 1.1532, "step": 4134 }, { "epoch": 0.03, "grad_norm": 4.610972230760081, "learning_rate": 1.9997603727909654e-06, "loss": 1.274, "step": 4135 }, { "epoch": 0.03, "grad_norm": 4.709732156901238, "learning_rate": 1.999760256598636e-06, "loss": 1.3896, "step": 4136 }, { "epoch": 0.03, "grad_norm": 6.761050711070873, "learning_rate": 1.9997601403781474e-06, "loss": 1.2895, "step": 4137 }, { "epoch": 0.03, "grad_norm": 4.575405619876242, "learning_rate": 1.9997600241294988e-06, "loss": 1.4106, "step": 4138 }, { "epoch": 0.03, "grad_norm": 4.80841416381284, "learning_rate": 1.9997599078526905e-06, "loss": 1.3987, "step": 4139 }, { "epoch": 0.03, "grad_norm": 4.785911708746512, "learning_rate": 1.9997597915477214e-06, "loss": 1.2118, "step": 4140 }, { "epoch": 0.03, "grad_norm": 4.468887881734601, "learning_rate": 1.999759675214593e-06, "loss": 1.3385, "step": 4141 }, { "epoch": 0.03, "grad_norm": 4.772032285750059, "learning_rate": 1.9997595588533047e-06, "loss": 1.4772, "step": 4142 }, { "epoch": 0.03, "grad_norm": 4.723302125863515, "learning_rate": 1.9997594424638568e-06, "loss": 1.5156, "step": 4143 }, { "epoch": 0.03, "grad_norm": 4.9531333405724025, "learning_rate": 1.9997593260462484e-06, "loss": 1.1532, "step": 4144 }, { "epoch": 0.03, "grad_norm": 4.621563863553203, "learning_rate": 1.9997592096004803e-06, "loss": 1.4829, "step": 4145 }, { "epoch": 0.03, "grad_norm": 4.593571912036547, "learning_rate": 1.9997590931265523e-06, "loss": 1.3882, "step": 4146 }, { "epoch": 0.03, "grad_norm": 4.806925133387347, "learning_rate": 1.9997589766244646e-06, "loss": 1.4586, "step": 4147 }, { "epoch": 0.03, "grad_norm": 5.7110168635326035, "learning_rate": 1.999758860094217e-06, "loss": 1.2612, "step": 4148 }, { "epoch": 0.03, "grad_norm": 5.217842543743607, "learning_rate": 1.999758743535809e-06, "loss": 1.4013, "step": 4149 }, { "epoch": 0.03, "grad_norm": 4.717089122414292, "learning_rate": 1.999758626949242e-06, "loss": 1.2974, "step": 4150 }, { "epoch": 0.03, "grad_norm": 4.460997143690698, "learning_rate": 1.9997585103345144e-06, "loss": 1.4959, "step": 4151 }, { "epoch": 0.03, "grad_norm": 5.132835552607995, "learning_rate": 1.9997583936916274e-06, "loss": 1.3628, "step": 4152 }, { "epoch": 0.03, "grad_norm": 4.742280690702636, "learning_rate": 1.99975827702058e-06, "loss": 1.4159, "step": 4153 }, { "epoch": 0.03, "grad_norm": 4.568277599776101, "learning_rate": 1.9997581603213734e-06, "loss": 1.4161, "step": 4154 }, { "epoch": 0.03, "grad_norm": 4.917976173785172, "learning_rate": 1.9997580435940063e-06, "loss": 1.3555, "step": 4155 }, { "epoch": 0.03, "grad_norm": 4.559613125971704, "learning_rate": 1.99975792683848e-06, "loss": 1.3502, "step": 4156 }, { "epoch": 0.03, "grad_norm": 5.856554274725562, "learning_rate": 1.9997578100547932e-06, "loss": 1.5632, "step": 4157 }, { "epoch": 0.03, "grad_norm": 4.876646924043157, "learning_rate": 1.999757693242947e-06, "loss": 1.3472, "step": 4158 }, { "epoch": 0.03, "grad_norm": 5.056290835014417, "learning_rate": 1.9997575764029405e-06, "loss": 1.3679, "step": 4159 }, { "epoch": 0.03, "grad_norm": 4.428029708995, "learning_rate": 1.9997574595347745e-06, "loss": 1.2782, "step": 4160 }, { "epoch": 0.03, "grad_norm": 4.866493224404937, "learning_rate": 1.999757342638449e-06, "loss": 1.3243, "step": 4161 }, { "epoch": 0.03, "eval_loss": 1.605318307876587, "eval_runtime": 4.5957, "eval_samples_per_second": 1.958, "eval_steps_per_second": 1.088, "step": 4161 }, { "epoch": 0.03, "grad_norm": 4.778523456073136, "learning_rate": 1.999757225713963e-06, "loss": 1.4175, "step": 4162 }, { "epoch": 0.03, "grad_norm": 5.083503925951187, "learning_rate": 1.9997571087613176e-06, "loss": 1.3638, "step": 4163 }, { "epoch": 0.03, "grad_norm": 4.812126559910803, "learning_rate": 1.999756991780512e-06, "loss": 1.4237, "step": 4164 }, { "epoch": 0.03, "grad_norm": 4.547598587659892, "learning_rate": 1.9997568747715466e-06, "loss": 1.2708, "step": 4165 }, { "epoch": 0.03, "grad_norm": 6.859300660479676, "learning_rate": 1.9997567577344217e-06, "loss": 1.5507, "step": 4166 }, { "epoch": 0.03, "grad_norm": 5.433564857959006, "learning_rate": 1.9997566406691367e-06, "loss": 1.2973, "step": 4167 }, { "epoch": 0.03, "grad_norm": 7.713131837774937, "learning_rate": 1.999756523575692e-06, "loss": 1.3117, "step": 4168 }, { "epoch": 0.03, "grad_norm": 4.604610496662957, "learning_rate": 1.9997564064540875e-06, "loss": 1.4579, "step": 4169 }, { "epoch": 0.03, "grad_norm": 5.414408527970808, "learning_rate": 1.9997562893043233e-06, "loss": 1.3933, "step": 4170 }, { "epoch": 0.03, "grad_norm": 4.368761255457766, "learning_rate": 1.999756172126399e-06, "loss": 1.3328, "step": 4171 }, { "epoch": 0.03, "grad_norm": 4.676989725607843, "learning_rate": 1.9997560549203148e-06, "loss": 1.4446, "step": 4172 }, { "epoch": 0.03, "grad_norm": 4.215126493130506, "learning_rate": 1.9997559376860713e-06, "loss": 1.3812, "step": 4173 }, { "epoch": 0.03, "grad_norm": 5.209524224356203, "learning_rate": 1.9997558204236674e-06, "loss": 1.4541, "step": 4174 }, { "epoch": 0.03, "grad_norm": 6.212735909822387, "learning_rate": 1.9997557031331043e-06, "loss": 1.2672, "step": 4175 }, { "epoch": 0.03, "grad_norm": 5.0232431720701705, "learning_rate": 1.999755585814381e-06, "loss": 1.3827, "step": 4176 }, { "epoch": 0.03, "grad_norm": 4.477608594481957, "learning_rate": 1.9997554684674984e-06, "loss": 1.3674, "step": 4177 }, { "epoch": 0.03, "grad_norm": 4.968947153983377, "learning_rate": 1.9997553510924556e-06, "loss": 1.5877, "step": 4178 }, { "epoch": 0.03, "grad_norm": 4.4631320160645895, "learning_rate": 1.999755233689253e-06, "loss": 1.3394, "step": 4179 }, { "epoch": 0.03, "grad_norm": 4.981123483965869, "learning_rate": 1.9997551162578907e-06, "loss": 1.3412, "step": 4180 }, { "epoch": 0.03, "grad_norm": 5.255520313895866, "learning_rate": 1.9997549987983687e-06, "loss": 1.4561, "step": 4181 }, { "epoch": 0.03, "grad_norm": 4.758633230533075, "learning_rate": 1.9997548813106866e-06, "loss": 1.4496, "step": 4182 }, { "epoch": 0.03, "grad_norm": 4.461564864709629, "learning_rate": 1.9997547637948454e-06, "loss": 1.3857, "step": 4183 }, { "epoch": 0.03, "grad_norm": 4.644894181581171, "learning_rate": 1.9997546462508436e-06, "loss": 1.423, "step": 4184 }, { "epoch": 0.03, "grad_norm": 8.477077286868623, "learning_rate": 1.9997545286786827e-06, "loss": 1.6138, "step": 4185 }, { "epoch": 0.03, "grad_norm": 5.283875993196426, "learning_rate": 1.9997544110783618e-06, "loss": 1.3827, "step": 4186 }, { "epoch": 0.03, "grad_norm": 4.4979172950587785, "learning_rate": 1.999754293449881e-06, "loss": 1.3568, "step": 4187 }, { "epoch": 0.03, "grad_norm": 4.6655463149908165, "learning_rate": 1.9997541757932406e-06, "loss": 1.474, "step": 4188 }, { "epoch": 0.03, "grad_norm": 5.040500443762113, "learning_rate": 1.9997540581084404e-06, "loss": 1.2304, "step": 4189 }, { "epoch": 0.03, "grad_norm": 5.557090195006884, "learning_rate": 1.9997539403954806e-06, "loss": 1.54, "step": 4190 }, { "epoch": 0.03, "grad_norm": 4.624816212082906, "learning_rate": 1.9997538226543612e-06, "loss": 1.3855, "step": 4191 }, { "epoch": 0.03, "grad_norm": 4.964022659979495, "learning_rate": 1.9997537048850813e-06, "loss": 1.2982, "step": 4192 }, { "epoch": 0.03, "grad_norm": 7.541997754911193, "learning_rate": 1.9997535870876427e-06, "loss": 1.5706, "step": 4193 }, { "epoch": 0.03, "grad_norm": 5.083291429876115, "learning_rate": 1.9997534692620436e-06, "loss": 1.4258, "step": 4194 }, { "epoch": 0.03, "grad_norm": 5.895588814485954, "learning_rate": 1.9997533514082853e-06, "loss": 1.4846, "step": 4195 }, { "epoch": 0.03, "grad_norm": 4.466537159162781, "learning_rate": 1.9997532335263666e-06, "loss": 1.3688, "step": 4196 }, { "epoch": 0.03, "grad_norm": 4.885970679568281, "learning_rate": 1.9997531156162886e-06, "loss": 1.3961, "step": 4197 }, { "epoch": 0.03, "grad_norm": 4.5741866177791, "learning_rate": 1.999752997678051e-06, "loss": 1.3459, "step": 4198 }, { "epoch": 0.03, "grad_norm": 5.924564344076677, "learning_rate": 1.9997528797116535e-06, "loss": 1.4766, "step": 4199 }, { "epoch": 0.03, "grad_norm": 4.424343534513083, "learning_rate": 1.9997527617170964e-06, "loss": 1.3479, "step": 4200 }, { "epoch": 0.03, "grad_norm": 5.444463225470948, "learning_rate": 1.9997526436943796e-06, "loss": 1.4379, "step": 4201 }, { "epoch": 0.03, "grad_norm": 5.328598798186751, "learning_rate": 1.9997525256435027e-06, "loss": 1.5127, "step": 4202 }, { "epoch": 0.03, "grad_norm": 6.497152155755689, "learning_rate": 1.9997524075644663e-06, "loss": 1.4933, "step": 4203 }, { "epoch": 0.03, "grad_norm": 4.729974316974081, "learning_rate": 1.9997522894572707e-06, "loss": 1.5687, "step": 4204 }, { "epoch": 0.03, "grad_norm": 4.793359713349888, "learning_rate": 1.9997521713219146e-06, "loss": 1.4321, "step": 4205 }, { "epoch": 0.03, "grad_norm": 4.894274186268635, "learning_rate": 1.9997520531583993e-06, "loss": 1.3873, "step": 4206 }, { "epoch": 0.03, "grad_norm": 5.238111785237876, "learning_rate": 1.999751934966724e-06, "loss": 1.3967, "step": 4207 }, { "epoch": 0.03, "grad_norm": 7.193367848990965, "learning_rate": 1.9997518167468895e-06, "loss": 1.5156, "step": 4208 }, { "epoch": 0.03, "grad_norm": 4.667781403081681, "learning_rate": 1.999751698498895e-06, "loss": 1.4265, "step": 4209 }, { "epoch": 0.03, "grad_norm": 5.1243003298062195, "learning_rate": 1.999751580222741e-06, "loss": 1.3233, "step": 4210 }, { "epoch": 0.03, "grad_norm": 4.5249709916197, "learning_rate": 1.999751461918427e-06, "loss": 1.4349, "step": 4211 }, { "epoch": 0.03, "grad_norm": 6.282977119818528, "learning_rate": 1.9997513435859538e-06, "loss": 1.4503, "step": 4212 }, { "epoch": 0.03, "grad_norm": 5.194234225854601, "learning_rate": 1.9997512252253204e-06, "loss": 1.3252, "step": 4213 }, { "epoch": 0.03, "grad_norm": 4.613843200243345, "learning_rate": 1.9997511068365274e-06, "loss": 1.4896, "step": 4214 }, { "epoch": 0.03, "grad_norm": 4.912914392718041, "learning_rate": 1.999750988419575e-06, "loss": 1.3633, "step": 4215 }, { "epoch": 0.03, "grad_norm": 5.679593081472903, "learning_rate": 1.999750869974463e-06, "loss": 1.4339, "step": 4216 }, { "epoch": 0.03, "grad_norm": 5.210918637697213, "learning_rate": 1.999750751501191e-06, "loss": 1.4502, "step": 4217 }, { "epoch": 0.03, "grad_norm": 4.855739662094995, "learning_rate": 1.9997506329997597e-06, "loss": 1.4349, "step": 4218 }, { "epoch": 0.03, "grad_norm": 6.854639447100227, "learning_rate": 1.9997505144701687e-06, "loss": 1.5614, "step": 4219 }, { "epoch": 0.03, "grad_norm": 4.768086414546765, "learning_rate": 1.9997503959124176e-06, "loss": 1.44, "step": 4220 }, { "epoch": 0.03, "grad_norm": 4.885905090995814, "learning_rate": 1.9997502773265077e-06, "loss": 1.4049, "step": 4221 }, { "epoch": 0.03, "grad_norm": 4.69298720355058, "learning_rate": 1.9997501587124374e-06, "loss": 1.4697, "step": 4222 }, { "epoch": 0.03, "grad_norm": 4.5840093309788355, "learning_rate": 1.9997500400702075e-06, "loss": 1.3389, "step": 4223 }, { "epoch": 0.03, "grad_norm": 4.998602737236111, "learning_rate": 1.9997499213998184e-06, "loss": 1.506, "step": 4224 }, { "epoch": 0.03, "grad_norm": 4.726261404096954, "learning_rate": 1.9997498027012693e-06, "loss": 1.4349, "step": 4225 }, { "epoch": 0.03, "grad_norm": 4.478170412001975, "learning_rate": 1.999749683974561e-06, "loss": 1.3608, "step": 4226 }, { "epoch": 0.03, "grad_norm": 4.446580855147208, "learning_rate": 1.9997495652196926e-06, "loss": 1.3775, "step": 4227 }, { "epoch": 0.03, "grad_norm": 5.577641674378644, "learning_rate": 1.999749446436665e-06, "loss": 1.2627, "step": 4228 }, { "epoch": 0.03, "grad_norm": 4.423217408869717, "learning_rate": 1.9997493276254775e-06, "loss": 1.2576, "step": 4229 }, { "epoch": 0.03, "grad_norm": 5.538390534264384, "learning_rate": 1.9997492087861303e-06, "loss": 1.2664, "step": 4230 }, { "epoch": 0.03, "grad_norm": 4.736587950698366, "learning_rate": 1.999749089918624e-06, "loss": 1.4831, "step": 4231 }, { "epoch": 0.03, "grad_norm": 4.47980371360739, "learning_rate": 1.9997489710229575e-06, "loss": 1.3582, "step": 4232 }, { "epoch": 0.03, "grad_norm": 4.395765614873441, "learning_rate": 1.9997488520991315e-06, "loss": 1.3669, "step": 4233 }, { "epoch": 0.03, "grad_norm": 4.968108351724749, "learning_rate": 1.9997487331471463e-06, "loss": 1.6281, "step": 4234 }, { "epoch": 0.03, "eval_loss": 1.6025989055633545, "eval_runtime": 4.6471, "eval_samples_per_second": 1.937, "eval_steps_per_second": 1.076, "step": 4234 }, { "epoch": 0.03, "grad_norm": 4.969149649765911, "learning_rate": 1.999748614167001e-06, "loss": 1.3274, "step": 4235 }, { "epoch": 0.03, "grad_norm": 4.919542573577905, "learning_rate": 1.9997484951586966e-06, "loss": 1.3442, "step": 4236 }, { "epoch": 0.03, "grad_norm": 4.865965174020303, "learning_rate": 1.999748376122232e-06, "loss": 1.3527, "step": 4237 }, { "epoch": 0.03, "grad_norm": 4.615901718507112, "learning_rate": 1.9997482570576085e-06, "loss": 1.4545, "step": 4238 }, { "epoch": 0.03, "grad_norm": 5.59767405042703, "learning_rate": 1.9997481379648253e-06, "loss": 1.4731, "step": 4239 }, { "epoch": 0.03, "grad_norm": 4.775596415272808, "learning_rate": 1.9997480188438824e-06, "loss": 1.3906, "step": 4240 }, { "epoch": 0.03, "grad_norm": 5.646920434393918, "learning_rate": 1.9997478996947795e-06, "loss": 1.3994, "step": 4241 }, { "epoch": 0.03, "grad_norm": 4.685462751040186, "learning_rate": 1.9997477805175174e-06, "loss": 1.4542, "step": 4242 }, { "epoch": 0.03, "grad_norm": 4.778082014139417, "learning_rate": 1.9997476613120957e-06, "loss": 1.4046, "step": 4243 }, { "epoch": 0.03, "grad_norm": 4.761207727766869, "learning_rate": 1.9997475420785144e-06, "loss": 1.3631, "step": 4244 }, { "epoch": 0.03, "grad_norm": 4.472601210599744, "learning_rate": 1.9997474228167735e-06, "loss": 1.407, "step": 4245 }, { "epoch": 0.03, "grad_norm": 4.5421395657557015, "learning_rate": 1.999747303526873e-06, "loss": 1.2928, "step": 4246 }, { "epoch": 0.03, "grad_norm": 5.509096130761806, "learning_rate": 1.9997471842088133e-06, "loss": 1.537, "step": 4247 }, { "epoch": 0.03, "grad_norm": 4.592592930867474, "learning_rate": 1.9997470648625935e-06, "loss": 1.3722, "step": 4248 }, { "epoch": 0.03, "grad_norm": 5.370061327329297, "learning_rate": 1.9997469454882146e-06, "loss": 1.3637, "step": 4249 }, { "epoch": 0.03, "grad_norm": 4.837980437689338, "learning_rate": 1.9997468260856765e-06, "loss": 1.5129, "step": 4250 }, { "epoch": 0.03, "grad_norm": 4.585402193996551, "learning_rate": 1.999746706654978e-06, "loss": 1.2622, "step": 4251 }, { "epoch": 0.03, "grad_norm": 4.818840579081662, "learning_rate": 1.99974658719612e-06, "loss": 1.466, "step": 4252 }, { "epoch": 0.03, "grad_norm": 5.791491902379428, "learning_rate": 1.9997464677091027e-06, "loss": 1.5247, "step": 4253 }, { "epoch": 0.03, "grad_norm": 4.714629003218997, "learning_rate": 1.999746348193926e-06, "loss": 1.3524, "step": 4254 }, { "epoch": 0.03, "grad_norm": 4.8478113133379965, "learning_rate": 1.99974622865059e-06, "loss": 1.5753, "step": 4255 }, { "epoch": 0.03, "grad_norm": 4.892306075830973, "learning_rate": 1.999746109079094e-06, "loss": 1.5105, "step": 4256 }, { "epoch": 0.03, "grad_norm": 4.2873956273261005, "learning_rate": 1.999745989479439e-06, "loss": 1.3837, "step": 4257 }, { "epoch": 0.03, "grad_norm": 6.542590153947477, "learning_rate": 1.9997458698516242e-06, "loss": 1.3869, "step": 4258 }, { "epoch": 0.03, "grad_norm": 4.471949269096139, "learning_rate": 1.99974575019565e-06, "loss": 1.5125, "step": 4259 }, { "epoch": 0.03, "grad_norm": 4.5955918076682325, "learning_rate": 1.999745630511516e-06, "loss": 1.3708, "step": 4260 }, { "epoch": 0.03, "grad_norm": 4.357852850239874, "learning_rate": 1.9997455107992224e-06, "loss": 1.3624, "step": 4261 }, { "epoch": 0.03, "grad_norm": 4.717214115578147, "learning_rate": 1.99974539105877e-06, "loss": 1.4262, "step": 4262 }, { "epoch": 0.03, "grad_norm": 4.945920196271858, "learning_rate": 1.999745271290157e-06, "loss": 1.4823, "step": 4263 }, { "epoch": 0.03, "grad_norm": 6.206453568123609, "learning_rate": 1.9997451514933853e-06, "loss": 1.5297, "step": 4264 }, { "epoch": 0.03, "grad_norm": 4.516196233675721, "learning_rate": 1.999745031668454e-06, "loss": 1.4171, "step": 4265 }, { "epoch": 0.03, "grad_norm": 6.473464905423704, "learning_rate": 1.999744911815363e-06, "loss": 1.4122, "step": 4266 }, { "epoch": 0.03, "grad_norm": 4.546794668744697, "learning_rate": 1.9997447919341126e-06, "loss": 1.4924, "step": 4267 }, { "epoch": 0.03, "grad_norm": 5.336257459345282, "learning_rate": 1.999744672024703e-06, "loss": 1.6183, "step": 4268 }, { "epoch": 0.03, "grad_norm": 4.728766775376159, "learning_rate": 1.9997445520871334e-06, "loss": 1.3718, "step": 4269 }, { "epoch": 0.03, "grad_norm": 4.463755923942121, "learning_rate": 1.9997444321214047e-06, "loss": 1.3763, "step": 4270 }, { "epoch": 0.03, "grad_norm": 4.6543604819355755, "learning_rate": 1.9997443121275165e-06, "loss": 1.3928, "step": 4271 }, { "epoch": 0.03, "grad_norm": 27.24437620174219, "learning_rate": 1.9997441921054686e-06, "loss": 1.6026, "step": 4272 }, { "epoch": 0.03, "grad_norm": 4.3070462879454405, "learning_rate": 1.9997440720552616e-06, "loss": 1.3095, "step": 4273 }, { "epoch": 0.03, "grad_norm": 4.393442785560549, "learning_rate": 1.9997439519768945e-06, "loss": 1.3027, "step": 4274 }, { "epoch": 0.03, "grad_norm": 7.716817726517502, "learning_rate": 1.9997438318703687e-06, "loss": 1.3163, "step": 4275 }, { "epoch": 0.03, "grad_norm": 4.6636141014943195, "learning_rate": 1.999743711735683e-06, "loss": 1.5753, "step": 4276 }, { "epoch": 0.03, "grad_norm": 5.1333680252614915, "learning_rate": 1.9997435915728377e-06, "loss": 1.5675, "step": 4277 }, { "epoch": 0.03, "grad_norm": 4.154083856422058, "learning_rate": 1.9997434713818335e-06, "loss": 1.3517, "step": 4278 }, { "epoch": 0.03, "grad_norm": 4.374842596763697, "learning_rate": 1.999743351162669e-06, "loss": 1.3226, "step": 4279 }, { "epoch": 0.03, "grad_norm": 4.946187535235528, "learning_rate": 1.9997432309153457e-06, "loss": 1.5661, "step": 4280 }, { "epoch": 0.03, "grad_norm": 4.325828396047361, "learning_rate": 1.999743110639863e-06, "loss": 1.3612, "step": 4281 }, { "epoch": 0.03, "grad_norm": 4.949801618620817, "learning_rate": 1.999742990336221e-06, "loss": 1.4937, "step": 4282 }, { "epoch": 0.03, "grad_norm": 4.746324632370678, "learning_rate": 1.999742870004419e-06, "loss": 1.436, "step": 4283 }, { "epoch": 0.03, "grad_norm": 4.849086474371882, "learning_rate": 1.999742749644458e-06, "loss": 1.4615, "step": 4284 }, { "epoch": 0.03, "grad_norm": 5.342703878528549, "learning_rate": 1.999742629256337e-06, "loss": 1.3822, "step": 4285 }, { "epoch": 0.03, "grad_norm": 5.058841068413354, "learning_rate": 1.9997425088400573e-06, "loss": 1.307, "step": 4286 }, { "epoch": 0.03, "grad_norm": 4.643302981728043, "learning_rate": 1.999742388395618e-06, "loss": 1.4409, "step": 4287 }, { "epoch": 0.03, "grad_norm": 5.012758514415535, "learning_rate": 1.999742267923019e-06, "loss": 1.3791, "step": 4288 }, { "epoch": 0.03, "grad_norm": 5.4353063809674245, "learning_rate": 1.9997421474222605e-06, "loss": 1.2868, "step": 4289 }, { "epoch": 0.03, "grad_norm": 5.374799062006198, "learning_rate": 1.999742026893343e-06, "loss": 1.3498, "step": 4290 }, { "epoch": 0.03, "grad_norm": 4.780379286424909, "learning_rate": 1.999741906336266e-06, "loss": 1.3452, "step": 4291 }, { "epoch": 0.03, "grad_norm": 11.487378886208319, "learning_rate": 1.9997417857510293e-06, "loss": 1.3079, "step": 4292 }, { "epoch": 0.03, "grad_norm": 4.642774004664867, "learning_rate": 1.9997416651376335e-06, "loss": 1.406, "step": 4293 }, { "epoch": 0.03, "grad_norm": 4.310811163258382, "learning_rate": 1.999741544496078e-06, "loss": 1.3298, "step": 4294 }, { "epoch": 0.03, "grad_norm": 4.623612823385325, "learning_rate": 1.9997414238263633e-06, "loss": 1.3962, "step": 4295 }, { "epoch": 0.03, "grad_norm": 4.558521112096722, "learning_rate": 1.9997413031284895e-06, "loss": 1.5078, "step": 4296 }, { "epoch": 0.03, "grad_norm": 5.450927650644775, "learning_rate": 1.999741182402456e-06, "loss": 1.4211, "step": 4297 }, { "epoch": 0.03, "grad_norm": 4.526193655726679, "learning_rate": 1.999741061648263e-06, "loss": 1.4408, "step": 4298 }, { "epoch": 0.03, "grad_norm": 5.004772065752105, "learning_rate": 1.999740940865911e-06, "loss": 1.3913, "step": 4299 }, { "epoch": 0.03, "grad_norm": 4.619766434031569, "learning_rate": 1.9997408200553993e-06, "loss": 1.3461, "step": 4300 }, { "epoch": 0.03, "grad_norm": 4.333879366576657, "learning_rate": 1.9997406992167287e-06, "loss": 1.2859, "step": 4301 }, { "epoch": 0.03, "grad_norm": 4.8361764147066015, "learning_rate": 1.999740578349898e-06, "loss": 1.3816, "step": 4302 }, { "epoch": 0.03, "grad_norm": 5.052826654661089, "learning_rate": 1.9997404574549086e-06, "loss": 1.3417, "step": 4303 }, { "epoch": 0.03, "grad_norm": 4.531581716857078, "learning_rate": 1.9997403365317595e-06, "loss": 1.3593, "step": 4304 }, { "epoch": 0.03, "grad_norm": 4.643325504275953, "learning_rate": 1.999740215580451e-06, "loss": 1.4435, "step": 4305 }, { "epoch": 0.03, "grad_norm": 5.030355496702564, "learning_rate": 1.999740094600983e-06, "loss": 1.6323, "step": 4306 }, { "epoch": 0.03, "grad_norm": 4.6505798796984825, "learning_rate": 1.999739973593356e-06, "loss": 1.4138, "step": 4307 }, { "epoch": 0.03, "eval_loss": 1.6011900901794434, "eval_runtime": 4.5989, "eval_samples_per_second": 1.957, "eval_steps_per_second": 1.087, "step": 4307 }, { "epoch": 0.03, "grad_norm": 5.296543713505417, "learning_rate": 1.9997398525575694e-06, "loss": 1.3979, "step": 4308 }, { "epoch": 0.03, "grad_norm": 4.468650291170195, "learning_rate": 1.9997397314936236e-06, "loss": 1.4619, "step": 4309 }, { "epoch": 0.03, "grad_norm": 4.818137755706899, "learning_rate": 1.9997396104015186e-06, "loss": 1.458, "step": 4310 }, { "epoch": 0.03, "grad_norm": 5.463490767786852, "learning_rate": 1.999739489281254e-06, "loss": 1.3772, "step": 4311 }, { "epoch": 0.03, "grad_norm": 4.406236482586799, "learning_rate": 1.99973936813283e-06, "loss": 1.5414, "step": 4312 }, { "epoch": 0.03, "grad_norm": 4.531127476142023, "learning_rate": 1.9997392469562468e-06, "loss": 1.4123, "step": 4313 }, { "epoch": 0.03, "grad_norm": 4.200241377658454, "learning_rate": 1.999739125751504e-06, "loss": 1.3767, "step": 4314 }, { "epoch": 0.03, "grad_norm": 4.386227205256228, "learning_rate": 1.9997390045186024e-06, "loss": 1.2347, "step": 4315 }, { "epoch": 0.03, "grad_norm": 4.656000873657767, "learning_rate": 1.9997388832575414e-06, "loss": 1.3513, "step": 4316 }, { "epoch": 0.03, "grad_norm": 5.4810322108868945, "learning_rate": 1.999738761968321e-06, "loss": 1.3061, "step": 4317 }, { "epoch": 0.03, "grad_norm": 4.658496756603207, "learning_rate": 1.999738640650941e-06, "loss": 1.3777, "step": 4318 }, { "epoch": 0.03, "grad_norm": 4.500436187640294, "learning_rate": 1.999738519305402e-06, "loss": 1.3473, "step": 4319 }, { "epoch": 0.03, "grad_norm": 4.6887519574686065, "learning_rate": 1.999738397931703e-06, "loss": 1.2844, "step": 4320 }, { "epoch": 0.03, "grad_norm": 5.4163669352161055, "learning_rate": 1.999738276529846e-06, "loss": 1.3652, "step": 4321 }, { "epoch": 0.03, "grad_norm": 4.8053008839685525, "learning_rate": 1.999738155099829e-06, "loss": 1.3673, "step": 4322 }, { "epoch": 0.03, "grad_norm": 4.748319942918259, "learning_rate": 1.9997380336416523e-06, "loss": 1.4073, "step": 4323 }, { "epoch": 0.03, "grad_norm": 4.498862051942241, "learning_rate": 1.9997379121553166e-06, "loss": 1.4198, "step": 4324 }, { "epoch": 0.03, "grad_norm": 4.992001971949867, "learning_rate": 1.9997377906408217e-06, "loss": 1.4091, "step": 4325 }, { "epoch": 0.03, "grad_norm": 6.912479498619018, "learning_rate": 1.9997376690981676e-06, "loss": 1.5051, "step": 4326 }, { "epoch": 0.03, "grad_norm": 4.264456785580832, "learning_rate": 1.9997375475273543e-06, "loss": 1.2736, "step": 4327 }, { "epoch": 0.03, "grad_norm": 4.537927830371792, "learning_rate": 1.9997374259283814e-06, "loss": 1.5255, "step": 4328 }, { "epoch": 0.03, "grad_norm": 4.591063426578891, "learning_rate": 1.9997373043012493e-06, "loss": 1.3624, "step": 4329 }, { "epoch": 0.03, "grad_norm": 4.395928848650604, "learning_rate": 1.999737182645958e-06, "loss": 1.3083, "step": 4330 }, { "epoch": 0.03, "grad_norm": 5.120765046932194, "learning_rate": 1.9997370609625075e-06, "loss": 1.4978, "step": 4331 }, { "epoch": 0.03, "grad_norm": 4.54493445469966, "learning_rate": 1.9997369392508975e-06, "loss": 1.3721, "step": 4332 }, { "epoch": 0.03, "grad_norm": 4.468931325081782, "learning_rate": 1.9997368175111286e-06, "loss": 1.4308, "step": 4333 }, { "epoch": 0.03, "grad_norm": 4.6194504343603, "learning_rate": 1.9997366957432e-06, "loss": 1.1578, "step": 4334 }, { "epoch": 0.03, "grad_norm": 4.575477316877093, "learning_rate": 1.9997365739471125e-06, "loss": 1.363, "step": 4335 }, { "epoch": 0.03, "grad_norm": 4.41968250413834, "learning_rate": 1.9997364521228653e-06, "loss": 1.388, "step": 4336 }, { "epoch": 0.03, "grad_norm": 4.512169648425959, "learning_rate": 1.9997363302704593e-06, "loss": 1.3845, "step": 4337 }, { "epoch": 0.03, "grad_norm": 5.541197815911678, "learning_rate": 1.999736208389894e-06, "loss": 1.3792, "step": 4338 }, { "epoch": 0.03, "grad_norm": 5.463834495481971, "learning_rate": 1.9997360864811693e-06, "loss": 1.43, "step": 4339 }, { "epoch": 0.03, "grad_norm": 4.512957041614997, "learning_rate": 1.9997359645442857e-06, "loss": 1.491, "step": 4340 }, { "epoch": 0.03, "grad_norm": 5.009615522190885, "learning_rate": 1.999735842579242e-06, "loss": 1.562, "step": 4341 }, { "epoch": 0.03, "grad_norm": 6.241239017496779, "learning_rate": 1.9997357205860397e-06, "loss": 1.485, "step": 4342 }, { "epoch": 0.03, "grad_norm": 4.694391112562757, "learning_rate": 1.9997355985646786e-06, "loss": 1.485, "step": 4343 }, { "epoch": 0.03, "grad_norm": 5.432825957801096, "learning_rate": 1.9997354765151574e-06, "loss": 1.3394, "step": 4344 }, { "epoch": 0.03, "grad_norm": 4.493593009986748, "learning_rate": 1.9997353544374775e-06, "loss": 1.3063, "step": 4345 }, { "epoch": 0.03, "grad_norm": 4.932388240765814, "learning_rate": 1.999735232331638e-06, "loss": 1.4697, "step": 4346 }, { "epoch": 0.03, "grad_norm": 4.9824785764331825, "learning_rate": 1.9997351101976397e-06, "loss": 1.3837, "step": 4347 }, { "epoch": 0.03, "grad_norm": 5.601476576063595, "learning_rate": 1.9997349880354817e-06, "loss": 1.3557, "step": 4348 }, { "epoch": 0.03, "grad_norm": 6.4510815617501, "learning_rate": 1.999734865845165e-06, "loss": 1.4983, "step": 4349 }, { "epoch": 0.03, "grad_norm": 4.848925917928955, "learning_rate": 1.999734743626689e-06, "loss": 1.4237, "step": 4350 }, { "epoch": 0.03, "grad_norm": 4.434583454754839, "learning_rate": 1.9997346213800537e-06, "loss": 1.2911, "step": 4351 }, { "epoch": 0.03, "grad_norm": 4.892014895935739, "learning_rate": 1.9997344991052595e-06, "loss": 1.4209, "step": 4352 }, { "epoch": 0.03, "grad_norm": 4.852081686860236, "learning_rate": 1.9997343768023057e-06, "loss": 1.4132, "step": 4353 }, { "epoch": 0.03, "grad_norm": 4.45823634125208, "learning_rate": 1.9997342544711926e-06, "loss": 1.348, "step": 4354 }, { "epoch": 0.03, "grad_norm": 4.765945667775744, "learning_rate": 1.9997341321119204e-06, "loss": 1.3664, "step": 4355 }, { "epoch": 0.03, "grad_norm": 4.5299338023062985, "learning_rate": 1.9997340097244894e-06, "loss": 1.4089, "step": 4356 }, { "epoch": 0.03, "grad_norm": 4.876518993595765, "learning_rate": 1.999733887308899e-06, "loss": 1.3758, "step": 4357 }, { "epoch": 0.03, "grad_norm": 4.862541951305569, "learning_rate": 1.9997337648651495e-06, "loss": 1.3556, "step": 4358 }, { "epoch": 0.03, "grad_norm": 4.511534551832159, "learning_rate": 1.9997336423932405e-06, "loss": 1.3286, "step": 4359 }, { "epoch": 0.03, "grad_norm": 4.401748346998763, "learning_rate": 1.9997335198931723e-06, "loss": 1.3822, "step": 4360 }, { "epoch": 0.03, "grad_norm": 4.6191376348378315, "learning_rate": 1.9997333973649454e-06, "loss": 1.2467, "step": 4361 }, { "epoch": 0.03, "grad_norm": 4.436414364217913, "learning_rate": 1.9997332748085593e-06, "loss": 1.4192, "step": 4362 }, { "epoch": 0.03, "grad_norm": 5.619843931696095, "learning_rate": 1.9997331522240136e-06, "loss": 1.4819, "step": 4363 }, { "epoch": 0.03, "grad_norm": 4.4153787910247075, "learning_rate": 1.999733029611309e-06, "loss": 1.2797, "step": 4364 }, { "epoch": 0.03, "grad_norm": 4.229162307796779, "learning_rate": 1.999732906970445e-06, "loss": 1.3664, "step": 4365 }, { "epoch": 0.03, "grad_norm": 5.060199498910178, "learning_rate": 1.999732784301422e-06, "loss": 1.4569, "step": 4366 }, { "epoch": 0.03, "grad_norm": 4.440558056775924, "learning_rate": 1.99973266160424e-06, "loss": 1.4091, "step": 4367 }, { "epoch": 0.03, "grad_norm": 4.9681776240837365, "learning_rate": 1.999732538878899e-06, "loss": 1.2696, "step": 4368 }, { "epoch": 0.03, "grad_norm": 4.4641242753123365, "learning_rate": 1.999732416125399e-06, "loss": 1.2049, "step": 4369 }, { "epoch": 0.03, "grad_norm": 4.256704388098, "learning_rate": 1.9997322933437393e-06, "loss": 1.3302, "step": 4370 }, { "epoch": 0.03, "grad_norm": 4.490925212479531, "learning_rate": 1.9997321705339206e-06, "loss": 1.3916, "step": 4371 }, { "epoch": 0.03, "grad_norm": 4.665075708301038, "learning_rate": 1.999732047695943e-06, "loss": 1.3978, "step": 4372 }, { "epoch": 0.03, "grad_norm": 5.007908273488665, "learning_rate": 1.9997319248298063e-06, "loss": 1.3958, "step": 4373 }, { "epoch": 0.03, "grad_norm": 4.866326973208806, "learning_rate": 1.99973180193551e-06, "loss": 1.3788, "step": 4374 }, { "epoch": 0.03, "grad_norm": 4.494388195184433, "learning_rate": 1.999731679013055e-06, "loss": 1.4384, "step": 4375 }, { "epoch": 0.03, "grad_norm": 4.2392258897699024, "learning_rate": 1.9997315560624405e-06, "loss": 1.1994, "step": 4376 }, { "epoch": 0.03, "grad_norm": 4.782750217365004, "learning_rate": 1.999731433083667e-06, "loss": 1.1272, "step": 4377 }, { "epoch": 0.03, "grad_norm": 5.385810509679439, "learning_rate": 1.999731310076735e-06, "loss": 1.3052, "step": 4378 }, { "epoch": 0.03, "grad_norm": 4.428947438228673, "learning_rate": 1.999731187041643e-06, "loss": 1.462, "step": 4379 }, { "epoch": 0.03, "grad_norm": 4.719744538517375, "learning_rate": 1.9997310639783923e-06, "loss": 1.3086, "step": 4380 }, { "epoch": 0.03, "eval_loss": 1.6017142534255981, "eval_runtime": 4.5899, "eval_samples_per_second": 1.961, "eval_steps_per_second": 1.089, "step": 4380 }, { "epoch": 0.03, "grad_norm": 4.576827556246438, "learning_rate": 1.999730940886983e-06, "loss": 1.4093, "step": 4381 }, { "epoch": 0.03, "grad_norm": 4.659762272632282, "learning_rate": 1.999730817767414e-06, "loss": 1.4571, "step": 4382 }, { "epoch": 0.03, "grad_norm": 4.710057390158991, "learning_rate": 1.999730694619686e-06, "loss": 1.4725, "step": 4383 }, { "epoch": 0.03, "grad_norm": 5.209522358930369, "learning_rate": 1.999730571443799e-06, "loss": 1.4837, "step": 4384 }, { "epoch": 0.03, "grad_norm": 4.705694869173882, "learning_rate": 1.9997304482397527e-06, "loss": 1.355, "step": 4385 }, { "epoch": 0.03, "grad_norm": 5.430543308697818, "learning_rate": 1.9997303250075474e-06, "loss": 1.4277, "step": 4386 }, { "epoch": 0.03, "grad_norm": 5.797189869368812, "learning_rate": 1.9997302017471833e-06, "loss": 1.2403, "step": 4387 }, { "epoch": 0.03, "grad_norm": 4.419252181824527, "learning_rate": 1.99973007845866e-06, "loss": 1.4673, "step": 4388 }, { "epoch": 0.03, "grad_norm": 4.4663145807563485, "learning_rate": 1.9997299551419776e-06, "loss": 1.321, "step": 4389 }, { "epoch": 0.03, "grad_norm": 6.697570138748406, "learning_rate": 1.999729831797136e-06, "loss": 1.3475, "step": 4390 }, { "epoch": 0.03, "grad_norm": 4.844516229638093, "learning_rate": 1.9997297084241355e-06, "loss": 1.2889, "step": 4391 }, { "epoch": 0.03, "grad_norm": 4.525310176237878, "learning_rate": 1.999729585022976e-06, "loss": 1.3671, "step": 4392 }, { "epoch": 0.03, "grad_norm": 4.695283514694347, "learning_rate": 1.999729461593657e-06, "loss": 1.4108, "step": 4393 }, { "epoch": 0.03, "grad_norm": 5.459939541492222, "learning_rate": 1.999729338136179e-06, "loss": 1.3325, "step": 4394 }, { "epoch": 0.03, "grad_norm": 4.543078722340205, "learning_rate": 1.9997292146505425e-06, "loss": 1.4104, "step": 4395 }, { "epoch": 0.03, "grad_norm": 4.33750759803959, "learning_rate": 1.999729091136747e-06, "loss": 1.3292, "step": 4396 }, { "epoch": 0.03, "grad_norm": 4.686653775726854, "learning_rate": 1.999728967594792e-06, "loss": 1.2876, "step": 4397 }, { "epoch": 0.03, "grad_norm": 4.76445340282035, "learning_rate": 1.999728844024678e-06, "loss": 1.5677, "step": 4398 }, { "epoch": 0.03, "grad_norm": 4.713707274998664, "learning_rate": 1.999728720426405e-06, "loss": 1.4404, "step": 4399 }, { "epoch": 0.03, "grad_norm": 4.572369200302076, "learning_rate": 1.999728596799973e-06, "loss": 1.4441, "step": 4400 }, { "epoch": 0.03, "grad_norm": 5.234092253811171, "learning_rate": 1.999728473145382e-06, "loss": 1.4818, "step": 4401 }, { "epoch": 0.03, "grad_norm": 4.382222652549184, "learning_rate": 1.999728349462632e-06, "loss": 1.3823, "step": 4402 }, { "epoch": 0.03, "grad_norm": 5.218666768954624, "learning_rate": 1.999728225751723e-06, "loss": 1.4093, "step": 4403 }, { "epoch": 0.03, "grad_norm": 4.449381238241505, "learning_rate": 1.999728102012655e-06, "loss": 1.3458, "step": 4404 }, { "epoch": 0.03, "grad_norm": 4.296877308334789, "learning_rate": 1.9997279782454275e-06, "loss": 1.3655, "step": 4405 }, { "epoch": 0.03, "grad_norm": 4.502513087285789, "learning_rate": 1.999727854450042e-06, "loss": 1.3795, "step": 4406 }, { "epoch": 0.03, "grad_norm": 4.694823343260584, "learning_rate": 1.9997277306264966e-06, "loss": 1.4231, "step": 4407 }, { "epoch": 0.03, "grad_norm": 4.496740797821412, "learning_rate": 1.9997276067747925e-06, "loss": 1.3807, "step": 4408 }, { "epoch": 0.03, "grad_norm": 4.388211373869601, "learning_rate": 1.9997274828949293e-06, "loss": 1.287, "step": 4409 }, { "epoch": 0.03, "grad_norm": 15.802080808216045, "learning_rate": 1.9997273589869073e-06, "loss": 1.6177, "step": 4410 }, { "epoch": 0.03, "grad_norm": 4.580393104332515, "learning_rate": 1.9997272350507257e-06, "loss": 1.4575, "step": 4411 }, { "epoch": 0.03, "grad_norm": 4.548398896686738, "learning_rate": 1.999727111086386e-06, "loss": 1.3715, "step": 4412 }, { "epoch": 0.03, "grad_norm": 4.447134136763138, "learning_rate": 1.9997269870938867e-06, "loss": 1.4868, "step": 4413 }, { "epoch": 0.03, "grad_norm": 4.284477894304026, "learning_rate": 1.9997268630732288e-06, "loss": 1.3475, "step": 4414 }, { "epoch": 0.03, "grad_norm": 5.427883088594327, "learning_rate": 1.9997267390244117e-06, "loss": 1.2808, "step": 4415 }, { "epoch": 0.03, "grad_norm": 6.507785545378877, "learning_rate": 1.999726614947436e-06, "loss": 1.2896, "step": 4416 }, { "epoch": 0.03, "grad_norm": 4.272289274442121, "learning_rate": 1.9997264908423004e-06, "loss": 1.4883, "step": 4417 }, { "epoch": 0.03, "grad_norm": 6.642370216299524, "learning_rate": 1.9997263667090066e-06, "loss": 1.5029, "step": 4418 }, { "epoch": 0.03, "grad_norm": 4.4000411165478095, "learning_rate": 1.9997262425475536e-06, "loss": 1.3926, "step": 4419 }, { "epoch": 0.03, "grad_norm": 5.182425805793034, "learning_rate": 1.999726118357942e-06, "loss": 1.4843, "step": 4420 }, { "epoch": 0.03, "grad_norm": 4.689272578916729, "learning_rate": 1.999725994140171e-06, "loss": 1.3709, "step": 4421 }, { "epoch": 0.03, "grad_norm": 4.825590011336578, "learning_rate": 1.9997258698942413e-06, "loss": 1.4262, "step": 4422 }, { "epoch": 0.03, "grad_norm": 4.420585720416629, "learning_rate": 1.9997257456201524e-06, "loss": 1.4376, "step": 4423 }, { "epoch": 0.03, "grad_norm": 4.795091319130384, "learning_rate": 1.9997256213179048e-06, "loss": 1.2938, "step": 4424 }, { "epoch": 0.03, "grad_norm": 4.749244138858896, "learning_rate": 1.999725496987498e-06, "loss": 1.3949, "step": 4425 }, { "epoch": 0.03, "grad_norm": 4.679858531369746, "learning_rate": 1.9997253726289324e-06, "loss": 1.2739, "step": 4426 }, { "epoch": 0.03, "grad_norm": 4.254338755385825, "learning_rate": 1.9997252482422076e-06, "loss": 1.3465, "step": 4427 }, { "epoch": 0.03, "grad_norm": 4.821681793117109, "learning_rate": 1.9997251238273245e-06, "loss": 1.1908, "step": 4428 }, { "epoch": 0.03, "grad_norm": 4.378356148447813, "learning_rate": 1.999724999384282e-06, "loss": 1.3987, "step": 4429 }, { "epoch": 0.03, "grad_norm": 4.955922279034343, "learning_rate": 1.9997248749130807e-06, "loss": 1.5207, "step": 4430 }, { "epoch": 0.03, "grad_norm": 4.904415469518878, "learning_rate": 1.9997247504137204e-06, "loss": 1.3792, "step": 4431 }, { "epoch": 0.03, "grad_norm": 4.5197144550998365, "learning_rate": 1.999724625886201e-06, "loss": 1.456, "step": 4432 }, { "epoch": 0.03, "grad_norm": 5.7015053155013415, "learning_rate": 1.9997245013305228e-06, "loss": 1.6208, "step": 4433 }, { "epoch": 0.03, "grad_norm": 4.488003056038032, "learning_rate": 1.999724376746686e-06, "loss": 1.5068, "step": 4434 }, { "epoch": 0.03, "grad_norm": 4.631922551895107, "learning_rate": 1.99972425213469e-06, "loss": 1.4745, "step": 4435 }, { "epoch": 0.03, "grad_norm": 4.383029850158886, "learning_rate": 1.999724127494535e-06, "loss": 1.4446, "step": 4436 }, { "epoch": 0.03, "grad_norm": 5.329574506911482, "learning_rate": 1.9997240028262215e-06, "loss": 1.5673, "step": 4437 }, { "epoch": 0.03, "grad_norm": 5.012721073595145, "learning_rate": 1.9997238781297486e-06, "loss": 1.3343, "step": 4438 }, { "epoch": 0.03, "grad_norm": 4.672739029634206, "learning_rate": 1.9997237534051174e-06, "loss": 1.4372, "step": 4439 }, { "epoch": 0.03, "grad_norm": 4.293694914173145, "learning_rate": 1.999723628652327e-06, "loss": 1.3857, "step": 4440 }, { "epoch": 0.03, "grad_norm": 5.077018949037316, "learning_rate": 1.9997235038713774e-06, "loss": 1.2687, "step": 4441 }, { "epoch": 0.03, "grad_norm": 4.5893189481752, "learning_rate": 1.9997233790622695e-06, "loss": 1.455, "step": 4442 }, { "epoch": 0.03, "grad_norm": 5.501984858473863, "learning_rate": 1.9997232542250024e-06, "loss": 1.5641, "step": 4443 }, { "epoch": 0.03, "grad_norm": 4.777323146646321, "learning_rate": 1.999723129359576e-06, "loss": 1.3306, "step": 4444 }, { "epoch": 0.03, "grad_norm": 4.480658597109748, "learning_rate": 1.9997230044659915e-06, "loss": 1.4156, "step": 4445 }, { "epoch": 0.03, "grad_norm": 9.444202516830364, "learning_rate": 1.9997228795442477e-06, "loss": 1.529, "step": 4446 }, { "epoch": 0.03, "grad_norm": 4.805066254401963, "learning_rate": 1.999722754594345e-06, "loss": 1.5046, "step": 4447 }, { "epoch": 0.03, "grad_norm": 4.620019052045173, "learning_rate": 1.9997226296162837e-06, "loss": 1.4518, "step": 4448 }, { "epoch": 0.03, "grad_norm": 4.466445005326779, "learning_rate": 1.999722504610063e-06, "loss": 1.4152, "step": 4449 }, { "epoch": 0.03, "grad_norm": 4.716870548025645, "learning_rate": 1.9997223795756843e-06, "loss": 1.428, "step": 4450 }, { "epoch": 0.03, "grad_norm": 4.8565447985970716, "learning_rate": 1.9997222545131463e-06, "loss": 1.4168, "step": 4451 }, { "epoch": 0.03, "grad_norm": 4.339439244684547, "learning_rate": 1.9997221294224494e-06, "loss": 1.2661, "step": 4452 }, { "epoch": 0.03, "grad_norm": 4.594867048542358, "learning_rate": 1.9997220043035934e-06, "loss": 1.4999, "step": 4453 }, { "epoch": 0.03, "eval_loss": 1.601130723953247, "eval_runtime": 4.5922, "eval_samples_per_second": 1.96, "eval_steps_per_second": 1.089, "step": 4453 }, { "epoch": 0.03, "grad_norm": 4.590495399544321, "learning_rate": 1.999721879156579e-06, "loss": 1.3184, "step": 4454 }, { "epoch": 0.03, "grad_norm": 4.331894974901062, "learning_rate": 1.9997217539814055e-06, "loss": 1.3255, "step": 4455 }, { "epoch": 0.03, "grad_norm": 4.5986540401054405, "learning_rate": 1.9997216287780732e-06, "loss": 1.3841, "step": 4456 }, { "epoch": 0.03, "grad_norm": 4.724779783508861, "learning_rate": 1.999721503546582e-06, "loss": 1.3599, "step": 4457 }, { "epoch": 0.03, "grad_norm": 7.032279947644534, "learning_rate": 1.9997213782869323e-06, "loss": 1.2311, "step": 4458 }, { "epoch": 0.03, "grad_norm": 4.796545802875709, "learning_rate": 1.9997212529991233e-06, "loss": 1.4356, "step": 4459 }, { "epoch": 0.03, "grad_norm": 4.424204244969529, "learning_rate": 1.999721127683156e-06, "loss": 1.393, "step": 4460 }, { "epoch": 0.03, "grad_norm": 4.415965298687206, "learning_rate": 1.9997210023390294e-06, "loss": 1.0531, "step": 4461 }, { "epoch": 0.03, "grad_norm": 4.715558771558784, "learning_rate": 1.9997208769667446e-06, "loss": 1.3135, "step": 4462 }, { "epoch": 0.03, "grad_norm": 4.971668236449606, "learning_rate": 1.9997207515663005e-06, "loss": 1.4641, "step": 4463 }, { "epoch": 0.03, "grad_norm": 4.492266778925344, "learning_rate": 1.9997206261376977e-06, "loss": 1.4123, "step": 4464 }, { "epoch": 0.03, "grad_norm": 4.769488132959541, "learning_rate": 1.999720500680936e-06, "loss": 1.4587, "step": 4465 }, { "epoch": 0.03, "grad_norm": 4.560061197211145, "learning_rate": 1.9997203751960157e-06, "loss": 1.3184, "step": 4466 }, { "epoch": 0.03, "grad_norm": 4.374203083955434, "learning_rate": 1.9997202496829366e-06, "loss": 1.4043, "step": 4467 }, { "epoch": 0.03, "grad_norm": 5.550601548278648, "learning_rate": 1.9997201241416987e-06, "loss": 1.4825, "step": 4468 }, { "epoch": 0.03, "grad_norm": 4.497413688530989, "learning_rate": 1.9997199985723016e-06, "loss": 1.326, "step": 4469 }, { "epoch": 0.03, "grad_norm": 4.706442672451927, "learning_rate": 1.999719872974746e-06, "loss": 1.2711, "step": 4470 }, { "epoch": 0.03, "grad_norm": 5.948700514400472, "learning_rate": 1.9997197473490316e-06, "loss": 1.4653, "step": 4471 }, { "epoch": 0.03, "grad_norm": 4.712511277605898, "learning_rate": 1.9997196216951587e-06, "loss": 1.4442, "step": 4472 }, { "epoch": 0.03, "grad_norm": 4.918823003644156, "learning_rate": 1.9997194960131266e-06, "loss": 1.3915, "step": 4473 }, { "epoch": 0.03, "grad_norm": 6.2131250071788955, "learning_rate": 1.999719370302936e-06, "loss": 1.2476, "step": 4474 }, { "epoch": 0.03, "grad_norm": 4.8723754573058216, "learning_rate": 1.9997192445645864e-06, "loss": 1.5397, "step": 4475 }, { "epoch": 0.03, "grad_norm": 4.273111893867883, "learning_rate": 1.999719118798078e-06, "loss": 1.3882, "step": 4476 }, { "epoch": 0.03, "grad_norm": 4.544424367465914, "learning_rate": 1.999718993003411e-06, "loss": 1.3518, "step": 4477 }, { "epoch": 0.03, "grad_norm": 4.538748637075479, "learning_rate": 1.9997188671805858e-06, "loss": 1.474, "step": 4478 }, { "epoch": 0.03, "grad_norm": 4.953487250641325, "learning_rate": 1.9997187413296006e-06, "loss": 1.3892, "step": 4479 }, { "epoch": 0.03, "grad_norm": 4.54701858957239, "learning_rate": 1.9997186154504576e-06, "loss": 1.3837, "step": 4480 }, { "epoch": 0.03, "grad_norm": 4.456442398174285, "learning_rate": 1.9997184895431558e-06, "loss": 1.4326, "step": 4481 }, { "epoch": 0.03, "grad_norm": 4.457892792448363, "learning_rate": 1.9997183636076952e-06, "loss": 1.3531, "step": 4482 }, { "epoch": 0.03, "grad_norm": 5.572716988547392, "learning_rate": 1.9997182376440755e-06, "loss": 1.302, "step": 4483 }, { "epoch": 0.03, "grad_norm": 5.416673989481979, "learning_rate": 1.9997181116522974e-06, "loss": 1.4328, "step": 4484 }, { "epoch": 0.03, "grad_norm": 4.561292347302119, "learning_rate": 1.99971798563236e-06, "loss": 1.3784, "step": 4485 }, { "epoch": 0.03, "grad_norm": 5.595348553035903, "learning_rate": 1.9997178595842645e-06, "loss": 1.6083, "step": 4486 }, { "epoch": 0.03, "grad_norm": 4.842063691959445, "learning_rate": 1.99971773350801e-06, "loss": 1.4861, "step": 4487 }, { "epoch": 0.03, "grad_norm": 4.466370506355964, "learning_rate": 1.999717607403597e-06, "loss": 1.4296, "step": 4488 }, { "epoch": 0.03, "grad_norm": 4.6561250376530054, "learning_rate": 1.999717481271025e-06, "loss": 1.438, "step": 4489 }, { "epoch": 0.03, "grad_norm": 4.239150476251425, "learning_rate": 1.9997173551102943e-06, "loss": 1.314, "step": 4490 }, { "epoch": 0.03, "grad_norm": 4.440286225120971, "learning_rate": 1.9997172289214053e-06, "loss": 1.3698, "step": 4491 }, { "epoch": 0.03, "grad_norm": 4.585308139174541, "learning_rate": 1.9997171027043575e-06, "loss": 1.4583, "step": 4492 }, { "epoch": 0.03, "grad_norm": 4.612015956586811, "learning_rate": 1.9997169764591506e-06, "loss": 1.3708, "step": 4493 }, { "epoch": 0.03, "grad_norm": 4.879945529633844, "learning_rate": 1.9997168501857853e-06, "loss": 1.2438, "step": 4494 }, { "epoch": 0.03, "grad_norm": 6.40992026759723, "learning_rate": 1.9997167238842608e-06, "loss": 1.3945, "step": 4495 }, { "epoch": 0.03, "grad_norm": 4.3202306557060615, "learning_rate": 1.9997165975545784e-06, "loss": 1.4014, "step": 4496 }, { "epoch": 0.03, "grad_norm": 5.301677251903743, "learning_rate": 1.9997164711967368e-06, "loss": 1.549, "step": 4497 }, { "epoch": 0.03, "grad_norm": 7.730003067885032, "learning_rate": 1.9997163448107364e-06, "loss": 1.4268, "step": 4498 }, { "epoch": 0.03, "grad_norm": 4.681910470754163, "learning_rate": 1.9997162183965777e-06, "loss": 1.4928, "step": 4499 }, { "epoch": 0.03, "grad_norm": 5.461643071470781, "learning_rate": 1.99971609195426e-06, "loss": 1.4687, "step": 4500 }, { "epoch": 0.03, "grad_norm": 5.080476202468499, "learning_rate": 1.999715965483784e-06, "loss": 1.5461, "step": 4501 }, { "epoch": 0.03, "grad_norm": 4.304004105815959, "learning_rate": 1.999715838985149e-06, "loss": 1.3281, "step": 4502 }, { "epoch": 0.03, "grad_norm": 4.7747733170699656, "learning_rate": 1.9997157124583553e-06, "loss": 1.3785, "step": 4503 }, { "epoch": 0.03, "grad_norm": 5.357918768634274, "learning_rate": 1.999715585903403e-06, "loss": 1.374, "step": 4504 }, { "epoch": 0.03, "grad_norm": 4.581418869283627, "learning_rate": 1.9997154593202923e-06, "loss": 1.599, "step": 4505 }, { "epoch": 0.03, "grad_norm": 5.016971298500958, "learning_rate": 1.9997153327090227e-06, "loss": 1.3056, "step": 4506 }, { "epoch": 0.03, "grad_norm": 5.655844522236134, "learning_rate": 1.9997152060695943e-06, "loss": 1.4539, "step": 4507 }, { "epoch": 0.03, "grad_norm": 4.510115133968695, "learning_rate": 1.9997150794020076e-06, "loss": 1.4212, "step": 4508 }, { "epoch": 0.03, "grad_norm": 5.661307096079274, "learning_rate": 1.999714952706262e-06, "loss": 1.4814, "step": 4509 }, { "epoch": 0.03, "grad_norm": 4.649188647108606, "learning_rate": 1.999714825982358e-06, "loss": 1.3811, "step": 4510 }, { "epoch": 0.03, "grad_norm": 4.274866569739158, "learning_rate": 1.9997146992302953e-06, "loss": 1.2209, "step": 4511 }, { "epoch": 0.03, "grad_norm": 5.7613170829183185, "learning_rate": 1.9997145724500735e-06, "loss": 1.4285, "step": 4512 }, { "epoch": 0.03, "grad_norm": 4.643219560833804, "learning_rate": 1.999714445641694e-06, "loss": 1.4686, "step": 4513 }, { "epoch": 0.03, "grad_norm": 5.944940595282874, "learning_rate": 1.999714318805155e-06, "loss": 1.3333, "step": 4514 }, { "epoch": 0.03, "grad_norm": 4.258582449241286, "learning_rate": 1.9997141919404577e-06, "loss": 1.1356, "step": 4515 }, { "epoch": 0.03, "grad_norm": 4.431739519167114, "learning_rate": 1.9997140650476017e-06, "loss": 1.3535, "step": 4516 }, { "epoch": 0.03, "grad_norm": 4.87147641809397, "learning_rate": 1.999713938126587e-06, "loss": 1.2698, "step": 4517 }, { "epoch": 0.03, "grad_norm": 4.616387510338015, "learning_rate": 1.999713811177414e-06, "loss": 1.2143, "step": 4518 }, { "epoch": 0.03, "grad_norm": 5.559952303453012, "learning_rate": 1.999713684200082e-06, "loss": 1.3221, "step": 4519 }, { "epoch": 0.03, "grad_norm": 5.097624020023589, "learning_rate": 1.9997135571945915e-06, "loss": 1.5807, "step": 4520 }, { "epoch": 0.03, "grad_norm": 4.674184744353376, "learning_rate": 1.9997134301609425e-06, "loss": 1.4574, "step": 4521 }, { "epoch": 0.03, "grad_norm": 4.870754663031117, "learning_rate": 1.999713303099135e-06, "loss": 1.4814, "step": 4522 }, { "epoch": 0.03, "grad_norm": 4.92236017704258, "learning_rate": 1.9997131760091687e-06, "loss": 1.2875, "step": 4523 }, { "epoch": 0.03, "grad_norm": 4.807047148537356, "learning_rate": 1.999713048891044e-06, "loss": 1.4459, "step": 4524 }, { "epoch": 0.03, "grad_norm": 6.393873546044099, "learning_rate": 1.9997129217447608e-06, "loss": 1.4268, "step": 4525 }, { "epoch": 0.03, "grad_norm": 4.610987080112875, "learning_rate": 1.9997127945703184e-06, "loss": 1.4361, "step": 4526 }, { "epoch": 0.03, "eval_loss": 1.5975711345672607, "eval_runtime": 4.6184, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 4526 }, { "epoch": 0.03, "grad_norm": 4.513663921139003, "learning_rate": 1.999712667367718e-06, "loss": 1.2942, "step": 4527 }, { "epoch": 0.03, "grad_norm": 4.649464666918121, "learning_rate": 1.9997125401369587e-06, "loss": 1.1709, "step": 4528 }, { "epoch": 0.03, "grad_norm": 5.91830422537713, "learning_rate": 1.9997124128780414e-06, "loss": 1.2521, "step": 4529 }, { "epoch": 0.03, "grad_norm": 4.598842420787161, "learning_rate": 1.999712285590965e-06, "loss": 1.3428, "step": 4530 }, { "epoch": 0.03, "grad_norm": 4.534735549347137, "learning_rate": 1.99971215827573e-06, "loss": 1.3675, "step": 4531 }, { "epoch": 0.03, "grad_norm": 4.911355493419071, "learning_rate": 1.9997120309323363e-06, "loss": 1.293, "step": 4532 }, { "epoch": 0.03, "grad_norm": 4.4339945794122935, "learning_rate": 1.9997119035607843e-06, "loss": 1.3888, "step": 4533 }, { "epoch": 0.03, "grad_norm": 5.812519604166528, "learning_rate": 1.999711776161074e-06, "loss": 1.5327, "step": 4534 }, { "epoch": 0.03, "grad_norm": 4.490156850154689, "learning_rate": 1.999711648733205e-06, "loss": 1.4336, "step": 4535 }, { "epoch": 0.03, "grad_norm": 4.650835713904079, "learning_rate": 1.999711521277177e-06, "loss": 1.3437, "step": 4536 }, { "epoch": 0.03, "grad_norm": 4.596808571066155, "learning_rate": 1.9997113937929912e-06, "loss": 1.3622, "step": 4537 }, { "epoch": 0.03, "grad_norm": 4.420665950779566, "learning_rate": 1.9997112662806463e-06, "loss": 1.4369, "step": 4538 }, { "epoch": 0.03, "grad_norm": 5.369270149300852, "learning_rate": 1.999711138740143e-06, "loss": 1.3291, "step": 4539 }, { "epoch": 0.03, "grad_norm": 4.47379643319412, "learning_rate": 1.999711011171481e-06, "loss": 1.3611, "step": 4540 }, { "epoch": 0.03, "grad_norm": 5.014648208059479, "learning_rate": 1.9997108835746605e-06, "loss": 1.2846, "step": 4541 }, { "epoch": 0.03, "grad_norm": 4.633919578829302, "learning_rate": 1.9997107559496817e-06, "loss": 1.4365, "step": 4542 }, { "epoch": 0.03, "grad_norm": 4.608067912239075, "learning_rate": 1.9997106282965442e-06, "loss": 1.4405, "step": 4543 }, { "epoch": 0.03, "grad_norm": 4.72868483490262, "learning_rate": 1.999710500615248e-06, "loss": 1.4297, "step": 4544 }, { "epoch": 0.03, "grad_norm": 10.366763694762389, "learning_rate": 1.9997103729057938e-06, "loss": 1.4042, "step": 4545 }, { "epoch": 0.03, "grad_norm": 4.7423793342135365, "learning_rate": 1.999710245168181e-06, "loss": 1.2583, "step": 4546 }, { "epoch": 0.03, "grad_norm": 4.721059095250116, "learning_rate": 1.9997101174024095e-06, "loss": 1.4249, "step": 4547 }, { "epoch": 0.03, "grad_norm": 4.58455394623487, "learning_rate": 1.999709989608479e-06, "loss": 1.3442, "step": 4548 }, { "epoch": 0.03, "grad_norm": 4.437601341708891, "learning_rate": 1.9997098617863907e-06, "loss": 1.371, "step": 4549 }, { "epoch": 0.03, "grad_norm": 4.498319968364425, "learning_rate": 1.999709733936144e-06, "loss": 1.2153, "step": 4550 }, { "epoch": 0.03, "grad_norm": 4.976580398850894, "learning_rate": 1.9997096060577384e-06, "loss": 1.655, "step": 4551 }, { "epoch": 0.03, "grad_norm": 4.793535828863053, "learning_rate": 1.999709478151174e-06, "loss": 1.3682, "step": 4552 }, { "epoch": 0.03, "grad_norm": 5.086751255098221, "learning_rate": 1.9997093502164516e-06, "loss": 1.5098, "step": 4553 }, { "epoch": 0.03, "grad_norm": 4.682415618625111, "learning_rate": 1.9997092222535706e-06, "loss": 1.4527, "step": 4554 }, { "epoch": 0.03, "grad_norm": 4.884728575829793, "learning_rate": 1.9997090942625314e-06, "loss": 1.4643, "step": 4555 }, { "epoch": 0.03, "grad_norm": 5.550770055177438, "learning_rate": 1.9997089662433333e-06, "loss": 1.4318, "step": 4556 }, { "epoch": 0.03, "grad_norm": 4.5040716686793, "learning_rate": 1.999708838195977e-06, "loss": 1.3393, "step": 4557 }, { "epoch": 0.03, "grad_norm": 4.688281068338207, "learning_rate": 1.9997087101204618e-06, "loss": 1.287, "step": 4558 }, { "epoch": 0.03, "grad_norm": 5.36666364093149, "learning_rate": 1.9997085820167883e-06, "loss": 1.5608, "step": 4559 }, { "epoch": 0.03, "grad_norm": 5.236654223247516, "learning_rate": 1.9997084538849565e-06, "loss": 1.497, "step": 4560 }, { "epoch": 0.03, "grad_norm": 4.5201933838098105, "learning_rate": 1.9997083257249663e-06, "loss": 1.3819, "step": 4561 }, { "epoch": 0.03, "grad_norm": 4.50473770291589, "learning_rate": 1.9997081975368174e-06, "loss": 1.3745, "step": 4562 }, { "epoch": 0.03, "grad_norm": 4.726017984082692, "learning_rate": 1.9997080693205105e-06, "loss": 1.4112, "step": 4563 }, { "epoch": 0.03, "grad_norm": 4.597953826538774, "learning_rate": 1.9997079410760445e-06, "loss": 1.4726, "step": 4564 }, { "epoch": 0.03, "grad_norm": 5.12118254749384, "learning_rate": 1.9997078128034206e-06, "loss": 1.3728, "step": 4565 }, { "epoch": 0.03, "grad_norm": 5.687614687342618, "learning_rate": 1.999707684502638e-06, "loss": 1.1816, "step": 4566 }, { "epoch": 0.03, "grad_norm": 7.507115685316445, "learning_rate": 1.9997075561736972e-06, "loss": 1.6171, "step": 4567 }, { "epoch": 0.03, "grad_norm": 4.5882821347616325, "learning_rate": 1.9997074278165974e-06, "loss": 1.3052, "step": 4568 }, { "epoch": 0.03, "grad_norm": 9.93158968795624, "learning_rate": 1.9997072994313397e-06, "loss": 1.3165, "step": 4569 }, { "epoch": 0.03, "grad_norm": 6.923706913786316, "learning_rate": 1.9997071710179236e-06, "loss": 1.3844, "step": 4570 }, { "epoch": 0.03, "grad_norm": 4.859733857181811, "learning_rate": 1.9997070425763488e-06, "loss": 1.324, "step": 4571 }, { "epoch": 0.03, "grad_norm": 5.11347591852976, "learning_rate": 1.9997069141066156e-06, "loss": 1.4837, "step": 4572 }, { "epoch": 0.03, "grad_norm": 4.38038760249524, "learning_rate": 1.999706785608724e-06, "loss": 1.3372, "step": 4573 }, { "epoch": 0.03, "grad_norm": 4.835893094544579, "learning_rate": 1.9997066570826742e-06, "loss": 1.5453, "step": 4574 }, { "epoch": 0.03, "grad_norm": 4.513435220975756, "learning_rate": 1.9997065285284656e-06, "loss": 1.3463, "step": 4575 }, { "epoch": 0.03, "grad_norm": 4.7051714557491175, "learning_rate": 1.9997063999460987e-06, "loss": 1.1787, "step": 4576 }, { "epoch": 0.03, "grad_norm": 5.424760354583208, "learning_rate": 1.9997062713355738e-06, "loss": 1.2403, "step": 4577 }, { "epoch": 0.03, "grad_norm": 4.525838949191244, "learning_rate": 1.99970614269689e-06, "loss": 1.2993, "step": 4578 }, { "epoch": 0.03, "grad_norm": 6.088760584524558, "learning_rate": 1.9997060140300478e-06, "loss": 1.5282, "step": 4579 }, { "epoch": 0.03, "grad_norm": 4.693182784707592, "learning_rate": 1.9997058853350474e-06, "loss": 1.3566, "step": 4580 }, { "epoch": 0.03, "grad_norm": 4.75489088748738, "learning_rate": 1.9997057566118888e-06, "loss": 1.4, "step": 4581 }, { "epoch": 0.03, "grad_norm": 4.829263152751833, "learning_rate": 1.9997056278605714e-06, "loss": 1.457, "step": 4582 }, { "epoch": 0.03, "grad_norm": 4.6568429200869135, "learning_rate": 1.999705499081096e-06, "loss": 1.2583, "step": 4583 }, { "epoch": 0.03, "grad_norm": 5.515636940175523, "learning_rate": 1.999705370273462e-06, "loss": 1.2338, "step": 4584 }, { "epoch": 0.03, "grad_norm": 5.035655859865824, "learning_rate": 1.99970524143767e-06, "loss": 1.4824, "step": 4585 }, { "epoch": 0.03, "grad_norm": 4.41985533368628, "learning_rate": 1.999705112573719e-06, "loss": 1.2937, "step": 4586 }, { "epoch": 0.03, "grad_norm": 4.4907936182052355, "learning_rate": 1.99970498368161e-06, "loss": 1.3923, "step": 4587 }, { "epoch": 0.03, "grad_norm": 4.588920654596639, "learning_rate": 1.999704854761343e-06, "loss": 1.4349, "step": 4588 }, { "epoch": 0.03, "grad_norm": 4.283405248023199, "learning_rate": 1.9997047258129168e-06, "loss": 1.3651, "step": 4589 }, { "epoch": 0.03, "grad_norm": 4.934504469147916, "learning_rate": 1.999704596836333e-06, "loss": 1.4372, "step": 4590 }, { "epoch": 0.03, "grad_norm": 5.212642290989719, "learning_rate": 1.99970446783159e-06, "loss": 1.3618, "step": 4591 }, { "epoch": 0.03, "grad_norm": 5.016121208695871, "learning_rate": 1.9997043387986893e-06, "loss": 1.4286, "step": 4592 }, { "epoch": 0.03, "grad_norm": 5.032572043558688, "learning_rate": 1.99970420973763e-06, "loss": 1.4306, "step": 4593 }, { "epoch": 0.03, "grad_norm": 5.410382985918965, "learning_rate": 1.9997040806484127e-06, "loss": 1.5223, "step": 4594 }, { "epoch": 0.03, "grad_norm": 4.630820509469735, "learning_rate": 1.999703951531037e-06, "loss": 1.3602, "step": 4595 }, { "epoch": 0.03, "grad_norm": 4.697887388318839, "learning_rate": 1.9997038223855027e-06, "loss": 1.3183, "step": 4596 }, { "epoch": 0.03, "grad_norm": 4.840386034206172, "learning_rate": 1.99970369321181e-06, "loss": 1.281, "step": 4597 }, { "epoch": 0.03, "grad_norm": 4.662134501365582, "learning_rate": 1.9997035640099593e-06, "loss": 1.4583, "step": 4598 }, { "epoch": 0.03, "grad_norm": 4.668835874845229, "learning_rate": 1.99970343477995e-06, "loss": 1.3553, "step": 4599 }, { "epoch": 0.03, "eval_loss": 1.5995750427246094, "eval_runtime": 4.6087, "eval_samples_per_second": 1.953, "eval_steps_per_second": 1.085, "step": 4599 }, { "epoch": 0.03, "grad_norm": 4.607251133930972, "learning_rate": 1.9997033055217826e-06, "loss": 1.1421, "step": 4600 }, { "epoch": 0.03, "grad_norm": 4.911545096497189, "learning_rate": 1.999703176235457e-06, "loss": 1.4066, "step": 4601 }, { "epoch": 0.03, "grad_norm": 4.486251688436454, "learning_rate": 1.999703046920973e-06, "loss": 1.2592, "step": 4602 }, { "epoch": 0.03, "grad_norm": 6.064586228726028, "learning_rate": 1.9997029175783304e-06, "loss": 1.3601, "step": 4603 }, { "epoch": 0.03, "grad_norm": 4.747931550259789, "learning_rate": 1.9997027882075295e-06, "loss": 1.4612, "step": 4604 }, { "epoch": 0.03, "grad_norm": 4.304108613620174, "learning_rate": 1.9997026588085702e-06, "loss": 1.262, "step": 4605 }, { "epoch": 0.03, "grad_norm": 6.1485159718855975, "learning_rate": 1.9997025293814535e-06, "loss": 1.2186, "step": 4606 }, { "epoch": 0.03, "grad_norm": 4.527460239300987, "learning_rate": 1.9997023999261776e-06, "loss": 1.297, "step": 4607 }, { "epoch": 0.03, "grad_norm": 4.677223756893461, "learning_rate": 1.9997022704427438e-06, "loss": 1.4472, "step": 4608 }, { "epoch": 0.03, "grad_norm": 4.438673665907044, "learning_rate": 1.9997021409311516e-06, "loss": 1.4524, "step": 4609 }, { "epoch": 0.03, "grad_norm": 4.867993298746743, "learning_rate": 1.999702011391401e-06, "loss": 1.4453, "step": 4610 }, { "epoch": 0.03, "grad_norm": 4.439838889473788, "learning_rate": 1.9997018818234922e-06, "loss": 1.3608, "step": 4611 }, { "epoch": 0.03, "grad_norm": 4.73645165853954, "learning_rate": 1.999701752227425e-06, "loss": 1.3776, "step": 4612 }, { "epoch": 0.03, "grad_norm": 4.216976641700966, "learning_rate": 1.9997016226032e-06, "loss": 1.3617, "step": 4613 }, { "epoch": 0.03, "grad_norm": 5.145123326915574, "learning_rate": 1.999701492950816e-06, "loss": 1.4343, "step": 4614 }, { "epoch": 0.03, "grad_norm": 5.68640188999334, "learning_rate": 1.9997013632702743e-06, "loss": 1.4184, "step": 4615 }, { "epoch": 0.03, "grad_norm": 4.783000145392253, "learning_rate": 1.9997012335615738e-06, "loss": 1.3918, "step": 4616 }, { "epoch": 0.03, "grad_norm": 4.454709689095027, "learning_rate": 1.9997011038247157e-06, "loss": 1.4723, "step": 4617 }, { "epoch": 0.03, "grad_norm": 4.913421443143604, "learning_rate": 1.999700974059699e-06, "loss": 1.4252, "step": 4618 }, { "epoch": 0.03, "grad_norm": 4.294355861160439, "learning_rate": 1.999700844266524e-06, "loss": 1.3352, "step": 4619 }, { "epoch": 0.03, "grad_norm": 4.593349518768013, "learning_rate": 1.999700714445191e-06, "loss": 1.3322, "step": 4620 }, { "epoch": 0.03, "grad_norm": 4.467025036004331, "learning_rate": 1.9997005845956997e-06, "loss": 1.3836, "step": 4621 }, { "epoch": 0.03, "grad_norm": 5.831810745895809, "learning_rate": 1.99970045471805e-06, "loss": 1.3067, "step": 4622 }, { "epoch": 0.03, "grad_norm": 4.765058814088863, "learning_rate": 1.9997003248122423e-06, "loss": 1.528, "step": 4623 }, { "epoch": 0.03, "grad_norm": 4.291059587728319, "learning_rate": 1.999700194878276e-06, "loss": 1.4166, "step": 4624 }, { "epoch": 0.03, "grad_norm": 5.3614253477745395, "learning_rate": 1.9997000649161516e-06, "loss": 1.3495, "step": 4625 }, { "epoch": 0.03, "grad_norm": 6.971076849430029, "learning_rate": 1.9996999349258693e-06, "loss": 1.4404, "step": 4626 }, { "epoch": 0.03, "grad_norm": 5.9111967526579585, "learning_rate": 1.9996998049074283e-06, "loss": 1.3588, "step": 4627 }, { "epoch": 0.03, "grad_norm": 4.384797170507827, "learning_rate": 1.9996996748608294e-06, "loss": 1.4556, "step": 4628 }, { "epoch": 0.03, "grad_norm": 4.528174486505923, "learning_rate": 1.999699544786072e-06, "loss": 1.5543, "step": 4629 }, { "epoch": 0.03, "grad_norm": 4.967245318414694, "learning_rate": 1.999699414683157e-06, "loss": 1.2323, "step": 4630 }, { "epoch": 0.03, "grad_norm": 4.6585596074181606, "learning_rate": 1.9996992845520834e-06, "loss": 1.3764, "step": 4631 }, { "epoch": 0.03, "grad_norm": 9.205323888151595, "learning_rate": 1.9996991543928515e-06, "loss": 1.4263, "step": 4632 }, { "epoch": 0.03, "grad_norm": 6.216285341585164, "learning_rate": 1.9996990242054613e-06, "loss": 1.509, "step": 4633 }, { "epoch": 0.03, "grad_norm": 5.413306333339003, "learning_rate": 1.9996988939899132e-06, "loss": 1.4901, "step": 4634 }, { "epoch": 0.03, "grad_norm": 5.252384307076165, "learning_rate": 1.9996987637462068e-06, "loss": 1.3979, "step": 4635 }, { "epoch": 0.03, "grad_norm": 4.578249747436371, "learning_rate": 1.9996986334743424e-06, "loss": 1.3796, "step": 4636 }, { "epoch": 0.03, "grad_norm": 5.545447215381461, "learning_rate": 1.9996985031743192e-06, "loss": 1.5025, "step": 4637 }, { "epoch": 0.03, "grad_norm": 4.795561191465646, "learning_rate": 1.999698372846138e-06, "loss": 1.4356, "step": 4638 }, { "epoch": 0.03, "grad_norm": 4.941565665202962, "learning_rate": 1.999698242489799e-06, "loss": 1.5655, "step": 4639 }, { "epoch": 0.03, "grad_norm": 4.903731710793547, "learning_rate": 1.999698112105302e-06, "loss": 1.3591, "step": 4640 }, { "epoch": 0.03, "grad_norm": 4.405630319029594, "learning_rate": 1.9996979816926462e-06, "loss": 1.3297, "step": 4641 }, { "epoch": 0.03, "grad_norm": 4.86642927252858, "learning_rate": 1.9996978512518327e-06, "loss": 1.3234, "step": 4642 }, { "epoch": 0.03, "grad_norm": 4.380162512649266, "learning_rate": 1.9996977207828608e-06, "loss": 1.4026, "step": 4643 }, { "epoch": 0.03, "grad_norm": 4.372286095963769, "learning_rate": 1.9996975902857305e-06, "loss": 1.4255, "step": 4644 }, { "epoch": 0.03, "grad_norm": 5.0012548376405, "learning_rate": 1.9996974597604428e-06, "loss": 1.4769, "step": 4645 }, { "epoch": 0.03, "grad_norm": 4.412482449289567, "learning_rate": 1.9996973292069967e-06, "loss": 1.4322, "step": 4646 }, { "epoch": 0.03, "grad_norm": 5.1900603978578195, "learning_rate": 1.999697198625392e-06, "loss": 1.3164, "step": 4647 }, { "epoch": 0.03, "grad_norm": 4.630007869843539, "learning_rate": 1.9996970680156295e-06, "loss": 1.4206, "step": 4648 }, { "epoch": 0.03, "grad_norm": 4.262042107480514, "learning_rate": 1.999696937377709e-06, "loss": 1.3071, "step": 4649 }, { "epoch": 0.03, "grad_norm": 4.767505368948058, "learning_rate": 1.99969680671163e-06, "loss": 1.4763, "step": 4650 }, { "epoch": 0.03, "grad_norm": 4.919164294998061, "learning_rate": 1.999696676017393e-06, "loss": 1.4472, "step": 4651 }, { "epoch": 0.03, "grad_norm": 4.718605752392131, "learning_rate": 1.9996965452949976e-06, "loss": 1.4399, "step": 4652 }, { "epoch": 0.03, "grad_norm": 18.337148389492913, "learning_rate": 1.9996964145444444e-06, "loss": 1.3977, "step": 4653 }, { "epoch": 0.03, "grad_norm": 4.847516040947428, "learning_rate": 1.999696283765733e-06, "loss": 1.4986, "step": 4654 }, { "epoch": 0.03, "grad_norm": 4.209970480673567, "learning_rate": 1.9996961529588634e-06, "loss": 1.3949, "step": 4655 }, { "epoch": 0.03, "grad_norm": 4.861989064904023, "learning_rate": 1.9996960221238356e-06, "loss": 1.3597, "step": 4656 }, { "epoch": 0.03, "grad_norm": 6.370415858300467, "learning_rate": 1.99969589126065e-06, "loss": 1.4715, "step": 4657 }, { "epoch": 0.03, "grad_norm": 6.5979683672051905, "learning_rate": 1.9996957603693063e-06, "loss": 1.2932, "step": 4658 }, { "epoch": 0.03, "grad_norm": 4.884492629660198, "learning_rate": 1.9996956294498044e-06, "loss": 1.4524, "step": 4659 }, { "epoch": 0.03, "grad_norm": 4.364265683873123, "learning_rate": 1.9996954985021445e-06, "loss": 1.3084, "step": 4660 }, { "epoch": 0.03, "grad_norm": 4.350524480239627, "learning_rate": 1.9996953675263263e-06, "loss": 1.3307, "step": 4661 }, { "epoch": 0.03, "grad_norm": 4.606305555165247, "learning_rate": 1.99969523652235e-06, "loss": 1.4614, "step": 4662 }, { "epoch": 0.03, "grad_norm": 4.813773039596495, "learning_rate": 1.9996951054902157e-06, "loss": 1.4229, "step": 4663 }, { "epoch": 0.03, "grad_norm": 5.080022495744986, "learning_rate": 1.9996949744299233e-06, "loss": 1.4522, "step": 4664 }, { "epoch": 0.03, "grad_norm": 4.44418023707076, "learning_rate": 1.9996948433414726e-06, "loss": 1.4778, "step": 4665 }, { "epoch": 0.03, "grad_norm": 4.391710584536705, "learning_rate": 1.9996947122248644e-06, "loss": 1.3318, "step": 4666 }, { "epoch": 0.03, "grad_norm": 5.733588857333391, "learning_rate": 1.999694581080098e-06, "loss": 1.5187, "step": 4667 }, { "epoch": 0.03, "grad_norm": 4.297792542617826, "learning_rate": 1.9996944499071725e-06, "loss": 1.3352, "step": 4668 }, { "epoch": 0.03, "grad_norm": 4.6541106783951856, "learning_rate": 1.99969431870609e-06, "loss": 1.5422, "step": 4669 }, { "epoch": 0.03, "grad_norm": 4.762312235712126, "learning_rate": 1.9996941874768493e-06, "loss": 1.4161, "step": 4670 }, { "epoch": 0.03, "grad_norm": 4.8344220522249035, "learning_rate": 1.9996940562194503e-06, "loss": 1.4553, "step": 4671 }, { "epoch": 0.03, "grad_norm": 4.969429267760363, "learning_rate": 1.9996939249338933e-06, "loss": 1.4402, "step": 4672 }, { "epoch": 0.03, "eval_loss": 1.601548671722412, "eval_runtime": 4.6055, "eval_samples_per_second": 1.954, "eval_steps_per_second": 1.086, "step": 4672 }, { "epoch": 0.03, "grad_norm": 4.606387196659041, "learning_rate": 1.999693793620178e-06, "loss": 1.3363, "step": 4673 }, { "epoch": 0.03, "grad_norm": 4.953540662707658, "learning_rate": 1.999693662278305e-06, "loss": 1.4435, "step": 4674 }, { "epoch": 0.03, "grad_norm": 4.379543320528708, "learning_rate": 1.999693530908274e-06, "loss": 1.4231, "step": 4675 }, { "epoch": 0.03, "grad_norm": 4.229681987448923, "learning_rate": 1.999693399510085e-06, "loss": 1.2745, "step": 4676 }, { "epoch": 0.03, "grad_norm": 6.133620721263649, "learning_rate": 1.999693268083738e-06, "loss": 1.3584, "step": 4677 }, { "epoch": 0.03, "grad_norm": 4.949342227321623, "learning_rate": 1.9996931366292326e-06, "loss": 1.4218, "step": 4678 }, { "epoch": 0.03, "grad_norm": 4.542261647931501, "learning_rate": 1.9996930051465693e-06, "loss": 1.3741, "step": 4679 }, { "epoch": 0.03, "grad_norm": 4.248604436360381, "learning_rate": 1.999692873635748e-06, "loss": 1.3355, "step": 4680 }, { "epoch": 0.03, "grad_norm": 4.863418083132129, "learning_rate": 1.9996927420967687e-06, "loss": 1.4312, "step": 4681 }, { "epoch": 0.03, "grad_norm": 4.528406574995225, "learning_rate": 1.9996926105296313e-06, "loss": 1.403, "step": 4682 }, { "epoch": 0.03, "grad_norm": 4.377414560134764, "learning_rate": 1.999692478934336e-06, "loss": 1.3647, "step": 4683 }, { "epoch": 0.03, "grad_norm": 6.271390333678702, "learning_rate": 1.9996923473108827e-06, "loss": 1.4911, "step": 4684 }, { "epoch": 0.03, "grad_norm": 4.560061007218579, "learning_rate": 1.999692215659271e-06, "loss": 1.4399, "step": 4685 }, { "epoch": 0.03, "grad_norm": 4.971520233276469, "learning_rate": 1.999692083979502e-06, "loss": 1.3748, "step": 4686 }, { "epoch": 0.03, "grad_norm": 4.304368932651768, "learning_rate": 1.9996919522715746e-06, "loss": 1.2535, "step": 4687 }, { "epoch": 0.03, "grad_norm": 5.213270394030907, "learning_rate": 1.9996918205354892e-06, "loss": 1.2906, "step": 4688 }, { "epoch": 0.03, "grad_norm": 6.8513744166695885, "learning_rate": 1.999691688771246e-06, "loss": 1.4525, "step": 4689 }, { "epoch": 0.03, "grad_norm": 4.900000707651183, "learning_rate": 1.9996915569788448e-06, "loss": 1.376, "step": 4690 }, { "epoch": 0.03, "grad_norm": 4.30613670773767, "learning_rate": 1.9996914251582853e-06, "loss": 1.2873, "step": 4691 }, { "epoch": 0.03, "grad_norm": 4.627526933202667, "learning_rate": 1.999691293309568e-06, "loss": 1.4344, "step": 4692 }, { "epoch": 0.03, "grad_norm": 5.305292206086056, "learning_rate": 1.9996911614326925e-06, "loss": 1.4648, "step": 4693 }, { "epoch": 0.03, "grad_norm": 4.807815153540651, "learning_rate": 1.9996910295276592e-06, "loss": 1.2935, "step": 4694 }, { "epoch": 0.03, "grad_norm": 3.9896120129378314, "learning_rate": 1.999690897594468e-06, "loss": 1.0908, "step": 4695 }, { "epoch": 0.03, "grad_norm": 4.593265001115877, "learning_rate": 1.999690765633119e-06, "loss": 1.2241, "step": 4696 }, { "epoch": 0.03, "grad_norm": 4.251732526739906, "learning_rate": 1.9996906336436115e-06, "loss": 1.3555, "step": 4697 }, { "epoch": 0.03, "grad_norm": 4.609627019089599, "learning_rate": 1.9996905016259466e-06, "loss": 1.4551, "step": 4698 }, { "epoch": 0.03, "grad_norm": 4.951205870586321, "learning_rate": 1.9996903695801233e-06, "loss": 1.3807, "step": 4699 }, { "epoch": 0.03, "grad_norm": 4.375791564143481, "learning_rate": 1.9996902375061426e-06, "loss": 1.2615, "step": 4700 }, { "epoch": 0.03, "grad_norm": 4.430298894427687, "learning_rate": 1.9996901054040035e-06, "loss": 1.2341, "step": 4701 }, { "epoch": 0.03, "grad_norm": 4.429158212296167, "learning_rate": 1.9996899732737065e-06, "loss": 1.3283, "step": 4702 }, { "epoch": 0.03, "grad_norm": 4.382997833547272, "learning_rate": 1.9996898411152515e-06, "loss": 1.4111, "step": 4703 }, { "epoch": 0.03, "grad_norm": 4.917328027502967, "learning_rate": 1.9996897089286387e-06, "loss": 1.3069, "step": 4704 }, { "epoch": 0.03, "grad_norm": 5.00875376637453, "learning_rate": 1.9996895767138683e-06, "loss": 1.5968, "step": 4705 }, { "epoch": 0.03, "grad_norm": 6.181909390189377, "learning_rate": 1.9996894444709392e-06, "loss": 1.5782, "step": 4706 }, { "epoch": 0.03, "grad_norm": 4.471687107486269, "learning_rate": 1.9996893121998526e-06, "loss": 1.36, "step": 4707 }, { "epoch": 0.03, "grad_norm": 5.5917579094463585, "learning_rate": 1.999689179900608e-06, "loss": 1.432, "step": 4708 }, { "epoch": 0.03, "grad_norm": 4.5682274519264086, "learning_rate": 1.9996890475732057e-06, "loss": 1.433, "step": 4709 }, { "epoch": 0.03, "grad_norm": 6.004183648761494, "learning_rate": 1.9996889152176453e-06, "loss": 1.5683, "step": 4710 }, { "epoch": 0.03, "grad_norm": 4.511109463967205, "learning_rate": 1.999688782833927e-06, "loss": 1.382, "step": 4711 }, { "epoch": 0.03, "grad_norm": 4.868479643513265, "learning_rate": 1.9996886504220513e-06, "loss": 1.4416, "step": 4712 }, { "epoch": 0.03, "grad_norm": 4.609740449703529, "learning_rate": 1.999688517982017e-06, "loss": 1.3855, "step": 4713 }, { "epoch": 0.03, "grad_norm": 4.553985812537693, "learning_rate": 1.999688385513825e-06, "loss": 1.3525, "step": 4714 }, { "epoch": 0.03, "grad_norm": 4.760949903767553, "learning_rate": 1.999688253017475e-06, "loss": 1.5952, "step": 4715 }, { "epoch": 0.03, "grad_norm": 4.672994858056655, "learning_rate": 1.9996881204929674e-06, "loss": 1.3566, "step": 4716 }, { "epoch": 0.03, "grad_norm": 4.701681460476019, "learning_rate": 1.9996879879403017e-06, "loss": 1.3295, "step": 4717 }, { "epoch": 0.03, "grad_norm": 7.296911425926979, "learning_rate": 1.999687855359478e-06, "loss": 1.4726, "step": 4718 }, { "epoch": 0.03, "grad_norm": 4.415217865891265, "learning_rate": 1.999687722750497e-06, "loss": 1.3247, "step": 4719 }, { "epoch": 0.03, "grad_norm": 4.498513746825456, "learning_rate": 1.9996875901133573e-06, "loss": 1.5196, "step": 4720 }, { "epoch": 0.03, "grad_norm": 4.816061997018404, "learning_rate": 1.99968745744806e-06, "loss": 1.4052, "step": 4721 }, { "epoch": 0.03, "grad_norm": 4.525070676281438, "learning_rate": 1.999687324754605e-06, "loss": 1.439, "step": 4722 }, { "epoch": 0.03, "grad_norm": 5.565343142166182, "learning_rate": 1.999687192032992e-06, "loss": 1.448, "step": 4723 }, { "epoch": 0.03, "grad_norm": 4.456049834281472, "learning_rate": 1.9996870592832214e-06, "loss": 1.3753, "step": 4724 }, { "epoch": 0.03, "grad_norm": 4.20335000279415, "learning_rate": 1.9996869265052927e-06, "loss": 1.3295, "step": 4725 }, { "epoch": 0.03, "grad_norm": 4.698667413687671, "learning_rate": 1.999686793699206e-06, "loss": 1.3766, "step": 4726 }, { "epoch": 0.03, "grad_norm": 4.3921977469782325, "learning_rate": 1.9996866608649617e-06, "loss": 1.5162, "step": 4727 }, { "epoch": 0.03, "grad_norm": 4.745666352066779, "learning_rate": 1.9996865280025597e-06, "loss": 1.4701, "step": 4728 }, { "epoch": 0.03, "grad_norm": 4.106162674673154, "learning_rate": 1.9996863951119993e-06, "loss": 1.383, "step": 4729 }, { "epoch": 0.03, "grad_norm": 4.656060970048848, "learning_rate": 1.999686262193282e-06, "loss": 1.4546, "step": 4730 }, { "epoch": 0.03, "grad_norm": 5.01099105852221, "learning_rate": 1.999686129246406e-06, "loss": 1.3374, "step": 4731 }, { "epoch": 0.03, "grad_norm": 4.429333289185999, "learning_rate": 1.9996859962713725e-06, "loss": 1.439, "step": 4732 }, { "epoch": 0.03, "grad_norm": 6.485225261241643, "learning_rate": 1.9996858632681814e-06, "loss": 1.4019, "step": 4733 }, { "epoch": 0.03, "grad_norm": 4.3602506780828865, "learning_rate": 1.999685730236832e-06, "loss": 1.1456, "step": 4734 }, { "epoch": 0.03, "grad_norm": 5.342513380155669, "learning_rate": 1.999685597177325e-06, "loss": 1.4804, "step": 4735 }, { "epoch": 0.03, "grad_norm": 4.7783974147096995, "learning_rate": 1.99968546408966e-06, "loss": 1.5598, "step": 4736 }, { "epoch": 0.03, "grad_norm": 5.013312236200295, "learning_rate": 1.9996853309738376e-06, "loss": 1.542, "step": 4737 }, { "epoch": 0.03, "grad_norm": 5.0781349638807995, "learning_rate": 1.999685197829857e-06, "loss": 1.3445, "step": 4738 }, { "epoch": 0.03, "grad_norm": 4.9425678942915665, "learning_rate": 1.9996850646577187e-06, "loss": 1.5019, "step": 4739 }, { "epoch": 0.03, "grad_norm": 4.398005881112347, "learning_rate": 1.9996849314574225e-06, "loss": 1.2824, "step": 4740 }, { "epoch": 0.03, "grad_norm": 4.8604757029897305, "learning_rate": 1.9996847982289685e-06, "loss": 1.2284, "step": 4741 }, { "epoch": 0.03, "grad_norm": 4.505197013907895, "learning_rate": 1.999684664972357e-06, "loss": 1.4373, "step": 4742 }, { "epoch": 0.03, "grad_norm": 5.979441510612392, "learning_rate": 1.9996845316875875e-06, "loss": 1.4752, "step": 4743 }, { "epoch": 0.03, "grad_norm": 4.639381709951698, "learning_rate": 1.99968439837466e-06, "loss": 1.3809, "step": 4744 }, { "epoch": 0.03, "grad_norm": 4.750717087395945, "learning_rate": 1.999684265033575e-06, "loss": 1.2907, "step": 4745 }, { "epoch": 0.03, "eval_loss": 1.596257209777832, "eval_runtime": 4.6086, "eval_samples_per_second": 1.953, "eval_steps_per_second": 1.085, "step": 4745 }, { "epoch": 0.03, "grad_norm": 4.7195788276555, "learning_rate": 1.9996841316643324e-06, "loss": 1.4295, "step": 4746 }, { "epoch": 0.03, "grad_norm": 4.71821455924299, "learning_rate": 1.9996839982669317e-06, "loss": 1.2671, "step": 4747 }, { "epoch": 0.03, "grad_norm": 4.652792943550019, "learning_rate": 1.9996838648413735e-06, "loss": 1.3964, "step": 4748 }, { "epoch": 0.03, "grad_norm": 4.8450747435353145, "learning_rate": 1.999683731387657e-06, "loss": 1.4581, "step": 4749 }, { "epoch": 0.03, "grad_norm": 5.154429025507441, "learning_rate": 1.9996835979057834e-06, "loss": 1.5225, "step": 4750 }, { "epoch": 0.03, "grad_norm": 4.603822055020466, "learning_rate": 1.999683464395752e-06, "loss": 1.4042, "step": 4751 }, { "epoch": 0.03, "grad_norm": 4.789110043256694, "learning_rate": 1.9996833308575624e-06, "loss": 1.4275, "step": 4752 }, { "epoch": 0.03, "grad_norm": 4.4064687043986215, "learning_rate": 1.999683197291215e-06, "loss": 1.4125, "step": 4753 }, { "epoch": 0.03, "grad_norm": 5.588246284940714, "learning_rate": 1.99968306369671e-06, "loss": 1.3828, "step": 4754 }, { "epoch": 0.03, "grad_norm": 4.811399405200881, "learning_rate": 1.9996829300740474e-06, "loss": 1.2623, "step": 4755 }, { "epoch": 0.03, "grad_norm": 5.38170118422687, "learning_rate": 1.999682796423227e-06, "loss": 1.4902, "step": 4756 }, { "epoch": 0.03, "grad_norm": 4.570451688730901, "learning_rate": 1.999682662744249e-06, "loss": 1.384, "step": 4757 }, { "epoch": 0.03, "grad_norm": 4.466080672695972, "learning_rate": 1.999682529037113e-06, "loss": 1.4375, "step": 4758 }, { "epoch": 0.03, "grad_norm": 4.849748958039163, "learning_rate": 1.9996823953018193e-06, "loss": 1.4123, "step": 4759 }, { "epoch": 0.03, "grad_norm": 6.846775230167279, "learning_rate": 1.999682261538368e-06, "loss": 1.4364, "step": 4760 }, { "epoch": 0.03, "grad_norm": 4.511236655253821, "learning_rate": 1.999682127746759e-06, "loss": 1.4086, "step": 4761 }, { "epoch": 0.03, "grad_norm": 4.643944392023116, "learning_rate": 1.999681993926992e-06, "loss": 1.4248, "step": 4762 }, { "epoch": 0.03, "grad_norm": 5.4573597719852325, "learning_rate": 1.9996818600790675e-06, "loss": 1.3153, "step": 4763 }, { "epoch": 0.03, "grad_norm": 5.749795721977581, "learning_rate": 1.9996817262029856e-06, "loss": 1.4663, "step": 4764 }, { "epoch": 0.03, "grad_norm": 4.778560951406055, "learning_rate": 1.9996815922987454e-06, "loss": 1.4312, "step": 4765 }, { "epoch": 0.03, "grad_norm": 4.829170326939204, "learning_rate": 1.9996814583663476e-06, "loss": 1.36, "step": 4766 }, { "epoch": 0.03, "grad_norm": 4.974777069806951, "learning_rate": 1.9996813244057924e-06, "loss": 1.4325, "step": 4767 }, { "epoch": 0.03, "grad_norm": 4.257673765982547, "learning_rate": 1.9996811904170793e-06, "loss": 1.1051, "step": 4768 }, { "epoch": 0.03, "grad_norm": 4.180303252136803, "learning_rate": 1.9996810564002087e-06, "loss": 1.2257, "step": 4769 }, { "epoch": 0.03, "grad_norm": 4.869923413542321, "learning_rate": 1.99968092235518e-06, "loss": 1.3964, "step": 4770 }, { "epoch": 0.03, "grad_norm": 5.070645261080126, "learning_rate": 1.999680788281994e-06, "loss": 1.3191, "step": 4771 }, { "epoch": 0.03, "grad_norm": 4.730964464304645, "learning_rate": 1.99968065418065e-06, "loss": 1.4307, "step": 4772 }, { "epoch": 0.03, "grad_norm": 4.543524537769403, "learning_rate": 1.9996805200511487e-06, "loss": 1.3392, "step": 4773 }, { "epoch": 0.03, "grad_norm": 5.212331331874303, "learning_rate": 1.9996803858934897e-06, "loss": 1.4318, "step": 4774 }, { "epoch": 0.03, "grad_norm": 4.477650829070674, "learning_rate": 1.999680251707673e-06, "loss": 1.547, "step": 4775 }, { "epoch": 0.03, "grad_norm": 6.288964147613757, "learning_rate": 1.9996801174936985e-06, "loss": 1.5658, "step": 4776 }, { "epoch": 0.03, "grad_norm": 5.12318918498003, "learning_rate": 1.9996799832515663e-06, "loss": 1.5263, "step": 4777 }, { "epoch": 0.03, "grad_norm": 4.596802410999469, "learning_rate": 1.999679848981276e-06, "loss": 1.2374, "step": 4778 }, { "epoch": 0.03, "grad_norm": 4.895346838573517, "learning_rate": 1.999679714682829e-06, "loss": 1.3863, "step": 4779 }, { "epoch": 0.03, "grad_norm": 5.610837380686373, "learning_rate": 1.9996795803562237e-06, "loss": 1.4456, "step": 4780 }, { "epoch": 0.03, "grad_norm": 5.337789893393667, "learning_rate": 1.999679446001461e-06, "loss": 1.3759, "step": 4781 }, { "epoch": 0.03, "grad_norm": 4.128624253113952, "learning_rate": 1.999679311618541e-06, "loss": 1.3489, "step": 4782 }, { "epoch": 0.03, "grad_norm": 4.162980714733354, "learning_rate": 1.9996791772074624e-06, "loss": 1.1858, "step": 4783 }, { "epoch": 0.03, "grad_norm": 4.338991986792946, "learning_rate": 1.999679042768227e-06, "loss": 1.2959, "step": 4784 }, { "epoch": 0.03, "grad_norm": 4.568952126171166, "learning_rate": 1.9996789083008333e-06, "loss": 1.3689, "step": 4785 }, { "epoch": 0.03, "grad_norm": 5.0798099145996405, "learning_rate": 1.9996787738052823e-06, "loss": 1.4099, "step": 4786 }, { "epoch": 0.03, "grad_norm": 4.313352380464308, "learning_rate": 1.999678639281574e-06, "loss": 1.3969, "step": 4787 }, { "epoch": 0.03, "grad_norm": 4.905569186985227, "learning_rate": 1.9996785047297075e-06, "loss": 1.3436, "step": 4788 }, { "epoch": 0.03, "grad_norm": 6.031241780487444, "learning_rate": 1.999678370149684e-06, "loss": 1.3618, "step": 4789 }, { "epoch": 0.03, "grad_norm": 4.608152663529866, "learning_rate": 1.9996782355415027e-06, "loss": 1.5202, "step": 4790 }, { "epoch": 0.03, "grad_norm": 4.319054814070204, "learning_rate": 1.9996781009051634e-06, "loss": 1.3644, "step": 4791 }, { "epoch": 0.03, "grad_norm": 5.121781626619037, "learning_rate": 1.9996779662406666e-06, "loss": 1.388, "step": 4792 }, { "epoch": 0.03, "grad_norm": 4.79587290501606, "learning_rate": 1.9996778315480124e-06, "loss": 1.3988, "step": 4793 }, { "epoch": 0.03, "grad_norm": 6.419223423152675, "learning_rate": 1.9996776968272006e-06, "loss": 1.5686, "step": 4794 }, { "epoch": 0.03, "grad_norm": 4.710323142581367, "learning_rate": 1.999677562078231e-06, "loss": 1.318, "step": 4795 }, { "epoch": 0.03, "grad_norm": 4.940563209123242, "learning_rate": 1.999677427301104e-06, "loss": 1.4184, "step": 4796 }, { "epoch": 0.03, "grad_norm": 4.426845682552496, "learning_rate": 1.9996772924958195e-06, "loss": 1.2661, "step": 4797 }, { "epoch": 0.03, "grad_norm": 6.6385246808204945, "learning_rate": 1.9996771576623774e-06, "loss": 1.3476, "step": 4798 }, { "epoch": 0.03, "grad_norm": 4.452735357952455, "learning_rate": 1.9996770228007773e-06, "loss": 1.532, "step": 4799 }, { "epoch": 0.03, "grad_norm": 4.679168770735215, "learning_rate": 1.99967688791102e-06, "loss": 1.4238, "step": 4800 }, { "epoch": 0.03, "grad_norm": 5.970346252753626, "learning_rate": 1.999676752993105e-06, "loss": 1.35, "step": 4801 }, { "epoch": 0.03, "grad_norm": 4.360145682814877, "learning_rate": 1.9996766180470325e-06, "loss": 1.3502, "step": 4802 }, { "epoch": 0.03, "grad_norm": 4.511759660990795, "learning_rate": 1.9996764830728025e-06, "loss": 1.1785, "step": 4803 }, { "epoch": 0.03, "grad_norm": 5.473689422967052, "learning_rate": 1.999676348070415e-06, "loss": 1.5054, "step": 4804 }, { "epoch": 0.03, "grad_norm": 4.411432054336889, "learning_rate": 1.9996762130398695e-06, "loss": 1.3236, "step": 4805 }, { "epoch": 0.03, "grad_norm": 4.414497998053194, "learning_rate": 1.9996760779811666e-06, "loss": 1.3714, "step": 4806 }, { "epoch": 0.03, "grad_norm": 4.417583589084534, "learning_rate": 1.9996759428943066e-06, "loss": 1.3159, "step": 4807 }, { "epoch": 0.03, "grad_norm": 4.719218968253314, "learning_rate": 1.9996758077792886e-06, "loss": 1.5414, "step": 4808 }, { "epoch": 0.03, "grad_norm": 4.48388286101016, "learning_rate": 1.9996756726361132e-06, "loss": 1.2479, "step": 4809 }, { "epoch": 0.03, "grad_norm": 4.630529733959733, "learning_rate": 1.9996755374647803e-06, "loss": 1.4229, "step": 4810 }, { "epoch": 0.03, "grad_norm": 5.094640904326784, "learning_rate": 1.99967540226529e-06, "loss": 1.502, "step": 4811 }, { "epoch": 0.03, "grad_norm": 4.430130434540125, "learning_rate": 1.999675267037642e-06, "loss": 1.2396, "step": 4812 }, { "epoch": 0.03, "grad_norm": 4.550108243726061, "learning_rate": 1.999675131781836e-06, "loss": 1.4852, "step": 4813 }, { "epoch": 0.03, "grad_norm": 4.560023170631079, "learning_rate": 1.9996749964978733e-06, "loss": 1.5171, "step": 4814 }, { "epoch": 0.03, "grad_norm": 4.557689671277744, "learning_rate": 1.999674861185753e-06, "loss": 1.3303, "step": 4815 }, { "epoch": 0.03, "grad_norm": 4.752841142796988, "learning_rate": 1.9996747258454746e-06, "loss": 1.4356, "step": 4816 }, { "epoch": 0.03, "grad_norm": 4.661692127248317, "learning_rate": 1.999674590477039e-06, "loss": 1.3943, "step": 4817 }, { "epoch": 0.03, "grad_norm": 5.023253648433682, "learning_rate": 1.999674455080446e-06, "loss": 1.4331, "step": 4818 }, { "epoch": 0.03, "eval_loss": 1.5894834995269775, "eval_runtime": 4.6098, "eval_samples_per_second": 1.952, "eval_steps_per_second": 1.085, "step": 4818 }, { "epoch": 0.03, "grad_norm": 4.654423018173803, "learning_rate": 1.999674319655695e-06, "loss": 1.4038, "step": 4819 }, { "epoch": 0.03, "grad_norm": 4.373764424066224, "learning_rate": 1.9996741842027874e-06, "loss": 1.2405, "step": 4820 }, { "epoch": 0.03, "grad_norm": 5.552173321540753, "learning_rate": 1.9996740487217216e-06, "loss": 1.4747, "step": 4821 }, { "epoch": 0.03, "grad_norm": 4.818611243181649, "learning_rate": 1.9996739132124984e-06, "loss": 1.3139, "step": 4822 }, { "epoch": 0.03, "grad_norm": 5.8874585640249935, "learning_rate": 1.999673777675118e-06, "loss": 1.5988, "step": 4823 }, { "epoch": 0.03, "grad_norm": 6.199310548561564, "learning_rate": 1.99967364210958e-06, "loss": 1.437, "step": 4824 }, { "epoch": 0.03, "grad_norm": 6.224496928606354, "learning_rate": 1.9996735065158845e-06, "loss": 1.3206, "step": 4825 }, { "epoch": 0.03, "grad_norm": 5.440512696045429, "learning_rate": 1.9996733708940313e-06, "loss": 1.4273, "step": 4826 }, { "epoch": 0.03, "grad_norm": 5.252760070048289, "learning_rate": 1.9996732352440206e-06, "loss": 1.4664, "step": 4827 }, { "epoch": 0.03, "grad_norm": 4.991272074491538, "learning_rate": 1.9996730995658528e-06, "loss": 1.2293, "step": 4828 }, { "epoch": 0.03, "grad_norm": 4.758565770482391, "learning_rate": 1.9996729638595275e-06, "loss": 1.4735, "step": 4829 }, { "epoch": 0.03, "grad_norm": 5.523771964480376, "learning_rate": 1.9996728281250443e-06, "loss": 1.0222, "step": 4830 }, { "epoch": 0.03, "grad_norm": 4.488104080943716, "learning_rate": 1.999672692362404e-06, "loss": 1.2748, "step": 4831 }, { "epoch": 0.03, "grad_norm": 4.733031314813794, "learning_rate": 1.9996725565716063e-06, "loss": 1.4047, "step": 4832 }, { "epoch": 0.03, "grad_norm": 4.944137159941149, "learning_rate": 1.999672420752651e-06, "loss": 1.4206, "step": 4833 }, { "epoch": 0.03, "grad_norm": 6.329267465362595, "learning_rate": 1.999672284905538e-06, "loss": 1.3196, "step": 4834 }, { "epoch": 0.03, "grad_norm": 4.989332004973218, "learning_rate": 1.999672149030268e-06, "loss": 1.3999, "step": 4835 }, { "epoch": 0.03, "grad_norm": 4.278843024541689, "learning_rate": 1.9996720131268403e-06, "loss": 1.3334, "step": 4836 }, { "epoch": 0.03, "grad_norm": 5.480469479836834, "learning_rate": 1.9996718771952555e-06, "loss": 1.1818, "step": 4837 }, { "epoch": 0.03, "grad_norm": 4.464540350452838, "learning_rate": 1.9996717412355133e-06, "loss": 1.4719, "step": 4838 }, { "epoch": 0.03, "grad_norm": 4.9610274031254775, "learning_rate": 1.999671605247613e-06, "loss": 1.3896, "step": 4839 }, { "epoch": 0.03, "grad_norm": 4.43642237136088, "learning_rate": 1.9996714692315558e-06, "loss": 1.3923, "step": 4840 }, { "epoch": 0.03, "grad_norm": 4.81001477009308, "learning_rate": 1.999671333187341e-06, "loss": 1.3506, "step": 4841 }, { "epoch": 0.03, "grad_norm": 5.2087150211997235, "learning_rate": 1.9996711971149687e-06, "loss": 1.354, "step": 4842 }, { "epoch": 0.03, "grad_norm": 8.827492905270116, "learning_rate": 1.9996710610144394e-06, "loss": 1.4963, "step": 4843 }, { "epoch": 0.03, "grad_norm": 4.913344302530046, "learning_rate": 1.9996709248857526e-06, "loss": 1.4538, "step": 4844 }, { "epoch": 0.03, "grad_norm": 4.74753443940213, "learning_rate": 1.999670788728908e-06, "loss": 1.4023, "step": 4845 }, { "epoch": 0.03, "grad_norm": 4.697553782555514, "learning_rate": 1.999670652543906e-06, "loss": 1.3918, "step": 4846 }, { "epoch": 0.03, "grad_norm": 4.965595902050283, "learning_rate": 1.999670516330747e-06, "loss": 1.4924, "step": 4847 }, { "epoch": 0.03, "grad_norm": 4.783845146781693, "learning_rate": 1.9996703800894304e-06, "loss": 1.3205, "step": 4848 }, { "epoch": 0.03, "grad_norm": 4.460876956097227, "learning_rate": 1.9996702438199565e-06, "loss": 1.3555, "step": 4849 }, { "epoch": 0.03, "grad_norm": 4.676499569244951, "learning_rate": 1.999670107522325e-06, "loss": 1.4648, "step": 4850 }, { "epoch": 0.03, "grad_norm": 5.139895427887596, "learning_rate": 1.9996699711965363e-06, "loss": 1.4575, "step": 4851 }, { "epoch": 0.03, "grad_norm": 4.204327491255401, "learning_rate": 1.9996698348425904e-06, "loss": 1.257, "step": 4852 }, { "epoch": 0.03, "grad_norm": 4.423481744843335, "learning_rate": 1.9996696984604866e-06, "loss": 1.3695, "step": 4853 }, { "epoch": 0.03, "grad_norm": 4.605386917108096, "learning_rate": 1.9996695620502257e-06, "loss": 1.3833, "step": 4854 }, { "epoch": 0.03, "grad_norm": 5.526277413807305, "learning_rate": 1.9996694256118077e-06, "loss": 1.3928, "step": 4855 }, { "epoch": 0.03, "grad_norm": 4.37299745351461, "learning_rate": 1.999669289145232e-06, "loss": 1.3407, "step": 4856 }, { "epoch": 0.03, "grad_norm": 5.074171039640281, "learning_rate": 1.9996691526504993e-06, "loss": 1.4996, "step": 4857 }, { "epoch": 0.03, "grad_norm": 4.660491102242868, "learning_rate": 1.999669016127609e-06, "loss": 1.3281, "step": 4858 }, { "epoch": 0.03, "grad_norm": 4.547180967060236, "learning_rate": 1.999668879576561e-06, "loss": 1.3997, "step": 4859 }, { "epoch": 0.03, "grad_norm": 4.875397784753304, "learning_rate": 1.9996687429973563e-06, "loss": 1.2495, "step": 4860 }, { "epoch": 0.03, "grad_norm": 4.527792890552893, "learning_rate": 1.999668606389994e-06, "loss": 1.4757, "step": 4861 }, { "epoch": 0.03, "grad_norm": 4.558839828877586, "learning_rate": 1.9996684697544742e-06, "loss": 1.3041, "step": 4862 }, { "epoch": 0.03, "grad_norm": 4.462134868557263, "learning_rate": 1.999668333090797e-06, "loss": 1.2766, "step": 4863 }, { "epoch": 0.03, "grad_norm": 4.815224453706913, "learning_rate": 1.9996681963989626e-06, "loss": 1.5012, "step": 4864 }, { "epoch": 0.03, "grad_norm": 4.427115255915135, "learning_rate": 1.9996680596789714e-06, "loss": 1.3876, "step": 4865 }, { "epoch": 0.03, "grad_norm": 4.395307343589683, "learning_rate": 1.999667922930822e-06, "loss": 1.3663, "step": 4866 }, { "epoch": 0.03, "grad_norm": 11.504475402831304, "learning_rate": 1.9996677861545157e-06, "loss": 1.3031, "step": 4867 }, { "epoch": 0.03, "grad_norm": 4.691668495267009, "learning_rate": 1.999667649350052e-06, "loss": 1.5103, "step": 4868 }, { "epoch": 0.03, "grad_norm": 4.807011919827541, "learning_rate": 1.9996675125174312e-06, "loss": 1.5035, "step": 4869 }, { "epoch": 0.03, "grad_norm": 4.659866337693316, "learning_rate": 1.999667375656653e-06, "loss": 1.3734, "step": 4870 }, { "epoch": 0.03, "grad_norm": 4.309140742861166, "learning_rate": 1.9996672387677173e-06, "loss": 1.4029, "step": 4871 }, { "epoch": 0.03, "grad_norm": 4.5430207721794655, "learning_rate": 1.9996671018506245e-06, "loss": 1.2461, "step": 4872 }, { "epoch": 0.03, "grad_norm": 4.986224627840002, "learning_rate": 1.999666964905374e-06, "loss": 1.3279, "step": 4873 }, { "epoch": 0.03, "grad_norm": 4.768451752962609, "learning_rate": 1.9996668279319668e-06, "loss": 1.5597, "step": 4874 }, { "epoch": 0.03, "grad_norm": 4.4045185366632005, "learning_rate": 1.999666690930402e-06, "loss": 1.3605, "step": 4875 }, { "epoch": 0.03, "grad_norm": 5.547189338935934, "learning_rate": 1.99966655390068e-06, "loss": 1.4016, "step": 4876 }, { "epoch": 0.03, "grad_norm": 4.800271574017062, "learning_rate": 1.999666416842801e-06, "loss": 1.4097, "step": 4877 }, { "epoch": 0.03, "grad_norm": 4.743358842780424, "learning_rate": 1.999666279756764e-06, "loss": 1.3934, "step": 4878 }, { "epoch": 0.03, "grad_norm": 4.530830556209148, "learning_rate": 1.9996661426425705e-06, "loss": 1.3985, "step": 4879 }, { "epoch": 0.03, "grad_norm": 11.044648990306191, "learning_rate": 1.999666005500219e-06, "loss": 1.53, "step": 4880 }, { "epoch": 0.03, "grad_norm": 7.9978749130735, "learning_rate": 1.999665868329711e-06, "loss": 1.4872, "step": 4881 }, { "epoch": 0.03, "grad_norm": 4.55657722400038, "learning_rate": 1.9996657311310453e-06, "loss": 1.3815, "step": 4882 }, { "epoch": 0.03, "grad_norm": 7.398056586649666, "learning_rate": 1.999665593904222e-06, "loss": 1.4576, "step": 4883 }, { "epoch": 0.03, "grad_norm": 4.928651247720553, "learning_rate": 1.999665456649242e-06, "loss": 1.3804, "step": 4884 }, { "epoch": 0.03, "grad_norm": 5.5124114489765255, "learning_rate": 1.9996653193661048e-06, "loss": 1.4463, "step": 4885 }, { "epoch": 0.03, "grad_norm": 4.753941694249805, "learning_rate": 1.99966518205481e-06, "loss": 1.3844, "step": 4886 }, { "epoch": 0.03, "grad_norm": 4.783030101596149, "learning_rate": 1.9996650447153582e-06, "loss": 1.3934, "step": 4887 }, { "epoch": 0.03, "grad_norm": 4.616949792800396, "learning_rate": 1.999664907347749e-06, "loss": 1.5281, "step": 4888 }, { "epoch": 0.03, "grad_norm": 5.4828373987439525, "learning_rate": 1.9996647699519825e-06, "loss": 1.3051, "step": 4889 }, { "epoch": 0.03, "grad_norm": 4.467158174079821, "learning_rate": 1.9996646325280587e-06, "loss": 1.4541, "step": 4890 }, { "epoch": 0.03, "grad_norm": 4.406824641028305, "learning_rate": 1.999664495075978e-06, "loss": 1.2389, "step": 4891 }, { "epoch": 0.03, "eval_loss": 1.5879747867584229, "eval_runtime": 4.6028, "eval_samples_per_second": 1.955, "eval_steps_per_second": 1.086, "step": 4891 }, { "epoch": 0.03, "grad_norm": 4.505150948797786, "learning_rate": 1.9996643575957397e-06, "loss": 1.3242, "step": 4892 }, { "epoch": 0.03, "grad_norm": 4.452979134833612, "learning_rate": 1.9996642200873447e-06, "loss": 1.3525, "step": 4893 }, { "epoch": 0.03, "grad_norm": 4.769536123734563, "learning_rate": 1.999664082550792e-06, "loss": 1.4308, "step": 4894 }, { "epoch": 0.03, "grad_norm": 6.150893673031577, "learning_rate": 1.9996639449860825e-06, "loss": 1.3133, "step": 4895 }, { "epoch": 0.03, "grad_norm": 5.129749543457461, "learning_rate": 1.9996638073932153e-06, "loss": 1.4119, "step": 4896 }, { "epoch": 0.03, "grad_norm": 4.495210496492481, "learning_rate": 1.9996636697721916e-06, "loss": 1.3255, "step": 4897 }, { "epoch": 0.03, "grad_norm": 4.8419242040224475, "learning_rate": 1.99966353212301e-06, "loss": 1.3193, "step": 4898 }, { "epoch": 0.03, "grad_norm": 4.550861178064907, "learning_rate": 1.9996633944456716e-06, "loss": 1.3279, "step": 4899 }, { "epoch": 0.03, "grad_norm": 5.254026521766827, "learning_rate": 1.9996632567401757e-06, "loss": 1.4014, "step": 4900 }, { "epoch": 0.03, "grad_norm": 4.566784238226731, "learning_rate": 1.999663119006523e-06, "loss": 1.3501, "step": 4901 }, { "epoch": 0.03, "grad_norm": 6.703466983616319, "learning_rate": 1.999662981244713e-06, "loss": 1.2757, "step": 4902 }, { "epoch": 0.03, "grad_norm": 5.0344704209883675, "learning_rate": 1.9996628434547454e-06, "loss": 1.5171, "step": 4903 }, { "epoch": 0.03, "grad_norm": 4.928481649465292, "learning_rate": 1.999662705636621e-06, "loss": 1.4109, "step": 4904 }, { "epoch": 0.03, "grad_norm": 5.7086203678126415, "learning_rate": 1.9996625677903393e-06, "loss": 1.4863, "step": 4905 }, { "epoch": 0.03, "grad_norm": 4.855491302198558, "learning_rate": 1.9996624299159006e-06, "loss": 1.37, "step": 4906 }, { "epoch": 0.03, "grad_norm": 6.635896637659114, "learning_rate": 1.9996622920133045e-06, "loss": 1.489, "step": 4907 }, { "epoch": 0.03, "grad_norm": 4.620910257844721, "learning_rate": 1.9996621540825512e-06, "loss": 1.4575, "step": 4908 }, { "epoch": 0.03, "grad_norm": 4.967278319705271, "learning_rate": 1.9996620161236414e-06, "loss": 1.225, "step": 4909 }, { "epoch": 0.03, "grad_norm": 4.489758706027532, "learning_rate": 1.9996618781365736e-06, "loss": 1.28, "step": 4910 }, { "epoch": 0.03, "grad_norm": 5.018987375416176, "learning_rate": 1.999661740121349e-06, "loss": 1.2599, "step": 4911 }, { "epoch": 0.03, "grad_norm": 4.729877339942227, "learning_rate": 1.9996616020779676e-06, "loss": 1.5163, "step": 4912 }, { "epoch": 0.03, "grad_norm": 5.003590450938485, "learning_rate": 1.9996614640064286e-06, "loss": 1.4623, "step": 4913 }, { "epoch": 0.03, "grad_norm": 4.844257778258036, "learning_rate": 1.9996613259067326e-06, "loss": 1.4209, "step": 4914 }, { "epoch": 0.03, "grad_norm": 4.7817590475056955, "learning_rate": 1.9996611877788794e-06, "loss": 1.4215, "step": 4915 }, { "epoch": 0.03, "grad_norm": 4.819627973974734, "learning_rate": 1.999661049622869e-06, "loss": 1.4199, "step": 4916 }, { "epoch": 0.03, "grad_norm": 4.709077696274618, "learning_rate": 1.999660911438702e-06, "loss": 1.5143, "step": 4917 }, { "epoch": 0.03, "grad_norm": 6.5933862705934665, "learning_rate": 1.999660773226377e-06, "loss": 1.3542, "step": 4918 }, { "epoch": 0.03, "grad_norm": 4.7040974151039725, "learning_rate": 1.9996606349858953e-06, "loss": 1.3125, "step": 4919 }, { "epoch": 0.03, "grad_norm": 4.4287260862014755, "learning_rate": 1.999660496717257e-06, "loss": 1.384, "step": 4920 }, { "epoch": 0.03, "grad_norm": 10.291975310579549, "learning_rate": 1.999660358420461e-06, "loss": 1.3225, "step": 4921 }, { "epoch": 0.03, "grad_norm": 4.505404618356303, "learning_rate": 1.999660220095508e-06, "loss": 1.3542, "step": 4922 }, { "epoch": 0.03, "grad_norm": 5.208540263995069, "learning_rate": 1.9996600817423977e-06, "loss": 1.4667, "step": 4923 }, { "epoch": 0.03, "grad_norm": 4.58032932131421, "learning_rate": 1.9996599433611305e-06, "loss": 1.4817, "step": 4924 }, { "epoch": 0.03, "grad_norm": 4.637987091086839, "learning_rate": 1.9996598049517063e-06, "loss": 1.5098, "step": 4925 }, { "epoch": 0.03, "grad_norm": 4.9727075736014745, "learning_rate": 1.999659666514125e-06, "loss": 1.3691, "step": 4926 }, { "epoch": 0.03, "grad_norm": 4.355402907618825, "learning_rate": 1.9996595280483866e-06, "loss": 1.2775, "step": 4927 }, { "epoch": 0.03, "grad_norm": 4.995770741891165, "learning_rate": 1.999659389554491e-06, "loss": 1.448, "step": 4928 }, { "epoch": 0.03, "grad_norm": 4.65151264882068, "learning_rate": 1.999659251032438e-06, "loss": 1.346, "step": 4929 }, { "epoch": 0.03, "grad_norm": 4.970197895073858, "learning_rate": 1.9996591124822286e-06, "loss": 1.4548, "step": 4930 }, { "epoch": 0.03, "grad_norm": 4.57484270410133, "learning_rate": 1.999658973903862e-06, "loss": 1.5617, "step": 4931 }, { "epoch": 0.03, "grad_norm": 5.966820833543396, "learning_rate": 1.999658835297338e-06, "loss": 1.453, "step": 4932 }, { "epoch": 0.03, "grad_norm": 9.522035163549237, "learning_rate": 1.999658696662657e-06, "loss": 1.3827, "step": 4933 }, { "epoch": 0.03, "grad_norm": 4.473174043677729, "learning_rate": 1.999658557999819e-06, "loss": 1.3531, "step": 4934 }, { "epoch": 0.03, "grad_norm": 4.24299079828901, "learning_rate": 1.999658419308824e-06, "loss": 1.2754, "step": 4935 }, { "epoch": 0.03, "grad_norm": 5.49883711852753, "learning_rate": 1.999658280589672e-06, "loss": 1.3504, "step": 4936 }, { "epoch": 0.03, "grad_norm": 4.81062731056944, "learning_rate": 1.9996581418423626e-06, "loss": 1.3611, "step": 4937 }, { "epoch": 0.03, "grad_norm": 4.847256713252676, "learning_rate": 1.9996580030668964e-06, "loss": 1.4836, "step": 4938 }, { "epoch": 0.03, "grad_norm": 4.636923268714284, "learning_rate": 1.9996578642632737e-06, "loss": 1.2719, "step": 4939 }, { "epoch": 0.03, "grad_norm": 4.468270411166637, "learning_rate": 1.9996577254314934e-06, "loss": 1.3969, "step": 4940 }, { "epoch": 0.03, "grad_norm": 4.9062320489821065, "learning_rate": 1.999657586571556e-06, "loss": 1.5834, "step": 4941 }, { "epoch": 0.03, "grad_norm": 4.353075572335245, "learning_rate": 1.999657447683462e-06, "loss": 1.286, "step": 4942 }, { "epoch": 0.03, "grad_norm": 4.1722421999333825, "learning_rate": 1.9996573087672105e-06, "loss": 1.3064, "step": 4943 }, { "epoch": 0.03, "grad_norm": 4.685184943977218, "learning_rate": 1.999657169822802e-06, "loss": 1.2777, "step": 4944 }, { "epoch": 0.03, "grad_norm": 4.173916537930611, "learning_rate": 1.9996570308502367e-06, "loss": 1.2809, "step": 4945 }, { "epoch": 0.03, "grad_norm": 9.815878816058133, "learning_rate": 1.9996568918495144e-06, "loss": 1.5139, "step": 4946 }, { "epoch": 0.03, "grad_norm": 5.242274615185901, "learning_rate": 1.999656752820635e-06, "loss": 1.2482, "step": 4947 }, { "epoch": 0.03, "grad_norm": 4.628254423139213, "learning_rate": 1.9996566137635982e-06, "loss": 1.4503, "step": 4948 }, { "epoch": 0.03, "grad_norm": 4.785684383511945, "learning_rate": 1.999656474678405e-06, "loss": 1.0241, "step": 4949 }, { "epoch": 0.03, "grad_norm": 4.435258668007332, "learning_rate": 1.9996563355650546e-06, "loss": 1.3078, "step": 4950 }, { "epoch": 0.03, "grad_norm": 5.537869856809554, "learning_rate": 1.9996561964235474e-06, "loss": 1.2117, "step": 4951 }, { "epoch": 0.03, "grad_norm": 5.296415857570484, "learning_rate": 1.9996560572538827e-06, "loss": 1.2693, "step": 4952 }, { "epoch": 0.03, "grad_norm": 6.629284609340128, "learning_rate": 1.9996559180560614e-06, "loss": 1.3939, "step": 4953 }, { "epoch": 0.03, "grad_norm": 4.714726736551369, "learning_rate": 1.9996557788300834e-06, "loss": 1.5352, "step": 4954 }, { "epoch": 0.03, "grad_norm": 4.811921746020366, "learning_rate": 1.999655639575948e-06, "loss": 1.3141, "step": 4955 }, { "epoch": 0.03, "grad_norm": 4.736849502199576, "learning_rate": 1.9996555002936553e-06, "loss": 1.296, "step": 4956 }, { "epoch": 0.03, "grad_norm": 5.025593654215372, "learning_rate": 1.999655360983206e-06, "loss": 1.3401, "step": 4957 }, { "epoch": 0.03, "grad_norm": 4.346141381064583, "learning_rate": 1.9996552216446e-06, "loss": 1.2662, "step": 4958 }, { "epoch": 0.03, "grad_norm": 6.078985637023049, "learning_rate": 1.9996550822778365e-06, "loss": 1.4397, "step": 4959 }, { "epoch": 0.03, "grad_norm": 5.293708684832406, "learning_rate": 1.9996549428829166e-06, "loss": 1.3144, "step": 4960 }, { "epoch": 0.03, "grad_norm": 4.954695796739692, "learning_rate": 1.9996548034598395e-06, "loss": 1.4181, "step": 4961 }, { "epoch": 0.03, "grad_norm": 4.301633925796158, "learning_rate": 1.9996546640086054e-06, "loss": 1.3035, "step": 4962 }, { "epoch": 0.03, "grad_norm": 4.849977614063476, "learning_rate": 1.999654524529214e-06, "loss": 1.5079, "step": 4963 }, { "epoch": 0.03, "grad_norm": 5.002788514765028, "learning_rate": 1.9996543850216664e-06, "loss": 1.4079, "step": 4964 }, { "epoch": 0.03, "eval_loss": 1.5884954929351807, "eval_runtime": 4.6023, "eval_samples_per_second": 1.956, "eval_steps_per_second": 1.086, "step": 4964 }, { "epoch": 0.03, "grad_norm": 5.929779615942267, "learning_rate": 1.9996542454859615e-06, "loss": 1.2237, "step": 4965 }, { "epoch": 0.03, "grad_norm": 19.98003308629139, "learning_rate": 1.9996541059220995e-06, "loss": 1.6444, "step": 4966 }, { "epoch": 0.03, "grad_norm": 4.465636137391666, "learning_rate": 1.999653966330081e-06, "loss": 1.4086, "step": 4967 }, { "epoch": 0.03, "grad_norm": 4.701530835624048, "learning_rate": 1.999653826709905e-06, "loss": 1.2818, "step": 4968 }, { "epoch": 0.03, "grad_norm": 4.301135490058194, "learning_rate": 1.9996536870615724e-06, "loss": 1.3054, "step": 4969 }, { "epoch": 0.03, "grad_norm": 4.9078177423659595, "learning_rate": 1.999653547385083e-06, "loss": 1.4703, "step": 4970 }, { "epoch": 0.03, "grad_norm": 4.037717114633664, "learning_rate": 1.9996534076804366e-06, "loss": 1.2785, "step": 4971 }, { "epoch": 0.03, "grad_norm": 10.49535073651398, "learning_rate": 1.999653267947633e-06, "loss": 1.2535, "step": 4972 }, { "epoch": 0.03, "grad_norm": 4.4152212219806, "learning_rate": 1.9996531281866733e-06, "loss": 1.396, "step": 4973 }, { "epoch": 0.03, "grad_norm": 4.649643790591089, "learning_rate": 1.999652988397556e-06, "loss": 1.2659, "step": 4974 }, { "epoch": 0.03, "grad_norm": 4.749432814107806, "learning_rate": 1.9996528485802817e-06, "loss": 1.5086, "step": 4975 }, { "epoch": 0.03, "grad_norm": 4.6032099809794325, "learning_rate": 1.9996527087348507e-06, "loss": 1.4974, "step": 4976 }, { "epoch": 0.03, "grad_norm": 4.496339482907752, "learning_rate": 1.999652568861263e-06, "loss": 1.3888, "step": 4977 }, { "epoch": 0.03, "grad_norm": 4.379277330878916, "learning_rate": 1.9996524289595184e-06, "loss": 1.3209, "step": 4978 }, { "epoch": 0.03, "grad_norm": 4.508967072877293, "learning_rate": 1.9996522890296162e-06, "loss": 1.3332, "step": 4979 }, { "epoch": 0.03, "grad_norm": 4.717289235032692, "learning_rate": 1.999652149071558e-06, "loss": 1.2594, "step": 4980 }, { "epoch": 0.03, "grad_norm": 4.499625716266565, "learning_rate": 1.9996520090853428e-06, "loss": 1.1879, "step": 4981 }, { "epoch": 0.03, "grad_norm": 4.873543966630928, "learning_rate": 1.9996518690709702e-06, "loss": 1.5932, "step": 4982 }, { "epoch": 0.03, "grad_norm": 4.680017638205766, "learning_rate": 1.999651729028441e-06, "loss": 1.3994, "step": 4983 }, { "epoch": 0.03, "grad_norm": 5.246729807504438, "learning_rate": 1.9996515889577552e-06, "loss": 1.4374, "step": 4984 }, { "epoch": 0.03, "grad_norm": 4.857621717344253, "learning_rate": 1.9996514488589123e-06, "loss": 1.4387, "step": 4985 }, { "epoch": 0.03, "grad_norm": 4.354157706315619, "learning_rate": 1.9996513087319124e-06, "loss": 1.3744, "step": 4986 }, { "epoch": 0.03, "grad_norm": 4.608806360543509, "learning_rate": 1.999651168576756e-06, "loss": 1.5182, "step": 4987 }, { "epoch": 0.03, "grad_norm": 4.488314183962565, "learning_rate": 1.9996510283934425e-06, "loss": 1.3388, "step": 4988 }, { "epoch": 0.03, "grad_norm": 6.799259935023664, "learning_rate": 1.999650888181972e-06, "loss": 1.6192, "step": 4989 }, { "epoch": 0.03, "grad_norm": 4.635620856016286, "learning_rate": 1.999650747942345e-06, "loss": 1.4724, "step": 4990 }, { "epoch": 0.03, "grad_norm": 4.445110599054364, "learning_rate": 1.9996506076745615e-06, "loss": 1.376, "step": 4991 }, { "epoch": 0.03, "grad_norm": 4.516224898247299, "learning_rate": 1.9996504673786204e-06, "loss": 1.322, "step": 4992 }, { "epoch": 0.03, "grad_norm": 4.739723435982058, "learning_rate": 1.9996503270545227e-06, "loss": 1.4548, "step": 4993 }, { "epoch": 0.03, "grad_norm": 4.613520266268725, "learning_rate": 1.9996501867022687e-06, "loss": 1.4638, "step": 4994 }, { "epoch": 0.03, "grad_norm": 4.374530596162222, "learning_rate": 1.9996500463218572e-06, "loss": 1.2745, "step": 4995 }, { "epoch": 0.03, "grad_norm": 4.9192696634571265, "learning_rate": 1.9996499059132895e-06, "loss": 1.3904, "step": 4996 }, { "epoch": 0.03, "grad_norm": 4.550637226039346, "learning_rate": 1.9996497654765643e-06, "loss": 1.5197, "step": 4997 }, { "epoch": 0.03, "grad_norm": 4.3166469589187235, "learning_rate": 1.999649625011683e-06, "loss": 1.3615, "step": 4998 }, { "epoch": 0.03, "grad_norm": 3.9647631282254903, "learning_rate": 1.9996494845186444e-06, "loss": 1.2493, "step": 4999 }, { "epoch": 0.03, "grad_norm": 4.598890837133854, "learning_rate": 1.999649343997449e-06, "loss": 1.3756, "step": 5000 }, { "epoch": 0.03, "grad_norm": 5.1545243798412335, "learning_rate": 1.9996492034480967e-06, "loss": 1.3058, "step": 5001 }, { "epoch": 0.03, "grad_norm": 5.194455953045936, "learning_rate": 1.999649062870588e-06, "loss": 1.3365, "step": 5002 }, { "epoch": 0.03, "grad_norm": 4.492615339765341, "learning_rate": 1.9996489222649224e-06, "loss": 1.3927, "step": 5003 }, { "epoch": 0.03, "grad_norm": 5.748565934709617, "learning_rate": 1.9996487816311e-06, "loss": 1.2491, "step": 5004 }, { "epoch": 0.03, "grad_norm": 4.40480061920351, "learning_rate": 1.9996486409691206e-06, "loss": 1.4601, "step": 5005 }, { "epoch": 0.03, "grad_norm": 4.481709558199589, "learning_rate": 1.9996485002789848e-06, "loss": 1.3711, "step": 5006 }, { "epoch": 0.03, "grad_norm": 6.043031459500611, "learning_rate": 1.999648359560692e-06, "loss": 1.4037, "step": 5007 }, { "epoch": 0.03, "grad_norm": 4.464137226055633, "learning_rate": 1.9996482188142423e-06, "loss": 1.3197, "step": 5008 }, { "epoch": 0.03, "grad_norm": 4.71333260992595, "learning_rate": 1.999648078039636e-06, "loss": 1.2934, "step": 5009 }, { "epoch": 0.03, "grad_norm": 4.6727290961823265, "learning_rate": 1.9996479372368728e-06, "loss": 1.4415, "step": 5010 }, { "epoch": 0.03, "grad_norm": 4.910850901414767, "learning_rate": 1.999647796405953e-06, "loss": 1.3522, "step": 5011 }, { "epoch": 0.03, "grad_norm": 4.131208992516828, "learning_rate": 1.9996476555468767e-06, "loss": 1.1736, "step": 5012 }, { "epoch": 0.03, "grad_norm": 5.00627949468508, "learning_rate": 1.999647514659643e-06, "loss": 1.3759, "step": 5013 }, { "epoch": 0.03, "grad_norm": 4.325045330673293, "learning_rate": 1.9996473737442532e-06, "loss": 1.1789, "step": 5014 }, { "epoch": 0.03, "grad_norm": 4.305491625182342, "learning_rate": 1.9996472328007063e-06, "loss": 1.3465, "step": 5015 }, { "epoch": 0.03, "grad_norm": 4.710281409607687, "learning_rate": 1.9996470918290027e-06, "loss": 1.4342, "step": 5016 }, { "epoch": 0.03, "grad_norm": 4.668861934316613, "learning_rate": 1.999646950829142e-06, "loss": 1.4647, "step": 5017 }, { "epoch": 0.03, "grad_norm": 4.981831107261447, "learning_rate": 1.9996468098011256e-06, "loss": 1.4368, "step": 5018 }, { "epoch": 0.03, "grad_norm": 4.696677434701531, "learning_rate": 1.9996466687449517e-06, "loss": 1.3528, "step": 5019 }, { "epoch": 0.03, "grad_norm": 4.626644136638084, "learning_rate": 1.999646527660621e-06, "loss": 1.3947, "step": 5020 }, { "epoch": 0.03, "grad_norm": 5.336280516663327, "learning_rate": 1.9996463865481343e-06, "loss": 1.3848, "step": 5021 }, { "epoch": 0.03, "grad_norm": 4.204518153810109, "learning_rate": 1.99964624540749e-06, "loss": 1.2134, "step": 5022 }, { "epoch": 0.03, "grad_norm": 4.481850990435741, "learning_rate": 1.9996461042386895e-06, "loss": 1.3728, "step": 5023 }, { "epoch": 0.03, "grad_norm": 4.596116350666009, "learning_rate": 1.9996459630417323e-06, "loss": 1.2481, "step": 5024 }, { "epoch": 0.03, "grad_norm": 6.071200702929735, "learning_rate": 1.999645821816618e-06, "loss": 1.2837, "step": 5025 }, { "epoch": 0.03, "grad_norm": 4.682787458386267, "learning_rate": 1.9996456805633476e-06, "loss": 1.373, "step": 5026 }, { "epoch": 0.03, "grad_norm": 4.465654588073633, "learning_rate": 1.99964553928192e-06, "loss": 1.3399, "step": 5027 }, { "epoch": 0.03, "grad_norm": 4.257427680203682, "learning_rate": 1.999645397972336e-06, "loss": 1.2983, "step": 5028 }, { "epoch": 0.03, "grad_norm": 6.47990486117222, "learning_rate": 1.999645256634595e-06, "loss": 1.4403, "step": 5029 }, { "epoch": 0.03, "grad_norm": 4.395372935794683, "learning_rate": 1.9996451152686976e-06, "loss": 1.2232, "step": 5030 }, { "epoch": 0.03, "grad_norm": 4.500470450876562, "learning_rate": 1.9996449738746435e-06, "loss": 1.4449, "step": 5031 }, { "epoch": 0.03, "grad_norm": 4.387892413032265, "learning_rate": 1.9996448324524328e-06, "loss": 1.3108, "step": 5032 }, { "epoch": 0.03, "grad_norm": 4.312493709388586, "learning_rate": 1.9996446910020654e-06, "loss": 1.347, "step": 5033 }, { "epoch": 0.03, "grad_norm": 4.317136988973174, "learning_rate": 1.999644549523541e-06, "loss": 1.4633, "step": 5034 }, { "epoch": 0.03, "grad_norm": 4.53828222623832, "learning_rate": 1.9996444080168602e-06, "loss": 1.3957, "step": 5035 }, { "epoch": 0.03, "grad_norm": 4.41946763026586, "learning_rate": 1.9996442664820225e-06, "loss": 1.3796, "step": 5036 }, { "epoch": 0.03, "grad_norm": 4.4238340440667665, "learning_rate": 1.9996441249190285e-06, "loss": 1.2988, "step": 5037 }, { "epoch": 0.03, "eval_loss": 1.5884430408477783, "eval_runtime": 4.5994, "eval_samples_per_second": 1.957, "eval_steps_per_second": 1.087, "step": 5037 }, { "epoch": 0.03, "grad_norm": 4.787279185469436, "learning_rate": 1.999643983327878e-06, "loss": 1.4537, "step": 5038 }, { "epoch": 0.03, "grad_norm": 4.6269270235179665, "learning_rate": 1.9996438417085706e-06, "loss": 1.5011, "step": 5039 }, { "epoch": 0.03, "grad_norm": 4.520648400330527, "learning_rate": 1.9996437000611062e-06, "loss": 1.4328, "step": 5040 }, { "epoch": 0.03, "grad_norm": 4.697830269102603, "learning_rate": 1.9996435583854857e-06, "loss": 1.5317, "step": 5041 }, { "epoch": 0.03, "grad_norm": 4.32983204513598, "learning_rate": 1.999643416681708e-06, "loss": 1.3167, "step": 5042 }, { "epoch": 0.03, "grad_norm": 12.445854572605224, "learning_rate": 1.999643274949774e-06, "loss": 1.3766, "step": 5043 }, { "epoch": 0.03, "grad_norm": 14.875042607954068, "learning_rate": 1.9996431331896837e-06, "loss": 1.5167, "step": 5044 }, { "epoch": 0.03, "grad_norm": 4.690403487286438, "learning_rate": 1.9996429914014365e-06, "loss": 1.3579, "step": 5045 }, { "epoch": 0.03, "grad_norm": 4.562533250540788, "learning_rate": 1.9996428495850327e-06, "loss": 1.3749, "step": 5046 }, { "epoch": 0.03, "grad_norm": 4.409660806429923, "learning_rate": 1.999642707740472e-06, "loss": 1.3783, "step": 5047 }, { "epoch": 0.03, "grad_norm": 4.743282761576718, "learning_rate": 1.9996425658677548e-06, "loss": 1.3337, "step": 5048 }, { "epoch": 0.03, "grad_norm": 4.653231436589295, "learning_rate": 1.9996424239668815e-06, "loss": 1.4122, "step": 5049 }, { "epoch": 0.03, "grad_norm": 4.275647302697379, "learning_rate": 1.999642282037851e-06, "loss": 1.2388, "step": 5050 }, { "epoch": 0.03, "grad_norm": 4.71674766293332, "learning_rate": 1.999642140080664e-06, "loss": 1.5939, "step": 5051 }, { "epoch": 0.03, "grad_norm": 4.937829772666525, "learning_rate": 1.999641998095321e-06, "loss": 1.2365, "step": 5052 }, { "epoch": 0.03, "grad_norm": 4.767818163191467, "learning_rate": 1.999641856081821e-06, "loss": 1.1922, "step": 5053 }, { "epoch": 0.03, "grad_norm": 6.373049591623676, "learning_rate": 1.9996417140401644e-06, "loss": 1.5192, "step": 5054 }, { "epoch": 0.03, "grad_norm": 4.442225362706674, "learning_rate": 1.999641571970351e-06, "loss": 1.3672, "step": 5055 }, { "epoch": 0.03, "grad_norm": 5.625776490519675, "learning_rate": 1.999641429872381e-06, "loss": 1.3573, "step": 5056 }, { "epoch": 0.03, "grad_norm": 4.999717882842893, "learning_rate": 1.999641287746255e-06, "loss": 1.3185, "step": 5057 }, { "epoch": 0.03, "grad_norm": 4.455798037442581, "learning_rate": 1.999641145591972e-06, "loss": 1.3481, "step": 5058 }, { "epoch": 0.03, "grad_norm": 4.019127018315487, "learning_rate": 1.999641003409532e-06, "loss": 1.0802, "step": 5059 }, { "epoch": 0.03, "grad_norm": 4.417337897998684, "learning_rate": 1.999640861198936e-06, "loss": 1.4398, "step": 5060 }, { "epoch": 0.03, "grad_norm": 4.768902051082049, "learning_rate": 1.9996407189601834e-06, "loss": 1.4748, "step": 5061 }, { "epoch": 0.03, "grad_norm": 6.343100027269473, "learning_rate": 1.999640576693274e-06, "loss": 1.3709, "step": 5062 }, { "epoch": 0.03, "grad_norm": 4.8008865496503965, "learning_rate": 1.9996404343982086e-06, "loss": 1.446, "step": 5063 }, { "epoch": 0.03, "grad_norm": 4.57188740462178, "learning_rate": 1.999640292074986e-06, "loss": 1.3294, "step": 5064 }, { "epoch": 0.03, "grad_norm": 4.647259026804336, "learning_rate": 1.9996401497236077e-06, "loss": 1.4078, "step": 5065 }, { "epoch": 0.03, "grad_norm": 4.913890463540519, "learning_rate": 1.9996400073440723e-06, "loss": 1.331, "step": 5066 }, { "epoch": 0.03, "grad_norm": 4.413066958492824, "learning_rate": 1.99963986493638e-06, "loss": 1.2172, "step": 5067 }, { "epoch": 0.03, "grad_norm": 4.47097548323965, "learning_rate": 1.999639722500532e-06, "loss": 1.3581, "step": 5068 }, { "epoch": 0.03, "grad_norm": 4.71028185304613, "learning_rate": 1.999639580036527e-06, "loss": 1.329, "step": 5069 }, { "epoch": 0.03, "grad_norm": 5.103536487727267, "learning_rate": 1.9996394375443653e-06, "loss": 1.4207, "step": 5070 }, { "epoch": 0.03, "grad_norm": 4.811508678485072, "learning_rate": 1.9996392950240475e-06, "loss": 1.518, "step": 5071 }, { "epoch": 0.03, "grad_norm": 4.127675276325026, "learning_rate": 1.999639152475573e-06, "loss": 1.2926, "step": 5072 }, { "epoch": 0.03, "grad_norm": 4.557844146271824, "learning_rate": 1.999639009898942e-06, "loss": 1.4426, "step": 5073 }, { "epoch": 0.03, "grad_norm": 4.860046835036374, "learning_rate": 1.9996388672941546e-06, "loss": 1.4809, "step": 5074 }, { "epoch": 0.03, "grad_norm": 4.65753944339768, "learning_rate": 1.9996387246612106e-06, "loss": 1.4068, "step": 5075 }, { "epoch": 0.03, "grad_norm": 4.8706428775934745, "learning_rate": 1.99963858200011e-06, "loss": 1.1294, "step": 5076 }, { "epoch": 0.03, "grad_norm": 4.863523617552779, "learning_rate": 1.999638439310853e-06, "loss": 1.3169, "step": 5077 }, { "epoch": 0.03, "grad_norm": 6.035462602494183, "learning_rate": 1.9996382965934392e-06, "loss": 1.4175, "step": 5078 }, { "epoch": 0.03, "grad_norm": 5.209236934674451, "learning_rate": 1.9996381538478695e-06, "loss": 1.2541, "step": 5079 }, { "epoch": 0.03, "grad_norm": 4.589261805424809, "learning_rate": 1.9996380110741427e-06, "loss": 1.3112, "step": 5080 }, { "epoch": 0.03, "grad_norm": 5.182226709791036, "learning_rate": 1.9996378682722597e-06, "loss": 1.3856, "step": 5081 }, { "epoch": 0.03, "grad_norm": 4.827770797738687, "learning_rate": 1.9996377254422205e-06, "loss": 1.3987, "step": 5082 }, { "epoch": 0.03, "grad_norm": 4.888680288943949, "learning_rate": 1.9996375825840246e-06, "loss": 1.4795, "step": 5083 }, { "epoch": 0.03, "grad_norm": 4.362209274162513, "learning_rate": 1.999637439697672e-06, "loss": 1.2897, "step": 5084 }, { "epoch": 0.03, "grad_norm": 5.579767286468069, "learning_rate": 1.9996372967831633e-06, "loss": 1.486, "step": 5085 }, { "epoch": 0.03, "grad_norm": 5.715927557674843, "learning_rate": 1.999637153840498e-06, "loss": 1.3947, "step": 5086 }, { "epoch": 0.03, "grad_norm": 4.5154657470793005, "learning_rate": 1.9996370108696763e-06, "loss": 1.2339, "step": 5087 }, { "epoch": 0.03, "grad_norm": 4.548517262126829, "learning_rate": 1.999636867870698e-06, "loss": 1.3473, "step": 5088 }, { "epoch": 0.03, "grad_norm": 7.972278232399468, "learning_rate": 1.9996367248435636e-06, "loss": 1.3446, "step": 5089 }, { "epoch": 0.03, "grad_norm": 4.187881227535005, "learning_rate": 1.9996365817882725e-06, "loss": 1.3722, "step": 5090 }, { "epoch": 0.03, "grad_norm": 4.9748319935344725, "learning_rate": 1.999636438704825e-06, "loss": 1.2213, "step": 5091 }, { "epoch": 0.03, "grad_norm": 4.501728350463374, "learning_rate": 1.999636295593221e-06, "loss": 1.2142, "step": 5092 }, { "epoch": 0.03, "grad_norm": 4.64131706776445, "learning_rate": 1.9996361524534605e-06, "loss": 1.3512, "step": 5093 }, { "epoch": 0.03, "grad_norm": 4.652688819908968, "learning_rate": 1.999636009285544e-06, "loss": 1.4951, "step": 5094 }, { "epoch": 0.03, "grad_norm": 5.86680131063485, "learning_rate": 1.9996358660894706e-06, "loss": 1.4418, "step": 5095 }, { "epoch": 0.03, "grad_norm": 5.80990903584315, "learning_rate": 1.999635722865241e-06, "loss": 1.2776, "step": 5096 }, { "epoch": 0.03, "grad_norm": 4.602684135874908, "learning_rate": 1.999635579612855e-06, "loss": 1.3736, "step": 5097 }, { "epoch": 0.03, "grad_norm": 4.488870638353602, "learning_rate": 1.9996354363323123e-06, "loss": 1.4349, "step": 5098 }, { "epoch": 0.03, "grad_norm": 4.416661940445059, "learning_rate": 1.999635293023614e-06, "loss": 1.3378, "step": 5099 }, { "epoch": 0.03, "grad_norm": 4.619789395024554, "learning_rate": 1.9996351496867584e-06, "loss": 1.2565, "step": 5100 }, { "epoch": 0.03, "grad_norm": 5.517088793451248, "learning_rate": 1.9996350063217468e-06, "loss": 1.479, "step": 5101 }, { "epoch": 0.03, "grad_norm": 4.776894590035201, "learning_rate": 1.9996348629285784e-06, "loss": 1.3805, "step": 5102 }, { "epoch": 0.03, "grad_norm": 4.538811156951564, "learning_rate": 1.9996347195072543e-06, "loss": 1.4089, "step": 5103 }, { "epoch": 0.03, "grad_norm": 6.844668443293697, "learning_rate": 1.999634576057773e-06, "loss": 0.8379, "step": 5104 }, { "epoch": 0.03, "grad_norm": 4.2428078733204515, "learning_rate": 1.999634432580136e-06, "loss": 1.3333, "step": 5105 }, { "epoch": 0.03, "grad_norm": 4.675285252647242, "learning_rate": 1.9996342890743425e-06, "loss": 1.4058, "step": 5106 }, { "epoch": 0.03, "grad_norm": 4.8116511907238255, "learning_rate": 1.9996341455403926e-06, "loss": 1.3124, "step": 5107 }, { "epoch": 0.03, "grad_norm": 4.719938689172663, "learning_rate": 1.999634001978286e-06, "loss": 1.3912, "step": 5108 }, { "epoch": 0.03, "grad_norm": 6.661728425638731, "learning_rate": 1.9996338583880238e-06, "loss": 1.5051, "step": 5109 }, { "epoch": 0.03, "grad_norm": 4.351034669346957, "learning_rate": 1.9996337147696044e-06, "loss": 1.3348, "step": 5110 }, { "epoch": 0.03, "eval_loss": 1.5869734287261963, "eval_runtime": 4.6358, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.079, "step": 5110 }, { "epoch": 0.03, "grad_norm": 4.636008355552436, "learning_rate": 1.999633571123029e-06, "loss": 1.3045, "step": 5111 }, { "epoch": 0.03, "grad_norm": 4.679248658230449, "learning_rate": 1.9996334274482974e-06, "loss": 1.3603, "step": 5112 }, { "epoch": 0.03, "grad_norm": 5.052382024408589, "learning_rate": 1.9996332837454093e-06, "loss": 1.407, "step": 5113 }, { "epoch": 0.03, "grad_norm": 4.669484864021556, "learning_rate": 1.999633140014365e-06, "loss": 1.3878, "step": 5114 }, { "epoch": 0.03, "grad_norm": 5.007725764398314, "learning_rate": 1.999632996255164e-06, "loss": 1.4109, "step": 5115 }, { "epoch": 0.03, "grad_norm": 5.070132417354139, "learning_rate": 1.999632852467807e-06, "loss": 1.3491, "step": 5116 }, { "epoch": 0.03, "grad_norm": 4.874656960212217, "learning_rate": 1.9996327086522937e-06, "loss": 1.4276, "step": 5117 }, { "epoch": 0.03, "grad_norm": 4.278710997155434, "learning_rate": 1.9996325648086237e-06, "loss": 1.2339, "step": 5118 }, { "epoch": 0.03, "grad_norm": 4.858352564069663, "learning_rate": 1.9996324209367974e-06, "loss": 1.4089, "step": 5119 }, { "epoch": 0.03, "grad_norm": 7.883506046721525, "learning_rate": 1.9996322770368154e-06, "loss": 1.5696, "step": 5120 }, { "epoch": 0.03, "grad_norm": 5.274266624575993, "learning_rate": 1.9996321331086763e-06, "loss": 1.5546, "step": 5121 }, { "epoch": 0.03, "grad_norm": 5.995316348086407, "learning_rate": 1.999631989152381e-06, "loss": 1.5866, "step": 5122 }, { "epoch": 0.03, "grad_norm": 4.399677787317921, "learning_rate": 1.99963184516793e-06, "loss": 1.449, "step": 5123 }, { "epoch": 0.03, "grad_norm": 4.43181644880042, "learning_rate": 1.999631701155322e-06, "loss": 1.3492, "step": 5124 }, { "epoch": 0.03, "grad_norm": 5.406313146704244, "learning_rate": 1.999631557114558e-06, "loss": 1.4326, "step": 5125 }, { "epoch": 0.03, "grad_norm": 4.825610570576208, "learning_rate": 1.999631413045638e-06, "loss": 1.4647, "step": 5126 }, { "epoch": 0.03, "grad_norm": 4.711642297100689, "learning_rate": 1.9996312689485615e-06, "loss": 1.3422, "step": 5127 }, { "epoch": 0.03, "grad_norm": 5.220966772074651, "learning_rate": 1.9996311248233284e-06, "loss": 1.5348, "step": 5128 }, { "epoch": 0.03, "grad_norm": 7.516467534550853, "learning_rate": 1.9996309806699396e-06, "loss": 1.4123, "step": 5129 }, { "epoch": 0.03, "grad_norm": 4.4276929917199705, "learning_rate": 1.9996308364883936e-06, "loss": 1.3432, "step": 5130 }, { "epoch": 0.03, "grad_norm": 5.382965422068927, "learning_rate": 1.9996306922786923e-06, "loss": 1.4137, "step": 5131 }, { "epoch": 0.03, "grad_norm": 4.912376117760465, "learning_rate": 1.9996305480408343e-06, "loss": 1.4876, "step": 5132 }, { "epoch": 0.03, "grad_norm": 4.421408662891891, "learning_rate": 1.99963040377482e-06, "loss": 1.3509, "step": 5133 }, { "epoch": 0.03, "grad_norm": 6.149236618830177, "learning_rate": 1.9996302594806498e-06, "loss": 1.4354, "step": 5134 }, { "epoch": 0.03, "grad_norm": 4.2199110080828515, "learning_rate": 1.9996301151583227e-06, "loss": 1.3263, "step": 5135 }, { "epoch": 0.03, "grad_norm": 5.161530510450788, "learning_rate": 1.99962997080784e-06, "loss": 1.3111, "step": 5136 }, { "epoch": 0.03, "grad_norm": 5.094041761942676, "learning_rate": 1.9996298264292008e-06, "loss": 1.3645, "step": 5137 }, { "epoch": 0.03, "grad_norm": 5.7827061620036275, "learning_rate": 1.999629682022405e-06, "loss": 1.6351, "step": 5138 }, { "epoch": 0.03, "grad_norm": 6.29162902189129, "learning_rate": 1.999629537587453e-06, "loss": 1.606, "step": 5139 }, { "epoch": 0.03, "grad_norm": 6.0223533550029344, "learning_rate": 1.9996293931243454e-06, "loss": 1.3115, "step": 5140 }, { "epoch": 0.03, "grad_norm": 5.927611822538678, "learning_rate": 1.999629248633081e-06, "loss": 1.4572, "step": 5141 }, { "epoch": 0.03, "grad_norm": 6.470119751596624, "learning_rate": 1.9996291041136604e-06, "loss": 1.4877, "step": 5142 }, { "epoch": 0.03, "grad_norm": 4.531014859642228, "learning_rate": 1.999628959566084e-06, "loss": 1.3237, "step": 5143 }, { "epoch": 0.03, "grad_norm": 4.680271995505747, "learning_rate": 1.999628814990351e-06, "loss": 1.3022, "step": 5144 }, { "epoch": 0.03, "grad_norm": 4.562220226438999, "learning_rate": 1.9996286703864617e-06, "loss": 1.4874, "step": 5145 }, { "epoch": 0.03, "grad_norm": 16.84560647695084, "learning_rate": 1.9996285257544166e-06, "loss": 1.5773, "step": 5146 }, { "epoch": 0.03, "grad_norm": 4.617978200923056, "learning_rate": 1.999628381094215e-06, "loss": 1.4126, "step": 5147 }, { "epoch": 0.03, "grad_norm": 4.62254388446995, "learning_rate": 1.9996282364058573e-06, "loss": 1.4919, "step": 5148 }, { "epoch": 0.03, "grad_norm": 6.673617739968051, "learning_rate": 1.999628091689343e-06, "loss": 1.4043, "step": 5149 }, { "epoch": 0.03, "grad_norm": 4.38104447243033, "learning_rate": 1.9996279469446732e-06, "loss": 1.3177, "step": 5150 }, { "epoch": 0.03, "grad_norm": 4.49417393255362, "learning_rate": 1.9996278021718466e-06, "loss": 1.4124, "step": 5151 }, { "epoch": 0.03, "grad_norm": 5.571525271653613, "learning_rate": 1.999627657370864e-06, "loss": 1.3623, "step": 5152 }, { "epoch": 0.03, "grad_norm": 4.832834774414256, "learning_rate": 1.9996275125417256e-06, "loss": 1.4513, "step": 5153 }, { "epoch": 0.03, "grad_norm": 4.801863462992774, "learning_rate": 1.9996273676844307e-06, "loss": 1.3901, "step": 5154 }, { "epoch": 0.03, "grad_norm": 6.2161875460342, "learning_rate": 1.9996272227989792e-06, "loss": 1.3981, "step": 5155 }, { "epoch": 0.03, "grad_norm": 4.386845205293793, "learning_rate": 1.999627077885372e-06, "loss": 1.3474, "step": 5156 }, { "epoch": 0.03, "grad_norm": 4.3499826144651745, "learning_rate": 1.9996269329436084e-06, "loss": 1.3741, "step": 5157 }, { "epoch": 0.03, "grad_norm": 5.536254342757492, "learning_rate": 1.999626787973689e-06, "loss": 1.5086, "step": 5158 }, { "epoch": 0.03, "grad_norm": 5.062572055224925, "learning_rate": 1.999626642975613e-06, "loss": 1.4146, "step": 5159 }, { "epoch": 0.03, "grad_norm": 4.7028956756380875, "learning_rate": 1.9996264979493814e-06, "loss": 1.3689, "step": 5160 }, { "epoch": 0.03, "grad_norm": 5.996231607592078, "learning_rate": 1.9996263528949934e-06, "loss": 1.283, "step": 5161 }, { "epoch": 0.03, "grad_norm": 5.890980057604597, "learning_rate": 1.999626207812449e-06, "loss": 1.338, "step": 5162 }, { "epoch": 0.03, "grad_norm": 4.531671081849737, "learning_rate": 1.9996260627017488e-06, "loss": 1.29, "step": 5163 }, { "epoch": 0.03, "grad_norm": 4.841806979184402, "learning_rate": 1.999625917562892e-06, "loss": 1.4412, "step": 5164 }, { "epoch": 0.03, "grad_norm": 4.816641039162274, "learning_rate": 1.9996257723958797e-06, "loss": 1.4289, "step": 5165 }, { "epoch": 0.03, "grad_norm": 4.553539779730702, "learning_rate": 1.999625627200711e-06, "loss": 1.2829, "step": 5166 }, { "epoch": 0.03, "grad_norm": 10.388543673407055, "learning_rate": 1.9996254819773857e-06, "loss": 1.5212, "step": 5167 }, { "epoch": 0.03, "grad_norm": 4.56516947610891, "learning_rate": 1.999625336725905e-06, "loss": 1.3865, "step": 5168 }, { "epoch": 0.03, "grad_norm": 4.669353354961817, "learning_rate": 1.9996251914462677e-06, "loss": 1.4505, "step": 5169 }, { "epoch": 0.03, "grad_norm": 4.735561321628716, "learning_rate": 1.999625046138474e-06, "loss": 1.3516, "step": 5170 }, { "epoch": 0.03, "grad_norm": 4.678134072942781, "learning_rate": 1.999624900802525e-06, "loss": 1.2902, "step": 5171 }, { "epoch": 0.03, "grad_norm": 7.404317471362056, "learning_rate": 1.9996247554384196e-06, "loss": 1.1697, "step": 5172 }, { "epoch": 0.03, "grad_norm": 4.329970767044252, "learning_rate": 1.999624610046158e-06, "loss": 1.224, "step": 5173 }, { "epoch": 0.03, "grad_norm": 5.023659606210608, "learning_rate": 1.99962446462574e-06, "loss": 1.3802, "step": 5174 }, { "epoch": 0.03, "grad_norm": 4.669469238433171, "learning_rate": 1.9996243191771664e-06, "loss": 1.3952, "step": 5175 }, { "epoch": 0.03, "grad_norm": 5.345858738027433, "learning_rate": 1.9996241737004363e-06, "loss": 1.4635, "step": 5176 }, { "epoch": 0.03, "grad_norm": 6.315739233004877, "learning_rate": 1.9996240281955505e-06, "loss": 1.1401, "step": 5177 }, { "epoch": 0.03, "grad_norm": 4.995440918929367, "learning_rate": 1.9996238826625084e-06, "loss": 1.5021, "step": 5178 }, { "epoch": 0.03, "grad_norm": 4.290451951114777, "learning_rate": 1.99962373710131e-06, "loss": 1.3885, "step": 5179 }, { "epoch": 0.04, "grad_norm": 4.975681449519642, "learning_rate": 1.999623591511956e-06, "loss": 1.3062, "step": 5180 }, { "epoch": 0.04, "grad_norm": 5.05583634187816, "learning_rate": 1.9996234458944457e-06, "loss": 1.4514, "step": 5181 }, { "epoch": 0.04, "grad_norm": 5.013460496566615, "learning_rate": 1.9996233002487796e-06, "loss": 1.3443, "step": 5182 }, { "epoch": 0.04, "grad_norm": 4.81713735288533, "learning_rate": 1.9996231545749573e-06, "loss": 1.3603, "step": 5183 }, { "epoch": 0.04, "eval_loss": 1.5902522802352905, "eval_runtime": 4.6294, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 5183 }, { "epoch": 0.04, "grad_norm": 4.738906332385359, "learning_rate": 1.9996230088729787e-06, "loss": 1.4826, "step": 5184 }, { "epoch": 0.04, "grad_norm": 4.835409290668115, "learning_rate": 1.9996228631428444e-06, "loss": 1.4405, "step": 5185 }, { "epoch": 0.04, "grad_norm": 6.256612091941971, "learning_rate": 1.9996227173845534e-06, "loss": 1.4007, "step": 5186 }, { "epoch": 0.04, "grad_norm": 5.835923171395803, "learning_rate": 1.999622571598107e-06, "loss": 1.3716, "step": 5187 }, { "epoch": 0.04, "grad_norm": 4.733046961539152, "learning_rate": 1.999622425783504e-06, "loss": 1.4846, "step": 5188 }, { "epoch": 0.04, "grad_norm": 4.7127208133688905, "learning_rate": 1.9996222799407456e-06, "loss": 1.5548, "step": 5189 }, { "epoch": 0.04, "grad_norm": 4.512907288384036, "learning_rate": 1.9996221340698306e-06, "loss": 1.3629, "step": 5190 }, { "epoch": 0.04, "grad_norm": 5.000591438003217, "learning_rate": 1.9996219881707597e-06, "loss": 1.4363, "step": 5191 }, { "epoch": 0.04, "grad_norm": 5.306051093935635, "learning_rate": 1.999621842243533e-06, "loss": 1.3033, "step": 5192 }, { "epoch": 0.04, "grad_norm": 5.768258797882892, "learning_rate": 1.9996216962881503e-06, "loss": 1.5043, "step": 5193 }, { "epoch": 0.04, "grad_norm": 4.992359914981791, "learning_rate": 1.9996215503046116e-06, "loss": 1.637, "step": 5194 }, { "epoch": 0.04, "grad_norm": 6.406961308269539, "learning_rate": 1.9996214042929167e-06, "loss": 1.5522, "step": 5195 }, { "epoch": 0.04, "grad_norm": 4.6346831075283665, "learning_rate": 1.9996212582530656e-06, "loss": 1.3284, "step": 5196 }, { "epoch": 0.04, "grad_norm": 4.225303879040243, "learning_rate": 1.9996211121850587e-06, "loss": 1.2964, "step": 5197 }, { "epoch": 0.04, "grad_norm": 4.406985087613287, "learning_rate": 1.999620966088896e-06, "loss": 1.4319, "step": 5198 }, { "epoch": 0.04, "grad_norm": 4.7096902647985095, "learning_rate": 1.999620819964577e-06, "loss": 1.2282, "step": 5199 }, { "epoch": 0.04, "grad_norm": 4.639799493480006, "learning_rate": 1.9996206738121024e-06, "loss": 1.459, "step": 5200 }, { "epoch": 0.04, "grad_norm": 4.680418994469507, "learning_rate": 1.9996205276314714e-06, "loss": 1.4666, "step": 5201 }, { "epoch": 0.04, "grad_norm": 6.084341516962402, "learning_rate": 1.9996203814226847e-06, "loss": 1.3726, "step": 5202 }, { "epoch": 0.04, "grad_norm": 4.4853896976830825, "learning_rate": 1.9996202351857417e-06, "loss": 1.2873, "step": 5203 }, { "epoch": 0.04, "grad_norm": 4.446609785726185, "learning_rate": 1.999620088920643e-06, "loss": 1.2625, "step": 5204 }, { "epoch": 0.04, "grad_norm": 4.26431824832489, "learning_rate": 1.999619942627388e-06, "loss": 1.3075, "step": 5205 }, { "epoch": 0.04, "grad_norm": 8.30092728768285, "learning_rate": 1.9996197963059772e-06, "loss": 1.3076, "step": 5206 }, { "epoch": 0.04, "grad_norm": 5.140645212572101, "learning_rate": 1.9996196499564106e-06, "loss": 1.2314, "step": 5207 }, { "epoch": 0.04, "grad_norm": 4.313626300481425, "learning_rate": 1.999619503578688e-06, "loss": 1.3258, "step": 5208 }, { "epoch": 0.04, "grad_norm": 4.940702539461331, "learning_rate": 1.9996193571728097e-06, "loss": 1.5247, "step": 5209 }, { "epoch": 0.04, "grad_norm": 4.732047254465986, "learning_rate": 1.999619210738775e-06, "loss": 1.3635, "step": 5210 }, { "epoch": 0.04, "grad_norm": 5.098374907583048, "learning_rate": 1.999619064276584e-06, "loss": 1.4533, "step": 5211 }, { "epoch": 0.04, "grad_norm": 4.5755286601837595, "learning_rate": 1.999618917786238e-06, "loss": 1.3437, "step": 5212 }, { "epoch": 0.04, "grad_norm": 4.482281974016032, "learning_rate": 1.999618771267735e-06, "loss": 1.378, "step": 5213 }, { "epoch": 0.04, "grad_norm": 5.046821549959167, "learning_rate": 1.9996186247210767e-06, "loss": 1.3081, "step": 5214 }, { "epoch": 0.04, "grad_norm": 7.112417260824924, "learning_rate": 1.999618478146263e-06, "loss": 1.4034, "step": 5215 }, { "epoch": 0.04, "grad_norm": 4.607442037021984, "learning_rate": 1.9996183315432924e-06, "loss": 1.3559, "step": 5216 }, { "epoch": 0.04, "grad_norm": 4.900340482240163, "learning_rate": 1.999618184912166e-06, "loss": 1.469, "step": 5217 }, { "epoch": 0.04, "grad_norm": 6.236161986662199, "learning_rate": 1.999618038252884e-06, "loss": 1.4448, "step": 5218 }, { "epoch": 0.04, "grad_norm": 7.240219622998569, "learning_rate": 1.9996178915654467e-06, "loss": 1.4633, "step": 5219 }, { "epoch": 0.04, "grad_norm": 5.183092071308639, "learning_rate": 1.9996177448498526e-06, "loss": 1.5244, "step": 5220 }, { "epoch": 0.04, "grad_norm": 5.099584574791853, "learning_rate": 1.9996175981061027e-06, "loss": 1.357, "step": 5221 }, { "epoch": 0.04, "grad_norm": 4.65280332420386, "learning_rate": 1.999617451334197e-06, "loss": 1.4817, "step": 5222 }, { "epoch": 0.04, "grad_norm": 4.609445123861824, "learning_rate": 1.9996173045341356e-06, "loss": 1.3838, "step": 5223 }, { "epoch": 0.04, "grad_norm": 4.48696213107029, "learning_rate": 1.999617157705918e-06, "loss": 1.2707, "step": 5224 }, { "epoch": 0.04, "grad_norm": 4.71491953334768, "learning_rate": 1.9996170108495444e-06, "loss": 1.3875, "step": 5225 }, { "epoch": 0.04, "grad_norm": 4.306972593842409, "learning_rate": 1.9996168639650155e-06, "loss": 1.3385, "step": 5226 }, { "epoch": 0.04, "grad_norm": 4.692476594446908, "learning_rate": 1.9996167170523304e-06, "loss": 1.3951, "step": 5227 }, { "epoch": 0.04, "grad_norm": 4.505668129305747, "learning_rate": 1.999616570111489e-06, "loss": 1.4755, "step": 5228 }, { "epoch": 0.04, "grad_norm": 5.473268169793204, "learning_rate": 1.9996164231424924e-06, "loss": 1.2401, "step": 5229 }, { "epoch": 0.04, "grad_norm": 7.731710957202071, "learning_rate": 1.9996162761453395e-06, "loss": 1.5127, "step": 5230 }, { "epoch": 0.04, "grad_norm": 4.811918942554435, "learning_rate": 1.999616129120031e-06, "loss": 1.4769, "step": 5231 }, { "epoch": 0.04, "grad_norm": 5.373228749243677, "learning_rate": 1.9996159820665667e-06, "loss": 1.271, "step": 5232 }, { "epoch": 0.04, "grad_norm": 4.571262815966328, "learning_rate": 1.999615834984946e-06, "loss": 1.4295, "step": 5233 }, { "epoch": 0.04, "grad_norm": 6.410680065815733, "learning_rate": 1.99961568787517e-06, "loss": 1.3415, "step": 5234 }, { "epoch": 0.04, "grad_norm": 4.42057900039846, "learning_rate": 1.999615540737238e-06, "loss": 1.3367, "step": 5235 }, { "epoch": 0.04, "grad_norm": 4.287952242425009, "learning_rate": 1.99961539357115e-06, "loss": 1.3363, "step": 5236 }, { "epoch": 0.04, "grad_norm": 5.388531260493579, "learning_rate": 1.9996152463769063e-06, "loss": 1.4094, "step": 5237 }, { "epoch": 0.04, "grad_norm": 4.589407684328228, "learning_rate": 1.9996150991545066e-06, "loss": 1.3579, "step": 5238 }, { "epoch": 0.04, "grad_norm": 5.201245553632696, "learning_rate": 1.9996149519039515e-06, "loss": 1.4632, "step": 5239 }, { "epoch": 0.04, "grad_norm": 4.5767057257924995, "learning_rate": 1.9996148046252397e-06, "loss": 1.3726, "step": 5240 }, { "epoch": 0.04, "grad_norm": 4.288311620700227, "learning_rate": 1.999614657318373e-06, "loss": 1.2894, "step": 5241 }, { "epoch": 0.04, "grad_norm": 4.625677992636677, "learning_rate": 1.99961450998335e-06, "loss": 1.488, "step": 5242 }, { "epoch": 0.04, "grad_norm": 4.6427755700445195, "learning_rate": 1.9996143626201714e-06, "loss": 1.3048, "step": 5243 }, { "epoch": 0.04, "grad_norm": 4.458252295180929, "learning_rate": 1.999614215228837e-06, "loss": 1.3213, "step": 5244 }, { "epoch": 0.04, "grad_norm": 4.805044940794729, "learning_rate": 1.9996140678093465e-06, "loss": 1.412, "step": 5245 }, { "epoch": 0.04, "grad_norm": 4.484287068665534, "learning_rate": 1.999613920361701e-06, "loss": 1.3523, "step": 5246 }, { "epoch": 0.04, "grad_norm": 4.510196746543035, "learning_rate": 1.9996137728858985e-06, "loss": 1.3001, "step": 5247 }, { "epoch": 0.04, "grad_norm": 4.66448407133446, "learning_rate": 1.9996136253819408e-06, "loss": 1.4309, "step": 5248 }, { "epoch": 0.04, "grad_norm": 4.767776649292796, "learning_rate": 1.9996134778498273e-06, "loss": 1.3617, "step": 5249 }, { "epoch": 0.04, "grad_norm": 5.009502276860983, "learning_rate": 1.999613330289558e-06, "loss": 1.3175, "step": 5250 }, { "epoch": 0.04, "grad_norm": 5.6102140539489, "learning_rate": 1.999613182701133e-06, "loss": 1.418, "step": 5251 }, { "epoch": 0.04, "grad_norm": 6.1004644400118035, "learning_rate": 1.999613035084552e-06, "loss": 1.3106, "step": 5252 }, { "epoch": 0.04, "grad_norm": 4.134634328066577, "learning_rate": 1.9996128874398157e-06, "loss": 1.1109, "step": 5253 }, { "epoch": 0.04, "grad_norm": 4.251599098003113, "learning_rate": 1.999612739766923e-06, "loss": 1.3869, "step": 5254 }, { "epoch": 0.04, "grad_norm": 4.310963052923367, "learning_rate": 1.999612592065875e-06, "loss": 1.2833, "step": 5255 }, { "epoch": 0.04, "grad_norm": 4.8339055764458285, "learning_rate": 1.999612444336671e-06, "loss": 1.3572, "step": 5256 }, { "epoch": 0.04, "eval_loss": 1.588416576385498, "eval_runtime": 4.6676, "eval_samples_per_second": 1.928, "eval_steps_per_second": 1.071, "step": 5256 }, { "epoch": 0.04, "grad_norm": 4.460064319816236, "learning_rate": 1.9996122965793113e-06, "loss": 1.428, "step": 5257 }, { "epoch": 0.04, "grad_norm": 5.132616360742193, "learning_rate": 1.999612148793796e-06, "loss": 1.4591, "step": 5258 }, { "epoch": 0.04, "grad_norm": 4.789342348637475, "learning_rate": 1.999612000980125e-06, "loss": 1.3685, "step": 5259 }, { "epoch": 0.04, "grad_norm": 4.426287856095415, "learning_rate": 1.999611853138298e-06, "loss": 1.2402, "step": 5260 }, { "epoch": 0.04, "grad_norm": 5.093056146736939, "learning_rate": 1.9996117052683154e-06, "loss": 1.1741, "step": 5261 }, { "epoch": 0.04, "grad_norm": 4.380169374285972, "learning_rate": 1.999611557370177e-06, "loss": 1.4076, "step": 5262 }, { "epoch": 0.04, "grad_norm": 4.385181788057366, "learning_rate": 1.9996114094438826e-06, "loss": 1.3282, "step": 5263 }, { "epoch": 0.04, "grad_norm": 5.143342428858552, "learning_rate": 1.999611261489433e-06, "loss": 1.2734, "step": 5264 }, { "epoch": 0.04, "grad_norm": 5.26082398589258, "learning_rate": 1.999611113506827e-06, "loss": 1.2327, "step": 5265 }, { "epoch": 0.04, "grad_norm": 4.7872359730307465, "learning_rate": 1.999610965496066e-06, "loss": 1.4473, "step": 5266 }, { "epoch": 0.04, "grad_norm": 4.3452439327914485, "learning_rate": 1.999610817457149e-06, "loss": 1.3596, "step": 5267 }, { "epoch": 0.04, "grad_norm": 5.089409324651872, "learning_rate": 1.9996106693900764e-06, "loss": 1.4129, "step": 5268 }, { "epoch": 0.04, "grad_norm": 4.714221362602081, "learning_rate": 1.9996105212948478e-06, "loss": 1.4164, "step": 5269 }, { "epoch": 0.04, "grad_norm": 4.992841821412603, "learning_rate": 1.9996103731714633e-06, "loss": 1.3354, "step": 5270 }, { "epoch": 0.04, "grad_norm": 7.205389949907606, "learning_rate": 1.9996102250199235e-06, "loss": 1.3173, "step": 5271 }, { "epoch": 0.04, "grad_norm": 4.877540484885216, "learning_rate": 1.9996100768402283e-06, "loss": 1.3251, "step": 5272 }, { "epoch": 0.04, "grad_norm": 4.8938893280451525, "learning_rate": 1.999609928632377e-06, "loss": 1.5887, "step": 5273 }, { "epoch": 0.04, "grad_norm": 5.841639172347136, "learning_rate": 1.99960978039637e-06, "loss": 1.2635, "step": 5274 }, { "epoch": 0.04, "grad_norm": 4.889150577066462, "learning_rate": 1.9996096321322075e-06, "loss": 1.3162, "step": 5275 }, { "epoch": 0.04, "grad_norm": 4.708102242269519, "learning_rate": 1.999609483839889e-06, "loss": 1.3477, "step": 5276 }, { "epoch": 0.04, "grad_norm": 4.50995649023718, "learning_rate": 1.9996093355194154e-06, "loss": 1.3798, "step": 5277 }, { "epoch": 0.04, "grad_norm": 4.362202093202189, "learning_rate": 1.9996091871707854e-06, "loss": 1.2651, "step": 5278 }, { "epoch": 0.04, "grad_norm": 5.142670986094631, "learning_rate": 1.9996090387940005e-06, "loss": 1.2936, "step": 5279 }, { "epoch": 0.04, "grad_norm": 4.914582084880499, "learning_rate": 1.9996088903890593e-06, "loss": 1.3712, "step": 5280 }, { "epoch": 0.04, "grad_norm": 4.6223903438811025, "learning_rate": 1.999608741955963e-06, "loss": 1.4532, "step": 5281 }, { "epoch": 0.04, "grad_norm": 5.702280330704291, "learning_rate": 1.9996085934947105e-06, "loss": 1.4272, "step": 5282 }, { "epoch": 0.04, "grad_norm": 4.783515861950863, "learning_rate": 1.9996084450053028e-06, "loss": 1.2471, "step": 5283 }, { "epoch": 0.04, "grad_norm": 5.193532226936051, "learning_rate": 1.999608296487739e-06, "loss": 1.4002, "step": 5284 }, { "epoch": 0.04, "grad_norm": 4.844054594739735, "learning_rate": 1.9996081479420195e-06, "loss": 1.3955, "step": 5285 }, { "epoch": 0.04, "grad_norm": 4.428498197482317, "learning_rate": 1.999607999368145e-06, "loss": 1.381, "step": 5286 }, { "epoch": 0.04, "grad_norm": 4.524776527611566, "learning_rate": 1.9996078507661144e-06, "loss": 1.4025, "step": 5287 }, { "epoch": 0.04, "grad_norm": 4.506899103356581, "learning_rate": 1.9996077021359285e-06, "loss": 1.446, "step": 5288 }, { "epoch": 0.04, "grad_norm": 4.292297159705407, "learning_rate": 1.9996075534775865e-06, "loss": 1.2051, "step": 5289 }, { "epoch": 0.04, "grad_norm": 4.816197128675721, "learning_rate": 1.999607404791089e-06, "loss": 1.5181, "step": 5290 }, { "epoch": 0.04, "grad_norm": 4.65769714406598, "learning_rate": 1.999607256076436e-06, "loss": 1.2641, "step": 5291 }, { "epoch": 0.04, "grad_norm": 4.221554781829679, "learning_rate": 1.999607107333628e-06, "loss": 1.2978, "step": 5292 }, { "epoch": 0.04, "grad_norm": 6.016508085126287, "learning_rate": 1.9996069585626636e-06, "loss": 1.3402, "step": 5293 }, { "epoch": 0.04, "grad_norm": 4.535700872424344, "learning_rate": 1.9996068097635438e-06, "loss": 1.3318, "step": 5294 }, { "epoch": 0.04, "grad_norm": 4.898209854111548, "learning_rate": 1.999606660936268e-06, "loss": 1.3361, "step": 5295 }, { "epoch": 0.04, "grad_norm": 4.8409722230372685, "learning_rate": 1.9996065120808372e-06, "loss": 1.3564, "step": 5296 }, { "epoch": 0.04, "grad_norm": 4.344704256090886, "learning_rate": 1.999606363197251e-06, "loss": 1.4212, "step": 5297 }, { "epoch": 0.04, "grad_norm": 4.338297341267405, "learning_rate": 1.9996062142855087e-06, "loss": 1.3852, "step": 5298 }, { "epoch": 0.04, "grad_norm": 4.899586831180439, "learning_rate": 1.9996060653456112e-06, "loss": 1.3609, "step": 5299 }, { "epoch": 0.04, "grad_norm": 4.635173854429334, "learning_rate": 1.9996059163775575e-06, "loss": 1.4054, "step": 5300 }, { "epoch": 0.04, "grad_norm": 5.87976861185665, "learning_rate": 1.9996057673813488e-06, "loss": 1.508, "step": 5301 }, { "epoch": 0.04, "grad_norm": 5.0425848840249285, "learning_rate": 1.9996056183569843e-06, "loss": 1.3775, "step": 5302 }, { "epoch": 0.04, "grad_norm": 4.8213859471071165, "learning_rate": 1.9996054693044644e-06, "loss": 1.3264, "step": 5303 }, { "epoch": 0.04, "grad_norm": 5.199297410011508, "learning_rate": 1.9996053202237888e-06, "loss": 1.3718, "step": 5304 }, { "epoch": 0.04, "grad_norm": 4.210159701762075, "learning_rate": 1.9996051711149577e-06, "loss": 1.2382, "step": 5305 }, { "epoch": 0.04, "grad_norm": 4.526015401831726, "learning_rate": 1.999605021977971e-06, "loss": 1.2904, "step": 5306 }, { "epoch": 0.04, "grad_norm": 4.393186594597382, "learning_rate": 1.9996048728128287e-06, "loss": 1.3646, "step": 5307 }, { "epoch": 0.04, "grad_norm": 4.410445747605544, "learning_rate": 1.999604723619531e-06, "loss": 1.4108, "step": 5308 }, { "epoch": 0.04, "grad_norm": 4.65064828681169, "learning_rate": 1.9996045743980777e-06, "loss": 1.2894, "step": 5309 }, { "epoch": 0.04, "grad_norm": 4.114539867359308, "learning_rate": 1.9996044251484685e-06, "loss": 1.2473, "step": 5310 }, { "epoch": 0.04, "grad_norm": 6.828289052470705, "learning_rate": 1.9996042758707044e-06, "loss": 1.4132, "step": 5311 }, { "epoch": 0.04, "grad_norm": 5.125778533384964, "learning_rate": 1.9996041265647844e-06, "loss": 1.45, "step": 5312 }, { "epoch": 0.04, "grad_norm": 4.713725872423266, "learning_rate": 1.999603977230709e-06, "loss": 1.404, "step": 5313 }, { "epoch": 0.04, "grad_norm": 5.116872334306885, "learning_rate": 1.999603827868478e-06, "loss": 1.4555, "step": 5314 }, { "epoch": 0.04, "grad_norm": 4.927839087951555, "learning_rate": 1.9996036784780915e-06, "loss": 1.4137, "step": 5315 }, { "epoch": 0.04, "grad_norm": 4.825527138155646, "learning_rate": 1.9996035290595493e-06, "loss": 1.3893, "step": 5316 }, { "epoch": 0.04, "grad_norm": 4.957203965463911, "learning_rate": 1.999603379612852e-06, "loss": 1.3108, "step": 5317 }, { "epoch": 0.04, "grad_norm": 4.3822919612617435, "learning_rate": 1.9996032301379994e-06, "loss": 1.3034, "step": 5318 }, { "epoch": 0.04, "grad_norm": 4.54291819133291, "learning_rate": 1.9996030806349906e-06, "loss": 1.3108, "step": 5319 }, { "epoch": 0.04, "grad_norm": 4.594554217533354, "learning_rate": 1.9996029311038268e-06, "loss": 1.4195, "step": 5320 }, { "epoch": 0.04, "grad_norm": 4.923752649673701, "learning_rate": 1.999602781544507e-06, "loss": 1.4086, "step": 5321 }, { "epoch": 0.04, "grad_norm": 5.133525252167421, "learning_rate": 1.9996026319570322e-06, "loss": 1.3792, "step": 5322 }, { "epoch": 0.04, "grad_norm": 4.83082068680036, "learning_rate": 1.999602482341402e-06, "loss": 1.5031, "step": 5323 }, { "epoch": 0.04, "grad_norm": 4.545744850759469, "learning_rate": 1.9996023326976158e-06, "loss": 1.4818, "step": 5324 }, { "epoch": 0.04, "grad_norm": 4.773007059649542, "learning_rate": 1.9996021830256743e-06, "loss": 1.585, "step": 5325 }, { "epoch": 0.04, "grad_norm": 4.6787208793953985, "learning_rate": 1.999602033325578e-06, "loss": 1.5435, "step": 5326 }, { "epoch": 0.04, "grad_norm": 5.217516292584878, "learning_rate": 1.999601883597325e-06, "loss": 1.5485, "step": 5327 }, { "epoch": 0.04, "grad_norm": 4.782192417399762, "learning_rate": 1.9996017338409175e-06, "loss": 1.4856, "step": 5328 }, { "epoch": 0.04, "grad_norm": 4.659594996196646, "learning_rate": 1.999601584056354e-06, "loss": 1.3872, "step": 5329 }, { "epoch": 0.04, "eval_loss": 1.5914134979248047, "eval_runtime": 4.6192, "eval_samples_per_second": 1.948, "eval_steps_per_second": 1.082, "step": 5329 }, { "epoch": 0.04, "grad_norm": 4.656378751419515, "learning_rate": 1.9996014342436353e-06, "loss": 1.2901, "step": 5330 }, { "epoch": 0.04, "grad_norm": 4.3234477920781655, "learning_rate": 1.9996012844027615e-06, "loss": 1.2679, "step": 5331 }, { "epoch": 0.04, "grad_norm": 6.309595673578597, "learning_rate": 1.999601134533732e-06, "loss": 1.4537, "step": 5332 }, { "epoch": 0.04, "grad_norm": 4.68311196545046, "learning_rate": 1.9996009846365466e-06, "loss": 1.5176, "step": 5333 }, { "epoch": 0.04, "grad_norm": 4.524090611566109, "learning_rate": 1.9996008347112063e-06, "loss": 1.538, "step": 5334 }, { "epoch": 0.04, "grad_norm": 4.470909241018738, "learning_rate": 1.9996006847577106e-06, "loss": 1.2494, "step": 5335 }, { "epoch": 0.04, "grad_norm": 4.714422550696149, "learning_rate": 1.999600534776059e-06, "loss": 1.4338, "step": 5336 }, { "epoch": 0.04, "grad_norm": 4.9565982097751595, "learning_rate": 1.9996003847662523e-06, "loss": 1.5361, "step": 5337 }, { "epoch": 0.04, "grad_norm": 4.8338435382041816, "learning_rate": 1.99960023472829e-06, "loss": 1.3377, "step": 5338 }, { "epoch": 0.04, "grad_norm": 4.794992577439646, "learning_rate": 1.9996000846621724e-06, "loss": 1.389, "step": 5339 }, { "epoch": 0.04, "grad_norm": 5.716459027955684, "learning_rate": 1.9995999345679e-06, "loss": 1.5347, "step": 5340 }, { "epoch": 0.04, "grad_norm": 4.536586009296358, "learning_rate": 1.999599784445471e-06, "loss": 1.3908, "step": 5341 }, { "epoch": 0.04, "grad_norm": 4.918756041310881, "learning_rate": 1.9995996342948874e-06, "loss": 1.5521, "step": 5342 }, { "epoch": 0.04, "grad_norm": 4.432444150659499, "learning_rate": 1.9995994841161482e-06, "loss": 1.339, "step": 5343 }, { "epoch": 0.04, "grad_norm": 5.305902097786757, "learning_rate": 1.9995993339092538e-06, "loss": 1.2672, "step": 5344 }, { "epoch": 0.04, "grad_norm": 4.600283874400767, "learning_rate": 1.9995991836742035e-06, "loss": 1.4089, "step": 5345 }, { "epoch": 0.04, "grad_norm": 4.478673210296926, "learning_rate": 1.9995990334109983e-06, "loss": 1.3002, "step": 5346 }, { "epoch": 0.04, "grad_norm": 4.628833420341199, "learning_rate": 1.9995988831196376e-06, "loss": 1.2925, "step": 5347 }, { "epoch": 0.04, "grad_norm": 4.759712481055288, "learning_rate": 1.9995987328001217e-06, "loss": 1.4287, "step": 5348 }, { "epoch": 0.04, "grad_norm": 4.378467041433537, "learning_rate": 1.99959858245245e-06, "loss": 1.1898, "step": 5349 }, { "epoch": 0.04, "grad_norm": 4.656882036684165, "learning_rate": 1.999598432076623e-06, "loss": 1.4104, "step": 5350 }, { "epoch": 0.04, "grad_norm": 7.58192189709911, "learning_rate": 1.999598281672641e-06, "loss": 1.3428, "step": 5351 }, { "epoch": 0.04, "grad_norm": 5.0235887756128195, "learning_rate": 1.9995981312405036e-06, "loss": 1.3091, "step": 5352 }, { "epoch": 0.04, "grad_norm": 5.531381978298488, "learning_rate": 1.9995979807802103e-06, "loss": 1.4114, "step": 5353 }, { "epoch": 0.04, "grad_norm": 6.376405838049194, "learning_rate": 1.999597830291762e-06, "loss": 1.1761, "step": 5354 }, { "epoch": 0.04, "grad_norm": 4.800774709710386, "learning_rate": 1.9995976797751584e-06, "loss": 1.4089, "step": 5355 }, { "epoch": 0.04, "grad_norm": 4.877492482027732, "learning_rate": 1.9995975292304e-06, "loss": 1.3907, "step": 5356 }, { "epoch": 0.04, "grad_norm": 4.913106564034408, "learning_rate": 1.9995973786574855e-06, "loss": 1.4759, "step": 5357 }, { "epoch": 0.04, "grad_norm": 4.293401188341443, "learning_rate": 1.9995972280564158e-06, "loss": 1.3488, "step": 5358 }, { "epoch": 0.04, "grad_norm": 5.287056937913001, "learning_rate": 1.9995970774271907e-06, "loss": 1.444, "step": 5359 }, { "epoch": 0.04, "grad_norm": 4.252128338091714, "learning_rate": 1.9995969267698106e-06, "loss": 1.2915, "step": 5360 }, { "epoch": 0.04, "grad_norm": 4.349214641497852, "learning_rate": 1.9995967760842747e-06, "loss": 1.4126, "step": 5361 }, { "epoch": 0.04, "grad_norm": 5.290841305378968, "learning_rate": 1.999596625370584e-06, "loss": 1.5357, "step": 5362 }, { "epoch": 0.04, "grad_norm": 4.747130625797225, "learning_rate": 1.9995964746287377e-06, "loss": 1.4004, "step": 5363 }, { "epoch": 0.04, "grad_norm": 4.987646915941917, "learning_rate": 1.999596323858736e-06, "loss": 1.3791, "step": 5364 }, { "epoch": 0.04, "grad_norm": 6.139804081038285, "learning_rate": 1.9995961730605797e-06, "loss": 1.5727, "step": 5365 }, { "epoch": 0.04, "grad_norm": 5.044552578834974, "learning_rate": 1.9995960222342674e-06, "loss": 1.452, "step": 5366 }, { "epoch": 0.04, "grad_norm": 4.438322634168964, "learning_rate": 1.9995958713797997e-06, "loss": 1.3587, "step": 5367 }, { "epoch": 0.04, "grad_norm": 5.081547990873596, "learning_rate": 1.999595720497177e-06, "loss": 1.4186, "step": 5368 }, { "epoch": 0.04, "grad_norm": 11.361463840832428, "learning_rate": 1.999595569586399e-06, "loss": 1.3331, "step": 5369 }, { "epoch": 0.04, "grad_norm": 4.573105264781268, "learning_rate": 1.9995954186474656e-06, "loss": 1.4836, "step": 5370 }, { "epoch": 0.04, "grad_norm": 4.524116657215281, "learning_rate": 1.9995952676803773e-06, "loss": 1.4834, "step": 5371 }, { "epoch": 0.04, "grad_norm": 4.5612899836864775, "learning_rate": 1.999595116685133e-06, "loss": 1.3424, "step": 5372 }, { "epoch": 0.04, "grad_norm": 4.451691037217679, "learning_rate": 1.999594965661734e-06, "loss": 1.4434, "step": 5373 }, { "epoch": 0.04, "grad_norm": 4.598363284266816, "learning_rate": 1.9995948146101796e-06, "loss": 1.186, "step": 5374 }, { "epoch": 0.04, "grad_norm": 6.230138371007278, "learning_rate": 1.99959466353047e-06, "loss": 1.4894, "step": 5375 }, { "epoch": 0.04, "grad_norm": 4.584331580666035, "learning_rate": 1.9995945124226054e-06, "loss": 1.5358, "step": 5376 }, { "epoch": 0.04, "grad_norm": 4.992474117850262, "learning_rate": 1.999594361286585e-06, "loss": 1.5344, "step": 5377 }, { "epoch": 0.04, "grad_norm": 4.615560941154592, "learning_rate": 1.9995942101224097e-06, "loss": 1.4381, "step": 5378 }, { "epoch": 0.04, "grad_norm": 4.762385942281559, "learning_rate": 1.999594058930079e-06, "loss": 1.3786, "step": 5379 }, { "epoch": 0.04, "grad_norm": 4.550711508796249, "learning_rate": 1.9995939077095933e-06, "loss": 1.2633, "step": 5380 }, { "epoch": 0.04, "grad_norm": 4.445867465869956, "learning_rate": 1.999593756460952e-06, "loss": 1.3703, "step": 5381 }, { "epoch": 0.04, "grad_norm": 5.112797286092936, "learning_rate": 1.9995936051841554e-06, "loss": 1.4838, "step": 5382 }, { "epoch": 0.04, "grad_norm": 4.664165035316399, "learning_rate": 1.9995934538792043e-06, "loss": 1.3778, "step": 5383 }, { "epoch": 0.04, "grad_norm": 4.471468476075772, "learning_rate": 1.9995933025460973e-06, "loss": 1.3596, "step": 5384 }, { "epoch": 0.04, "grad_norm": 4.636197592968499, "learning_rate": 1.999593151184835e-06, "loss": 1.3794, "step": 5385 }, { "epoch": 0.04, "grad_norm": 4.4608480022627415, "learning_rate": 1.999592999795418e-06, "loss": 1.293, "step": 5386 }, { "epoch": 0.04, "grad_norm": 4.670187134336065, "learning_rate": 1.9995928483778455e-06, "loss": 1.49, "step": 5387 }, { "epoch": 0.04, "grad_norm": 4.6125249277661515, "learning_rate": 1.999592696932118e-06, "loss": 1.4303, "step": 5388 }, { "epoch": 0.04, "grad_norm": 4.380598718814033, "learning_rate": 1.9995925454582353e-06, "loss": 1.3787, "step": 5389 }, { "epoch": 0.04, "grad_norm": 6.031159486574906, "learning_rate": 1.999592393956197e-06, "loss": 1.3739, "step": 5390 }, { "epoch": 0.04, "grad_norm": 6.586458668128173, "learning_rate": 1.9995922424260037e-06, "loss": 1.3875, "step": 5391 }, { "epoch": 0.04, "grad_norm": 4.469025152290972, "learning_rate": 1.9995920908676554e-06, "loss": 1.2878, "step": 5392 }, { "epoch": 0.04, "grad_norm": 5.057088730535222, "learning_rate": 1.999591939281152e-06, "loss": 1.4496, "step": 5393 }, { "epoch": 0.04, "grad_norm": 4.638746608664549, "learning_rate": 1.999591787666493e-06, "loss": 1.2525, "step": 5394 }, { "epoch": 0.04, "grad_norm": 4.12470530236778, "learning_rate": 1.9995916360236793e-06, "loss": 1.2856, "step": 5395 }, { "epoch": 0.04, "grad_norm": 7.158155987305781, "learning_rate": 1.99959148435271e-06, "loss": 1.3427, "step": 5396 }, { "epoch": 0.04, "grad_norm": 4.768982625950188, "learning_rate": 1.9995913326535856e-06, "loss": 1.3686, "step": 5397 }, { "epoch": 0.04, "grad_norm": 4.999099474489017, "learning_rate": 1.999591180926306e-06, "loss": 1.4405, "step": 5398 }, { "epoch": 0.04, "grad_norm": 4.845559456466601, "learning_rate": 1.9995910291708714e-06, "loss": 1.472, "step": 5399 }, { "epoch": 0.04, "grad_norm": 7.791873087181352, "learning_rate": 1.999590877387282e-06, "loss": 1.4414, "step": 5400 }, { "epoch": 0.04, "grad_norm": 5.83486619235522, "learning_rate": 1.9995907255755365e-06, "loss": 1.1654, "step": 5401 }, { "epoch": 0.04, "grad_norm": 5.566871185758199, "learning_rate": 1.9995905737356366e-06, "loss": 1.3979, "step": 5402 }, { "epoch": 0.04, "eval_loss": 1.5928033590316772, "eval_runtime": 4.6209, "eval_samples_per_second": 1.948, "eval_steps_per_second": 1.082, "step": 5402 }, { "epoch": 0.04, "grad_norm": 11.14198817676052, "learning_rate": 1.9995904218675813e-06, "loss": 1.5248, "step": 5403 }, { "epoch": 0.04, "grad_norm": 4.906619768830342, "learning_rate": 1.999590269971371e-06, "loss": 1.2803, "step": 5404 }, { "epoch": 0.04, "grad_norm": 5.081147090293134, "learning_rate": 1.9995901180470055e-06, "loss": 1.3923, "step": 5405 }, { "epoch": 0.04, "grad_norm": 4.2193281332529615, "learning_rate": 1.999589966094485e-06, "loss": 1.4464, "step": 5406 }, { "epoch": 0.04, "grad_norm": 4.582245210638781, "learning_rate": 1.999589814113809e-06, "loss": 1.4076, "step": 5407 }, { "epoch": 0.04, "grad_norm": 4.619990521265063, "learning_rate": 1.999589662104978e-06, "loss": 1.3721, "step": 5408 }, { "epoch": 0.04, "grad_norm": 4.79591891914449, "learning_rate": 1.9995895100679923e-06, "loss": 1.474, "step": 5409 }, { "epoch": 0.04, "grad_norm": 4.687496243854095, "learning_rate": 1.999589358002851e-06, "loss": 1.4769, "step": 5410 }, { "epoch": 0.04, "grad_norm": 6.413106374843211, "learning_rate": 1.999589205909555e-06, "loss": 1.4996, "step": 5411 }, { "epoch": 0.04, "grad_norm": 4.728265125964608, "learning_rate": 1.9995890537881035e-06, "loss": 1.1853, "step": 5412 }, { "epoch": 0.04, "grad_norm": 4.714549819204181, "learning_rate": 1.9995889016384974e-06, "loss": 1.3339, "step": 5413 }, { "epoch": 0.04, "grad_norm": 5.033083949823077, "learning_rate": 1.9995887494607356e-06, "loss": 1.3914, "step": 5414 }, { "epoch": 0.04, "grad_norm": 4.619312503189621, "learning_rate": 1.999588597254819e-06, "loss": 1.4209, "step": 5415 }, { "epoch": 0.04, "grad_norm": 4.392928302360342, "learning_rate": 1.9995884450207474e-06, "loss": 1.3411, "step": 5416 }, { "epoch": 0.04, "grad_norm": 4.699428525865151, "learning_rate": 1.9995882927585207e-06, "loss": 1.5718, "step": 5417 }, { "epoch": 0.04, "grad_norm": 4.8570811554984035, "learning_rate": 1.999588140468139e-06, "loss": 1.514, "step": 5418 }, { "epoch": 0.04, "grad_norm": 4.592200106158407, "learning_rate": 1.999587988149602e-06, "loss": 1.3471, "step": 5419 }, { "epoch": 0.04, "grad_norm": 5.42332602970698, "learning_rate": 1.99958783580291e-06, "loss": 1.5, "step": 5420 }, { "epoch": 0.04, "grad_norm": 4.5277477741477625, "learning_rate": 1.999587683428063e-06, "loss": 1.4744, "step": 5421 }, { "epoch": 0.04, "grad_norm": 5.846054652382134, "learning_rate": 1.999587531025061e-06, "loss": 1.3867, "step": 5422 }, { "epoch": 0.04, "grad_norm": 6.159749147772707, "learning_rate": 1.9995873785939035e-06, "loss": 1.2877, "step": 5423 }, { "epoch": 0.04, "grad_norm": 4.652069603232177, "learning_rate": 1.9995872261345913e-06, "loss": 1.4, "step": 5424 }, { "epoch": 0.04, "grad_norm": 4.422980786896793, "learning_rate": 1.999587073647124e-06, "loss": 1.3236, "step": 5425 }, { "epoch": 0.04, "grad_norm": 4.555298144729756, "learning_rate": 1.999586921131502e-06, "loss": 1.3565, "step": 5426 }, { "epoch": 0.04, "grad_norm": 4.716686719802506, "learning_rate": 1.9995867685877246e-06, "loss": 1.3191, "step": 5427 }, { "epoch": 0.04, "grad_norm": 5.480573731469923, "learning_rate": 1.999586616015792e-06, "loss": 1.428, "step": 5428 }, { "epoch": 0.04, "grad_norm": 4.578833826486278, "learning_rate": 1.9995864634157047e-06, "loss": 1.3911, "step": 5429 }, { "epoch": 0.04, "grad_norm": 4.6411112797710885, "learning_rate": 1.9995863107874624e-06, "loss": 1.4877, "step": 5430 }, { "epoch": 0.04, "grad_norm": 4.832662742398066, "learning_rate": 1.999586158131065e-06, "loss": 1.2709, "step": 5431 }, { "epoch": 0.04, "grad_norm": 4.785133397663012, "learning_rate": 1.9995860054465124e-06, "loss": 1.4045, "step": 5432 }, { "epoch": 0.04, "grad_norm": 4.694698051145196, "learning_rate": 1.9995858527338048e-06, "loss": 1.4158, "step": 5433 }, { "epoch": 0.04, "grad_norm": 4.5115803661982605, "learning_rate": 1.999585699992942e-06, "loss": 1.3451, "step": 5434 }, { "epoch": 0.04, "grad_norm": 5.333909714802689, "learning_rate": 1.999585547223925e-06, "loss": 1.2256, "step": 5435 }, { "epoch": 0.04, "grad_norm": 4.46634230697757, "learning_rate": 1.999585394426752e-06, "loss": 1.1993, "step": 5436 }, { "epoch": 0.04, "grad_norm": 6.118704393474845, "learning_rate": 1.9995852416014248e-06, "loss": 1.4105, "step": 5437 }, { "epoch": 0.04, "grad_norm": 4.98992274551725, "learning_rate": 1.9995850887479424e-06, "loss": 1.4555, "step": 5438 }, { "epoch": 0.04, "grad_norm": 5.245528611616613, "learning_rate": 1.999584935866305e-06, "loss": 1.2751, "step": 5439 }, { "epoch": 0.04, "grad_norm": 5.34046068419453, "learning_rate": 1.9995847829565123e-06, "loss": 1.5205, "step": 5440 }, { "epoch": 0.04, "grad_norm": 4.334310120840614, "learning_rate": 1.999584630018565e-06, "loss": 1.3701, "step": 5441 }, { "epoch": 0.04, "grad_norm": 4.757983130619233, "learning_rate": 1.999584477052463e-06, "loss": 1.3853, "step": 5442 }, { "epoch": 0.04, "grad_norm": 4.409294399481962, "learning_rate": 1.999584324058205e-06, "loss": 1.4091, "step": 5443 }, { "epoch": 0.04, "grad_norm": 4.826785168098429, "learning_rate": 1.999584171035793e-06, "loss": 1.4281, "step": 5444 }, { "epoch": 0.04, "grad_norm": 4.907065594737622, "learning_rate": 1.9995840179852258e-06, "loss": 1.465, "step": 5445 }, { "epoch": 0.04, "grad_norm": 4.682363436997646, "learning_rate": 1.9995838649065033e-06, "loss": 1.5089, "step": 5446 }, { "epoch": 0.04, "grad_norm": 4.619435687751559, "learning_rate": 1.999583711799626e-06, "loss": 1.2583, "step": 5447 }, { "epoch": 0.04, "grad_norm": 4.672584024572918, "learning_rate": 1.999583558664594e-06, "loss": 1.5137, "step": 5448 }, { "epoch": 0.04, "grad_norm": 6.577355320226652, "learning_rate": 1.999583405501407e-06, "loss": 1.5428, "step": 5449 }, { "epoch": 0.04, "grad_norm": 4.268919602796006, "learning_rate": 1.999583252310065e-06, "loss": 1.2885, "step": 5450 }, { "epoch": 0.04, "grad_norm": 4.118002500403124, "learning_rate": 1.999583099090568e-06, "loss": 1.1582, "step": 5451 }, { "epoch": 0.04, "grad_norm": 4.329875349677499, "learning_rate": 1.9995829458429157e-06, "loss": 1.3135, "step": 5452 }, { "epoch": 0.04, "grad_norm": 4.826994117851692, "learning_rate": 1.999582792567109e-06, "loss": 1.2244, "step": 5453 }, { "epoch": 0.04, "grad_norm": 4.726464604739192, "learning_rate": 1.9995826392631474e-06, "loss": 1.4273, "step": 5454 }, { "epoch": 0.04, "grad_norm": 4.478099061911341, "learning_rate": 1.9995824859310304e-06, "loss": 1.4666, "step": 5455 }, { "epoch": 0.04, "grad_norm": 4.554066739376522, "learning_rate": 1.999582332570759e-06, "loss": 1.3255, "step": 5456 }, { "epoch": 0.04, "grad_norm": 5.016794369400851, "learning_rate": 1.9995821791823325e-06, "loss": 1.4906, "step": 5457 }, { "epoch": 0.04, "grad_norm": 4.313115238611662, "learning_rate": 1.999582025765751e-06, "loss": 1.3535, "step": 5458 }, { "epoch": 0.04, "grad_norm": 7.122750345736439, "learning_rate": 1.999581872321015e-06, "loss": 1.3358, "step": 5459 }, { "epoch": 0.04, "grad_norm": 4.439426933947747, "learning_rate": 1.9995817188481238e-06, "loss": 1.3662, "step": 5460 }, { "epoch": 0.04, "grad_norm": 7.834853083236051, "learning_rate": 1.9995815653470775e-06, "loss": 1.4574, "step": 5461 }, { "epoch": 0.04, "grad_norm": 4.994102616850311, "learning_rate": 1.9995814118178767e-06, "loss": 1.5664, "step": 5462 }, { "epoch": 0.04, "grad_norm": 4.302015767682599, "learning_rate": 1.9995812582605205e-06, "loss": 1.3522, "step": 5463 }, { "epoch": 0.04, "grad_norm": 4.745366269142967, "learning_rate": 1.99958110467501e-06, "loss": 1.4443, "step": 5464 }, { "epoch": 0.04, "grad_norm": 4.602740066615117, "learning_rate": 1.999580951061344e-06, "loss": 1.4097, "step": 5465 }, { "epoch": 0.04, "grad_norm": 4.6276973511434125, "learning_rate": 1.999580797419524e-06, "loss": 1.41, "step": 5466 }, { "epoch": 0.04, "grad_norm": 4.680735663344895, "learning_rate": 1.9995806437495484e-06, "loss": 1.4282, "step": 5467 }, { "epoch": 0.04, "grad_norm": 4.4452243934762885, "learning_rate": 1.9995804900514183e-06, "loss": 1.2724, "step": 5468 }, { "epoch": 0.04, "grad_norm": 5.696734261141821, "learning_rate": 1.9995803363251333e-06, "loss": 1.4732, "step": 5469 }, { "epoch": 0.04, "grad_norm": 4.568261080482452, "learning_rate": 1.999580182570693e-06, "loss": 1.4947, "step": 5470 }, { "epoch": 0.04, "grad_norm": 5.364888905425856, "learning_rate": 1.9995800287880984e-06, "loss": 1.336, "step": 5471 }, { "epoch": 0.04, "grad_norm": 4.437611933898036, "learning_rate": 1.999579874977349e-06, "loss": 1.3896, "step": 5472 }, { "epoch": 0.04, "grad_norm": 4.747125833532376, "learning_rate": 1.999579721138444e-06, "loss": 1.3364, "step": 5473 }, { "epoch": 0.04, "grad_norm": 4.163517058080106, "learning_rate": 1.999579567271385e-06, "loss": 1.2912, "step": 5474 }, { "epoch": 0.04, "grad_norm": 4.37062891833211, "learning_rate": 1.9995794133761708e-06, "loss": 1.2929, "step": 5475 }, { "epoch": 0.04, "eval_loss": 1.5883009433746338, "eval_runtime": 4.6139, "eval_samples_per_second": 1.951, "eval_steps_per_second": 1.084, "step": 5475 }, { "epoch": 0.04, "grad_norm": 4.550895861802117, "learning_rate": 1.999579259452802e-06, "loss": 1.3748, "step": 5476 }, { "epoch": 0.04, "grad_norm": 4.37308320056366, "learning_rate": 1.999579105501278e-06, "loss": 1.3886, "step": 5477 }, { "epoch": 0.04, "grad_norm": 4.371706496814317, "learning_rate": 1.9995789515215994e-06, "loss": 1.2446, "step": 5478 }, { "epoch": 0.04, "grad_norm": 5.234786231819272, "learning_rate": 1.999578797513766e-06, "loss": 1.4633, "step": 5479 }, { "epoch": 0.04, "grad_norm": 4.637392103361315, "learning_rate": 1.999578643477778e-06, "loss": 1.3306, "step": 5480 }, { "epoch": 0.04, "grad_norm": 4.950416319511345, "learning_rate": 1.999578489413635e-06, "loss": 1.3856, "step": 5481 }, { "epoch": 0.04, "grad_norm": 4.612474217929316, "learning_rate": 1.999578335321337e-06, "loss": 1.388, "step": 5482 }, { "epoch": 0.04, "grad_norm": 5.242992295576217, "learning_rate": 1.9995781812008843e-06, "loss": 1.2876, "step": 5483 }, { "epoch": 0.04, "grad_norm": 5.4180641893003365, "learning_rate": 1.9995780270522767e-06, "loss": 1.4297, "step": 5484 }, { "epoch": 0.04, "grad_norm": 5.147408666412258, "learning_rate": 1.999577872875515e-06, "loss": 1.4316, "step": 5485 }, { "epoch": 0.04, "grad_norm": 6.013107863946539, "learning_rate": 1.9995777186705974e-06, "loss": 1.3074, "step": 5486 }, { "epoch": 0.04, "grad_norm": 5.818951050788746, "learning_rate": 1.9995775644375258e-06, "loss": 1.3686, "step": 5487 }, { "epoch": 0.04, "grad_norm": 4.411525670201678, "learning_rate": 1.9995774101762996e-06, "loss": 1.3503, "step": 5488 }, { "epoch": 0.04, "grad_norm": 4.375775538757313, "learning_rate": 1.999577255886918e-06, "loss": 1.4638, "step": 5489 }, { "epoch": 0.04, "grad_norm": 4.825543291017702, "learning_rate": 1.999577101569382e-06, "loss": 1.438, "step": 5490 }, { "epoch": 0.04, "grad_norm": 5.4557224337185755, "learning_rate": 1.999576947223691e-06, "loss": 1.6316, "step": 5491 }, { "epoch": 0.04, "grad_norm": 5.200112080059826, "learning_rate": 1.9995767928498455e-06, "loss": 1.6251, "step": 5492 }, { "epoch": 0.04, "grad_norm": 5.111364806210548, "learning_rate": 1.999576638447845e-06, "loss": 1.5281, "step": 5493 }, { "epoch": 0.04, "grad_norm": 4.425794066467563, "learning_rate": 1.99957648401769e-06, "loss": 1.4127, "step": 5494 }, { "epoch": 0.04, "grad_norm": 5.447327709941523, "learning_rate": 1.99957632955938e-06, "loss": 1.3892, "step": 5495 }, { "epoch": 0.04, "grad_norm": 6.114053358822904, "learning_rate": 1.9995761750729155e-06, "loss": 1.1889, "step": 5496 }, { "epoch": 0.04, "grad_norm": 4.577071940487855, "learning_rate": 1.999576020558296e-06, "loss": 1.5832, "step": 5497 }, { "epoch": 0.04, "grad_norm": 4.446815310121416, "learning_rate": 1.999575866015522e-06, "loss": 1.4318, "step": 5498 }, { "epoch": 0.04, "grad_norm": 4.436184684005386, "learning_rate": 1.9995757114445936e-06, "loss": 1.1015, "step": 5499 }, { "epoch": 0.04, "grad_norm": 4.85939396426341, "learning_rate": 1.9995755568455097e-06, "loss": 1.4752, "step": 5500 }, { "epoch": 0.04, "grad_norm": 4.586830592099405, "learning_rate": 1.9995754022182717e-06, "loss": 1.4497, "step": 5501 }, { "epoch": 0.04, "grad_norm": 8.088570891832045, "learning_rate": 1.9995752475628788e-06, "loss": 1.1334, "step": 5502 }, { "epoch": 0.04, "grad_norm": 4.414002657892107, "learning_rate": 1.999575092879331e-06, "loss": 1.3261, "step": 5503 }, { "epoch": 0.04, "grad_norm": 4.192161311679393, "learning_rate": 1.9995749381676285e-06, "loss": 1.2831, "step": 5504 }, { "epoch": 0.04, "grad_norm": 5.32054082484598, "learning_rate": 1.9995747834277715e-06, "loss": 1.3378, "step": 5505 }, { "epoch": 0.04, "grad_norm": 4.511364512843204, "learning_rate": 1.9995746286597596e-06, "loss": 1.4351, "step": 5506 }, { "epoch": 0.04, "grad_norm": 4.738334255209205, "learning_rate": 1.9995744738635932e-06, "loss": 1.3697, "step": 5507 }, { "epoch": 0.04, "grad_norm": 4.450904275724931, "learning_rate": 1.9995743190392723e-06, "loss": 1.4477, "step": 5508 }, { "epoch": 0.04, "grad_norm": 4.663790974556768, "learning_rate": 1.9995741641867964e-06, "loss": 1.4581, "step": 5509 }, { "epoch": 0.04, "grad_norm": 4.732086985431901, "learning_rate": 1.999574009306166e-06, "loss": 1.4154, "step": 5510 }, { "epoch": 0.04, "grad_norm": 4.573615220017464, "learning_rate": 1.9995738543973805e-06, "loss": 1.4402, "step": 5511 }, { "epoch": 0.04, "grad_norm": 4.696226211308651, "learning_rate": 1.999573699460441e-06, "loss": 1.4405, "step": 5512 }, { "epoch": 0.04, "grad_norm": 5.040428550310928, "learning_rate": 1.999573544495346e-06, "loss": 1.2629, "step": 5513 }, { "epoch": 0.04, "grad_norm": 5.572325263596189, "learning_rate": 1.999573389502097e-06, "loss": 1.4112, "step": 5514 }, { "epoch": 0.04, "grad_norm": 5.732902419280129, "learning_rate": 1.9995732344806933e-06, "loss": 1.4068, "step": 5515 }, { "epoch": 0.04, "grad_norm": 4.638254706300692, "learning_rate": 1.999573079431135e-06, "loss": 1.3419, "step": 5516 }, { "epoch": 0.04, "grad_norm": 4.402178857446612, "learning_rate": 1.9995729243534214e-06, "loss": 1.3603, "step": 5517 }, { "epoch": 0.04, "grad_norm": 4.443664547197061, "learning_rate": 1.9995727692475535e-06, "loss": 1.3077, "step": 5518 }, { "epoch": 0.04, "grad_norm": 4.776565439497071, "learning_rate": 1.9995726141135315e-06, "loss": 1.4059, "step": 5519 }, { "epoch": 0.04, "grad_norm": 4.840236194996271, "learning_rate": 1.999572458951354e-06, "loss": 1.3285, "step": 5520 }, { "epoch": 0.04, "grad_norm": 4.601009240176879, "learning_rate": 1.9995723037610226e-06, "loss": 1.2622, "step": 5521 }, { "epoch": 0.04, "grad_norm": 4.689399660195274, "learning_rate": 1.999572148542536e-06, "loss": 1.3786, "step": 5522 }, { "epoch": 0.04, "grad_norm": 4.879936128522637, "learning_rate": 1.9995719932958955e-06, "loss": 1.4304, "step": 5523 }, { "epoch": 0.04, "grad_norm": 5.053461764142007, "learning_rate": 1.9995718380211e-06, "loss": 1.103, "step": 5524 }, { "epoch": 0.04, "grad_norm": 10.212597639229678, "learning_rate": 1.9995716827181495e-06, "loss": 1.2929, "step": 5525 }, { "epoch": 0.04, "grad_norm": 4.800774005016834, "learning_rate": 1.999571527387045e-06, "loss": 1.4353, "step": 5526 }, { "epoch": 0.04, "grad_norm": 4.83357838863239, "learning_rate": 1.9995713720277854e-06, "loss": 1.5217, "step": 5527 }, { "epoch": 0.04, "grad_norm": 4.448088275358243, "learning_rate": 1.9995712166403714e-06, "loss": 1.3385, "step": 5528 }, { "epoch": 0.04, "grad_norm": 4.814258917353907, "learning_rate": 1.999571061224803e-06, "loss": 1.4251, "step": 5529 }, { "epoch": 0.04, "grad_norm": 5.48175977378777, "learning_rate": 1.9995709057810797e-06, "loss": 1.2553, "step": 5530 }, { "epoch": 0.04, "grad_norm": 6.995449812868445, "learning_rate": 1.999570750309202e-06, "loss": 1.503, "step": 5531 }, { "epoch": 0.04, "grad_norm": 4.505111906000265, "learning_rate": 1.99957059480917e-06, "loss": 1.3737, "step": 5532 }, { "epoch": 0.04, "grad_norm": 4.470732663254423, "learning_rate": 1.9995704392809828e-06, "loss": 1.3549, "step": 5533 }, { "epoch": 0.04, "grad_norm": 4.390457037465304, "learning_rate": 1.999570283724641e-06, "loss": 1.4678, "step": 5534 }, { "epoch": 0.04, "grad_norm": 4.497486405508152, "learning_rate": 1.999570128140145e-06, "loss": 1.3086, "step": 5535 }, { "epoch": 0.04, "grad_norm": 4.227297933627742, "learning_rate": 1.9995699725274946e-06, "loss": 1.3404, "step": 5536 }, { "epoch": 0.04, "grad_norm": 4.294243051642116, "learning_rate": 1.9995698168866894e-06, "loss": 1.3524, "step": 5537 }, { "epoch": 0.04, "grad_norm": 5.322836751875245, "learning_rate": 1.9995696612177297e-06, "loss": 1.4174, "step": 5538 }, { "epoch": 0.04, "grad_norm": 4.805565567392629, "learning_rate": 1.9995695055206154e-06, "loss": 1.562, "step": 5539 }, { "epoch": 0.04, "grad_norm": 4.675051676525897, "learning_rate": 1.999569349795346e-06, "loss": 1.4581, "step": 5540 }, { "epoch": 0.04, "grad_norm": 4.591972717502866, "learning_rate": 1.999569194041923e-06, "loss": 1.4726, "step": 5541 }, { "epoch": 0.04, "grad_norm": 5.491667643983784, "learning_rate": 1.999569038260345e-06, "loss": 1.3701, "step": 5542 }, { "epoch": 0.04, "grad_norm": 4.4113218104795555, "learning_rate": 1.9995688824506125e-06, "loss": 1.2901, "step": 5543 }, { "epoch": 0.04, "grad_norm": 4.350529664333452, "learning_rate": 1.9995687266127256e-06, "loss": 1.366, "step": 5544 }, { "epoch": 0.04, "grad_norm": 4.735714707913643, "learning_rate": 1.999568570746684e-06, "loss": 1.5362, "step": 5545 }, { "epoch": 0.04, "grad_norm": 6.553728883193815, "learning_rate": 1.9995684148524877e-06, "loss": 1.3676, "step": 5546 }, { "epoch": 0.04, "grad_norm": 5.954887712609845, "learning_rate": 1.999568258930137e-06, "loss": 1.6431, "step": 5547 }, { "epoch": 0.04, "grad_norm": 4.8138831207337525, "learning_rate": 1.999568102979632e-06, "loss": 1.4303, "step": 5548 }, { "epoch": 0.04, "eval_loss": 1.5862023830413818, "eval_runtime": 4.6174, "eval_samples_per_second": 1.949, "eval_steps_per_second": 1.083, "step": 5548 }, { "epoch": 0.04, "grad_norm": 5.088485142826955, "learning_rate": 1.9995679470009726e-06, "loss": 1.263, "step": 5549 }, { "epoch": 0.04, "grad_norm": 5.136917135049553, "learning_rate": 1.9995677909941585e-06, "loss": 1.3881, "step": 5550 }, { "epoch": 0.04, "grad_norm": 4.689584022406284, "learning_rate": 1.99956763495919e-06, "loss": 1.3607, "step": 5551 }, { "epoch": 0.04, "grad_norm": 4.630153123050387, "learning_rate": 1.9995674788960667e-06, "loss": 1.235, "step": 5552 }, { "epoch": 0.04, "grad_norm": 4.654011363113006, "learning_rate": 1.9995673228047894e-06, "loss": 1.329, "step": 5553 }, { "epoch": 0.04, "grad_norm": 4.734626395667247, "learning_rate": 1.9995671666853572e-06, "loss": 1.3453, "step": 5554 }, { "epoch": 0.04, "grad_norm": 4.425558315892681, "learning_rate": 1.9995670105377705e-06, "loss": 1.3351, "step": 5555 }, { "epoch": 0.04, "grad_norm": 4.79020442325916, "learning_rate": 1.9995668543620296e-06, "loss": 1.4056, "step": 5556 }, { "epoch": 0.04, "grad_norm": 4.282262556844571, "learning_rate": 1.9995666981581343e-06, "loss": 1.3624, "step": 5557 }, { "epoch": 0.04, "grad_norm": 4.870903993614832, "learning_rate": 1.999566541926084e-06, "loss": 1.4587, "step": 5558 }, { "epoch": 0.04, "grad_norm": 4.32130291101233, "learning_rate": 1.99956638566588e-06, "loss": 1.3842, "step": 5559 }, { "epoch": 0.04, "grad_norm": 4.271077140575839, "learning_rate": 1.9995662293775206e-06, "loss": 1.3866, "step": 5560 }, { "epoch": 0.04, "grad_norm": 5.102785701532075, "learning_rate": 1.9995660730610075e-06, "loss": 1.6478, "step": 5561 }, { "epoch": 0.04, "grad_norm": 4.509569913984659, "learning_rate": 1.9995659167163395e-06, "loss": 1.3883, "step": 5562 }, { "epoch": 0.04, "grad_norm": 4.23369987113473, "learning_rate": 1.9995657603435174e-06, "loss": 1.3531, "step": 5563 }, { "epoch": 0.04, "grad_norm": 4.342423726866366, "learning_rate": 1.9995656039425407e-06, "loss": 1.3863, "step": 5564 }, { "epoch": 0.04, "grad_norm": 4.9390625535685775, "learning_rate": 1.9995654475134095e-06, "loss": 1.3906, "step": 5565 }, { "epoch": 0.04, "grad_norm": 6.781014973266148, "learning_rate": 1.999565291056124e-06, "loss": 1.2984, "step": 5566 }, { "epoch": 0.04, "grad_norm": 4.339451618057539, "learning_rate": 1.999565134570684e-06, "loss": 1.2809, "step": 5567 }, { "epoch": 0.04, "grad_norm": 7.133494016914502, "learning_rate": 1.9995649780570893e-06, "loss": 1.3077, "step": 5568 }, { "epoch": 0.04, "grad_norm": 5.201323967642531, "learning_rate": 1.9995648215153404e-06, "loss": 1.4337, "step": 5569 }, { "epoch": 0.04, "grad_norm": 4.444480806170569, "learning_rate": 1.999564664945437e-06, "loss": 1.4624, "step": 5570 }, { "epoch": 0.04, "grad_norm": 4.555423359158323, "learning_rate": 1.9995645083473795e-06, "loss": 1.3336, "step": 5571 }, { "epoch": 0.04, "grad_norm": 7.519785915619404, "learning_rate": 1.9995643517211675e-06, "loss": 1.4261, "step": 5572 }, { "epoch": 0.04, "grad_norm": 11.284768895349025, "learning_rate": 1.9995641950668014e-06, "loss": 1.5399, "step": 5573 }, { "epoch": 0.04, "grad_norm": 4.81065786324385, "learning_rate": 1.9995640383842803e-06, "loss": 1.3493, "step": 5574 }, { "epoch": 0.04, "grad_norm": 4.791283091997439, "learning_rate": 1.999563881673605e-06, "loss": 1.3407, "step": 5575 }, { "epoch": 0.04, "grad_norm": 4.884467553801634, "learning_rate": 1.9995637249347754e-06, "loss": 1.4085, "step": 5576 }, { "epoch": 0.04, "grad_norm": 4.611343535990165, "learning_rate": 1.999563568167791e-06, "loss": 1.3765, "step": 5577 }, { "epoch": 0.04, "grad_norm": 4.360971884402026, "learning_rate": 1.999563411372653e-06, "loss": 1.4279, "step": 5578 }, { "epoch": 0.04, "grad_norm": 4.7541136022774975, "learning_rate": 1.99956325454936e-06, "loss": 1.2952, "step": 5579 }, { "epoch": 0.04, "grad_norm": 4.446874249294563, "learning_rate": 1.999563097697913e-06, "loss": 1.4323, "step": 5580 }, { "epoch": 0.04, "grad_norm": 5.148047760255938, "learning_rate": 1.999562940818311e-06, "loss": 1.38, "step": 5581 }, { "epoch": 0.04, "grad_norm": 5.130414233406137, "learning_rate": 1.9995627839105554e-06, "loss": 1.1946, "step": 5582 }, { "epoch": 0.04, "grad_norm": 5.15608745415354, "learning_rate": 1.999562626974645e-06, "loss": 1.494, "step": 5583 }, { "epoch": 0.04, "grad_norm": 6.449247882258043, "learning_rate": 1.9995624700105806e-06, "loss": 1.5227, "step": 5584 }, { "epoch": 0.04, "grad_norm": 4.378311167390088, "learning_rate": 1.9995623130183614e-06, "loss": 1.5961, "step": 5585 }, { "epoch": 0.04, "grad_norm": 4.7081457653466705, "learning_rate": 1.999562155997988e-06, "loss": 1.1838, "step": 5586 }, { "epoch": 0.04, "grad_norm": 4.530192591657961, "learning_rate": 1.9995619989494603e-06, "loss": 1.4848, "step": 5587 }, { "epoch": 0.04, "grad_norm": 4.59486446111991, "learning_rate": 1.9995618418727784e-06, "loss": 1.4088, "step": 5588 }, { "epoch": 0.04, "grad_norm": 4.712227196494009, "learning_rate": 1.9995616847679423e-06, "loss": 1.2953, "step": 5589 }, { "epoch": 0.04, "grad_norm": 4.680357383394461, "learning_rate": 1.9995615276349518e-06, "loss": 1.4393, "step": 5590 }, { "epoch": 0.04, "grad_norm": 4.7959347858909185, "learning_rate": 1.9995613704738067e-06, "loss": 1.4212, "step": 5591 }, { "epoch": 0.04, "grad_norm": 4.7581613571455454, "learning_rate": 1.999561213284507e-06, "loss": 1.4562, "step": 5592 }, { "epoch": 0.04, "grad_norm": 5.022913490150194, "learning_rate": 1.9995610560670533e-06, "loss": 1.5075, "step": 5593 }, { "epoch": 0.04, "grad_norm": 12.2657627134323, "learning_rate": 1.9995608988214455e-06, "loss": 1.5203, "step": 5594 }, { "epoch": 0.04, "grad_norm": 5.155211433282628, "learning_rate": 1.9995607415476835e-06, "loss": 1.1795, "step": 5595 }, { "epoch": 0.04, "grad_norm": 4.3793506299853, "learning_rate": 1.9995605842457667e-06, "loss": 1.3215, "step": 5596 }, { "epoch": 0.04, "grad_norm": 5.505774423763207, "learning_rate": 1.9995604269156957e-06, "loss": 1.2559, "step": 5597 }, { "epoch": 0.04, "grad_norm": 4.979164280112272, "learning_rate": 1.9995602695574706e-06, "loss": 1.4512, "step": 5598 }, { "epoch": 0.04, "grad_norm": 4.773244568700788, "learning_rate": 1.9995601121710914e-06, "loss": 1.371, "step": 5599 }, { "epoch": 0.04, "grad_norm": 4.422166109338598, "learning_rate": 1.9995599547565576e-06, "loss": 1.3316, "step": 5600 }, { "epoch": 0.04, "grad_norm": 4.530677656771057, "learning_rate": 1.9995597973138694e-06, "loss": 1.2827, "step": 5601 }, { "epoch": 0.04, "grad_norm": 4.917347789988938, "learning_rate": 1.9995596398430274e-06, "loss": 1.3733, "step": 5602 }, { "epoch": 0.04, "grad_norm": 4.663234532672557, "learning_rate": 1.999559482344031e-06, "loss": 1.4578, "step": 5603 }, { "epoch": 0.04, "grad_norm": 5.2060404919578716, "learning_rate": 1.99955932481688e-06, "loss": 1.3444, "step": 5604 }, { "epoch": 0.04, "grad_norm": 6.010016046658611, "learning_rate": 1.9995591672615753e-06, "loss": 1.2481, "step": 5605 }, { "epoch": 0.04, "grad_norm": 4.7232928123905245, "learning_rate": 1.9995590096781157e-06, "loss": 1.4256, "step": 5606 }, { "epoch": 0.04, "grad_norm": 4.3444974061605866, "learning_rate": 1.999558852066502e-06, "loss": 1.0913, "step": 5607 }, { "epoch": 0.04, "grad_norm": 4.97817790581386, "learning_rate": 1.9995586944267345e-06, "loss": 1.3017, "step": 5608 }, { "epoch": 0.04, "grad_norm": 6.78159625789784, "learning_rate": 1.999558536758812e-06, "loss": 1.4997, "step": 5609 }, { "epoch": 0.04, "grad_norm": 4.903446673313751, "learning_rate": 1.999558379062736e-06, "loss": 1.3845, "step": 5610 }, { "epoch": 0.04, "grad_norm": 4.5949315510231195, "learning_rate": 1.9995582213385055e-06, "loss": 1.2597, "step": 5611 }, { "epoch": 0.04, "grad_norm": 5.049733221702976, "learning_rate": 1.999558063586121e-06, "loss": 1.261, "step": 5612 }, { "epoch": 0.04, "grad_norm": 4.312583231142593, "learning_rate": 1.9995579058055817e-06, "loss": 1.413, "step": 5613 }, { "epoch": 0.04, "grad_norm": 4.980442472866436, "learning_rate": 1.9995577479968888e-06, "loss": 1.5361, "step": 5614 }, { "epoch": 0.04, "grad_norm": 6.371335244699003, "learning_rate": 1.9995575901600414e-06, "loss": 1.6197, "step": 5615 }, { "epoch": 0.04, "grad_norm": 4.95301867568782, "learning_rate": 1.9995574322950394e-06, "loss": 1.3604, "step": 5616 }, { "epoch": 0.04, "grad_norm": 4.461406514420875, "learning_rate": 1.9995572744018838e-06, "loss": 1.3329, "step": 5617 }, { "epoch": 0.04, "grad_norm": 4.227224042989711, "learning_rate": 1.9995571164805736e-06, "loss": 1.3427, "step": 5618 }, { "epoch": 0.04, "grad_norm": 4.4984290689551765, "learning_rate": 1.9995569585311094e-06, "loss": 1.3966, "step": 5619 }, { "epoch": 0.04, "grad_norm": 4.648814435305461, "learning_rate": 1.999556800553491e-06, "loss": 1.3188, "step": 5620 }, { "epoch": 0.04, "grad_norm": 4.515830330320723, "learning_rate": 1.9995566425477185e-06, "loss": 1.3622, "step": 5621 }, { "epoch": 0.04, "eval_loss": 1.5868887901306152, "eval_runtime": 4.6275, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.081, "step": 5621 }, { "epoch": 0.04, "grad_norm": 4.869599644879368, "learning_rate": 1.9995564845137915e-06, "loss": 1.3912, "step": 5622 }, { "epoch": 0.04, "grad_norm": 4.737237773008552, "learning_rate": 1.9995563264517104e-06, "loss": 1.4213, "step": 5623 }, { "epoch": 0.04, "grad_norm": 5.625671969676264, "learning_rate": 1.9995561683614752e-06, "loss": 1.4061, "step": 5624 }, { "epoch": 0.04, "grad_norm": 4.683348490886964, "learning_rate": 1.999556010243086e-06, "loss": 1.4692, "step": 5625 }, { "epoch": 0.04, "grad_norm": 4.967188858077755, "learning_rate": 1.9995558520965425e-06, "loss": 1.4937, "step": 5626 }, { "epoch": 0.04, "grad_norm": 4.664387983162651, "learning_rate": 1.9995556939218446e-06, "loss": 1.3118, "step": 5627 }, { "epoch": 0.04, "grad_norm": 4.385289469937267, "learning_rate": 1.999555535718993e-06, "loss": 1.3995, "step": 5628 }, { "epoch": 0.04, "grad_norm": 4.667301704665077, "learning_rate": 1.999555377487987e-06, "loss": 1.4365, "step": 5629 }, { "epoch": 0.04, "grad_norm": 5.108446130650856, "learning_rate": 1.999555219228827e-06, "loss": 1.4741, "step": 5630 }, { "epoch": 0.04, "grad_norm": 4.730356360194432, "learning_rate": 1.9995550609415126e-06, "loss": 1.4388, "step": 5631 }, { "epoch": 0.04, "grad_norm": 4.451749579380501, "learning_rate": 1.999554902626044e-06, "loss": 1.3369, "step": 5632 }, { "epoch": 0.04, "grad_norm": 7.2784971537151915, "learning_rate": 1.9995547442824216e-06, "loss": 1.4168, "step": 5633 }, { "epoch": 0.04, "grad_norm": 5.477690468289151, "learning_rate": 1.999554585910645e-06, "loss": 1.3588, "step": 5634 }, { "epoch": 0.04, "grad_norm": 5.318854593942667, "learning_rate": 1.999554427510714e-06, "loss": 1.4078, "step": 5635 }, { "epoch": 0.04, "grad_norm": 4.543252817742803, "learning_rate": 1.9995542690826292e-06, "loss": 1.3451, "step": 5636 }, { "epoch": 0.04, "grad_norm": 4.399067840583918, "learning_rate": 1.99955411062639e-06, "loss": 1.4051, "step": 5637 }, { "epoch": 0.04, "grad_norm": 4.588332765426616, "learning_rate": 1.9995539521419967e-06, "loss": 1.3247, "step": 5638 }, { "epoch": 0.04, "grad_norm": 5.270567225317389, "learning_rate": 1.9995537936294495e-06, "loss": 1.3493, "step": 5639 }, { "epoch": 0.04, "grad_norm": 5.163734160160999, "learning_rate": 1.9995536350887482e-06, "loss": 1.4248, "step": 5640 }, { "epoch": 0.04, "grad_norm": 5.172045046480884, "learning_rate": 1.9995534765198924e-06, "loss": 1.4766, "step": 5641 }, { "epoch": 0.04, "grad_norm": 4.633448534229108, "learning_rate": 1.999553317922883e-06, "loss": 1.3671, "step": 5642 }, { "epoch": 0.04, "grad_norm": 5.076757542500624, "learning_rate": 1.9995531592977193e-06, "loss": 1.2867, "step": 5643 }, { "epoch": 0.04, "grad_norm": 4.577296783864219, "learning_rate": 1.9995530006444016e-06, "loss": 1.2291, "step": 5644 }, { "epoch": 0.04, "grad_norm": 4.866953645691457, "learning_rate": 1.9995528419629293e-06, "loss": 1.4896, "step": 5645 }, { "epoch": 0.04, "grad_norm": 4.44592777181211, "learning_rate": 1.999552683253304e-06, "loss": 1.329, "step": 5646 }, { "epoch": 0.04, "grad_norm": 4.5988910317673675, "learning_rate": 1.9995525245155238e-06, "loss": 1.3731, "step": 5647 }, { "epoch": 0.04, "grad_norm": 5.186910860497609, "learning_rate": 1.9995523657495896e-06, "loss": 1.5872, "step": 5648 }, { "epoch": 0.04, "grad_norm": 4.4930893765611, "learning_rate": 1.9995522069555014e-06, "loss": 1.418, "step": 5649 }, { "epoch": 0.04, "grad_norm": 4.3118414791290025, "learning_rate": 1.9995520481332595e-06, "loss": 1.3141, "step": 5650 }, { "epoch": 0.04, "grad_norm": 4.631298874689587, "learning_rate": 1.999551889282863e-06, "loss": 1.4092, "step": 5651 }, { "epoch": 0.04, "grad_norm": 4.3608149186859535, "learning_rate": 1.999551730404313e-06, "loss": 1.4468, "step": 5652 }, { "epoch": 0.04, "grad_norm": 4.39189465900908, "learning_rate": 1.999551571497608e-06, "loss": 1.4773, "step": 5653 }, { "epoch": 0.04, "grad_norm": 4.643874225644158, "learning_rate": 1.99955141256275e-06, "loss": 1.4807, "step": 5654 }, { "epoch": 0.04, "grad_norm": 6.353914283291529, "learning_rate": 1.9995512535997374e-06, "loss": 1.2688, "step": 5655 }, { "epoch": 0.04, "grad_norm": 4.63512822711736, "learning_rate": 1.999551094608571e-06, "loss": 1.2541, "step": 5656 }, { "epoch": 0.04, "grad_norm": 4.969782336157194, "learning_rate": 1.99955093558925e-06, "loss": 1.4253, "step": 5657 }, { "epoch": 0.04, "grad_norm": 5.686572888601756, "learning_rate": 1.999550776541776e-06, "loss": 1.3629, "step": 5658 }, { "epoch": 0.04, "grad_norm": 7.568783371380138, "learning_rate": 1.999550617466147e-06, "loss": 1.4152, "step": 5659 }, { "epoch": 0.04, "grad_norm": 4.49346479229489, "learning_rate": 1.9995504583623645e-06, "loss": 1.4009, "step": 5660 }, { "epoch": 0.04, "grad_norm": 4.7788291574518365, "learning_rate": 1.9995502992304282e-06, "loss": 1.2063, "step": 5661 }, { "epoch": 0.04, "grad_norm": 4.6799478183470065, "learning_rate": 1.9995501400703375e-06, "loss": 1.4644, "step": 5662 }, { "epoch": 0.04, "grad_norm": 4.178668347448705, "learning_rate": 1.9995499808820926e-06, "loss": 1.2718, "step": 5663 }, { "epoch": 0.04, "grad_norm": 4.136450545936772, "learning_rate": 1.9995498216656945e-06, "loss": 1.3599, "step": 5664 }, { "epoch": 0.04, "grad_norm": 5.281692043880912, "learning_rate": 1.999549662421142e-06, "loss": 1.3372, "step": 5665 }, { "epoch": 0.04, "grad_norm": 4.927951515954843, "learning_rate": 1.999549503148435e-06, "loss": 1.427, "step": 5666 }, { "epoch": 0.04, "grad_norm": 4.927599550650963, "learning_rate": 1.9995493438475747e-06, "loss": 1.4525, "step": 5667 }, { "epoch": 0.04, "grad_norm": 4.573326002309609, "learning_rate": 1.99954918451856e-06, "loss": 1.3685, "step": 5668 }, { "epoch": 0.04, "grad_norm": 4.688392665339077, "learning_rate": 1.9995490251613915e-06, "loss": 1.1976, "step": 5669 }, { "epoch": 0.04, "grad_norm": 5.240017468538618, "learning_rate": 1.999548865776069e-06, "loss": 1.3647, "step": 5670 }, { "epoch": 0.04, "grad_norm": 4.992270603040266, "learning_rate": 1.9995487063625927e-06, "loss": 1.1956, "step": 5671 }, { "epoch": 0.04, "grad_norm": 4.809629957171013, "learning_rate": 1.999548546920962e-06, "loss": 1.3547, "step": 5672 }, { "epoch": 0.04, "grad_norm": 4.521525388493329, "learning_rate": 1.999548387451178e-06, "loss": 1.4112, "step": 5673 }, { "epoch": 0.04, "grad_norm": 4.482961296873273, "learning_rate": 1.9995482279532392e-06, "loss": 1.29, "step": 5674 }, { "epoch": 0.04, "grad_norm": 4.625547374032523, "learning_rate": 1.9995480684271472e-06, "loss": 1.4291, "step": 5675 }, { "epoch": 0.04, "grad_norm": 4.690000276692352, "learning_rate": 1.999547908872901e-06, "loss": 1.3442, "step": 5676 }, { "epoch": 0.04, "grad_norm": 4.430105772252919, "learning_rate": 1.999547749290501e-06, "loss": 1.2864, "step": 5677 }, { "epoch": 0.04, "grad_norm": 4.32930737729252, "learning_rate": 1.9995475896799466e-06, "loss": 1.397, "step": 5678 }, { "epoch": 0.04, "grad_norm": 4.572579379821591, "learning_rate": 1.9995474300412386e-06, "loss": 1.2977, "step": 5679 }, { "epoch": 0.04, "grad_norm": 4.435467085262868, "learning_rate": 1.999547270374377e-06, "loss": 1.3668, "step": 5680 }, { "epoch": 0.04, "grad_norm": 4.594957371726952, "learning_rate": 1.9995471106793607e-06, "loss": 1.4749, "step": 5681 }, { "epoch": 0.04, "grad_norm": 4.820132843336234, "learning_rate": 1.9995469509561913e-06, "loss": 1.5195, "step": 5682 }, { "epoch": 0.04, "grad_norm": 6.301070201595898, "learning_rate": 1.9995467912048673e-06, "loss": 1.3769, "step": 5683 }, { "epoch": 0.04, "grad_norm": 4.447793300024681, "learning_rate": 1.9995466314253896e-06, "loss": 1.3186, "step": 5684 }, { "epoch": 0.04, "grad_norm": 4.4565020751542805, "learning_rate": 1.9995464716177583e-06, "loss": 1.4532, "step": 5685 }, { "epoch": 0.04, "grad_norm": 4.449572046622291, "learning_rate": 1.999546311781973e-06, "loss": 1.2427, "step": 5686 }, { "epoch": 0.04, "grad_norm": 4.257749591036685, "learning_rate": 1.9995461519180333e-06, "loss": 1.0846, "step": 5687 }, { "epoch": 0.04, "grad_norm": 5.372656779153846, "learning_rate": 1.9995459920259404e-06, "loss": 1.3518, "step": 5688 }, { "epoch": 0.04, "grad_norm": 6.965077682588342, "learning_rate": 1.999545832105693e-06, "loss": 1.2883, "step": 5689 }, { "epoch": 0.04, "grad_norm": 4.974300099442155, "learning_rate": 1.999545672157292e-06, "loss": 1.4174, "step": 5690 }, { "epoch": 0.04, "grad_norm": 4.914011850560827, "learning_rate": 1.9995455121807374e-06, "loss": 1.504, "step": 5691 }, { "epoch": 0.04, "grad_norm": 4.720572316317237, "learning_rate": 1.9995453521760285e-06, "loss": 1.2646, "step": 5692 }, { "epoch": 0.04, "grad_norm": 4.4428759249096705, "learning_rate": 1.999545192143166e-06, "loss": 1.3852, "step": 5693 }, { "epoch": 0.04, "grad_norm": 5.499431056140739, "learning_rate": 1.9995450320821495e-06, "loss": 1.3161, "step": 5694 }, { "epoch": 0.04, "eval_loss": 1.5865055322647095, "eval_runtime": 4.6272, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.081, "step": 5694 }, { "epoch": 0.04, "grad_norm": 4.73088185280725, "learning_rate": 1.999544871992979e-06, "loss": 1.3606, "step": 5695 }, { "epoch": 0.04, "grad_norm": 5.508127774818568, "learning_rate": 1.9995447118756548e-06, "loss": 1.2525, "step": 5696 }, { "epoch": 0.04, "grad_norm": 4.973808574199772, "learning_rate": 1.999544551730177e-06, "loss": 1.4907, "step": 5697 }, { "epoch": 0.04, "grad_norm": 4.91844193841649, "learning_rate": 1.999544391556545e-06, "loss": 1.4116, "step": 5698 }, { "epoch": 0.04, "grad_norm": 4.377376941415667, "learning_rate": 1.9995442313547595e-06, "loss": 1.3167, "step": 5699 }, { "epoch": 0.04, "grad_norm": 4.330163726835295, "learning_rate": 1.99954407112482e-06, "loss": 1.3374, "step": 5700 }, { "epoch": 0.04, "grad_norm": 4.340568913577446, "learning_rate": 1.9995439108667264e-06, "loss": 1.3095, "step": 5701 }, { "epoch": 0.04, "grad_norm": 4.299547537666422, "learning_rate": 1.9995437505804795e-06, "loss": 1.3081, "step": 5702 }, { "epoch": 0.04, "grad_norm": 8.871930724341007, "learning_rate": 1.999543590266078e-06, "loss": 1.4336, "step": 5703 }, { "epoch": 0.04, "grad_norm": 4.598652081437408, "learning_rate": 1.9995434299235234e-06, "loss": 1.3376, "step": 5704 }, { "epoch": 0.04, "grad_norm": 4.9855800460741655, "learning_rate": 1.9995432695528146e-06, "loss": 1.1195, "step": 5705 }, { "epoch": 0.04, "grad_norm": 4.658156043367133, "learning_rate": 1.999543109153952e-06, "loss": 1.3097, "step": 5706 }, { "epoch": 0.04, "grad_norm": 5.787652588488233, "learning_rate": 1.999542948726936e-06, "loss": 1.2784, "step": 5707 }, { "epoch": 0.04, "grad_norm": 4.873810704743903, "learning_rate": 1.9995427882717657e-06, "loss": 1.4955, "step": 5708 }, { "epoch": 0.04, "grad_norm": 4.441694496951607, "learning_rate": 1.999542627788442e-06, "loss": 1.6009, "step": 5709 }, { "epoch": 0.04, "grad_norm": 4.239927270788598, "learning_rate": 1.9995424672769646e-06, "loss": 1.348, "step": 5710 }, { "epoch": 0.04, "grad_norm": 4.669791885900121, "learning_rate": 1.999542306737333e-06, "loss": 1.3065, "step": 5711 }, { "epoch": 0.04, "grad_norm": 5.0664063819839305, "learning_rate": 1.999542146169548e-06, "loss": 1.3004, "step": 5712 }, { "epoch": 0.04, "grad_norm": 6.972967726760133, "learning_rate": 1.999541985573609e-06, "loss": 1.3796, "step": 5713 }, { "epoch": 0.04, "grad_norm": 4.290732728175969, "learning_rate": 1.999541824949516e-06, "loss": 1.3084, "step": 5714 }, { "epoch": 0.04, "grad_norm": 5.053494331833529, "learning_rate": 1.999541664297269e-06, "loss": 1.4072, "step": 5715 }, { "epoch": 0.04, "grad_norm": 6.067027473836074, "learning_rate": 1.999541503616869e-06, "loss": 1.3401, "step": 5716 }, { "epoch": 0.04, "grad_norm": 5.874062400021865, "learning_rate": 1.999541342908315e-06, "loss": 1.3251, "step": 5717 }, { "epoch": 0.04, "grad_norm": 4.85109654944957, "learning_rate": 1.9995411821716074e-06, "loss": 1.4475, "step": 5718 }, { "epoch": 0.04, "grad_norm": 4.157375042943867, "learning_rate": 1.9995410214067458e-06, "loss": 1.3019, "step": 5719 }, { "epoch": 0.04, "grad_norm": 4.408576186147239, "learning_rate": 1.99954086061373e-06, "loss": 1.3997, "step": 5720 }, { "epoch": 0.04, "grad_norm": 5.4580097612881735, "learning_rate": 1.999540699792561e-06, "loss": 1.2468, "step": 5721 }, { "epoch": 0.04, "grad_norm": 7.573657227573233, "learning_rate": 1.9995405389432385e-06, "loss": 1.1638, "step": 5722 }, { "epoch": 0.04, "grad_norm": 5.0681026633933906, "learning_rate": 1.9995403780657618e-06, "loss": 1.3135, "step": 5723 }, { "epoch": 0.04, "grad_norm": 4.493283194431849, "learning_rate": 1.9995402171601318e-06, "loss": 1.4199, "step": 5724 }, { "epoch": 0.04, "grad_norm": 4.704795629608404, "learning_rate": 1.9995400562263477e-06, "loss": 1.4435, "step": 5725 }, { "epoch": 0.04, "grad_norm": 4.467729234021108, "learning_rate": 1.99953989526441e-06, "loss": 1.3044, "step": 5726 }, { "epoch": 0.04, "grad_norm": 4.5435378872649865, "learning_rate": 1.9995397342743184e-06, "loss": 1.3838, "step": 5727 }, { "epoch": 0.04, "grad_norm": 4.644203319206795, "learning_rate": 1.9995395732560733e-06, "loss": 1.4107, "step": 5728 }, { "epoch": 0.04, "grad_norm": 6.0131703225613204, "learning_rate": 1.999539412209675e-06, "loss": 1.4552, "step": 5729 }, { "epoch": 0.04, "grad_norm": 4.606935611580208, "learning_rate": 1.999539251135122e-06, "loss": 1.3144, "step": 5730 }, { "epoch": 0.04, "grad_norm": 5.233961317272037, "learning_rate": 1.9995390900324157e-06, "loss": 1.3378, "step": 5731 }, { "epoch": 0.04, "grad_norm": 5.19227681040022, "learning_rate": 1.999538928901556e-06, "loss": 1.5631, "step": 5732 }, { "epoch": 0.04, "grad_norm": 4.401469350616104, "learning_rate": 1.9995387677425423e-06, "loss": 1.3457, "step": 5733 }, { "epoch": 0.04, "grad_norm": 6.110661891574637, "learning_rate": 1.999538606555375e-06, "loss": 1.6455, "step": 5734 }, { "epoch": 0.04, "grad_norm": 4.617055151536763, "learning_rate": 1.9995384453400538e-06, "loss": 1.3457, "step": 5735 }, { "epoch": 0.04, "grad_norm": 4.511076195763694, "learning_rate": 1.999538284096579e-06, "loss": 1.3699, "step": 5736 }, { "epoch": 0.04, "grad_norm": 4.409702422136076, "learning_rate": 1.999538122824951e-06, "loss": 1.464, "step": 5737 }, { "epoch": 0.04, "grad_norm": 5.088054488103913, "learning_rate": 1.999537961525169e-06, "loss": 1.2416, "step": 5738 }, { "epoch": 0.04, "grad_norm": 4.717594101337755, "learning_rate": 1.9995378001972333e-06, "loss": 1.3881, "step": 5739 }, { "epoch": 0.04, "grad_norm": 6.0358670990552525, "learning_rate": 1.999537638841144e-06, "loss": 1.3765, "step": 5740 }, { "epoch": 0.04, "grad_norm": 4.6025152358498, "learning_rate": 1.999537477456901e-06, "loss": 1.4732, "step": 5741 }, { "epoch": 0.04, "grad_norm": 4.783595174371466, "learning_rate": 1.9995373160445047e-06, "loss": 1.3128, "step": 5742 }, { "epoch": 0.04, "grad_norm": 4.228426320629174, "learning_rate": 1.9995371546039544e-06, "loss": 1.2504, "step": 5743 }, { "epoch": 0.04, "grad_norm": 4.336185393717913, "learning_rate": 1.9995369931352507e-06, "loss": 1.3723, "step": 5744 }, { "epoch": 0.04, "grad_norm": 4.387910434673658, "learning_rate": 1.999536831638393e-06, "loss": 1.3952, "step": 5745 }, { "epoch": 0.04, "grad_norm": 4.2733963111213304, "learning_rate": 1.999536670113382e-06, "loss": 1.3632, "step": 5746 }, { "epoch": 0.04, "grad_norm": 4.8402697055323385, "learning_rate": 1.999536508560217e-06, "loss": 1.4165, "step": 5747 }, { "epoch": 0.04, "grad_norm": 4.269179880814298, "learning_rate": 1.999536346978899e-06, "loss": 1.1972, "step": 5748 }, { "epoch": 0.04, "grad_norm": 4.645323211920292, "learning_rate": 1.999536185369427e-06, "loss": 1.2998, "step": 5749 }, { "epoch": 0.04, "grad_norm": 6.684987629696235, "learning_rate": 1.9995360237318013e-06, "loss": 1.3018, "step": 5750 }, { "epoch": 0.04, "grad_norm": 4.345883481911031, "learning_rate": 1.999535862066022e-06, "loss": 1.3886, "step": 5751 }, { "epoch": 0.04, "grad_norm": 4.492817284423613, "learning_rate": 1.9995357003720893e-06, "loss": 1.2824, "step": 5752 }, { "epoch": 0.04, "grad_norm": 4.897598158396424, "learning_rate": 1.999535538650003e-06, "loss": 1.4897, "step": 5753 }, { "epoch": 0.04, "grad_norm": 4.977590742149847, "learning_rate": 1.9995353768997625e-06, "loss": 1.3561, "step": 5754 }, { "epoch": 0.04, "grad_norm": 4.288270785453842, "learning_rate": 1.9995352151213693e-06, "loss": 1.2578, "step": 5755 }, { "epoch": 0.04, "grad_norm": 4.272819507244013, "learning_rate": 1.999535053314822e-06, "loss": 1.3456, "step": 5756 }, { "epoch": 0.04, "grad_norm": 4.633816678114851, "learning_rate": 1.9995348914801213e-06, "loss": 1.335, "step": 5757 }, { "epoch": 0.04, "grad_norm": 4.743534936247921, "learning_rate": 1.999534729617267e-06, "loss": 1.2954, "step": 5758 }, { "epoch": 0.04, "grad_norm": 4.819758384279413, "learning_rate": 1.999534567726259e-06, "loss": 1.4341, "step": 5759 }, { "epoch": 0.04, "grad_norm": 4.632473536121055, "learning_rate": 1.9995344058070977e-06, "loss": 1.3987, "step": 5760 }, { "epoch": 0.04, "grad_norm": 4.846981580867686, "learning_rate": 1.999534243859783e-06, "loss": 1.573, "step": 5761 }, { "epoch": 0.04, "grad_norm": 4.802635307215445, "learning_rate": 1.999534081884314e-06, "loss": 1.4517, "step": 5762 }, { "epoch": 0.04, "grad_norm": 7.424376531384478, "learning_rate": 1.999533919880692e-06, "loss": 1.3099, "step": 5763 }, { "epoch": 0.04, "grad_norm": 4.811963844660745, "learning_rate": 1.9995337578489164e-06, "loss": 1.4261, "step": 5764 }, { "epoch": 0.04, "grad_norm": 4.554998602306021, "learning_rate": 1.9995335957889876e-06, "loss": 1.2917, "step": 5765 }, { "epoch": 0.04, "grad_norm": 4.20222297637741, "learning_rate": 1.9995334337009046e-06, "loss": 1.2525, "step": 5766 }, { "epoch": 0.04, "grad_norm": 4.816462591045214, "learning_rate": 1.9995332715846685e-06, "loss": 1.418, "step": 5767 }, { "epoch": 0.04, "eval_loss": 1.5874297618865967, "eval_runtime": 4.6418, "eval_samples_per_second": 1.939, "eval_steps_per_second": 1.077, "step": 5767 }, { "epoch": 0.04, "grad_norm": 4.775791751904938, "learning_rate": 1.9995331094402786e-06, "loss": 1.3892, "step": 5768 }, { "epoch": 0.04, "grad_norm": 5.169355928565239, "learning_rate": 1.999532947267735e-06, "loss": 1.3716, "step": 5769 }, { "epoch": 0.04, "grad_norm": 4.426889350331657, "learning_rate": 1.9995327850670383e-06, "loss": 1.3519, "step": 5770 }, { "epoch": 0.04, "grad_norm": 9.665299580620445, "learning_rate": 1.999532622838188e-06, "loss": 1.5072, "step": 5771 }, { "epoch": 0.04, "grad_norm": 5.561353963698622, "learning_rate": 1.999532460581184e-06, "loss": 1.3748, "step": 5772 }, { "epoch": 0.04, "grad_norm": 4.542305851527574, "learning_rate": 1.9995322982960267e-06, "loss": 1.4879, "step": 5773 }, { "epoch": 0.04, "grad_norm": 4.262762496961736, "learning_rate": 1.999532135982716e-06, "loss": 1.2253, "step": 5774 }, { "epoch": 0.04, "grad_norm": 4.268454755154603, "learning_rate": 1.9995319736412517e-06, "loss": 1.4535, "step": 5775 }, { "epoch": 0.04, "grad_norm": 4.468078092931769, "learning_rate": 1.9995318112716337e-06, "loss": 1.4572, "step": 5776 }, { "epoch": 0.04, "grad_norm": 4.664680993071104, "learning_rate": 1.9995316488738624e-06, "loss": 1.4317, "step": 5777 }, { "epoch": 0.04, "grad_norm": 4.802220438129624, "learning_rate": 1.9995314864479374e-06, "loss": 1.4448, "step": 5778 }, { "epoch": 0.04, "grad_norm": 5.104733298886027, "learning_rate": 1.9995313239938592e-06, "loss": 1.5477, "step": 5779 }, { "epoch": 0.04, "grad_norm": 4.222802494402566, "learning_rate": 1.9995311615116277e-06, "loss": 1.2637, "step": 5780 }, { "epoch": 0.04, "grad_norm": 4.351452776632753, "learning_rate": 1.999530999001242e-06, "loss": 1.2483, "step": 5781 }, { "epoch": 0.04, "grad_norm": 4.461436333851411, "learning_rate": 1.9995308364627033e-06, "loss": 1.4313, "step": 5782 }, { "epoch": 0.04, "grad_norm": 4.725480261441406, "learning_rate": 1.9995306738960112e-06, "loss": 1.4701, "step": 5783 }, { "epoch": 0.04, "grad_norm": 6.396245401402814, "learning_rate": 1.999530511301166e-06, "loss": 1.5903, "step": 5784 }, { "epoch": 0.04, "grad_norm": 4.417511195064635, "learning_rate": 1.999530348678167e-06, "loss": 1.3267, "step": 5785 }, { "epoch": 0.04, "grad_norm": 4.450060878281205, "learning_rate": 1.999530186027014e-06, "loss": 1.4219, "step": 5786 }, { "epoch": 0.04, "grad_norm": 4.678927499848172, "learning_rate": 1.9995300233477085e-06, "loss": 1.359, "step": 5787 }, { "epoch": 0.04, "grad_norm": 6.016906674436256, "learning_rate": 1.999529860640249e-06, "loss": 1.3466, "step": 5788 }, { "epoch": 0.04, "grad_norm": 5.250686282897962, "learning_rate": 1.999529697904636e-06, "loss": 1.4514, "step": 5789 }, { "epoch": 0.04, "grad_norm": 4.509277493069645, "learning_rate": 1.9995295351408702e-06, "loss": 1.3902, "step": 5790 }, { "epoch": 0.04, "grad_norm": 4.974650874434231, "learning_rate": 1.99952937234895e-06, "loss": 1.4315, "step": 5791 }, { "epoch": 0.04, "grad_norm": 4.617908737378824, "learning_rate": 1.9995292095288773e-06, "loss": 1.3101, "step": 5792 }, { "epoch": 0.04, "grad_norm": 4.674486134731671, "learning_rate": 1.999529046680651e-06, "loss": 1.5692, "step": 5793 }, { "epoch": 0.04, "grad_norm": 4.946964343169512, "learning_rate": 1.999528883804271e-06, "loss": 1.3259, "step": 5794 }, { "epoch": 0.04, "grad_norm": 4.349582653549313, "learning_rate": 1.9995287208997375e-06, "loss": 1.4015, "step": 5795 }, { "epoch": 0.04, "grad_norm": 5.401678625346844, "learning_rate": 1.999528557967051e-06, "loss": 1.5225, "step": 5796 }, { "epoch": 0.04, "grad_norm": 4.839082863394238, "learning_rate": 1.9995283950062107e-06, "loss": 1.3982, "step": 5797 }, { "epoch": 0.04, "grad_norm": 4.915980656992042, "learning_rate": 1.9995282320172172e-06, "loss": 1.4281, "step": 5798 }, { "epoch": 0.04, "grad_norm": 4.859026808530354, "learning_rate": 1.99952806900007e-06, "loss": 1.277, "step": 5799 }, { "epoch": 0.04, "grad_norm": 8.565115816305175, "learning_rate": 1.99952790595477e-06, "loss": 1.5037, "step": 5800 }, { "epoch": 0.04, "grad_norm": 4.199521156380893, "learning_rate": 1.9995277428813163e-06, "loss": 1.3223, "step": 5801 }, { "epoch": 0.04, "grad_norm": 4.300210871749029, "learning_rate": 1.9995275797797094e-06, "loss": 1.2914, "step": 5802 }, { "epoch": 0.04, "grad_norm": 4.756408378531319, "learning_rate": 1.999527416649949e-06, "loss": 1.3838, "step": 5803 }, { "epoch": 0.04, "grad_norm": 5.728205777896226, "learning_rate": 1.9995272534920352e-06, "loss": 1.3844, "step": 5804 }, { "epoch": 0.04, "grad_norm": 4.645678523665286, "learning_rate": 1.999527090305968e-06, "loss": 1.2477, "step": 5805 }, { "epoch": 0.04, "grad_norm": 4.732478522445746, "learning_rate": 1.9995269270917477e-06, "loss": 1.452, "step": 5806 }, { "epoch": 0.04, "grad_norm": 4.681745443595061, "learning_rate": 1.999526763849374e-06, "loss": 1.3097, "step": 5807 }, { "epoch": 0.04, "grad_norm": 4.463478111887422, "learning_rate": 1.999526600578847e-06, "loss": 1.359, "step": 5808 }, { "epoch": 0.04, "grad_norm": 4.421691545548865, "learning_rate": 1.9995264372801665e-06, "loss": 1.3669, "step": 5809 }, { "epoch": 0.04, "grad_norm": 4.8188606392810795, "learning_rate": 1.9995262739533326e-06, "loss": 1.3531, "step": 5810 }, { "epoch": 0.04, "grad_norm": 4.599822292212119, "learning_rate": 1.9995261105983455e-06, "loss": 1.3967, "step": 5811 }, { "epoch": 0.04, "grad_norm": 5.493973191760622, "learning_rate": 1.999525947215205e-06, "loss": 1.4455, "step": 5812 }, { "epoch": 0.04, "grad_norm": 4.780534002487309, "learning_rate": 1.999525783803911e-06, "loss": 1.4577, "step": 5813 }, { "epoch": 0.04, "grad_norm": 4.900091356819903, "learning_rate": 1.9995256203644637e-06, "loss": 1.4318, "step": 5814 }, { "epoch": 0.04, "grad_norm": 5.133514131011401, "learning_rate": 1.9995254568968636e-06, "loss": 1.434, "step": 5815 }, { "epoch": 0.04, "grad_norm": 4.36705674401036, "learning_rate": 1.9995252934011097e-06, "loss": 1.2738, "step": 5816 }, { "epoch": 0.04, "grad_norm": 5.17524501079888, "learning_rate": 1.999525129877203e-06, "loss": 1.2447, "step": 5817 }, { "epoch": 0.04, "grad_norm": 5.356042815511578, "learning_rate": 1.9995249663251427e-06, "loss": 1.2405, "step": 5818 }, { "epoch": 0.04, "grad_norm": 6.314495850541795, "learning_rate": 1.999524802744929e-06, "loss": 1.4544, "step": 5819 }, { "epoch": 0.04, "grad_norm": 6.389014689793489, "learning_rate": 1.999524639136562e-06, "loss": 1.6164, "step": 5820 }, { "epoch": 0.04, "grad_norm": 6.211023582977219, "learning_rate": 1.9995244755000417e-06, "loss": 1.4243, "step": 5821 }, { "epoch": 0.04, "grad_norm": 6.216808687720381, "learning_rate": 1.9995243118353683e-06, "loss": 1.297, "step": 5822 }, { "epoch": 0.04, "grad_norm": 4.255863232988317, "learning_rate": 1.9995241481425417e-06, "loss": 1.3636, "step": 5823 }, { "epoch": 0.04, "grad_norm": 4.405161121173103, "learning_rate": 1.9995239844215614e-06, "loss": 1.3062, "step": 5824 }, { "epoch": 0.04, "grad_norm": 4.685015947978792, "learning_rate": 1.999523820672428e-06, "loss": 1.4763, "step": 5825 }, { "epoch": 0.04, "grad_norm": 4.304887686625303, "learning_rate": 1.999523656895142e-06, "loss": 1.3243, "step": 5826 }, { "epoch": 0.04, "grad_norm": 4.412314277050078, "learning_rate": 1.9995234930897017e-06, "loss": 1.3325, "step": 5827 }, { "epoch": 0.04, "grad_norm": 4.590666944501228, "learning_rate": 1.9995233292561088e-06, "loss": 1.3308, "step": 5828 }, { "epoch": 0.04, "grad_norm": 4.950593516611324, "learning_rate": 1.9995231653943626e-06, "loss": 1.5258, "step": 5829 }, { "epoch": 0.04, "grad_norm": 5.005777348950392, "learning_rate": 1.999523001504463e-06, "loss": 1.3438, "step": 5830 }, { "epoch": 0.04, "grad_norm": 4.169881511000552, "learning_rate": 1.99952283758641e-06, "loss": 1.2706, "step": 5831 }, { "epoch": 0.04, "grad_norm": 4.68270791493962, "learning_rate": 1.999522673640204e-06, "loss": 1.4023, "step": 5832 }, { "epoch": 0.04, "grad_norm": 4.852824929822737, "learning_rate": 1.999522509665845e-06, "loss": 1.5146, "step": 5833 }, { "epoch": 0.04, "grad_norm": 4.378373654531029, "learning_rate": 1.9995223456633324e-06, "loss": 1.2745, "step": 5834 }, { "epoch": 0.04, "grad_norm": 5.3017044341222705, "learning_rate": 1.9995221816326667e-06, "loss": 1.4925, "step": 5835 }, { "epoch": 0.04, "grad_norm": 4.660967641045321, "learning_rate": 1.9995220175738477e-06, "loss": 1.3053, "step": 5836 }, { "epoch": 0.04, "grad_norm": 4.65699683886265, "learning_rate": 1.999521853486876e-06, "loss": 1.3504, "step": 5837 }, { "epoch": 0.04, "grad_norm": 5.687828356781194, "learning_rate": 1.9995216893717504e-06, "loss": 1.4364, "step": 5838 }, { "epoch": 0.04, "grad_norm": 4.441765156548666, "learning_rate": 1.999521525228472e-06, "loss": 1.5076, "step": 5839 }, { "epoch": 0.04, "grad_norm": 4.767094251866784, "learning_rate": 1.9995213610570404e-06, "loss": 1.4118, "step": 5840 }, { "epoch": 0.04, "eval_loss": 1.584568738937378, "eval_runtime": 4.6307, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 5840 }, { "epoch": 0.04, "grad_norm": 4.642642879580674, "learning_rate": 1.999521196857455e-06, "loss": 1.2914, "step": 5841 }, { "epoch": 0.04, "grad_norm": 5.429180208308674, "learning_rate": 1.999521032629717e-06, "loss": 1.2628, "step": 5842 }, { "epoch": 0.04, "grad_norm": 4.53057024965861, "learning_rate": 1.9995208683738257e-06, "loss": 1.3625, "step": 5843 }, { "epoch": 0.04, "grad_norm": 5.996324408935311, "learning_rate": 1.9995207040897815e-06, "loss": 1.3837, "step": 5844 }, { "epoch": 0.04, "grad_norm": 5.097365660801794, "learning_rate": 1.999520539777584e-06, "loss": 1.4953, "step": 5845 }, { "epoch": 0.04, "grad_norm": 6.371681193311429, "learning_rate": 1.999520375437233e-06, "loss": 1.5014, "step": 5846 }, { "epoch": 0.04, "grad_norm": 4.9833768756877195, "learning_rate": 1.999520211068729e-06, "loss": 1.5451, "step": 5847 }, { "epoch": 0.04, "grad_norm": 4.318636151604379, "learning_rate": 1.9995200466720717e-06, "loss": 1.3373, "step": 5848 }, { "epoch": 0.04, "grad_norm": 4.426201674725641, "learning_rate": 1.9995198822472616e-06, "loss": 1.3042, "step": 5849 }, { "epoch": 0.04, "grad_norm": 4.577379982396285, "learning_rate": 1.999519717794298e-06, "loss": 1.3741, "step": 5850 }, { "epoch": 0.04, "grad_norm": 4.609495777613937, "learning_rate": 1.9995195533131813e-06, "loss": 1.4263, "step": 5851 }, { "epoch": 0.04, "grad_norm": 4.62286969300606, "learning_rate": 1.999519388803912e-06, "loss": 1.4148, "step": 5852 }, { "epoch": 0.04, "grad_norm": 4.715038423736979, "learning_rate": 1.9995192242664888e-06, "loss": 1.4911, "step": 5853 }, { "epoch": 0.04, "grad_norm": 4.650075293226063, "learning_rate": 1.999519059700913e-06, "loss": 1.3889, "step": 5854 }, { "epoch": 0.04, "grad_norm": 4.466957598747972, "learning_rate": 1.9995188951071837e-06, "loss": 1.437, "step": 5855 }, { "epoch": 0.04, "grad_norm": 5.534445574765524, "learning_rate": 1.9995187304853017e-06, "loss": 1.3211, "step": 5856 }, { "epoch": 0.04, "grad_norm": 4.411997907381323, "learning_rate": 1.9995185658352664e-06, "loss": 1.2427, "step": 5857 }, { "epoch": 0.04, "grad_norm": 6.089195905819812, "learning_rate": 1.9995184011570774e-06, "loss": 1.4231, "step": 5858 }, { "epoch": 0.04, "grad_norm": 4.586518511091607, "learning_rate": 1.999518236450736e-06, "loss": 1.3962, "step": 5859 }, { "epoch": 0.04, "grad_norm": 5.19547828490689, "learning_rate": 1.9995180717162414e-06, "loss": 1.4804, "step": 5860 }, { "epoch": 0.04, "grad_norm": 7.089039224428242, "learning_rate": 1.9995179069535935e-06, "loss": 1.2133, "step": 5861 }, { "epoch": 0.04, "grad_norm": 4.431896876844631, "learning_rate": 1.999517742162793e-06, "loss": 1.2577, "step": 5862 }, { "epoch": 0.04, "grad_norm": 4.860563570256634, "learning_rate": 1.999517577343839e-06, "loss": 1.2221, "step": 5863 }, { "epoch": 0.04, "grad_norm": 5.5910743102262295, "learning_rate": 1.9995174124967316e-06, "loss": 1.4034, "step": 5864 }, { "epoch": 0.04, "grad_norm": 4.651760894842699, "learning_rate": 1.9995172476214715e-06, "loss": 1.339, "step": 5865 }, { "epoch": 0.04, "grad_norm": 5.820065028599262, "learning_rate": 1.999517082718058e-06, "loss": 1.2216, "step": 5866 }, { "epoch": 0.04, "grad_norm": 4.965431047620773, "learning_rate": 1.999516917786492e-06, "loss": 1.3808, "step": 5867 }, { "epoch": 0.04, "grad_norm": 4.401331148061845, "learning_rate": 1.9995167528267726e-06, "loss": 1.3745, "step": 5868 }, { "epoch": 0.04, "grad_norm": 5.784823081342542, "learning_rate": 1.9995165878389004e-06, "loss": 1.3611, "step": 5869 }, { "epoch": 0.04, "grad_norm": 4.772625758069375, "learning_rate": 1.999516422822875e-06, "loss": 1.3385, "step": 5870 }, { "epoch": 0.04, "grad_norm": 4.8126612827397945, "learning_rate": 1.9995162577786965e-06, "loss": 1.2631, "step": 5871 }, { "epoch": 0.04, "grad_norm": 5.394203373101565, "learning_rate": 1.9995160927063645e-06, "loss": 1.4353, "step": 5872 }, { "epoch": 0.04, "grad_norm": 4.444339340534983, "learning_rate": 1.99951592760588e-06, "loss": 1.2962, "step": 5873 }, { "epoch": 0.04, "grad_norm": 4.811190546947834, "learning_rate": 1.9995157624772424e-06, "loss": 1.3972, "step": 5874 }, { "epoch": 0.04, "grad_norm": 4.533974741425463, "learning_rate": 1.9995155973204514e-06, "loss": 1.4265, "step": 5875 }, { "epoch": 0.04, "grad_norm": 5.034535386182703, "learning_rate": 1.999515432135508e-06, "loss": 1.5753, "step": 5876 }, { "epoch": 0.04, "grad_norm": 5.214069930800117, "learning_rate": 1.9995152669224114e-06, "loss": 1.3045, "step": 5877 }, { "epoch": 0.04, "grad_norm": 5.0111753648003345, "learning_rate": 1.999515101681161e-06, "loss": 1.4838, "step": 5878 }, { "epoch": 0.04, "grad_norm": 4.611312998068975, "learning_rate": 1.999514936411759e-06, "loss": 1.5117, "step": 5879 }, { "epoch": 0.04, "grad_norm": 5.013967915569056, "learning_rate": 1.999514771114203e-06, "loss": 1.5269, "step": 5880 }, { "epoch": 0.04, "grad_norm": 4.380293744497348, "learning_rate": 1.999514605788494e-06, "loss": 1.326, "step": 5881 }, { "epoch": 0.04, "grad_norm": 4.582601180681681, "learning_rate": 1.999514440434632e-06, "loss": 1.4205, "step": 5882 }, { "epoch": 0.04, "grad_norm": 4.379707679441618, "learning_rate": 1.9995142750526176e-06, "loss": 1.3065, "step": 5883 }, { "epoch": 0.04, "grad_norm": 4.611899635368025, "learning_rate": 1.99951410964245e-06, "loss": 1.3389, "step": 5884 }, { "epoch": 0.04, "grad_norm": 4.735866256369223, "learning_rate": 1.999513944204129e-06, "loss": 1.4389, "step": 5885 }, { "epoch": 0.04, "grad_norm": 5.321208555352928, "learning_rate": 1.999513778737655e-06, "loss": 1.3257, "step": 5886 }, { "epoch": 0.04, "grad_norm": 4.982866142181729, "learning_rate": 1.999513613243028e-06, "loss": 1.3956, "step": 5887 }, { "epoch": 0.04, "grad_norm": 4.648863212958331, "learning_rate": 1.9995134477202485e-06, "loss": 1.5094, "step": 5888 }, { "epoch": 0.04, "grad_norm": 4.537438316844015, "learning_rate": 1.9995132821693157e-06, "loss": 1.3587, "step": 5889 }, { "epoch": 0.04, "grad_norm": 4.511904868807547, "learning_rate": 1.99951311659023e-06, "loss": 1.5049, "step": 5890 }, { "epoch": 0.04, "grad_norm": 4.5823750459485915, "learning_rate": 1.9995129509829917e-06, "loss": 1.3794, "step": 5891 }, { "epoch": 0.04, "grad_norm": 8.641035603251654, "learning_rate": 1.9995127853476e-06, "loss": 1.3697, "step": 5892 }, { "epoch": 0.04, "grad_norm": 4.713810144156122, "learning_rate": 1.9995126196840556e-06, "loss": 1.3298, "step": 5893 }, { "epoch": 0.04, "grad_norm": 4.854192293136251, "learning_rate": 1.9995124539923582e-06, "loss": 1.492, "step": 5894 }, { "epoch": 0.04, "grad_norm": 4.705063666616618, "learning_rate": 1.9995122882725076e-06, "loss": 1.439, "step": 5895 }, { "epoch": 0.04, "grad_norm": 4.924740100608494, "learning_rate": 1.999512122524504e-06, "loss": 1.5958, "step": 5896 }, { "epoch": 0.04, "grad_norm": 4.825477386939696, "learning_rate": 1.999511956748348e-06, "loss": 1.4765, "step": 5897 }, { "epoch": 0.04, "grad_norm": 4.394081981089352, "learning_rate": 1.999511790944039e-06, "loss": 1.4399, "step": 5898 }, { "epoch": 0.04, "grad_norm": 5.15208486956062, "learning_rate": 1.9995116251115765e-06, "loss": 1.5189, "step": 5899 }, { "epoch": 0.04, "grad_norm": 4.817634534448384, "learning_rate": 1.9995114592509617e-06, "loss": 1.4776, "step": 5900 }, { "epoch": 0.04, "grad_norm": 4.33941450283988, "learning_rate": 1.9995112933621937e-06, "loss": 1.3866, "step": 5901 }, { "epoch": 0.04, "grad_norm": 4.503021864563051, "learning_rate": 1.999511127445273e-06, "loss": 1.4537, "step": 5902 }, { "epoch": 0.04, "grad_norm": 5.25109779781238, "learning_rate": 1.999510961500199e-06, "loss": 1.3892, "step": 5903 }, { "epoch": 0.04, "grad_norm": 4.521348683172032, "learning_rate": 1.9995107955269726e-06, "loss": 1.4034, "step": 5904 }, { "epoch": 0.04, "grad_norm": 4.6417065441255465, "learning_rate": 1.999510629525593e-06, "loss": 1.5248, "step": 5905 }, { "epoch": 0.04, "grad_norm": 8.157992047876746, "learning_rate": 1.9995104634960606e-06, "loss": 1.5755, "step": 5906 }, { "epoch": 0.04, "grad_norm": 4.731558694632791, "learning_rate": 1.999510297438375e-06, "loss": 1.3873, "step": 5907 }, { "epoch": 0.04, "grad_norm": 4.609476935303952, "learning_rate": 1.9995101313525365e-06, "loss": 1.3913, "step": 5908 }, { "epoch": 0.04, "grad_norm": 4.2520422543557395, "learning_rate": 1.9995099652385458e-06, "loss": 1.2609, "step": 5909 }, { "epoch": 0.04, "grad_norm": 4.362488636063784, "learning_rate": 1.9995097990964014e-06, "loss": 1.0999, "step": 5910 }, { "epoch": 0.04, "grad_norm": 4.998638535411594, "learning_rate": 1.999509632926105e-06, "loss": 1.4985, "step": 5911 }, { "epoch": 0.04, "grad_norm": 4.8296074663814546, "learning_rate": 1.9995094667276554e-06, "loss": 1.3968, "step": 5912 }, { "epoch": 0.04, "grad_norm": 5.544034197772775, "learning_rate": 1.9995093005010525e-06, "loss": 1.3568, "step": 5913 }, { "epoch": 0.04, "eval_loss": 1.5844674110412598, "eval_runtime": 4.6302, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 5913 }, { "epoch": 0.04, "grad_norm": 4.309657811569285, "learning_rate": 1.9995091342462972e-06, "loss": 1.3083, "step": 5914 }, { "epoch": 0.04, "grad_norm": 4.074796676870538, "learning_rate": 1.999508967963389e-06, "loss": 1.1975, "step": 5915 }, { "epoch": 0.04, "grad_norm": 5.473508339546319, "learning_rate": 1.999508801652328e-06, "loss": 1.3911, "step": 5916 }, { "epoch": 0.04, "grad_norm": 4.535355157473748, "learning_rate": 1.999508635313114e-06, "loss": 1.3463, "step": 5917 }, { "epoch": 0.04, "grad_norm": 6.007262203832133, "learning_rate": 1.999508468945747e-06, "loss": 1.3137, "step": 5918 }, { "epoch": 0.04, "grad_norm": 4.641325841138452, "learning_rate": 1.9995083025502274e-06, "loss": 1.3244, "step": 5919 }, { "epoch": 0.04, "grad_norm": 4.997494201134398, "learning_rate": 1.999508136126555e-06, "loss": 1.4651, "step": 5920 }, { "epoch": 0.04, "grad_norm": 4.696803924076454, "learning_rate": 1.99950796967473e-06, "loss": 1.3223, "step": 5921 }, { "epoch": 0.04, "grad_norm": 4.918928382790908, "learning_rate": 1.999507803194752e-06, "loss": 1.3454, "step": 5922 }, { "epoch": 0.04, "grad_norm": 4.179833935045301, "learning_rate": 1.999507636686621e-06, "loss": 1.2209, "step": 5923 }, { "epoch": 0.04, "grad_norm": 5.234228975042117, "learning_rate": 1.999507470150337e-06, "loss": 1.4995, "step": 5924 }, { "epoch": 0.04, "grad_norm": 4.655045641032911, "learning_rate": 1.9995073035859007e-06, "loss": 1.5456, "step": 5925 }, { "epoch": 0.04, "grad_norm": 4.773510902370617, "learning_rate": 1.9995071369933114e-06, "loss": 1.3974, "step": 5926 }, { "epoch": 0.04, "grad_norm": 5.00556653068369, "learning_rate": 1.9995069703725697e-06, "loss": 1.4259, "step": 5927 }, { "epoch": 0.04, "grad_norm": 4.5401022122412185, "learning_rate": 1.9995068037236748e-06, "loss": 1.483, "step": 5928 }, { "epoch": 0.04, "grad_norm": 4.709691191062438, "learning_rate": 1.999506637046627e-06, "loss": 1.4221, "step": 5929 }, { "epoch": 0.04, "grad_norm": 4.702714988914459, "learning_rate": 1.9995064703414267e-06, "loss": 1.4221, "step": 5930 }, { "epoch": 0.04, "grad_norm": 5.040955436678988, "learning_rate": 1.9995063036080733e-06, "loss": 1.2107, "step": 5931 }, { "epoch": 0.04, "grad_norm": 4.457439207799398, "learning_rate": 1.9995061368465674e-06, "loss": 1.3345, "step": 5932 }, { "epoch": 0.04, "grad_norm": 5.04342675186248, "learning_rate": 1.9995059700569087e-06, "loss": 1.5034, "step": 5933 }, { "epoch": 0.04, "grad_norm": 4.621796802281291, "learning_rate": 1.9995058032390976e-06, "loss": 1.5084, "step": 5934 }, { "epoch": 0.04, "grad_norm": 4.688268110316498, "learning_rate": 1.999505636393133e-06, "loss": 1.4742, "step": 5935 }, { "epoch": 0.04, "grad_norm": 4.573398355545929, "learning_rate": 1.999505469519016e-06, "loss": 1.4283, "step": 5936 }, { "epoch": 0.04, "grad_norm": 4.6985611282695, "learning_rate": 1.9995053026167463e-06, "loss": 1.3883, "step": 5937 }, { "epoch": 0.04, "grad_norm": 4.40560345297333, "learning_rate": 1.999505135686324e-06, "loss": 1.3317, "step": 5938 }, { "epoch": 0.04, "grad_norm": 4.922639671154126, "learning_rate": 1.9995049687277486e-06, "loss": 1.359, "step": 5939 }, { "epoch": 0.04, "grad_norm": 4.4801582116425696, "learning_rate": 1.999504801741021e-06, "loss": 1.4745, "step": 5940 }, { "epoch": 0.04, "grad_norm": 5.613542401971949, "learning_rate": 1.9995046347261403e-06, "loss": 1.3758, "step": 5941 }, { "epoch": 0.04, "grad_norm": 4.409524732476373, "learning_rate": 1.999504467683107e-06, "loss": 1.3319, "step": 5942 }, { "epoch": 0.04, "grad_norm": 4.9799732152965435, "learning_rate": 1.9995043006119208e-06, "loss": 1.2, "step": 5943 }, { "epoch": 0.04, "grad_norm": 4.631813453222979, "learning_rate": 1.999504133512582e-06, "loss": 1.4017, "step": 5944 }, { "epoch": 0.04, "grad_norm": 4.600129657133171, "learning_rate": 1.9995039663850903e-06, "loss": 1.24, "step": 5945 }, { "epoch": 0.04, "grad_norm": 4.486371082791539, "learning_rate": 1.9995037992294464e-06, "loss": 1.3778, "step": 5946 }, { "epoch": 0.04, "grad_norm": 5.171132986804107, "learning_rate": 1.9995036320456493e-06, "loss": 1.3241, "step": 5947 }, { "epoch": 0.04, "grad_norm": 4.665452496819232, "learning_rate": 1.9995034648336997e-06, "loss": 1.4238, "step": 5948 }, { "epoch": 0.04, "grad_norm": 4.239531997207046, "learning_rate": 1.9995032975935974e-06, "loss": 1.2457, "step": 5949 }, { "epoch": 0.04, "grad_norm": 5.429131485564488, "learning_rate": 1.9995031303253426e-06, "loss": 1.4648, "step": 5950 }, { "epoch": 0.04, "grad_norm": 6.80221506541541, "learning_rate": 1.999502963028935e-06, "loss": 1.303, "step": 5951 }, { "epoch": 0.04, "grad_norm": 4.355581661656206, "learning_rate": 1.9995027957043746e-06, "loss": 1.328, "step": 5952 }, { "epoch": 0.04, "grad_norm": 4.350286262168313, "learning_rate": 1.9995026283516613e-06, "loss": 1.3807, "step": 5953 }, { "epoch": 0.04, "grad_norm": 4.668615204584762, "learning_rate": 1.9995024609707956e-06, "loss": 1.5269, "step": 5954 }, { "epoch": 0.04, "grad_norm": 5.218858712941064, "learning_rate": 1.9995022935617775e-06, "loss": 1.4627, "step": 5955 }, { "epoch": 0.04, "grad_norm": 4.540247870765922, "learning_rate": 1.9995021261246066e-06, "loss": 1.402, "step": 5956 }, { "epoch": 0.04, "grad_norm": 6.091521512800939, "learning_rate": 1.9995019586592828e-06, "loss": 1.5715, "step": 5957 }, { "epoch": 0.04, "grad_norm": 5.050205181287205, "learning_rate": 1.9995017911658066e-06, "loss": 1.5067, "step": 5958 }, { "epoch": 0.04, "grad_norm": 4.681034138705226, "learning_rate": 1.999501623644178e-06, "loss": 1.5021, "step": 5959 }, { "epoch": 0.04, "grad_norm": 5.355200635607366, "learning_rate": 1.999501456094396e-06, "loss": 1.3429, "step": 5960 }, { "epoch": 0.04, "grad_norm": 5.782906237060128, "learning_rate": 1.9995012885164623e-06, "loss": 1.279, "step": 5961 }, { "epoch": 0.04, "grad_norm": 4.678982853380652, "learning_rate": 1.9995011209103753e-06, "loss": 1.3037, "step": 5962 }, { "epoch": 0.04, "grad_norm": 5.048427681457911, "learning_rate": 1.999500953276136e-06, "loss": 1.3877, "step": 5963 }, { "epoch": 0.04, "grad_norm": 4.857791243222419, "learning_rate": 1.999500785613744e-06, "loss": 1.3915, "step": 5964 }, { "epoch": 0.04, "grad_norm": 4.438412772207488, "learning_rate": 1.999500617923199e-06, "loss": 1.435, "step": 5965 }, { "epoch": 0.04, "grad_norm": 4.682024502629021, "learning_rate": 1.999500450204502e-06, "loss": 1.3254, "step": 5966 }, { "epoch": 0.04, "grad_norm": 5.212423092528759, "learning_rate": 1.999500282457652e-06, "loss": 1.3801, "step": 5967 }, { "epoch": 0.04, "grad_norm": 4.715213304433088, "learning_rate": 1.9995001146826495e-06, "loss": 1.3298, "step": 5968 }, { "epoch": 0.04, "grad_norm": 8.254454539617027, "learning_rate": 1.9994999468794943e-06, "loss": 1.5046, "step": 5969 }, { "epoch": 0.04, "grad_norm": 4.269343628993595, "learning_rate": 1.999499779048187e-06, "loss": 1.4125, "step": 5970 }, { "epoch": 0.04, "grad_norm": 5.626549882017871, "learning_rate": 1.9994996111887266e-06, "loss": 1.2426, "step": 5971 }, { "epoch": 0.04, "grad_norm": 11.198922453617236, "learning_rate": 1.9994994433011137e-06, "loss": 1.3183, "step": 5972 }, { "epoch": 0.04, "grad_norm": 7.419855634170617, "learning_rate": 1.9994992753853484e-06, "loss": 1.3999, "step": 5973 }, { "epoch": 0.04, "grad_norm": 4.846911637581678, "learning_rate": 1.9994991074414303e-06, "loss": 1.2708, "step": 5974 }, { "epoch": 0.04, "grad_norm": 5.014120565666695, "learning_rate": 1.99949893946936e-06, "loss": 1.6365, "step": 5975 }, { "epoch": 0.04, "grad_norm": 5.178644232377816, "learning_rate": 1.9994987714691368e-06, "loss": 1.3502, "step": 5976 }, { "epoch": 0.04, "grad_norm": 4.542372805906949, "learning_rate": 1.9994986034407614e-06, "loss": 1.3883, "step": 5977 }, { "epoch": 0.04, "grad_norm": 5.808897216899337, "learning_rate": 1.9994984353842332e-06, "loss": 1.5053, "step": 5978 }, { "epoch": 0.04, "grad_norm": 5.1164821645637595, "learning_rate": 1.999498267299552e-06, "loss": 1.3607, "step": 5979 }, { "epoch": 0.04, "grad_norm": 4.474593401794795, "learning_rate": 1.999498099186719e-06, "loss": 1.4336, "step": 5980 }, { "epoch": 0.04, "grad_norm": 4.873644324468687, "learning_rate": 1.9994979310457333e-06, "loss": 1.4006, "step": 5981 }, { "epoch": 0.04, "grad_norm": 4.311642601298582, "learning_rate": 1.999497762876595e-06, "loss": 1.2782, "step": 5982 }, { "epoch": 0.04, "grad_norm": 4.744912981689627, "learning_rate": 1.999497594679304e-06, "loss": 1.2587, "step": 5983 }, { "epoch": 0.04, "grad_norm": 4.691586123719747, "learning_rate": 1.999497426453861e-06, "loss": 1.4636, "step": 5984 }, { "epoch": 0.04, "grad_norm": 5.163744599314968, "learning_rate": 1.9994972582002645e-06, "loss": 1.5752, "step": 5985 }, { "epoch": 0.04, "grad_norm": 5.233356847720551, "learning_rate": 1.9994970899185166e-06, "loss": 1.491, "step": 5986 }, { "epoch": 0.04, "eval_loss": 1.57973051071167, "eval_runtime": 4.6236, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.081, "step": 5986 }, { "epoch": 0.04, "grad_norm": 4.444489653708122, "learning_rate": 1.9994969216086154e-06, "loss": 1.4479, "step": 5987 }, { "epoch": 0.04, "grad_norm": 4.456317831620817, "learning_rate": 1.999496753270562e-06, "loss": 1.2308, "step": 5988 }, { "epoch": 0.04, "grad_norm": 4.80305562733577, "learning_rate": 1.999496584904356e-06, "loss": 1.3326, "step": 5989 }, { "epoch": 0.04, "grad_norm": 5.246894737565416, "learning_rate": 1.9994964165099975e-06, "loss": 1.1137, "step": 5990 }, { "epoch": 0.04, "grad_norm": 4.877779248925687, "learning_rate": 1.9994962480874867e-06, "loss": 1.4882, "step": 5991 }, { "epoch": 0.04, "grad_norm": 6.069745894341453, "learning_rate": 1.999496079636823e-06, "loss": 1.2745, "step": 5992 }, { "epoch": 0.04, "grad_norm": 4.355660835187988, "learning_rate": 1.9994959111580074e-06, "loss": 1.3832, "step": 5993 }, { "epoch": 0.04, "grad_norm": 4.545739127455691, "learning_rate": 1.999495742651039e-06, "loss": 1.3298, "step": 5994 }, { "epoch": 0.04, "grad_norm": 4.811509406084908, "learning_rate": 1.999495574115918e-06, "loss": 1.4934, "step": 5995 }, { "epoch": 0.04, "grad_norm": 4.49568688412138, "learning_rate": 1.9994954055526448e-06, "loss": 1.2812, "step": 5996 }, { "epoch": 0.04, "grad_norm": 4.745405638081004, "learning_rate": 1.999495236961219e-06, "loss": 1.3719, "step": 5997 }, { "epoch": 0.04, "grad_norm": 4.933718580040205, "learning_rate": 1.9994950683416406e-06, "loss": 1.2649, "step": 5998 }, { "epoch": 0.04, "grad_norm": 8.145237802789559, "learning_rate": 1.9994948996939105e-06, "loss": 1.4663, "step": 5999 }, { "epoch": 0.04, "grad_norm": 4.247300965136955, "learning_rate": 1.999494731018027e-06, "loss": 1.2422, "step": 6000 }, { "epoch": 0.04, "grad_norm": 8.231488124753303, "learning_rate": 1.9994945623139918e-06, "loss": 1.2335, "step": 6001 }, { "epoch": 0.04, "grad_norm": 5.007640437152326, "learning_rate": 1.999494393581803e-06, "loss": 1.541, "step": 6002 }, { "epoch": 0.04, "grad_norm": 4.896291270748846, "learning_rate": 1.999494224821463e-06, "loss": 1.4031, "step": 6003 }, { "epoch": 0.04, "grad_norm": 4.518289699013999, "learning_rate": 1.99949405603297e-06, "loss": 1.3314, "step": 6004 }, { "epoch": 0.04, "grad_norm": 4.6119212601372315, "learning_rate": 1.999493887216325e-06, "loss": 1.3907, "step": 6005 }, { "epoch": 0.04, "grad_norm": 4.4530885445336965, "learning_rate": 1.9994937183715272e-06, "loss": 1.3936, "step": 6006 }, { "epoch": 0.04, "grad_norm": 4.9213357448105866, "learning_rate": 1.999493549498577e-06, "loss": 1.4903, "step": 6007 }, { "epoch": 0.04, "grad_norm": 4.6904663247591065, "learning_rate": 1.9994933805974744e-06, "loss": 1.461, "step": 6008 }, { "epoch": 0.04, "grad_norm": 4.678872170122384, "learning_rate": 1.9994932116682198e-06, "loss": 1.1427, "step": 6009 }, { "epoch": 0.04, "grad_norm": 4.3834153600403125, "learning_rate": 1.9994930427108123e-06, "loss": 1.3824, "step": 6010 }, { "epoch": 0.04, "grad_norm": 4.43701244049742, "learning_rate": 1.9994928737252524e-06, "loss": 1.3555, "step": 6011 }, { "epoch": 0.04, "grad_norm": 4.626164913717927, "learning_rate": 1.9994927047115406e-06, "loss": 1.3826, "step": 6012 }, { "epoch": 0.04, "grad_norm": 5.561295374282704, "learning_rate": 1.9994925356696763e-06, "loss": 1.4574, "step": 6013 }, { "epoch": 0.04, "grad_norm": 5.397353456854751, "learning_rate": 1.9994923665996592e-06, "loss": 1.4528, "step": 6014 }, { "epoch": 0.04, "grad_norm": 4.491319815817919, "learning_rate": 1.9994921975014897e-06, "loss": 1.5232, "step": 6015 }, { "epoch": 0.04, "grad_norm": 4.501428095250587, "learning_rate": 1.9994920283751682e-06, "loss": 1.2587, "step": 6016 }, { "epoch": 0.04, "grad_norm": 4.512731343644986, "learning_rate": 1.9994918592206943e-06, "loss": 1.3002, "step": 6017 }, { "epoch": 0.04, "grad_norm": 5.998960027927198, "learning_rate": 1.999491690038068e-06, "loss": 1.3417, "step": 6018 }, { "epoch": 0.04, "grad_norm": 4.810804381703738, "learning_rate": 1.9994915208272893e-06, "loss": 1.3283, "step": 6019 }, { "epoch": 0.04, "grad_norm": 5.1215818388209575, "learning_rate": 1.999491351588358e-06, "loss": 1.4758, "step": 6020 }, { "epoch": 0.04, "grad_norm": 5.2896966542088375, "learning_rate": 1.9994911823212746e-06, "loss": 1.2747, "step": 6021 }, { "epoch": 0.04, "grad_norm": 5.096806153855373, "learning_rate": 1.999491013026039e-06, "loss": 1.3602, "step": 6022 }, { "epoch": 0.04, "grad_norm": 4.645543689628549, "learning_rate": 1.999490843702651e-06, "loss": 1.2922, "step": 6023 }, { "epoch": 0.04, "grad_norm": 4.071431383230532, "learning_rate": 1.9994906743511107e-06, "loss": 1.1705, "step": 6024 }, { "epoch": 0.04, "grad_norm": 4.590727714784874, "learning_rate": 1.9994905049714176e-06, "loss": 1.1355, "step": 6025 }, { "epoch": 0.04, "grad_norm": 5.8136883080889215, "learning_rate": 1.999490335563573e-06, "loss": 1.3417, "step": 6026 }, { "epoch": 0.04, "grad_norm": 5.25521206259113, "learning_rate": 1.9994901661275752e-06, "loss": 1.4476, "step": 6027 }, { "epoch": 0.04, "grad_norm": 4.74701411592608, "learning_rate": 1.9994899966634256e-06, "loss": 1.3106, "step": 6028 }, { "epoch": 0.04, "grad_norm": 4.614328864811646, "learning_rate": 1.9994898271711237e-06, "loss": 1.4338, "step": 6029 }, { "epoch": 0.04, "grad_norm": 5.84079579780227, "learning_rate": 1.9994896576506693e-06, "loss": 1.4991, "step": 6030 }, { "epoch": 0.04, "grad_norm": 4.6138758879923145, "learning_rate": 1.999489488102063e-06, "loss": 1.3733, "step": 6031 }, { "epoch": 0.04, "grad_norm": 4.82965359871014, "learning_rate": 1.999489318525304e-06, "loss": 1.4025, "step": 6032 }, { "epoch": 0.04, "grad_norm": 4.392228963696012, "learning_rate": 1.9994891489203925e-06, "loss": 1.3522, "step": 6033 }, { "epoch": 0.04, "grad_norm": 4.47804718442672, "learning_rate": 1.9994889792873293e-06, "loss": 1.3479, "step": 6034 }, { "epoch": 0.04, "grad_norm": 4.958836670721066, "learning_rate": 1.9994888096261137e-06, "loss": 1.4792, "step": 6035 }, { "epoch": 0.04, "grad_norm": 4.478318902420378, "learning_rate": 1.9994886399367452e-06, "loss": 1.4007, "step": 6036 }, { "epoch": 0.04, "grad_norm": 4.8154935747508585, "learning_rate": 1.9994884702192252e-06, "loss": 1.4116, "step": 6037 }, { "epoch": 0.04, "grad_norm": 6.649886072462123, "learning_rate": 1.9994883004735524e-06, "loss": 1.5609, "step": 6038 }, { "epoch": 0.04, "grad_norm": 7.5489539998345085, "learning_rate": 1.9994881306997276e-06, "loss": 1.3799, "step": 6039 }, { "epoch": 0.04, "grad_norm": 4.503192357157364, "learning_rate": 1.9994879608977508e-06, "loss": 1.3552, "step": 6040 }, { "epoch": 0.04, "grad_norm": 4.6949075241063545, "learning_rate": 1.999487791067621e-06, "loss": 1.1173, "step": 6041 }, { "epoch": 0.04, "grad_norm": 4.283222990845595, "learning_rate": 1.9994876212093395e-06, "loss": 1.3951, "step": 6042 }, { "epoch": 0.04, "grad_norm": 6.4448198293069, "learning_rate": 1.999487451322906e-06, "loss": 1.1869, "step": 6043 }, { "epoch": 0.04, "grad_norm": 6.754015541765455, "learning_rate": 1.99948728140832e-06, "loss": 1.3206, "step": 6044 }, { "epoch": 0.04, "grad_norm": 5.451581378684238, "learning_rate": 1.9994871114655814e-06, "loss": 1.5363, "step": 6045 }, { "epoch": 0.04, "grad_norm": 5.036088049324541, "learning_rate": 1.999486941494691e-06, "loss": 1.3068, "step": 6046 }, { "epoch": 0.04, "grad_norm": 4.508300048630319, "learning_rate": 1.999486771495648e-06, "loss": 1.3017, "step": 6047 }, { "epoch": 0.04, "grad_norm": 5.219211182837951, "learning_rate": 1.9994866014684533e-06, "loss": 1.5394, "step": 6048 }, { "epoch": 0.04, "grad_norm": 4.6574978755119405, "learning_rate": 1.999486431413106e-06, "loss": 1.4524, "step": 6049 }, { "epoch": 0.04, "grad_norm": 4.646724372175653, "learning_rate": 1.999486261329607e-06, "loss": 1.3673, "step": 6050 }, { "epoch": 0.04, "grad_norm": 4.666735563091164, "learning_rate": 1.9994860912179553e-06, "loss": 1.3786, "step": 6051 }, { "epoch": 0.04, "grad_norm": 5.004039396047037, "learning_rate": 1.9994859210781512e-06, "loss": 1.4475, "step": 6052 }, { "epoch": 0.04, "grad_norm": 4.4559305433858, "learning_rate": 1.9994857509101956e-06, "loss": 1.403, "step": 6053 }, { "epoch": 0.04, "grad_norm": 4.200620839592182, "learning_rate": 1.999485580714087e-06, "loss": 1.4339, "step": 6054 }, { "epoch": 0.04, "grad_norm": 4.368923775818351, "learning_rate": 1.9994854104898268e-06, "loss": 1.3961, "step": 6055 }, { "epoch": 0.04, "grad_norm": 4.520758908192519, "learning_rate": 1.9994852402374143e-06, "loss": 1.3481, "step": 6056 }, { "epoch": 0.04, "grad_norm": 4.749790030035608, "learning_rate": 1.9994850699568495e-06, "loss": 1.3857, "step": 6057 }, { "epoch": 0.04, "grad_norm": 5.364813787560973, "learning_rate": 1.9994848996481327e-06, "loss": 1.4239, "step": 6058 }, { "epoch": 0.04, "grad_norm": 4.776545174481406, "learning_rate": 1.9994847293112635e-06, "loss": 1.3665, "step": 6059 }, { "epoch": 0.04, "eval_loss": 1.5814626216888428, "eval_runtime": 4.6295, "eval_samples_per_second": 1.944, "eval_steps_per_second": 1.08, "step": 6059 }, { "epoch": 0.04, "grad_norm": 4.4610124368355715, "learning_rate": 1.9994845589462423e-06, "loss": 1.2708, "step": 6060 }, { "epoch": 0.04, "grad_norm": 4.3555597781804085, "learning_rate": 1.999484388553069e-06, "loss": 1.2651, "step": 6061 }, { "epoch": 0.04, "grad_norm": 4.695709724087035, "learning_rate": 1.9994842181317435e-06, "loss": 1.5059, "step": 6062 }, { "epoch": 0.04, "grad_norm": 4.588933549078004, "learning_rate": 1.999484047682266e-06, "loss": 1.2791, "step": 6063 }, { "epoch": 0.04, "grad_norm": 4.4369556708064275, "learning_rate": 1.999483877204636e-06, "loss": 1.3224, "step": 6064 }, { "epoch": 0.04, "grad_norm": 4.981094341732428, "learning_rate": 1.9994837066988543e-06, "loss": 1.3515, "step": 6065 }, { "epoch": 0.04, "grad_norm": 4.827720040784978, "learning_rate": 1.99948353616492e-06, "loss": 1.7331, "step": 6066 }, { "epoch": 0.04, "grad_norm": 4.8644874693326825, "learning_rate": 1.999483365602834e-06, "loss": 1.3682, "step": 6067 }, { "epoch": 0.04, "grad_norm": 4.444503516764638, "learning_rate": 1.9994831950125956e-06, "loss": 1.3409, "step": 6068 }, { "epoch": 0.04, "grad_norm": 4.738312383862863, "learning_rate": 1.9994830243942052e-06, "loss": 1.3596, "step": 6069 }, { "epoch": 0.04, "grad_norm": 3.7407667912707794, "learning_rate": 1.999482853747663e-06, "loss": 1.0397, "step": 6070 }, { "epoch": 0.04, "grad_norm": 4.540780517516395, "learning_rate": 1.999482683072968e-06, "loss": 1.541, "step": 6071 }, { "epoch": 0.04, "grad_norm": 4.249051272582699, "learning_rate": 1.9994825123701214e-06, "loss": 1.2847, "step": 6072 }, { "epoch": 0.04, "grad_norm": 4.718535707840548, "learning_rate": 1.9994823416391222e-06, "loss": 1.4749, "step": 6073 }, { "epoch": 0.04, "grad_norm": 4.231595756329563, "learning_rate": 1.9994821708799715e-06, "loss": 1.2346, "step": 6074 }, { "epoch": 0.04, "grad_norm": 4.654180368226225, "learning_rate": 1.9994820000926684e-06, "loss": 1.3243, "step": 6075 }, { "epoch": 0.04, "grad_norm": 5.253237657249213, "learning_rate": 1.9994818292772133e-06, "loss": 1.0968, "step": 6076 }, { "epoch": 0.04, "grad_norm": 4.4294106093805015, "learning_rate": 1.999481658433606e-06, "loss": 1.3959, "step": 6077 }, { "epoch": 0.04, "grad_norm": 4.605849279958842, "learning_rate": 1.999481487561847e-06, "loss": 1.5796, "step": 6078 }, { "epoch": 0.04, "grad_norm": 4.604275515755657, "learning_rate": 1.9994813166619356e-06, "loss": 1.3411, "step": 6079 }, { "epoch": 0.04, "grad_norm": 4.5391827210489515, "learning_rate": 1.999481145733872e-06, "loss": 1.3292, "step": 6080 }, { "epoch": 0.04, "grad_norm": 4.258402011935969, "learning_rate": 1.9994809747776566e-06, "loss": 1.208, "step": 6081 }, { "epoch": 0.04, "grad_norm": 4.90922451177866, "learning_rate": 1.999480803793289e-06, "loss": 1.4242, "step": 6082 }, { "epoch": 0.04, "grad_norm": 5.413230793455484, "learning_rate": 1.9994806327807697e-06, "loss": 1.3969, "step": 6083 }, { "epoch": 0.04, "grad_norm": 4.790217331119937, "learning_rate": 1.999480461740098e-06, "loss": 1.6191, "step": 6084 }, { "epoch": 0.04, "grad_norm": 4.711049785581008, "learning_rate": 1.9994802906712744e-06, "loss": 1.3155, "step": 6085 }, { "epoch": 0.04, "grad_norm": 4.859462719164144, "learning_rate": 1.9994801195742986e-06, "loss": 1.4134, "step": 6086 }, { "epoch": 0.04, "grad_norm": 4.600983445317028, "learning_rate": 1.9994799484491708e-06, "loss": 1.2127, "step": 6087 }, { "epoch": 0.04, "grad_norm": 4.535710486992747, "learning_rate": 1.999479777295891e-06, "loss": 1.3211, "step": 6088 }, { "epoch": 0.04, "grad_norm": 5.276527942535898, "learning_rate": 1.9994796061144596e-06, "loss": 1.3883, "step": 6089 }, { "epoch": 0.04, "grad_norm": 4.588655871306414, "learning_rate": 1.9994794349048758e-06, "loss": 1.4562, "step": 6090 }, { "epoch": 0.04, "grad_norm": 4.358827615578689, "learning_rate": 1.99947926366714e-06, "loss": 1.2157, "step": 6091 }, { "epoch": 0.04, "grad_norm": 5.379107702441567, "learning_rate": 1.999479092401252e-06, "loss": 1.4984, "step": 6092 }, { "epoch": 0.04, "grad_norm": 4.703256990466054, "learning_rate": 1.999478921107212e-06, "loss": 1.4846, "step": 6093 }, { "epoch": 0.04, "grad_norm": 4.596203798715561, "learning_rate": 1.9994787497850204e-06, "loss": 1.2495, "step": 6094 }, { "epoch": 0.04, "grad_norm": 7.102533389263381, "learning_rate": 1.9994785784346766e-06, "loss": 1.5484, "step": 6095 }, { "epoch": 0.04, "grad_norm": 5.101495341051491, "learning_rate": 1.999478407056181e-06, "loss": 1.5624, "step": 6096 }, { "epoch": 0.04, "grad_norm": 4.540548930296532, "learning_rate": 1.999478235649533e-06, "loss": 1.446, "step": 6097 }, { "epoch": 0.04, "grad_norm": 5.20654688757371, "learning_rate": 1.999478064214733e-06, "loss": 1.361, "step": 6098 }, { "epoch": 0.04, "grad_norm": 4.987142852674508, "learning_rate": 1.9994778927517815e-06, "loss": 1.2327, "step": 6099 }, { "epoch": 0.04, "grad_norm": 4.455227469872403, "learning_rate": 1.999477721260678e-06, "loss": 1.2404, "step": 6100 }, { "epoch": 0.04, "grad_norm": 4.151831432678794, "learning_rate": 1.999477549741422e-06, "loss": 1.3637, "step": 6101 }, { "epoch": 0.04, "grad_norm": 4.4398108263471086, "learning_rate": 1.999477378194015e-06, "loss": 1.3926, "step": 6102 }, { "epoch": 0.04, "grad_norm": 4.638985304932603, "learning_rate": 1.9994772066184554e-06, "loss": 1.4549, "step": 6103 }, { "epoch": 0.04, "grad_norm": 5.764089849166333, "learning_rate": 1.9994770350147438e-06, "loss": 1.238, "step": 6104 }, { "epoch": 0.04, "grad_norm": 4.221141658122603, "learning_rate": 1.99947686338288e-06, "loss": 1.2808, "step": 6105 }, { "epoch": 0.04, "grad_norm": 4.987870381857472, "learning_rate": 1.999476691722865e-06, "loss": 1.4136, "step": 6106 }, { "epoch": 0.04, "grad_norm": 5.414943213916018, "learning_rate": 1.9994765200346975e-06, "loss": 1.408, "step": 6107 }, { "epoch": 0.04, "grad_norm": 4.590091578818411, "learning_rate": 1.9994763483183783e-06, "loss": 1.2204, "step": 6108 }, { "epoch": 0.04, "grad_norm": 4.918726076051846, "learning_rate": 1.9994761765739072e-06, "loss": 1.4237, "step": 6109 }, { "epoch": 0.04, "grad_norm": 4.429053489962411, "learning_rate": 1.999476004801284e-06, "loss": 1.368, "step": 6110 }, { "epoch": 0.04, "grad_norm": 4.637152365679712, "learning_rate": 1.999475833000509e-06, "loss": 1.3377, "step": 6111 }, { "epoch": 0.04, "grad_norm": 4.435475088674321, "learning_rate": 1.999475661171582e-06, "loss": 1.3944, "step": 6112 }, { "epoch": 0.04, "grad_norm": 4.556226538719296, "learning_rate": 1.9994754893145033e-06, "loss": 1.1661, "step": 6113 }, { "epoch": 0.04, "grad_norm": 4.8157589715874325, "learning_rate": 1.9994753174292727e-06, "loss": 1.249, "step": 6114 }, { "epoch": 0.04, "grad_norm": 5.433075189629411, "learning_rate": 1.99947514551589e-06, "loss": 1.174, "step": 6115 }, { "epoch": 0.04, "grad_norm": 4.553371378036626, "learning_rate": 1.9994749735743555e-06, "loss": 1.2733, "step": 6116 }, { "epoch": 0.04, "grad_norm": 4.678272941970609, "learning_rate": 1.9994748016046693e-06, "loss": 1.4607, "step": 6117 }, { "epoch": 0.04, "grad_norm": 4.738752956770592, "learning_rate": 1.9994746296068308e-06, "loss": 1.445, "step": 6118 }, { "epoch": 0.04, "grad_norm": 4.561523281822727, "learning_rate": 1.999474457580841e-06, "loss": 1.3853, "step": 6119 }, { "epoch": 0.04, "grad_norm": 4.703929268553739, "learning_rate": 1.999474285526699e-06, "loss": 1.4613, "step": 6120 }, { "epoch": 0.04, "grad_norm": 5.323945995405551, "learning_rate": 1.999474113444405e-06, "loss": 1.3089, "step": 6121 }, { "epoch": 0.04, "grad_norm": 4.737580808799884, "learning_rate": 1.999473941333959e-06, "loss": 1.3946, "step": 6122 }, { "epoch": 0.04, "grad_norm": 4.165283005591392, "learning_rate": 1.9994737691953615e-06, "loss": 1.4231, "step": 6123 }, { "epoch": 0.04, "grad_norm": 5.177058287172347, "learning_rate": 1.9994735970286123e-06, "loss": 1.4302, "step": 6124 }, { "epoch": 0.04, "grad_norm": 4.398244724544157, "learning_rate": 1.9994734248337106e-06, "loss": 1.2193, "step": 6125 }, { "epoch": 0.04, "grad_norm": 5.05904925787711, "learning_rate": 1.999473252610658e-06, "loss": 1.4165, "step": 6126 }, { "epoch": 0.04, "grad_norm": 4.329559284786233, "learning_rate": 1.9994730803594527e-06, "loss": 1.2999, "step": 6127 }, { "epoch": 0.04, "grad_norm": 4.301085581371965, "learning_rate": 1.999472908080096e-06, "loss": 1.3309, "step": 6128 }, { "epoch": 0.04, "grad_norm": 4.77761987141311, "learning_rate": 1.999472735772587e-06, "loss": 1.3824, "step": 6129 }, { "epoch": 0.04, "grad_norm": 4.516107530030336, "learning_rate": 1.999472563436927e-06, "loss": 1.2827, "step": 6130 }, { "epoch": 0.04, "grad_norm": 4.360677328930842, "learning_rate": 1.999472391073114e-06, "loss": 1.3214, "step": 6131 }, { "epoch": 0.04, "grad_norm": 9.74669626437974, "learning_rate": 1.9994722186811503e-06, "loss": 1.3294, "step": 6132 }, { "epoch": 0.04, "eval_loss": 1.582488775253296, "eval_runtime": 4.6323, "eval_samples_per_second": 1.943, "eval_steps_per_second": 1.079, "step": 6132 }, { "epoch": 0.04, "grad_norm": 4.964186150657791, "learning_rate": 1.9994720462610345e-06, "loss": 1.3245, "step": 6133 }, { "epoch": 0.04, "grad_norm": 5.018327538888818, "learning_rate": 1.9994718738127667e-06, "loss": 1.5288, "step": 6134 }, { "epoch": 0.04, "grad_norm": 4.400754960118608, "learning_rate": 1.9994717013363473e-06, "loss": 1.4085, "step": 6135 }, { "epoch": 0.04, "grad_norm": 4.718339302096536, "learning_rate": 1.999471528831776e-06, "loss": 1.3087, "step": 6136 }, { "epoch": 0.04, "grad_norm": 6.062922471058402, "learning_rate": 1.999471356299053e-06, "loss": 1.3564, "step": 6137 }, { "epoch": 0.04, "grad_norm": 4.937724979699985, "learning_rate": 1.999471183738178e-06, "loss": 1.3485, "step": 6138 }, { "epoch": 0.04, "grad_norm": 6.397936412448947, "learning_rate": 1.999471011149151e-06, "loss": 1.3899, "step": 6139 }, { "epoch": 0.04, "grad_norm": 4.264980015565267, "learning_rate": 1.999470838531973e-06, "loss": 1.2311, "step": 6140 }, { "epoch": 0.04, "grad_norm": 4.672186924733777, "learning_rate": 1.9994706658866426e-06, "loss": 1.4506, "step": 6141 }, { "epoch": 0.04, "grad_norm": 4.808623320860854, "learning_rate": 1.9994704932131606e-06, "loss": 1.2822, "step": 6142 }, { "epoch": 0.04, "grad_norm": 4.558147094577359, "learning_rate": 1.999470320511527e-06, "loss": 1.5793, "step": 6143 }, { "epoch": 0.04, "grad_norm": 4.387805393512093, "learning_rate": 1.9994701477817414e-06, "loss": 1.419, "step": 6144 }, { "epoch": 0.04, "grad_norm": 4.297311851383743, "learning_rate": 1.9994699750238043e-06, "loss": 1.3896, "step": 6145 }, { "epoch": 0.04, "grad_norm": 4.295559804098184, "learning_rate": 1.9994698022377156e-06, "loss": 0.8714, "step": 6146 }, { "epoch": 0.04, "grad_norm": 4.2967838935084455, "learning_rate": 1.999469629423475e-06, "loss": 1.3869, "step": 6147 }, { "epoch": 0.04, "grad_norm": 4.358455124495937, "learning_rate": 1.9994694565810822e-06, "loss": 1.3049, "step": 6148 }, { "epoch": 0.04, "grad_norm": 7.239151354778028, "learning_rate": 1.999469283710538e-06, "loss": 1.2637, "step": 6149 }, { "epoch": 0.04, "grad_norm": 4.6768981235943246, "learning_rate": 1.999469110811842e-06, "loss": 1.2972, "step": 6150 }, { "epoch": 0.04, "grad_norm": 5.169875277724359, "learning_rate": 1.999468937884995e-06, "loss": 1.2823, "step": 6151 }, { "epoch": 0.04, "grad_norm": 5.007450316135535, "learning_rate": 1.9994687649299954e-06, "loss": 1.3594, "step": 6152 }, { "epoch": 0.04, "grad_norm": 5.3284465856143, "learning_rate": 1.999468591946844e-06, "loss": 1.3988, "step": 6153 }, { "epoch": 0.04, "grad_norm": 4.631363634094369, "learning_rate": 1.9994684189355416e-06, "loss": 1.4882, "step": 6154 }, { "epoch": 0.04, "grad_norm": 4.955448139733144, "learning_rate": 1.999468245896087e-06, "loss": 1.4284, "step": 6155 }, { "epoch": 0.04, "grad_norm": 4.275900206069789, "learning_rate": 1.999468072828481e-06, "loss": 1.3695, "step": 6156 }, { "epoch": 0.04, "grad_norm": 5.484413834433741, "learning_rate": 1.999467899732723e-06, "loss": 1.4065, "step": 6157 }, { "epoch": 0.04, "grad_norm": 4.300235278427091, "learning_rate": 1.9994677266088135e-06, "loss": 1.4083, "step": 6158 }, { "epoch": 0.04, "grad_norm": 4.602670019650477, "learning_rate": 1.9994675534567524e-06, "loss": 1.4673, "step": 6159 }, { "epoch": 0.04, "grad_norm": 4.547191082336501, "learning_rate": 1.9994673802765397e-06, "loss": 1.4991, "step": 6160 }, { "epoch": 0.04, "grad_norm": 4.396399700331105, "learning_rate": 1.999467207068175e-06, "loss": 1.4652, "step": 6161 }, { "epoch": 0.04, "grad_norm": 4.090362508254198, "learning_rate": 1.9994670338316587e-06, "loss": 1.1855, "step": 6162 }, { "epoch": 0.04, "grad_norm": 4.7330609271931365, "learning_rate": 1.999466860566991e-06, "loss": 1.3576, "step": 6163 }, { "epoch": 0.04, "grad_norm": 4.563211520677828, "learning_rate": 1.999466687274171e-06, "loss": 1.3395, "step": 6164 }, { "epoch": 0.04, "grad_norm": 4.282961859072747, "learning_rate": 1.9994665139532e-06, "loss": 1.3582, "step": 6165 }, { "epoch": 0.04, "grad_norm": 4.652086898581341, "learning_rate": 1.999466340604077e-06, "loss": 1.3789, "step": 6166 }, { "epoch": 0.04, "grad_norm": 5.148255142346508, "learning_rate": 1.999466167226803e-06, "loss": 1.3128, "step": 6167 }, { "epoch": 0.04, "grad_norm": 4.950838977368306, "learning_rate": 1.9994659938213766e-06, "loss": 1.3493, "step": 6168 }, { "epoch": 0.04, "grad_norm": 4.539896650709607, "learning_rate": 1.999465820387799e-06, "loss": 1.3399, "step": 6169 }, { "epoch": 0.04, "grad_norm": 5.020568259141698, "learning_rate": 1.9994656469260694e-06, "loss": 1.4112, "step": 6170 }, { "epoch": 0.04, "grad_norm": 4.7142857191065985, "learning_rate": 1.9994654734361886e-06, "loss": 1.2468, "step": 6171 }, { "epoch": 0.04, "grad_norm": 5.16014302424319, "learning_rate": 1.999465299918156e-06, "loss": 1.4557, "step": 6172 }, { "epoch": 0.04, "grad_norm": 5.2521699890907, "learning_rate": 1.9994651263719716e-06, "loss": 1.7294, "step": 6173 }, { "epoch": 0.04, "grad_norm": 4.48021446105213, "learning_rate": 1.999464952797636e-06, "loss": 1.2958, "step": 6174 }, { "epoch": 0.04, "grad_norm": 4.706816248859854, "learning_rate": 1.9994647791951482e-06, "loss": 1.3717, "step": 6175 }, { "epoch": 0.04, "grad_norm": 4.240228006782147, "learning_rate": 1.9994646055645097e-06, "loss": 1.2771, "step": 6176 }, { "epoch": 0.04, "grad_norm": 4.677862593769692, "learning_rate": 1.9994644319057187e-06, "loss": 1.4664, "step": 6177 }, { "epoch": 0.04, "grad_norm": 4.836147231280031, "learning_rate": 1.9994642582187766e-06, "loss": 1.4567, "step": 6178 }, { "epoch": 0.04, "grad_norm": 5.0100709358719255, "learning_rate": 1.999464084503683e-06, "loss": 1.6372, "step": 6179 }, { "epoch": 0.04, "grad_norm": 4.66840316102352, "learning_rate": 1.999463910760437e-06, "loss": 1.3656, "step": 6180 }, { "epoch": 0.04, "grad_norm": 10.24341457309304, "learning_rate": 1.9994637369890404e-06, "loss": 1.4524, "step": 6181 }, { "epoch": 0.04, "grad_norm": 4.8244700633445765, "learning_rate": 1.999463563189492e-06, "loss": 1.3036, "step": 6182 }, { "epoch": 0.04, "grad_norm": 5.040044287910526, "learning_rate": 1.9994633893617916e-06, "loss": 1.4475, "step": 6183 }, { "epoch": 0.04, "grad_norm": 4.434311570026643, "learning_rate": 1.99946321550594e-06, "loss": 1.4527, "step": 6184 }, { "epoch": 0.04, "grad_norm": 6.915237503634814, "learning_rate": 1.999463041621937e-06, "loss": 1.5297, "step": 6185 }, { "epoch": 0.04, "grad_norm": 4.7972949551211075, "learning_rate": 1.9994628677097824e-06, "loss": 1.4167, "step": 6186 }, { "epoch": 0.04, "grad_norm": 4.417488185539121, "learning_rate": 1.999462693769476e-06, "loss": 1.2969, "step": 6187 }, { "epoch": 0.04, "grad_norm": 4.511259313928958, "learning_rate": 1.999462519801018e-06, "loss": 1.3152, "step": 6188 }, { "epoch": 0.04, "grad_norm": 4.581454435228433, "learning_rate": 1.9994623458044087e-06, "loss": 1.2407, "step": 6189 }, { "epoch": 0.04, "grad_norm": 4.645052729303026, "learning_rate": 1.999462171779648e-06, "loss": 1.3239, "step": 6190 }, { "epoch": 0.04, "grad_norm": 4.77228333435505, "learning_rate": 1.9994619977267354e-06, "loss": 1.4478, "step": 6191 }, { "epoch": 0.04, "grad_norm": 4.832032020936424, "learning_rate": 1.9994618236456714e-06, "loss": 1.2545, "step": 6192 }, { "epoch": 0.04, "grad_norm": 4.489886177222047, "learning_rate": 1.999461649536456e-06, "loss": 1.423, "step": 6193 }, { "epoch": 0.04, "grad_norm": 4.039063470302119, "learning_rate": 1.999461475399089e-06, "loss": 1.3277, "step": 6194 }, { "epoch": 0.04, "grad_norm": 4.997397507196713, "learning_rate": 1.9994613012335704e-06, "loss": 1.3689, "step": 6195 }, { "epoch": 0.04, "grad_norm": 4.543867728623865, "learning_rate": 1.9994611270399005e-06, "loss": 1.4871, "step": 6196 }, { "epoch": 0.04, "grad_norm": 4.6319875859501956, "learning_rate": 1.999460952818079e-06, "loss": 1.4085, "step": 6197 }, { "epoch": 0.04, "grad_norm": 4.618400072200484, "learning_rate": 1.9994607785681057e-06, "loss": 1.4328, "step": 6198 }, { "epoch": 0.04, "grad_norm": 4.415133663286954, "learning_rate": 1.9994606042899816e-06, "loss": 1.2672, "step": 6199 }, { "epoch": 0.04, "grad_norm": 4.402528926656821, "learning_rate": 1.9994604299837055e-06, "loss": 1.3727, "step": 6200 }, { "epoch": 0.04, "grad_norm": 4.3107896927872895, "learning_rate": 1.9994602556492783e-06, "loss": 1.3585, "step": 6201 }, { "epoch": 0.04, "grad_norm": 4.505844512239053, "learning_rate": 1.9994600812866995e-06, "loss": 1.3846, "step": 6202 }, { "epoch": 0.04, "grad_norm": 6.16017137211389, "learning_rate": 1.999459906895969e-06, "loss": 1.2515, "step": 6203 }, { "epoch": 0.04, "grad_norm": 4.2443583247966705, "learning_rate": 1.999459732477087e-06, "loss": 1.4439, "step": 6204 }, { "epoch": 0.04, "grad_norm": 5.29325492520201, "learning_rate": 1.999459558030054e-06, "loss": 1.4962, "step": 6205 }, { "epoch": 0.04, "eval_loss": 1.579491138458252, "eval_runtime": 4.6366, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 6205 }, { "epoch": 0.04, "grad_norm": 4.365961234878097, "learning_rate": 1.999459383554869e-06, "loss": 1.3323, "step": 6206 }, { "epoch": 0.04, "grad_norm": 4.438554607222995, "learning_rate": 1.9994592090515333e-06, "loss": 1.3669, "step": 6207 }, { "epoch": 0.04, "grad_norm": 4.799954840272582, "learning_rate": 1.9994590345200455e-06, "loss": 1.2788, "step": 6208 }, { "epoch": 0.04, "grad_norm": 5.911931732571483, "learning_rate": 1.9994588599604066e-06, "loss": 1.5406, "step": 6209 }, { "epoch": 0.04, "grad_norm": 4.570525123202556, "learning_rate": 1.999458685372616e-06, "loss": 1.4127, "step": 6210 }, { "epoch": 0.04, "grad_norm": 5.3863869493826755, "learning_rate": 1.9994585107566737e-06, "loss": 1.4656, "step": 6211 }, { "epoch": 0.04, "grad_norm": 4.432176310123156, "learning_rate": 1.9994583361125806e-06, "loss": 1.377, "step": 6212 }, { "epoch": 0.04, "grad_norm": 9.757045455039089, "learning_rate": 1.9994581614403363e-06, "loss": 1.3982, "step": 6213 }, { "epoch": 0.04, "grad_norm": 5.1490574463311996, "learning_rate": 1.99945798673994e-06, "loss": 1.3326, "step": 6214 }, { "epoch": 0.04, "grad_norm": 5.22065542765558, "learning_rate": 1.999457812011392e-06, "loss": 1.2758, "step": 6215 }, { "epoch": 0.04, "grad_norm": 4.269147559952856, "learning_rate": 1.9994576372546935e-06, "loss": 1.4082, "step": 6216 }, { "epoch": 0.04, "grad_norm": 4.26652065699978, "learning_rate": 1.999457462469843e-06, "loss": 1.3887, "step": 6217 }, { "epoch": 0.04, "grad_norm": 5.159510965803933, "learning_rate": 1.9994572876568413e-06, "loss": 1.3974, "step": 6218 }, { "epoch": 0.04, "grad_norm": 4.502074384420588, "learning_rate": 1.999457112815688e-06, "loss": 1.4528, "step": 6219 }, { "epoch": 0.04, "grad_norm": 10.971859173023756, "learning_rate": 1.9994569379463836e-06, "loss": 1.3884, "step": 6220 }, { "epoch": 0.04, "grad_norm": 4.622875778957151, "learning_rate": 1.999456763048928e-06, "loss": 1.4717, "step": 6221 }, { "epoch": 0.04, "grad_norm": 4.2659946954724, "learning_rate": 1.9994565881233205e-06, "loss": 1.2976, "step": 6222 }, { "epoch": 0.04, "grad_norm": 4.539960535806077, "learning_rate": 1.999456413169562e-06, "loss": 1.3798, "step": 6223 }, { "epoch": 0.04, "grad_norm": 5.16379763833579, "learning_rate": 1.999456238187652e-06, "loss": 1.4206, "step": 6224 }, { "epoch": 0.04, "grad_norm": 5.38333834598189, "learning_rate": 1.9994560631775907e-06, "loss": 1.6892, "step": 6225 }, { "epoch": 0.04, "grad_norm": 4.3252235100853165, "learning_rate": 1.9994558881393777e-06, "loss": 1.4032, "step": 6226 }, { "epoch": 0.04, "grad_norm": 4.638301138943027, "learning_rate": 1.9994557130730137e-06, "loss": 1.0862, "step": 6227 }, { "epoch": 0.04, "grad_norm": 4.199857407346218, "learning_rate": 1.9994555379784984e-06, "loss": 1.2343, "step": 6228 }, { "epoch": 0.04, "grad_norm": 4.460954646283919, "learning_rate": 1.9994553628558317e-06, "loss": 1.4285, "step": 6229 }, { "epoch": 0.04, "grad_norm": 5.22144764699454, "learning_rate": 1.9994551877050133e-06, "loss": 1.386, "step": 6230 }, { "epoch": 0.04, "grad_norm": 4.885479940540609, "learning_rate": 1.9994550125260443e-06, "loss": 1.4835, "step": 6231 }, { "epoch": 0.04, "grad_norm": 4.274801175086886, "learning_rate": 1.9994548373189236e-06, "loss": 1.3729, "step": 6232 }, { "epoch": 0.04, "grad_norm": 4.320719186443537, "learning_rate": 1.9994546620836514e-06, "loss": 1.4038, "step": 6233 }, { "epoch": 0.04, "grad_norm": 5.13127197033803, "learning_rate": 1.999454486820228e-06, "loss": 1.4569, "step": 6234 }, { "epoch": 0.04, "grad_norm": 4.708373616219464, "learning_rate": 1.9994543115286532e-06, "loss": 1.3605, "step": 6235 }, { "epoch": 0.04, "grad_norm": 4.381461293701964, "learning_rate": 1.999454136208927e-06, "loss": 1.3229, "step": 6236 }, { "epoch": 0.04, "grad_norm": 5.1886530394011965, "learning_rate": 1.99945396086105e-06, "loss": 1.4379, "step": 6237 }, { "epoch": 0.04, "grad_norm": 4.290838637917517, "learning_rate": 1.9994537854850217e-06, "loss": 1.3634, "step": 6238 }, { "epoch": 0.04, "grad_norm": 4.64763420157834, "learning_rate": 1.999453610080842e-06, "loss": 1.4798, "step": 6239 }, { "epoch": 0.04, "grad_norm": 4.850536566464667, "learning_rate": 1.9994534346485104e-06, "loss": 1.4149, "step": 6240 }, { "epoch": 0.04, "grad_norm": 4.7042830848839134, "learning_rate": 1.9994532591880282e-06, "loss": 1.4296, "step": 6241 }, { "epoch": 0.04, "grad_norm": 5.098811986755147, "learning_rate": 1.9994530836993945e-06, "loss": 1.5315, "step": 6242 }, { "epoch": 0.04, "grad_norm": 7.276713920050073, "learning_rate": 1.9994529081826097e-06, "loss": 1.4529, "step": 6243 }, { "epoch": 0.04, "grad_norm": 4.459359710809481, "learning_rate": 1.9994527326376732e-06, "loss": 1.3151, "step": 6244 }, { "epoch": 0.04, "grad_norm": 5.198277194946066, "learning_rate": 1.9994525570645857e-06, "loss": 1.2233, "step": 6245 }, { "epoch": 0.04, "grad_norm": 4.480675344921372, "learning_rate": 1.999452381463347e-06, "loss": 1.2578, "step": 6246 }, { "epoch": 0.04, "grad_norm": 4.398837651025428, "learning_rate": 1.999452205833957e-06, "loss": 1.3635, "step": 6247 }, { "epoch": 0.04, "grad_norm": 4.874983502384644, "learning_rate": 1.9994520301764157e-06, "loss": 1.3113, "step": 6248 }, { "epoch": 0.04, "grad_norm": 4.4619963771909115, "learning_rate": 1.9994518544907236e-06, "loss": 1.3168, "step": 6249 }, { "epoch": 0.04, "grad_norm": 4.8218187466622355, "learning_rate": 1.9994516787768795e-06, "loss": 1.3981, "step": 6250 }, { "epoch": 0.04, "grad_norm": 4.804911941840873, "learning_rate": 1.9994515030348847e-06, "loss": 1.3478, "step": 6251 }, { "epoch": 0.04, "grad_norm": 4.7732039290960095, "learning_rate": 1.9994513272647383e-06, "loss": 1.4151, "step": 6252 }, { "epoch": 0.04, "grad_norm": 4.782008114664108, "learning_rate": 1.999451151466441e-06, "loss": 1.3251, "step": 6253 }, { "epoch": 0.04, "grad_norm": 4.334209691403756, "learning_rate": 1.9994509756399925e-06, "loss": 1.3157, "step": 6254 }, { "epoch": 0.04, "grad_norm": 4.202503772855173, "learning_rate": 1.9994507997853927e-06, "loss": 1.3002, "step": 6255 }, { "epoch": 0.04, "grad_norm": 4.123562115337841, "learning_rate": 1.9994506239026417e-06, "loss": 1.1009, "step": 6256 }, { "epoch": 0.04, "grad_norm": 4.9891242612665145, "learning_rate": 1.9994504479917396e-06, "loss": 1.5414, "step": 6257 }, { "epoch": 0.04, "grad_norm": 5.048372778728673, "learning_rate": 1.9994502720526864e-06, "loss": 1.4656, "step": 6258 }, { "epoch": 0.04, "grad_norm": 4.31647391475051, "learning_rate": 1.9994500960854816e-06, "loss": 1.4439, "step": 6259 }, { "epoch": 0.04, "grad_norm": 11.075614754896375, "learning_rate": 1.9994499200901257e-06, "loss": 1.4309, "step": 6260 }, { "epoch": 0.04, "grad_norm": 5.092187210808303, "learning_rate": 1.999449744066619e-06, "loss": 1.4125, "step": 6261 }, { "epoch": 0.04, "grad_norm": 4.15856219538189, "learning_rate": 1.999449568014961e-06, "loss": 1.2085, "step": 6262 }, { "epoch": 0.04, "grad_norm": 6.063080042489394, "learning_rate": 1.9994493919351515e-06, "loss": 1.494, "step": 6263 }, { "epoch": 0.04, "grad_norm": 4.3053476529014905, "learning_rate": 1.999449215827191e-06, "loss": 1.3967, "step": 6264 }, { "epoch": 0.04, "grad_norm": 4.612544656213768, "learning_rate": 1.9994490396910793e-06, "loss": 1.3992, "step": 6265 }, { "epoch": 0.04, "grad_norm": 4.505894391854513, "learning_rate": 1.9994488635268166e-06, "loss": 1.2655, "step": 6266 }, { "epoch": 0.04, "grad_norm": 4.999160621922022, "learning_rate": 1.9994486873344027e-06, "loss": 1.4246, "step": 6267 }, { "epoch": 0.04, "grad_norm": 4.732689820779631, "learning_rate": 1.9994485111138376e-06, "loss": 1.4205, "step": 6268 }, { "epoch": 0.04, "grad_norm": 6.12552769248662, "learning_rate": 1.9994483348651214e-06, "loss": 1.5443, "step": 6269 }, { "epoch": 0.04, "grad_norm": 4.636498202092567, "learning_rate": 1.999448158588254e-06, "loss": 1.5138, "step": 6270 }, { "epoch": 0.04, "grad_norm": 4.869604032966136, "learning_rate": 1.9994479822832356e-06, "loss": 1.4206, "step": 6271 }, { "epoch": 0.04, "grad_norm": 4.42351752795691, "learning_rate": 1.999447805950066e-06, "loss": 1.402, "step": 6272 }, { "epoch": 0.04, "grad_norm": 4.68384467295785, "learning_rate": 1.9994476295887452e-06, "loss": 1.5068, "step": 6273 }, { "epoch": 0.04, "grad_norm": 4.522735507252104, "learning_rate": 1.9994474531992733e-06, "loss": 1.2736, "step": 6274 }, { "epoch": 0.04, "grad_norm": 4.041076017284593, "learning_rate": 1.9994472767816503e-06, "loss": 1.3421, "step": 6275 }, { "epoch": 0.04, "grad_norm": 4.352022325552487, "learning_rate": 1.9994471003358766e-06, "loss": 1.4047, "step": 6276 }, { "epoch": 0.04, "grad_norm": 5.048220907763825, "learning_rate": 1.9994469238619512e-06, "loss": 1.316, "step": 6277 }, { "epoch": 0.04, "grad_norm": 6.951951703927068, "learning_rate": 1.9994467473598752e-06, "loss": 1.4171, "step": 6278 }, { "epoch": 0.04, "eval_loss": 1.5816144943237305, "eval_runtime": 4.6245, "eval_samples_per_second": 1.946, "eval_steps_per_second": 1.081, "step": 6278 }, { "epoch": 0.04, "grad_norm": 4.545079796800474, "learning_rate": 1.9994465708296476e-06, "loss": 1.1521, "step": 6279 }, { "epoch": 0.04, "grad_norm": 4.532157179173343, "learning_rate": 1.9994463942712693e-06, "loss": 1.4189, "step": 6280 }, { "epoch": 0.04, "grad_norm": 4.518566058348504, "learning_rate": 1.99944621768474e-06, "loss": 1.3532, "step": 6281 }, { "epoch": 0.04, "grad_norm": 4.69897295484806, "learning_rate": 1.9994460410700593e-06, "loss": 1.4321, "step": 6282 }, { "epoch": 0.04, "grad_norm": 4.271392916778712, "learning_rate": 1.9994458644272275e-06, "loss": 1.3013, "step": 6283 }, { "epoch": 0.04, "grad_norm": 4.2870166818173585, "learning_rate": 1.999445687756245e-06, "loss": 1.2614, "step": 6284 }, { "epoch": 0.04, "grad_norm": 4.968990225655271, "learning_rate": 1.999445511057111e-06, "loss": 1.3301, "step": 6285 }, { "epoch": 0.04, "grad_norm": 5.316547171577142, "learning_rate": 1.999445334329826e-06, "loss": 1.5789, "step": 6286 }, { "epoch": 0.04, "grad_norm": 4.504139249836599, "learning_rate": 1.99944515757439e-06, "loss": 1.4502, "step": 6287 }, { "epoch": 0.04, "grad_norm": 4.354619549425394, "learning_rate": 1.9994449807908035e-06, "loss": 1.3688, "step": 6288 }, { "epoch": 0.04, "grad_norm": 4.858810023761249, "learning_rate": 1.9994448039790653e-06, "loss": 1.2367, "step": 6289 }, { "epoch": 0.04, "grad_norm": 5.072472600989854, "learning_rate": 1.9994446271391765e-06, "loss": 1.2932, "step": 6290 }, { "epoch": 0.04, "grad_norm": 6.996626783719462, "learning_rate": 1.9994444502711365e-06, "loss": 1.4289, "step": 6291 }, { "epoch": 0.04, "grad_norm": 5.768561890119225, "learning_rate": 1.9994442733749453e-06, "loss": 1.5282, "step": 6292 }, { "epoch": 0.04, "grad_norm": 4.592982885132441, "learning_rate": 1.999444096450603e-06, "loss": 1.4303, "step": 6293 }, { "epoch": 0.04, "grad_norm": 4.858330766036358, "learning_rate": 1.99944391949811e-06, "loss": 1.2728, "step": 6294 }, { "epoch": 0.04, "grad_norm": 4.962024401130429, "learning_rate": 1.999443742517466e-06, "loss": 1.3689, "step": 6295 }, { "epoch": 0.04, "grad_norm": 10.612791316508146, "learning_rate": 1.9994435655086706e-06, "loss": 1.4866, "step": 6296 }, { "epoch": 0.04, "grad_norm": 4.524703312983261, "learning_rate": 1.9994433884717246e-06, "loss": 1.2387, "step": 6297 }, { "epoch": 0.04, "grad_norm": 4.759956417902498, "learning_rate": 1.9994432114066275e-06, "loss": 1.4107, "step": 6298 }, { "epoch": 0.04, "grad_norm": 4.6006718394635895, "learning_rate": 1.9994430343133796e-06, "loss": 1.4975, "step": 6299 }, { "epoch": 0.04, "grad_norm": 4.399390391233103, "learning_rate": 1.9994428571919802e-06, "loss": 1.3366, "step": 6300 }, { "epoch": 0.04, "grad_norm": 4.350427703990483, "learning_rate": 1.9994426800424305e-06, "loss": 1.4663, "step": 6301 }, { "epoch": 0.04, "grad_norm": 4.311435118977647, "learning_rate": 1.9994425028647293e-06, "loss": 1.3505, "step": 6302 }, { "epoch": 0.04, "grad_norm": 4.088365246877882, "learning_rate": 1.9994423256588773e-06, "loss": 1.179, "step": 6303 }, { "epoch": 0.04, "grad_norm": 4.36004274390509, "learning_rate": 1.999442148424874e-06, "loss": 1.2554, "step": 6304 }, { "epoch": 0.04, "grad_norm": 4.64975464215782, "learning_rate": 1.9994419711627203e-06, "loss": 1.4389, "step": 6305 }, { "epoch": 0.04, "grad_norm": 4.7555821073965205, "learning_rate": 1.9994417938724153e-06, "loss": 1.3113, "step": 6306 }, { "epoch": 0.04, "grad_norm": 4.3218837365849705, "learning_rate": 1.9994416165539596e-06, "loss": 1.1372, "step": 6307 }, { "epoch": 0.04, "grad_norm": 4.624064755129902, "learning_rate": 1.9994414392073528e-06, "loss": 1.4236, "step": 6308 }, { "epoch": 0.04, "grad_norm": 4.674889228192624, "learning_rate": 1.999441261832595e-06, "loss": 1.3463, "step": 6309 }, { "epoch": 0.04, "grad_norm": 5.653073593634385, "learning_rate": 1.999441084429686e-06, "loss": 1.4913, "step": 6310 }, { "epoch": 0.04, "grad_norm": 5.153710008001566, "learning_rate": 1.9994409069986267e-06, "loss": 1.3682, "step": 6311 }, { "epoch": 0.04, "grad_norm": 4.533571582433127, "learning_rate": 1.999440729539416e-06, "loss": 1.3795, "step": 6312 }, { "epoch": 0.04, "grad_norm": 4.734295526473661, "learning_rate": 1.999440552052055e-06, "loss": 1.4166, "step": 6313 }, { "epoch": 0.04, "grad_norm": 4.733539046006419, "learning_rate": 1.9994403745365424e-06, "loss": 1.5861, "step": 6314 }, { "epoch": 0.04, "grad_norm": 4.728263074304334, "learning_rate": 1.9994401969928793e-06, "loss": 1.5769, "step": 6315 }, { "epoch": 0.04, "grad_norm": 5.272703253215903, "learning_rate": 1.999440019421065e-06, "loss": 1.5393, "step": 6316 }, { "epoch": 0.04, "grad_norm": 8.607529820210914, "learning_rate": 1.9994398418210996e-06, "loss": 1.3662, "step": 6317 }, { "epoch": 0.04, "grad_norm": 5.130572745049581, "learning_rate": 1.999439664192984e-06, "loss": 1.2736, "step": 6318 }, { "epoch": 0.04, "grad_norm": 6.926083900062192, "learning_rate": 1.9994394865367167e-06, "loss": 1.5641, "step": 6319 }, { "epoch": 0.04, "grad_norm": 4.756207440717079, "learning_rate": 1.999439308852299e-06, "loss": 1.36, "step": 6320 }, { "epoch": 0.04, "grad_norm": 4.500478062459338, "learning_rate": 1.9994391311397304e-06, "loss": 1.4026, "step": 6321 }, { "epoch": 0.04, "grad_norm": 4.977508838830679, "learning_rate": 1.999438953399011e-06, "loss": 1.4227, "step": 6322 }, { "epoch": 0.04, "grad_norm": 4.7239213779645315, "learning_rate": 1.9994387756301404e-06, "loss": 1.3146, "step": 6323 }, { "epoch": 0.04, "grad_norm": 6.338504632302492, "learning_rate": 1.9994385978331196e-06, "loss": 1.3008, "step": 6324 }, { "epoch": 0.04, "grad_norm": 4.2558563589541425, "learning_rate": 1.999438420007947e-06, "loss": 1.3107, "step": 6325 }, { "epoch": 0.04, "grad_norm": 5.20916111880159, "learning_rate": 1.9994382421546245e-06, "loss": 1.4738, "step": 6326 }, { "epoch": 0.04, "grad_norm": 5.444669708410852, "learning_rate": 1.9994380642731502e-06, "loss": 1.2841, "step": 6327 }, { "epoch": 0.04, "grad_norm": 4.6941929077487226, "learning_rate": 1.9994378863635257e-06, "loss": 1.3465, "step": 6328 }, { "epoch": 0.04, "grad_norm": 5.276286514949238, "learning_rate": 1.9994377084257504e-06, "loss": 1.3585, "step": 6329 }, { "epoch": 0.04, "grad_norm": 4.6074864102743796, "learning_rate": 1.999437530459824e-06, "loss": 1.2857, "step": 6330 }, { "epoch": 0.04, "grad_norm": 5.000588478317822, "learning_rate": 1.999437352465747e-06, "loss": 1.3529, "step": 6331 }, { "epoch": 0.04, "grad_norm": 4.106900934498859, "learning_rate": 1.9994371744435186e-06, "loss": 1.267, "step": 6332 }, { "epoch": 0.04, "grad_norm": 5.259397656663777, "learning_rate": 1.99943699639314e-06, "loss": 1.5369, "step": 6333 }, { "epoch": 0.04, "grad_norm": 4.61118613993078, "learning_rate": 1.9994368183146103e-06, "loss": 1.4563, "step": 6334 }, { "epoch": 0.04, "grad_norm": 4.528294736254403, "learning_rate": 1.99943664020793e-06, "loss": 1.327, "step": 6335 }, { "epoch": 0.04, "grad_norm": 4.176025018458495, "learning_rate": 1.9994364620730987e-06, "loss": 1.2841, "step": 6336 }, { "epoch": 0.04, "grad_norm": 5.206068572012066, "learning_rate": 1.9994362839101164e-06, "loss": 1.2684, "step": 6337 }, { "epoch": 0.04, "grad_norm": 4.5141779201602565, "learning_rate": 1.9994361057189835e-06, "loss": 1.4944, "step": 6338 }, { "epoch": 0.04, "grad_norm": 4.582058411839732, "learning_rate": 1.9994359274996997e-06, "loss": 1.4789, "step": 6339 }, { "epoch": 0.04, "grad_norm": 5.979282453781429, "learning_rate": 1.9994357492522657e-06, "loss": 1.268, "step": 6340 }, { "epoch": 0.04, "grad_norm": 4.88222482949607, "learning_rate": 1.99943557097668e-06, "loss": 1.4104, "step": 6341 }, { "epoch": 0.04, "grad_norm": 12.808308161297045, "learning_rate": 1.9994353926729443e-06, "loss": 1.3816, "step": 6342 }, { "epoch": 0.04, "grad_norm": 5.070605057742127, "learning_rate": 1.9994352143410577e-06, "loss": 1.4017, "step": 6343 }, { "epoch": 0.04, "grad_norm": 4.0917949640384395, "learning_rate": 1.99943503598102e-06, "loss": 1.4013, "step": 6344 }, { "epoch": 0.04, "grad_norm": 4.409104461398939, "learning_rate": 1.9994348575928316e-06, "loss": 1.315, "step": 6345 }, { "epoch": 0.04, "grad_norm": 4.939187610417441, "learning_rate": 1.999434679176493e-06, "loss": 1.4322, "step": 6346 }, { "epoch": 0.04, "grad_norm": 5.402134105272472, "learning_rate": 1.999434500732003e-06, "loss": 1.4955, "step": 6347 }, { "epoch": 0.04, "grad_norm": 3.8902591917310105, "learning_rate": 1.9994343222593624e-06, "loss": 1.2315, "step": 6348 }, { "epoch": 0.04, "grad_norm": 4.266057797365926, "learning_rate": 1.999434143758571e-06, "loss": 1.3174, "step": 6349 }, { "epoch": 0.04, "grad_norm": 4.461643920968673, "learning_rate": 1.999433965229629e-06, "loss": 1.3411, "step": 6350 }, { "epoch": 0.04, "grad_norm": 4.2658767299471485, "learning_rate": 1.9994337866725364e-06, "loss": 1.199, "step": 6351 }, { "epoch": 0.04, "eval_loss": 1.5781373977661133, "eval_runtime": 4.6354, "eval_samples_per_second": 1.942, "eval_steps_per_second": 1.079, "step": 6351 }, { "epoch": 0.04, "grad_norm": 4.408669931585522, "learning_rate": 1.999433608087293e-06, "loss": 1.1444, "step": 6352 }, { "epoch": 0.04, "grad_norm": 5.086641760711587, "learning_rate": 1.9994334294738987e-06, "loss": 1.5424, "step": 6353 }, { "epoch": 0.04, "grad_norm": 6.1703983485337295, "learning_rate": 1.999433250832354e-06, "loss": 1.4033, "step": 6354 }, { "epoch": 0.04, "grad_norm": 4.532046025207658, "learning_rate": 1.9994330721626583e-06, "loss": 1.4032, "step": 6355 }, { "epoch": 0.04, "grad_norm": 6.650891641794448, "learning_rate": 1.999432893464812e-06, "loss": 1.4061, "step": 6356 }, { "epoch": 0.04, "grad_norm": 7.351888233552044, "learning_rate": 1.999432714738815e-06, "loss": 1.4295, "step": 6357 }, { "epoch": 0.04, "grad_norm": 5.214317271172098, "learning_rate": 1.999432535984667e-06, "loss": 1.293, "step": 6358 }, { "epoch": 0.04, "grad_norm": 4.95374842874618, "learning_rate": 1.9994323572023687e-06, "loss": 1.5432, "step": 6359 }, { "epoch": 0.04, "grad_norm": 4.775499286199467, "learning_rate": 1.9994321783919195e-06, "loss": 1.437, "step": 6360 }, { "epoch": 0.04, "grad_norm": 4.617717458939363, "learning_rate": 1.9994319995533196e-06, "loss": 1.4424, "step": 6361 }, { "epoch": 0.04, "grad_norm": 5.144217405727594, "learning_rate": 1.9994318206865694e-06, "loss": 1.2367, "step": 6362 }, { "epoch": 0.04, "grad_norm": 4.8940168055056965, "learning_rate": 1.999431641791668e-06, "loss": 1.3389, "step": 6363 }, { "epoch": 0.04, "grad_norm": 4.092902014867667, "learning_rate": 1.9994314628686165e-06, "loss": 1.3561, "step": 6364 }, { "epoch": 0.04, "grad_norm": 5.882905609095015, "learning_rate": 1.9994312839174137e-06, "loss": 1.4295, "step": 6365 }, { "epoch": 0.04, "grad_norm": 4.366134689547483, "learning_rate": 1.9994311049380607e-06, "loss": 1.2848, "step": 6366 }, { "epoch": 0.04, "grad_norm": 4.985913870444529, "learning_rate": 1.9994309259305573e-06, "loss": 1.32, "step": 6367 }, { "epoch": 0.04, "grad_norm": 4.549555296366963, "learning_rate": 1.999430746894903e-06, "loss": 1.3335, "step": 6368 }, { "epoch": 0.04, "grad_norm": 4.652777329356844, "learning_rate": 1.999430567831098e-06, "loss": 1.5243, "step": 6369 }, { "epoch": 0.04, "grad_norm": 5.3407807539112175, "learning_rate": 1.999430388739142e-06, "loss": 1.3471, "step": 6370 }, { "epoch": 0.04, "grad_norm": 5.0112933247838205, "learning_rate": 1.9994302096190355e-06, "loss": 1.3587, "step": 6371 }, { "epoch": 0.04, "grad_norm": 4.438187876242872, "learning_rate": 1.999430030470779e-06, "loss": 1.2903, "step": 6372 }, { "epoch": 0.04, "grad_norm": 4.638495585852088, "learning_rate": 1.9994298512943713e-06, "loss": 1.3461, "step": 6373 }, { "epoch": 0.04, "grad_norm": 4.865120368050272, "learning_rate": 1.9994296720898134e-06, "loss": 1.2982, "step": 6374 }, { "epoch": 0.04, "grad_norm": 4.8793372672799, "learning_rate": 1.9994294928571043e-06, "loss": 1.3857, "step": 6375 }, { "epoch": 0.04, "grad_norm": 4.955497500367994, "learning_rate": 1.999429313596245e-06, "loss": 1.3042, "step": 6376 }, { "epoch": 0.04, "grad_norm": 6.011424136306247, "learning_rate": 1.9994291343072352e-06, "loss": 1.2827, "step": 6377 }, { "epoch": 0.04, "grad_norm": 10.28883374989166, "learning_rate": 1.9994289549900744e-06, "loss": 1.3462, "step": 6378 }, { "epoch": 0.04, "grad_norm": 5.538496793890718, "learning_rate": 1.9994287756447633e-06, "loss": 1.3973, "step": 6379 }, { "epoch": 0.04, "grad_norm": 4.195698955703969, "learning_rate": 1.9994285962713015e-06, "loss": 1.2351, "step": 6380 }, { "epoch": 0.04, "grad_norm": 4.904330193504931, "learning_rate": 1.9994284168696894e-06, "loss": 1.3985, "step": 6381 }, { "epoch": 0.04, "grad_norm": 4.6995517560985585, "learning_rate": 1.999428237439926e-06, "loss": 1.3197, "step": 6382 }, { "epoch": 0.04, "grad_norm": 4.946955664301173, "learning_rate": 1.999428057982013e-06, "loss": 1.3516, "step": 6383 }, { "epoch": 0.04, "grad_norm": 5.1295631486974935, "learning_rate": 1.9994278784959488e-06, "loss": 1.2706, "step": 6384 }, { "epoch": 0.04, "grad_norm": 5.337561708214265, "learning_rate": 1.9994276989817342e-06, "loss": 1.6488, "step": 6385 }, { "epoch": 0.04, "grad_norm": 5.474093333123212, "learning_rate": 1.999427519439369e-06, "loss": 1.4426, "step": 6386 }, { "epoch": 0.04, "grad_norm": 4.828382018951851, "learning_rate": 1.9994273398688534e-06, "loss": 1.3534, "step": 6387 }, { "epoch": 0.04, "grad_norm": 6.625670548840686, "learning_rate": 1.999427160270187e-06, "loss": 1.496, "step": 6388 }, { "epoch": 0.04, "grad_norm": 5.016705834884284, "learning_rate": 1.9994269806433706e-06, "loss": 1.2965, "step": 6389 }, { "epoch": 0.04, "grad_norm": 4.211284625383145, "learning_rate": 1.9994268009884033e-06, "loss": 1.2358, "step": 6390 }, { "epoch": 0.04, "grad_norm": 4.483301995120687, "learning_rate": 1.9994266213052853e-06, "loss": 1.4071, "step": 6391 }, { "epoch": 0.04, "grad_norm": 4.566025075672998, "learning_rate": 1.999426441594017e-06, "loss": 1.1719, "step": 6392 }, { "epoch": 0.04, "grad_norm": 4.384949958781413, "learning_rate": 1.999426261854598e-06, "loss": 1.3664, "step": 6393 }, { "epoch": 0.04, "grad_norm": 4.828241650521746, "learning_rate": 1.9994260820870286e-06, "loss": 1.3534, "step": 6394 }, { "epoch": 0.04, "grad_norm": 4.953915752038013, "learning_rate": 1.9994259022913086e-06, "loss": 1.502, "step": 6395 }, { "epoch": 0.04, "grad_norm": 5.173951460100117, "learning_rate": 1.9994257224674383e-06, "loss": 1.3728, "step": 6396 }, { "epoch": 0.04, "grad_norm": 6.041587682087497, "learning_rate": 1.9994255426154173e-06, "loss": 1.5582, "step": 6397 }, { "epoch": 0.04, "grad_norm": 5.001609119616925, "learning_rate": 1.9994253627352464e-06, "loss": 1.4269, "step": 6398 }, { "epoch": 0.04, "grad_norm": 5.193209674784823, "learning_rate": 1.9994251828269243e-06, "loss": 1.5906, "step": 6399 }, { "epoch": 0.04, "grad_norm": 4.624168421729793, "learning_rate": 1.999425002890452e-06, "loss": 1.3097, "step": 6400 }, { "epoch": 0.04, "grad_norm": 4.951419782000335, "learning_rate": 1.999424822925829e-06, "loss": 1.4123, "step": 6401 }, { "epoch": 0.04, "grad_norm": 4.401781065049857, "learning_rate": 1.9994246429330556e-06, "loss": 1.397, "step": 6402 }, { "epoch": 0.04, "grad_norm": 4.539755135839826, "learning_rate": 1.999424462912132e-06, "loss": 1.3766, "step": 6403 }, { "epoch": 0.04, "grad_norm": 4.615144721788273, "learning_rate": 1.9994242828630576e-06, "loss": 1.3266, "step": 6404 }, { "epoch": 0.04, "grad_norm": 4.5571458653270795, "learning_rate": 1.999424102785833e-06, "loss": 1.4187, "step": 6405 }, { "epoch": 0.04, "grad_norm": 3.9757551489723655, "learning_rate": 1.999423922680458e-06, "loss": 1.2183, "step": 6406 }, { "epoch": 0.04, "grad_norm": 4.172502066129642, "learning_rate": 1.999423742546932e-06, "loss": 1.2483, "step": 6407 }, { "epoch": 0.04, "grad_norm": 5.885945432469437, "learning_rate": 1.9994235623852564e-06, "loss": 1.3647, "step": 6408 }, { "epoch": 0.04, "grad_norm": 3.982087242107799, "learning_rate": 1.9994233821954298e-06, "loss": 1.2359, "step": 6409 }, { "epoch": 0.04, "grad_norm": 4.742122434359335, "learning_rate": 1.999423201977453e-06, "loss": 1.3547, "step": 6410 }, { "epoch": 0.04, "grad_norm": 4.244647607738897, "learning_rate": 1.999423021731325e-06, "loss": 1.2044, "step": 6411 }, { "epoch": 0.04, "grad_norm": 8.56850074865655, "learning_rate": 1.9994228414570476e-06, "loss": 1.1119, "step": 6412 }, { "epoch": 0.04, "grad_norm": 4.304553214122205, "learning_rate": 1.9994226611546193e-06, "loss": 1.36, "step": 6413 }, { "epoch": 0.04, "grad_norm": 5.175298560373722, "learning_rate": 1.999422480824041e-06, "loss": 1.4503, "step": 6414 }, { "epoch": 0.04, "grad_norm": 4.196476422325661, "learning_rate": 1.9994223004653115e-06, "loss": 1.184, "step": 6415 }, { "epoch": 0.04, "grad_norm": 4.796702423365602, "learning_rate": 1.9994221200784323e-06, "loss": 1.485, "step": 6416 }, { "epoch": 0.04, "grad_norm": 5.937989100609199, "learning_rate": 1.9994219396634025e-06, "loss": 1.391, "step": 6417 }, { "epoch": 0.04, "grad_norm": 6.102337030987837, "learning_rate": 1.9994217592202223e-06, "loss": 1.3259, "step": 6418 }, { "epoch": 0.04, "grad_norm": 4.618945350087312, "learning_rate": 1.999421578748892e-06, "loss": 1.4256, "step": 6419 }, { "epoch": 0.04, "grad_norm": 6.480567897392034, "learning_rate": 1.999421398249411e-06, "loss": 1.4871, "step": 6420 }, { "epoch": 0.04, "grad_norm": 5.20652740178265, "learning_rate": 1.9994212177217797e-06, "loss": 1.4581, "step": 6421 }, { "epoch": 0.04, "grad_norm": 4.875499810018957, "learning_rate": 1.999421037165998e-06, "loss": 1.4507, "step": 6422 }, { "epoch": 0.04, "grad_norm": 4.692646497309668, "learning_rate": 1.999420856582066e-06, "loss": 1.3856, "step": 6423 }, { "epoch": 0.04, "grad_norm": 4.955834152100426, "learning_rate": 1.999420675969983e-06, "loss": 1.5078, "step": 6424 }, { "epoch": 0.04, "eval_loss": 1.5750279426574707, "eval_runtime": 4.6263, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.081, "step": 6424 }, { "epoch": 0.04, "grad_norm": 4.288250969454204, "learning_rate": 1.9994204953297506e-06, "loss": 1.3956, "step": 6425 }, { "epoch": 0.04, "grad_norm": 4.484110580588845, "learning_rate": 1.999420314661367e-06, "loss": 1.3207, "step": 6426 }, { "epoch": 0.04, "grad_norm": 4.5818655003589335, "learning_rate": 1.999420133964834e-06, "loss": 1.3543, "step": 6427 }, { "epoch": 0.04, "grad_norm": 4.335650481892688, "learning_rate": 1.99941995324015e-06, "loss": 1.346, "step": 6428 }, { "epoch": 0.04, "grad_norm": 4.584925861840009, "learning_rate": 1.999419772487316e-06, "loss": 1.2897, "step": 6429 }, { "epoch": 0.04, "grad_norm": 5.034299329671145, "learning_rate": 1.9994195917063313e-06, "loss": 1.2994, "step": 6430 }, { "epoch": 0.04, "grad_norm": 4.39703051263187, "learning_rate": 1.9994194108971965e-06, "loss": 1.3563, "step": 6431 }, { "epoch": 0.04, "grad_norm": 5.035026362187114, "learning_rate": 1.9994192300599114e-06, "loss": 1.4089, "step": 6432 }, { "epoch": 0.04, "grad_norm": 4.298010335268097, "learning_rate": 1.999419049194476e-06, "loss": 1.282, "step": 6433 }, { "epoch": 0.04, "grad_norm": 5.887159046755277, "learning_rate": 1.99941886830089e-06, "loss": 1.4499, "step": 6434 }, { "epoch": 0.04, "grad_norm": 4.388487867895592, "learning_rate": 1.999418687379154e-06, "loss": 1.3262, "step": 6435 }, { "epoch": 0.04, "grad_norm": 4.3208523880153225, "learning_rate": 1.9994185064292677e-06, "loss": 1.2983, "step": 6436 }, { "epoch": 0.04, "grad_norm": 4.1454312784673375, "learning_rate": 1.999418325451231e-06, "loss": 1.373, "step": 6437 }, { "epoch": 0.04, "grad_norm": 4.777390601722309, "learning_rate": 1.9994181444450443e-06, "loss": 1.3201, "step": 6438 }, { "epoch": 0.04, "grad_norm": 4.828976905582328, "learning_rate": 1.9994179634107067e-06, "loss": 1.421, "step": 6439 }, { "epoch": 0.04, "grad_norm": 5.337441758999037, "learning_rate": 1.9994177823482193e-06, "loss": 1.3926, "step": 6440 }, { "epoch": 0.04, "grad_norm": 4.694175024714312, "learning_rate": 1.9994176012575815e-06, "loss": 1.2838, "step": 6441 }, { "epoch": 0.04, "grad_norm": 4.685904902812024, "learning_rate": 1.9994174201387935e-06, "loss": 1.3887, "step": 6442 }, { "epoch": 0.04, "grad_norm": 4.494170633022454, "learning_rate": 1.999417238991855e-06, "loss": 1.4851, "step": 6443 }, { "epoch": 0.04, "grad_norm": 4.266652047787922, "learning_rate": 1.9994170578167666e-06, "loss": 1.4226, "step": 6444 }, { "epoch": 0.04, "grad_norm": 5.827319310398469, "learning_rate": 1.9994168766135277e-06, "loss": 1.4613, "step": 6445 }, { "epoch": 0.04, "grad_norm": 4.661751062954241, "learning_rate": 1.9994166953821385e-06, "loss": 1.405, "step": 6446 }, { "epoch": 0.04, "grad_norm": 4.65070193266159, "learning_rate": 1.999416514122599e-06, "loss": 1.4581, "step": 6447 }, { "epoch": 0.04, "grad_norm": 4.435701869441354, "learning_rate": 1.9994163328349096e-06, "loss": 1.311, "step": 6448 }, { "epoch": 0.04, "grad_norm": 5.190693820934879, "learning_rate": 1.9994161515190695e-06, "loss": 1.424, "step": 6449 }, { "epoch": 0.04, "grad_norm": 6.1600631203013645, "learning_rate": 1.9994159701750796e-06, "loss": 1.2895, "step": 6450 }, { "epoch": 0.04, "grad_norm": 4.524658510557533, "learning_rate": 1.9994157888029393e-06, "loss": 1.4345, "step": 6451 }, { "epoch": 0.04, "grad_norm": 4.7250968475325505, "learning_rate": 1.999415607402649e-06, "loss": 1.3938, "step": 6452 }, { "epoch": 0.04, "grad_norm": 4.649665256938675, "learning_rate": 1.9994154259742084e-06, "loss": 1.3559, "step": 6453 }, { "epoch": 0.04, "grad_norm": 6.143142432701419, "learning_rate": 1.9994152445176177e-06, "loss": 1.4682, "step": 6454 }, { "epoch": 0.04, "grad_norm": 4.512144981521412, "learning_rate": 1.9994150630328763e-06, "loss": 1.2608, "step": 6455 }, { "epoch": 0.04, "grad_norm": 4.354098216943942, "learning_rate": 1.999414881519985e-06, "loss": 1.3093, "step": 6456 }, { "epoch": 0.04, "grad_norm": 4.747209802822031, "learning_rate": 1.9994146999789434e-06, "loss": 1.3977, "step": 6457 }, { "epoch": 0.04, "grad_norm": 4.420806814005281, "learning_rate": 1.9994145184097516e-06, "loss": 1.258, "step": 6458 }, { "epoch": 0.04, "grad_norm": 4.738170951265542, "learning_rate": 1.99941433681241e-06, "loss": 1.3438, "step": 6459 }, { "epoch": 0.04, "grad_norm": 4.921549650510274, "learning_rate": 1.999414155186918e-06, "loss": 1.3681, "step": 6460 }, { "epoch": 0.04, "grad_norm": 4.535870368528579, "learning_rate": 1.999413973533276e-06, "loss": 1.4082, "step": 6461 }, { "epoch": 0.04, "grad_norm": 4.583275250465685, "learning_rate": 1.9994137918514833e-06, "loss": 1.4157, "step": 6462 }, { "epoch": 0.04, "grad_norm": 4.398020842994636, "learning_rate": 1.999413610141541e-06, "loss": 1.3372, "step": 6463 }, { "epoch": 0.04, "grad_norm": 5.91739482389443, "learning_rate": 1.999413428403448e-06, "loss": 1.2848, "step": 6464 }, { "epoch": 0.04, "grad_norm": 4.776662182581634, "learning_rate": 1.999413246637205e-06, "loss": 1.2498, "step": 6465 }, { "epoch": 0.04, "grad_norm": 4.405948852996063, "learning_rate": 1.999413064842812e-06, "loss": 1.3386, "step": 6466 }, { "epoch": 0.04, "grad_norm": 4.743294100246146, "learning_rate": 1.999412883020269e-06, "loss": 1.3876, "step": 6467 }, { "epoch": 0.04, "grad_norm": 4.542031140946786, "learning_rate": 1.9994127011695757e-06, "loss": 1.3232, "step": 6468 }, { "epoch": 0.04, "grad_norm": 4.575891824314149, "learning_rate": 1.9994125192907323e-06, "loss": 1.4565, "step": 6469 }, { "epoch": 0.04, "grad_norm": 4.632671137142004, "learning_rate": 1.9994123373837386e-06, "loss": 1.4176, "step": 6470 }, { "epoch": 0.04, "grad_norm": 4.386976180987225, "learning_rate": 1.999412155448595e-06, "loss": 1.3675, "step": 6471 }, { "epoch": 0.04, "grad_norm": 5.800846908721324, "learning_rate": 1.999411973485301e-06, "loss": 1.4312, "step": 6472 }, { "epoch": 0.04, "grad_norm": 4.752349455555847, "learning_rate": 1.9994117914938574e-06, "loss": 1.4831, "step": 6473 }, { "epoch": 0.04, "grad_norm": 4.305644024583278, "learning_rate": 1.9994116094742634e-06, "loss": 1.3226, "step": 6474 }, { "epoch": 0.04, "grad_norm": 4.897398022757232, "learning_rate": 1.999411427426519e-06, "loss": 1.4881, "step": 6475 }, { "epoch": 0.04, "grad_norm": 4.26457062764882, "learning_rate": 1.999411245350625e-06, "loss": 1.3829, "step": 6476 }, { "epoch": 0.04, "grad_norm": 6.037317027443757, "learning_rate": 1.999411063246581e-06, "loss": 1.3811, "step": 6477 }, { "epoch": 0.04, "grad_norm": 4.471534521195251, "learning_rate": 1.9994108811143866e-06, "loss": 1.3476, "step": 6478 }, { "epoch": 0.04, "grad_norm": 4.522309312434982, "learning_rate": 1.999410698954042e-06, "loss": 1.473, "step": 6479 }, { "epoch": 0.04, "grad_norm": 4.548430906100433, "learning_rate": 1.9994105167655474e-06, "loss": 1.2829, "step": 6480 }, { "epoch": 0.04, "grad_norm": 4.297489125036424, "learning_rate": 1.999410334548903e-06, "loss": 1.2863, "step": 6481 }, { "epoch": 0.04, "grad_norm": 5.028109175957432, "learning_rate": 1.999410152304108e-06, "loss": 1.6114, "step": 6482 }, { "epoch": 0.04, "grad_norm": 4.557966174267893, "learning_rate": 1.9994099700311634e-06, "loss": 1.3573, "step": 6483 }, { "epoch": 0.04, "grad_norm": 4.569218359060069, "learning_rate": 1.9994097877300686e-06, "loss": 1.3363, "step": 6484 }, { "epoch": 0.04, "grad_norm": 5.319516788612354, "learning_rate": 1.999409605400824e-06, "loss": 1.3203, "step": 6485 }, { "epoch": 0.04, "grad_norm": 4.867921101991437, "learning_rate": 1.999409423043429e-06, "loss": 1.504, "step": 6486 }, { "epoch": 0.04, "grad_norm": 4.79231936127538, "learning_rate": 1.999409240657884e-06, "loss": 1.4666, "step": 6487 }, { "epoch": 0.04, "grad_norm": 4.423483097556062, "learning_rate": 1.999409058244189e-06, "loss": 1.4072, "step": 6488 }, { "epoch": 0.04, "grad_norm": 4.297857669444882, "learning_rate": 1.9994088758023443e-06, "loss": 1.4098, "step": 6489 }, { "epoch": 0.04, "grad_norm": 4.474789739793408, "learning_rate": 1.999408693332349e-06, "loss": 1.2434, "step": 6490 }, { "epoch": 0.04, "grad_norm": 6.53648139671094, "learning_rate": 1.9994085108342042e-06, "loss": 1.5028, "step": 6491 }, { "epoch": 0.04, "grad_norm": 4.44951623205067, "learning_rate": 1.999408328307909e-06, "loss": 1.3289, "step": 6492 }, { "epoch": 0.04, "grad_norm": 4.206608100421257, "learning_rate": 1.9994081457534643e-06, "loss": 1.1549, "step": 6493 }, { "epoch": 0.04, "grad_norm": 4.374473176636909, "learning_rate": 1.999407963170869e-06, "loss": 1.4438, "step": 6494 }, { "epoch": 0.04, "grad_norm": 4.34039531015808, "learning_rate": 1.999407780560124e-06, "loss": 1.3621, "step": 6495 }, { "epoch": 0.04, "grad_norm": 5.193406188455792, "learning_rate": 1.999407597921229e-06, "loss": 1.384, "step": 6496 }, { "epoch": 0.04, "grad_norm": 6.228922307984619, "learning_rate": 1.999407415254184e-06, "loss": 1.1589, "step": 6497 }, { "epoch": 0.04, "eval_loss": 1.570655345916748, "eval_runtime": 4.6411, "eval_samples_per_second": 1.939, "eval_steps_per_second": 1.077, "step": 6497 }, { "epoch": 0.04, "grad_norm": 4.532876491588331, "learning_rate": 1.999407232558989e-06, "loss": 1.3152, "step": 6498 }, { "epoch": 0.04, "grad_norm": 6.251153251779173, "learning_rate": 1.999407049835644e-06, "loss": 1.5836, "step": 6499 }, { "epoch": 0.04, "grad_norm": 4.573177793566228, "learning_rate": 1.999406867084149e-06, "loss": 1.3234, "step": 6500 }, { "epoch": 0.04, "grad_norm": 4.272448880554212, "learning_rate": 1.999406684304504e-06, "loss": 1.2406, "step": 6501 }, { "epoch": 0.04, "grad_norm": 4.551163380821845, "learning_rate": 1.999406501496709e-06, "loss": 1.4443, "step": 6502 }, { "epoch": 0.04, "grad_norm": 4.555850622505328, "learning_rate": 1.999406318660764e-06, "loss": 1.3868, "step": 6503 }, { "epoch": 0.04, "grad_norm": 4.256569246254423, "learning_rate": 1.9994061357966694e-06, "loss": 1.3658, "step": 6504 }, { "epoch": 0.04, "grad_norm": 5.407171282715384, "learning_rate": 1.999405952904425e-06, "loss": 1.319, "step": 6505 }, { "epoch": 0.04, "grad_norm": 4.490194570599599, "learning_rate": 1.9994057699840303e-06, "loss": 1.3928, "step": 6506 }, { "epoch": 0.04, "grad_norm": 4.608226892500228, "learning_rate": 1.9994055870354855e-06, "loss": 1.261, "step": 6507 }, { "epoch": 0.04, "grad_norm": 4.57673510345801, "learning_rate": 1.999405404058791e-06, "loss": 1.4627, "step": 6508 }, { "epoch": 0.04, "grad_norm": 4.80984639244294, "learning_rate": 1.9994052210539463e-06, "loss": 1.4727, "step": 6509 }, { "epoch": 0.04, "grad_norm": 4.865749647125733, "learning_rate": 1.999405038020952e-06, "loss": 1.5437, "step": 6510 }, { "epoch": 0.04, "grad_norm": 9.019637110105185, "learning_rate": 1.9994048549598077e-06, "loss": 1.3993, "step": 6511 }, { "epoch": 0.04, "grad_norm": 5.07668757652983, "learning_rate": 1.9994046718705135e-06, "loss": 1.4372, "step": 6512 }, { "epoch": 0.04, "grad_norm": 4.42999363978681, "learning_rate": 1.9994044887530695e-06, "loss": 1.289, "step": 6513 }, { "epoch": 0.04, "grad_norm": 4.801070557958482, "learning_rate": 1.9994043056074752e-06, "loss": 1.4178, "step": 6514 }, { "epoch": 0.04, "grad_norm": 4.587955237660779, "learning_rate": 1.9994041224337315e-06, "loss": 1.3486, "step": 6515 }, { "epoch": 0.04, "grad_norm": 4.823731219504641, "learning_rate": 1.9994039392318374e-06, "loss": 1.3678, "step": 6516 }, { "epoch": 0.04, "grad_norm": 6.4281577023818715, "learning_rate": 1.999403756001794e-06, "loss": 1.4017, "step": 6517 }, { "epoch": 0.04, "grad_norm": 4.400734554627938, "learning_rate": 1.9994035727436006e-06, "loss": 1.323, "step": 6518 }, { "epoch": 0.04, "grad_norm": 12.523643249497127, "learning_rate": 1.999403389457257e-06, "loss": 1.3022, "step": 6519 }, { "epoch": 0.04, "grad_norm": 4.788188811588854, "learning_rate": 1.999403206142764e-06, "loss": 1.2572, "step": 6520 }, { "epoch": 0.04, "grad_norm": 4.357382365505549, "learning_rate": 1.9994030228001205e-06, "loss": 1.3099, "step": 6521 }, { "epoch": 0.04, "grad_norm": 7.152666436731815, "learning_rate": 1.9994028394293272e-06, "loss": 1.0506, "step": 6522 }, { "epoch": 0.04, "grad_norm": 4.962461229126925, "learning_rate": 1.9994026560303845e-06, "loss": 1.0932, "step": 6523 }, { "epoch": 0.04, "grad_norm": 5.798405799418517, "learning_rate": 1.999402472603292e-06, "loss": 1.4882, "step": 6524 }, { "epoch": 0.04, "grad_norm": 4.906933043469203, "learning_rate": 1.999402289148049e-06, "loss": 1.2753, "step": 6525 }, { "epoch": 0.04, "grad_norm": 4.363287952400987, "learning_rate": 1.9994021056646568e-06, "loss": 1.3163, "step": 6526 }, { "epoch": 0.04, "grad_norm": 5.022895147560338, "learning_rate": 1.9994019221531146e-06, "loss": 1.3266, "step": 6527 }, { "epoch": 0.04, "grad_norm": 5.324205689590798, "learning_rate": 1.999401738613422e-06, "loss": 1.631, "step": 6528 }, { "epoch": 0.04, "grad_norm": 4.651851187157339, "learning_rate": 1.9994015550455802e-06, "loss": 1.2994, "step": 6529 }, { "epoch": 0.04, "grad_norm": 5.162137161720078, "learning_rate": 1.999401371449589e-06, "loss": 1.5299, "step": 6530 }, { "epoch": 0.04, "grad_norm": 4.879724440010851, "learning_rate": 1.999401187825447e-06, "loss": 1.2439, "step": 6531 }, { "epoch": 0.04, "grad_norm": 4.430181105153709, "learning_rate": 1.9994010041731557e-06, "loss": 1.3841, "step": 6532 }, { "epoch": 0.04, "grad_norm": 8.477790635834381, "learning_rate": 1.9994008204927147e-06, "loss": 1.6246, "step": 6533 }, { "epoch": 0.04, "grad_norm": 4.96721876142773, "learning_rate": 1.9994006367841234e-06, "loss": 1.4903, "step": 6534 }, { "epoch": 0.04, "grad_norm": 4.410918143105512, "learning_rate": 1.999400453047383e-06, "loss": 1.2758, "step": 6535 }, { "epoch": 0.04, "grad_norm": 4.228439608942006, "learning_rate": 1.9994002692824926e-06, "loss": 1.3165, "step": 6536 }, { "epoch": 0.04, "grad_norm": 4.415227201679303, "learning_rate": 1.999400085489452e-06, "loss": 1.3865, "step": 6537 }, { "epoch": 0.04, "grad_norm": 4.485381987050631, "learning_rate": 1.9993999016682622e-06, "loss": 1.3803, "step": 6538 }, { "epoch": 0.04, "grad_norm": 4.853984830702609, "learning_rate": 1.9993997178189225e-06, "loss": 1.3319, "step": 6539 }, { "epoch": 0.04, "grad_norm": 4.5773244743106245, "learning_rate": 1.999399533941433e-06, "loss": 1.427, "step": 6540 }, { "epoch": 0.04, "grad_norm": 4.903527734439789, "learning_rate": 1.999399350035793e-06, "loss": 1.3848, "step": 6541 }, { "epoch": 0.04, "grad_norm": 4.140108156452527, "learning_rate": 1.999399166102004e-06, "loss": 1.1811, "step": 6542 }, { "epoch": 0.04, "grad_norm": 4.759166124576844, "learning_rate": 1.999398982140065e-06, "loss": 1.3934, "step": 6543 }, { "epoch": 0.04, "grad_norm": 4.716196273308381, "learning_rate": 1.9993987981499765e-06, "loss": 1.2054, "step": 6544 }, { "epoch": 0.04, "grad_norm": 4.394571966837835, "learning_rate": 1.999398614131738e-06, "loss": 1.2814, "step": 6545 }, { "epoch": 0.04, "grad_norm": 5.049649944787621, "learning_rate": 1.9993984300853503e-06, "loss": 1.43, "step": 6546 }, { "epoch": 0.04, "grad_norm": 4.562622090616017, "learning_rate": 1.9993982460108124e-06, "loss": 1.4127, "step": 6547 }, { "epoch": 0.04, "grad_norm": 5.031795003982652, "learning_rate": 1.9993980619081247e-06, "loss": 1.3455, "step": 6548 }, { "epoch": 0.04, "grad_norm": 5.434753136215983, "learning_rate": 1.9993978777772875e-06, "loss": 1.337, "step": 6549 }, { "epoch": 0.04, "grad_norm": 4.523722265700851, "learning_rate": 1.9993976936183005e-06, "loss": 1.4148, "step": 6550 }, { "epoch": 0.04, "grad_norm": 4.514159749600672, "learning_rate": 1.9993975094311635e-06, "loss": 1.4092, "step": 6551 }, { "epoch": 0.04, "grad_norm": 4.732670287488305, "learning_rate": 1.999397325215877e-06, "loss": 1.34, "step": 6552 }, { "epoch": 0.04, "grad_norm": 5.441540400973514, "learning_rate": 1.9993971409724413e-06, "loss": 1.4223, "step": 6553 }, { "epoch": 0.04, "grad_norm": 5.248639779596383, "learning_rate": 1.9993969567008552e-06, "loss": 1.2518, "step": 6554 }, { "epoch": 0.04, "grad_norm": 5.125148479060199, "learning_rate": 1.9993967724011197e-06, "loss": 1.4507, "step": 6555 }, { "epoch": 0.04, "grad_norm": 5.395502593763626, "learning_rate": 1.9993965880732347e-06, "loss": 1.391, "step": 6556 }, { "epoch": 0.04, "grad_norm": 3.9932207981462797, "learning_rate": 1.9993964037172e-06, "loss": 1.2327, "step": 6557 }, { "epoch": 0.04, "grad_norm": 4.5932767824342005, "learning_rate": 1.9993962193330155e-06, "loss": 1.324, "step": 6558 }, { "epoch": 0.04, "grad_norm": 4.578320005992336, "learning_rate": 1.9993960349206813e-06, "loss": 1.3925, "step": 6559 }, { "epoch": 0.04, "grad_norm": 5.1885443205158275, "learning_rate": 1.9993958504801972e-06, "loss": 1.4615, "step": 6560 }, { "epoch": 0.04, "grad_norm": 4.246351687289503, "learning_rate": 1.9993956660115637e-06, "loss": 1.4181, "step": 6561 }, { "epoch": 0.04, "grad_norm": 4.479573479111158, "learning_rate": 1.9993954815147808e-06, "loss": 1.33, "step": 6562 }, { "epoch": 0.04, "grad_norm": 4.814496160286448, "learning_rate": 1.999395296989848e-06, "loss": 1.4294, "step": 6563 }, { "epoch": 0.04, "grad_norm": 4.765423874764579, "learning_rate": 1.9993951124367653e-06, "loss": 1.3456, "step": 6564 }, { "epoch": 0.04, "grad_norm": 4.736359900795483, "learning_rate": 1.999394927855533e-06, "loss": 1.476, "step": 6565 }, { "epoch": 0.04, "grad_norm": 4.613200458351985, "learning_rate": 1.9993947432461515e-06, "loss": 1.4512, "step": 6566 }, { "epoch": 0.04, "grad_norm": 4.376342375547429, "learning_rate": 1.99939455860862e-06, "loss": 1.2586, "step": 6567 }, { "epoch": 0.04, "grad_norm": 4.436110447878964, "learning_rate": 1.999394373942939e-06, "loss": 1.2242, "step": 6568 }, { "epoch": 0.04, "grad_norm": 5.738810594671168, "learning_rate": 1.999394189249109e-06, "loss": 1.2752, "step": 6569 }, { "epoch": 0.04, "grad_norm": 5.4712656248525775, "learning_rate": 1.9993940045271286e-06, "loss": 1.3038, "step": 6570 }, { "epoch": 0.04, "eval_loss": 1.5701048374176025, "eval_runtime": 4.6165, "eval_samples_per_second": 1.95, "eval_steps_per_second": 1.083, "step": 6570 }, { "epoch": 0.04, "grad_norm": 5.515270133580792, "learning_rate": 1.9993938197769986e-06, "loss": 1.338, "step": 6571 }, { "epoch": 0.04, "grad_norm": 4.724743049461662, "learning_rate": 1.9993936349987195e-06, "loss": 1.4913, "step": 6572 }, { "epoch": 0.04, "grad_norm": 4.439699959004503, "learning_rate": 1.9993934501922905e-06, "loss": 1.3479, "step": 6573 }, { "epoch": 0.04, "grad_norm": 7.234361415222628, "learning_rate": 1.9993932653577116e-06, "loss": 1.379, "step": 6574 }, { "epoch": 0.04, "grad_norm": 4.248967713492603, "learning_rate": 1.9993930804949834e-06, "loss": 1.2339, "step": 6575 }, { "epoch": 0.04, "grad_norm": 5.482397975291796, "learning_rate": 1.9993928956041056e-06, "loss": 1.4815, "step": 6576 }, { "epoch": 0.04, "grad_norm": 6.7470672380706995, "learning_rate": 1.9993927106850784e-06, "loss": 1.4692, "step": 6577 }, { "epoch": 0.04, "grad_norm": 5.0600999883103555, "learning_rate": 1.999392525737902e-06, "loss": 1.4211, "step": 6578 }, { "epoch": 0.04, "grad_norm": 7.163366568396338, "learning_rate": 1.9993923407625753e-06, "loss": 1.1995, "step": 6579 }, { "epoch": 0.04, "grad_norm": 4.282770334294331, "learning_rate": 1.999392155759099e-06, "loss": 1.4631, "step": 6580 }, { "epoch": 0.04, "grad_norm": 5.164968756172121, "learning_rate": 1.9993919707274736e-06, "loss": 1.4309, "step": 6581 }, { "epoch": 0.04, "grad_norm": 4.5693688487022746, "learning_rate": 1.9993917856676983e-06, "loss": 1.3074, "step": 6582 }, { "epoch": 0.04, "grad_norm": 5.034974858938123, "learning_rate": 1.999391600579774e-06, "loss": 1.2604, "step": 6583 }, { "epoch": 0.04, "grad_norm": 6.5281520067356915, "learning_rate": 1.9993914154637e-06, "loss": 1.4211, "step": 6584 }, { "epoch": 0.04, "grad_norm": 5.885686458456587, "learning_rate": 1.999391230319476e-06, "loss": 1.4122, "step": 6585 }, { "epoch": 0.04, "grad_norm": 4.55225051248099, "learning_rate": 1.999391045147103e-06, "loss": 1.4243, "step": 6586 }, { "epoch": 0.04, "grad_norm": 5.624366178470606, "learning_rate": 1.99939085994658e-06, "loss": 1.3535, "step": 6587 }, { "epoch": 0.04, "grad_norm": 4.344626526477646, "learning_rate": 1.9993906747179076e-06, "loss": 1.2951, "step": 6588 }, { "epoch": 0.04, "grad_norm": 5.200481156033781, "learning_rate": 1.999390489461086e-06, "loss": 1.5554, "step": 6589 }, { "epoch": 0.04, "grad_norm": 4.289595173171484, "learning_rate": 1.9993903041761145e-06, "loss": 1.4097, "step": 6590 }, { "epoch": 0.04, "grad_norm": 4.462809969642775, "learning_rate": 1.999390118862994e-06, "loss": 1.4093, "step": 6591 }, { "epoch": 0.04, "grad_norm": 4.657497051711786, "learning_rate": 1.9993899335217237e-06, "loss": 1.1915, "step": 6592 }, { "epoch": 0.04, "grad_norm": 4.903731641543155, "learning_rate": 1.9993897481523037e-06, "loss": 1.4433, "step": 6593 }, { "epoch": 0.04, "grad_norm": 4.339597401883772, "learning_rate": 1.9993895627547346e-06, "loss": 1.381, "step": 6594 }, { "epoch": 0.04, "grad_norm": 4.401690198095049, "learning_rate": 1.9993893773290157e-06, "loss": 1.3954, "step": 6595 }, { "epoch": 0.04, "grad_norm": 5.9779163111881655, "learning_rate": 1.9993891918751474e-06, "loss": 1.4376, "step": 6596 }, { "epoch": 0.04, "grad_norm": 4.606009143795483, "learning_rate": 1.9993890063931296e-06, "loss": 1.3671, "step": 6597 }, { "epoch": 0.04, "grad_norm": 4.592199191426679, "learning_rate": 1.9993888208829624e-06, "loss": 1.4755, "step": 6598 }, { "epoch": 0.04, "grad_norm": 4.623905799355523, "learning_rate": 1.999388635344646e-06, "loss": 1.3771, "step": 6599 }, { "epoch": 0.04, "grad_norm": 4.237405162954645, "learning_rate": 1.99938844977818e-06, "loss": 1.4002, "step": 6600 }, { "epoch": 0.04, "grad_norm": 4.7337222852257606, "learning_rate": 1.9993882641835644e-06, "loss": 1.4995, "step": 6601 }, { "epoch": 0.04, "grad_norm": 4.294837124432389, "learning_rate": 1.9993880785607994e-06, "loss": 1.3624, "step": 6602 }, { "epoch": 0.04, "grad_norm": 4.536015377337632, "learning_rate": 1.999387892909885e-06, "loss": 1.4603, "step": 6603 }, { "epoch": 0.04, "grad_norm": 5.105810813530506, "learning_rate": 1.9993877072308214e-06, "loss": 1.4265, "step": 6604 }, { "epoch": 0.04, "grad_norm": 5.904691394451306, "learning_rate": 1.999387521523608e-06, "loss": 1.1829, "step": 6605 }, { "epoch": 0.04, "grad_norm": 6.547913090209481, "learning_rate": 1.9993873357882452e-06, "loss": 1.2407, "step": 6606 }, { "epoch": 0.04, "grad_norm": 4.6327025046735875, "learning_rate": 1.9993871500247334e-06, "loss": 1.3744, "step": 6607 }, { "epoch": 0.04, "grad_norm": 8.63193813879483, "learning_rate": 1.9993869642330717e-06, "loss": 1.2336, "step": 6608 }, { "epoch": 0.04, "grad_norm": 6.417283379179613, "learning_rate": 1.9993867784132606e-06, "loss": 1.4006, "step": 6609 }, { "epoch": 0.04, "grad_norm": 4.421337235567864, "learning_rate": 1.999386592565301e-06, "loss": 1.3115, "step": 6610 }, { "epoch": 0.04, "grad_norm": 4.303939938338019, "learning_rate": 1.999386406689191e-06, "loss": 1.2726, "step": 6611 }, { "epoch": 0.04, "grad_norm": 5.957578566355889, "learning_rate": 1.9993862207849317e-06, "loss": 1.3194, "step": 6612 }, { "epoch": 0.04, "grad_norm": 4.446802999890054, "learning_rate": 1.9993860348525232e-06, "loss": 1.3245, "step": 6613 }, { "epoch": 0.04, "grad_norm": 6.843243800293919, "learning_rate": 1.9993858488919653e-06, "loss": 1.575, "step": 6614 }, { "epoch": 0.04, "grad_norm": 4.3071120749543566, "learning_rate": 1.9993856629032583e-06, "loss": 1.4499, "step": 6615 }, { "epoch": 0.04, "grad_norm": 5.406044686953453, "learning_rate": 1.9993854768864015e-06, "loss": 1.4099, "step": 6616 }, { "epoch": 0.04, "grad_norm": 6.102414402476134, "learning_rate": 1.9993852908413956e-06, "loss": 1.3886, "step": 6617 }, { "epoch": 0.04, "grad_norm": 4.30886833731806, "learning_rate": 1.9993851047682403e-06, "loss": 1.425, "step": 6618 }, { "epoch": 0.04, "grad_norm": 4.3863051110383235, "learning_rate": 1.9993849186669355e-06, "loss": 1.378, "step": 6619 }, { "epoch": 0.04, "grad_norm": 4.5183574730319735, "learning_rate": 1.9993847325374813e-06, "loss": 1.3533, "step": 6620 }, { "epoch": 0.04, "grad_norm": 4.446758818223998, "learning_rate": 1.999384546379878e-06, "loss": 1.4627, "step": 6621 }, { "epoch": 0.04, "grad_norm": 4.633424180841852, "learning_rate": 1.999384360194125e-06, "loss": 1.4084, "step": 6622 }, { "epoch": 0.04, "grad_norm": 5.446183872488713, "learning_rate": 1.999384173980223e-06, "loss": 1.5042, "step": 6623 }, { "epoch": 0.04, "grad_norm": 5.329952941596388, "learning_rate": 1.9993839877381718e-06, "loss": 1.1784, "step": 6624 }, { "epoch": 0.04, "grad_norm": 5.097754481693318, "learning_rate": 1.999383801467971e-06, "loss": 1.3401, "step": 6625 }, { "epoch": 0.04, "grad_norm": 5.4361849357503855, "learning_rate": 1.999383615169621e-06, "loss": 1.3359, "step": 6626 }, { "epoch": 0.04, "grad_norm": 4.758896218818498, "learning_rate": 1.9993834288431217e-06, "loss": 1.3341, "step": 6627 }, { "epoch": 0.04, "grad_norm": 4.159982208893944, "learning_rate": 1.999383242488473e-06, "loss": 1.4126, "step": 6628 }, { "epoch": 0.04, "grad_norm": 5.018171376976816, "learning_rate": 1.9993830561056752e-06, "loss": 1.3447, "step": 6629 }, { "epoch": 0.04, "grad_norm": 4.203856495101394, "learning_rate": 1.999382869694728e-06, "loss": 1.3415, "step": 6630 }, { "epoch": 0.04, "grad_norm": 5.059586696361144, "learning_rate": 1.9993826832556314e-06, "loss": 1.3475, "step": 6631 }, { "epoch": 0.04, "grad_norm": 4.940758351266604, "learning_rate": 1.9993824967883856e-06, "loss": 1.4344, "step": 6632 }, { "epoch": 0.04, "grad_norm": 4.323637808797647, "learning_rate": 1.9993823102929907e-06, "loss": 1.3695, "step": 6633 }, { "epoch": 0.04, "grad_norm": 4.610112866978993, "learning_rate": 1.999382123769446e-06, "loss": 1.4236, "step": 6634 }, { "epoch": 0.04, "grad_norm": 4.439344760603468, "learning_rate": 1.999381937217752e-06, "loss": 1.388, "step": 6635 }, { "epoch": 0.04, "grad_norm": 5.112407815506566, "learning_rate": 1.9993817506379098e-06, "loss": 1.4352, "step": 6636 }, { "epoch": 0.04, "grad_norm": 5.014061511779196, "learning_rate": 1.9993815640299175e-06, "loss": 1.3402, "step": 6637 }, { "epoch": 0.04, "grad_norm": 5.655891353665772, "learning_rate": 1.999381377393776e-06, "loss": 1.5104, "step": 6638 }, { "epoch": 0.04, "grad_norm": 4.686217415400032, "learning_rate": 1.999381190729485e-06, "loss": 1.4499, "step": 6639 }, { "epoch": 0.04, "grad_norm": 4.380136224757185, "learning_rate": 1.9993810040370454e-06, "loss": 1.3426, "step": 6640 }, { "epoch": 0.04, "grad_norm": 5.144441042471167, "learning_rate": 1.9993808173164562e-06, "loss": 1.3522, "step": 6641 }, { "epoch": 0.04, "grad_norm": 4.294544214737699, "learning_rate": 1.999380630567718e-06, "loss": 1.3489, "step": 6642 }, { "epoch": 0.04, "grad_norm": 5.1528096158677, "learning_rate": 1.99938044379083e-06, "loss": 1.4514, "step": 6643 }, { "epoch": 0.04, "eval_loss": 1.5706208944320679, "eval_runtime": 4.6345, "eval_samples_per_second": 1.942, "eval_steps_per_second": 1.079, "step": 6643 }, { "epoch": 0.04, "grad_norm": 5.349137964396962, "learning_rate": 1.9993802569857933e-06, "loss": 1.4154, "step": 6644 }, { "epoch": 0.04, "grad_norm": 5.615327909351482, "learning_rate": 1.999380070152607e-06, "loss": 1.4239, "step": 6645 }, { "epoch": 0.04, "grad_norm": 4.339665538705517, "learning_rate": 1.999379883291272e-06, "loss": 1.3286, "step": 6646 }, { "epoch": 0.04, "grad_norm": 5.1266146887320545, "learning_rate": 1.9993796964017874e-06, "loss": 1.512, "step": 6647 }, { "epoch": 0.04, "grad_norm": 5.9655218756289505, "learning_rate": 1.999379509484154e-06, "loss": 1.404, "step": 6648 }, { "epoch": 0.04, "grad_norm": 4.598461068462763, "learning_rate": 1.9993793225383708e-06, "loss": 1.3541, "step": 6649 }, { "epoch": 0.04, "grad_norm": 4.6982987373210765, "learning_rate": 1.999379135564439e-06, "loss": 1.3819, "step": 6650 }, { "epoch": 0.04, "grad_norm": 4.872141891132418, "learning_rate": 1.9993789485623576e-06, "loss": 1.4558, "step": 6651 }, { "epoch": 0.04, "grad_norm": 4.2667624652551455, "learning_rate": 1.999378761532127e-06, "loss": 1.3268, "step": 6652 }, { "epoch": 0.04, "grad_norm": 4.584306183042022, "learning_rate": 1.9993785744737475e-06, "loss": 1.4715, "step": 6653 }, { "epoch": 0.04, "grad_norm": 5.024190470935378, "learning_rate": 1.999378387387219e-06, "loss": 1.4238, "step": 6654 }, { "epoch": 0.04, "grad_norm": 4.96848799761176, "learning_rate": 1.999378200272541e-06, "loss": 1.3725, "step": 6655 }, { "epoch": 0.04, "grad_norm": 5.585896703406984, "learning_rate": 1.999378013129714e-06, "loss": 1.4047, "step": 6656 }, { "epoch": 0.04, "grad_norm": 5.194019896838438, "learning_rate": 1.9993778259587373e-06, "loss": 1.2574, "step": 6657 }, { "epoch": 0.04, "grad_norm": 5.988153860482977, "learning_rate": 1.999377638759612e-06, "loss": 1.3494, "step": 6658 }, { "epoch": 0.04, "grad_norm": 4.718767668829993, "learning_rate": 1.9993774515323373e-06, "loss": 1.4137, "step": 6659 }, { "epoch": 0.05, "grad_norm": 4.180904940604818, "learning_rate": 1.9993772642769137e-06, "loss": 1.3088, "step": 6660 }, { "epoch": 0.05, "grad_norm": 5.1447592951437455, "learning_rate": 1.999377076993341e-06, "loss": 1.5106, "step": 6661 }, { "epoch": 0.05, "grad_norm": 4.440978925070136, "learning_rate": 1.9993768896816186e-06, "loss": 1.282, "step": 6662 }, { "epoch": 0.05, "grad_norm": 4.369351633093875, "learning_rate": 1.999376702341748e-06, "loss": 1.3855, "step": 6663 }, { "epoch": 0.05, "grad_norm": 5.340815610074341, "learning_rate": 1.9993765149737275e-06, "loss": 1.2272, "step": 6664 }, { "epoch": 0.05, "grad_norm": 5.01500954033791, "learning_rate": 1.999376327577558e-06, "loss": 1.2476, "step": 6665 }, { "epoch": 0.05, "grad_norm": 4.69695158699135, "learning_rate": 1.99937614015324e-06, "loss": 1.3692, "step": 6666 }, { "epoch": 0.05, "grad_norm": 4.902709575982336, "learning_rate": 1.9993759527007723e-06, "loss": 1.2948, "step": 6667 }, { "epoch": 0.05, "grad_norm": 5.610828408373096, "learning_rate": 1.9993757652201557e-06, "loss": 1.3811, "step": 6668 }, { "epoch": 0.05, "grad_norm": 4.296801295992063, "learning_rate": 1.99937557771139e-06, "loss": 1.3086, "step": 6669 }, { "epoch": 0.05, "grad_norm": 5.704619668113932, "learning_rate": 1.9993753901744754e-06, "loss": 1.4968, "step": 6670 }, { "epoch": 0.05, "grad_norm": 5.613100619807498, "learning_rate": 1.9993752026094113e-06, "loss": 1.4664, "step": 6671 }, { "epoch": 0.05, "grad_norm": 4.750055080991661, "learning_rate": 1.9993750150161986e-06, "loss": 1.3969, "step": 6672 }, { "epoch": 0.05, "grad_norm": 4.565943163284562, "learning_rate": 1.9993748273948365e-06, "loss": 1.359, "step": 6673 }, { "epoch": 0.05, "grad_norm": 8.213951552187696, "learning_rate": 1.9993746397453253e-06, "loss": 1.5483, "step": 6674 }, { "epoch": 0.05, "grad_norm": 5.486568326006605, "learning_rate": 1.999374452067665e-06, "loss": 1.2674, "step": 6675 }, { "epoch": 0.05, "grad_norm": 5.7765927364836, "learning_rate": 1.999374264361856e-06, "loss": 1.3221, "step": 6676 }, { "epoch": 0.05, "grad_norm": 5.441424142406907, "learning_rate": 1.9993740766278977e-06, "loss": 1.4687, "step": 6677 }, { "epoch": 0.05, "grad_norm": 4.9420459093277795, "learning_rate": 1.9993738888657905e-06, "loss": 1.4172, "step": 6678 }, { "epoch": 0.05, "grad_norm": 4.766032816582536, "learning_rate": 1.9993737010755342e-06, "loss": 1.2462, "step": 6679 }, { "epoch": 0.05, "grad_norm": 4.9024377350518575, "learning_rate": 1.9993735132571285e-06, "loss": 1.3493, "step": 6680 }, { "epoch": 0.05, "grad_norm": 4.667337431539989, "learning_rate": 1.9993733254105742e-06, "loss": 1.4568, "step": 6681 }, { "epoch": 0.05, "grad_norm": 4.7434862621922385, "learning_rate": 1.999373137535871e-06, "loss": 1.3759, "step": 6682 }, { "epoch": 0.05, "grad_norm": 4.764039088430895, "learning_rate": 1.9993729496330185e-06, "loss": 1.3141, "step": 6683 }, { "epoch": 0.05, "grad_norm": 4.54719762856992, "learning_rate": 1.999372761702017e-06, "loss": 1.3775, "step": 6684 }, { "epoch": 0.05, "grad_norm": 4.366549695449851, "learning_rate": 1.9993725737428667e-06, "loss": 1.3818, "step": 6685 }, { "epoch": 0.05, "grad_norm": 4.323636806267695, "learning_rate": 1.999372385755567e-06, "loss": 1.3455, "step": 6686 }, { "epoch": 0.05, "grad_norm": 4.604609328393161, "learning_rate": 1.9993721977401185e-06, "loss": 1.3627, "step": 6687 }, { "epoch": 0.05, "grad_norm": 4.624616069161999, "learning_rate": 1.9993720096965214e-06, "loss": 1.3888, "step": 6688 }, { "epoch": 0.05, "grad_norm": 4.4177336580935656, "learning_rate": 1.999371821624775e-06, "loss": 1.3219, "step": 6689 }, { "epoch": 0.05, "grad_norm": 4.180809347626812, "learning_rate": 1.9993716335248794e-06, "loss": 1.2907, "step": 6690 }, { "epoch": 0.05, "grad_norm": 5.8763716015072305, "learning_rate": 1.999371445396835e-06, "loss": 1.3874, "step": 6691 }, { "epoch": 0.05, "grad_norm": 4.758643091045307, "learning_rate": 1.9993712572406417e-06, "loss": 1.3668, "step": 6692 }, { "epoch": 0.05, "grad_norm": 4.659951540969715, "learning_rate": 1.999371069056299e-06, "loss": 1.5008, "step": 6693 }, { "epoch": 0.05, "grad_norm": 4.68773839162296, "learning_rate": 1.999370880843808e-06, "loss": 1.4358, "step": 6694 }, { "epoch": 0.05, "grad_norm": 4.625948694426612, "learning_rate": 1.9993706926031678e-06, "loss": 1.4137, "step": 6695 }, { "epoch": 0.05, "grad_norm": 6.602342083897072, "learning_rate": 1.9993705043343786e-06, "loss": 1.5638, "step": 6696 }, { "epoch": 0.05, "grad_norm": 5.398827432388213, "learning_rate": 1.9993703160374403e-06, "loss": 1.3196, "step": 6697 }, { "epoch": 0.05, "grad_norm": 4.843149147547734, "learning_rate": 1.9993701277123535e-06, "loss": 1.4103, "step": 6698 }, { "epoch": 0.05, "grad_norm": 4.680339250312286, "learning_rate": 1.9993699393591176e-06, "loss": 1.3801, "step": 6699 }, { "epoch": 0.05, "grad_norm": 5.871172449926197, "learning_rate": 1.9993697509777327e-06, "loss": 1.3487, "step": 6700 }, { "epoch": 0.05, "grad_norm": 4.549992711707639, "learning_rate": 1.999369562568199e-06, "loss": 1.4678, "step": 6701 }, { "epoch": 0.05, "grad_norm": 4.929821248891381, "learning_rate": 1.999369374130516e-06, "loss": 1.4616, "step": 6702 }, { "epoch": 0.05, "grad_norm": 4.407652129934311, "learning_rate": 1.9993691856646843e-06, "loss": 1.1714, "step": 6703 }, { "epoch": 0.05, "grad_norm": 4.748651366632725, "learning_rate": 1.9993689971707038e-06, "loss": 1.3786, "step": 6704 }, { "epoch": 0.05, "grad_norm": 5.2956866978206145, "learning_rate": 1.999368808648574e-06, "loss": 1.4582, "step": 6705 }, { "epoch": 0.05, "grad_norm": 5.846956465214044, "learning_rate": 1.9993686200982956e-06, "loss": 1.3826, "step": 6706 }, { "epoch": 0.05, "grad_norm": 5.013826210799998, "learning_rate": 1.999368431519869e-06, "loss": 1.4582, "step": 6707 }, { "epoch": 0.05, "grad_norm": 4.264919631503435, "learning_rate": 1.9993682429132926e-06, "loss": 1.3902, "step": 6708 }, { "epoch": 0.05, "grad_norm": 4.731291876743891, "learning_rate": 1.9993680542785673e-06, "loss": 1.4003, "step": 6709 }, { "epoch": 0.05, "grad_norm": 4.193357333907853, "learning_rate": 1.9993678656156935e-06, "loss": 1.1846, "step": 6710 }, { "epoch": 0.05, "grad_norm": 5.206736261105648, "learning_rate": 1.9993676769246706e-06, "loss": 1.2813, "step": 6711 }, { "epoch": 0.05, "grad_norm": 4.634312077015591, "learning_rate": 1.999367488205499e-06, "loss": 1.4139, "step": 6712 }, { "epoch": 0.05, "grad_norm": 4.720794482599493, "learning_rate": 1.9993672994581787e-06, "loss": 1.3239, "step": 6713 }, { "epoch": 0.05, "grad_norm": 4.602992414867196, "learning_rate": 1.9993671106827096e-06, "loss": 1.3007, "step": 6714 }, { "epoch": 0.05, "grad_norm": 4.401980198774702, "learning_rate": 1.999366921879091e-06, "loss": 1.405, "step": 6715 }, { "epoch": 0.05, "grad_norm": 4.723348339945989, "learning_rate": 1.9993667330473244e-06, "loss": 1.4127, "step": 6716 }, { "epoch": 0.05, "eval_loss": 1.5677449703216553, "eval_runtime": 4.6629, "eval_samples_per_second": 1.93, "eval_steps_per_second": 1.072, "step": 6716 }, { "epoch": 0.05, "grad_norm": 4.476062246479651, "learning_rate": 1.999366544187408e-06, "loss": 1.2622, "step": 6717 }, { "epoch": 0.05, "grad_norm": 5.407199006295253, "learning_rate": 1.9993663552993435e-06, "loss": 1.4361, "step": 6718 }, { "epoch": 0.05, "grad_norm": 4.831434431187319, "learning_rate": 1.99936616638313e-06, "loss": 1.4612, "step": 6719 }, { "epoch": 0.05, "grad_norm": 4.444607285280013, "learning_rate": 1.9993659774387677e-06, "loss": 1.3391, "step": 6720 }, { "epoch": 0.05, "grad_norm": 4.3703132430219656, "learning_rate": 1.9993657884662568e-06, "loss": 1.3474, "step": 6721 }, { "epoch": 0.05, "grad_norm": 4.28865103403165, "learning_rate": 1.9993655994655968e-06, "loss": 1.3705, "step": 6722 }, { "epoch": 0.05, "grad_norm": 4.405777484600818, "learning_rate": 1.9993654104367877e-06, "loss": 1.4649, "step": 6723 }, { "epoch": 0.05, "grad_norm": 4.4407276197747745, "learning_rate": 1.9993652213798306e-06, "loss": 1.3115, "step": 6724 }, { "epoch": 0.05, "grad_norm": 4.635664331973335, "learning_rate": 1.999365032294724e-06, "loss": 1.2741, "step": 6725 }, { "epoch": 0.05, "grad_norm": 4.5285375313661635, "learning_rate": 1.999364843181469e-06, "loss": 1.388, "step": 6726 }, { "epoch": 0.05, "grad_norm": 4.686925850722363, "learning_rate": 1.999364654040065e-06, "loss": 1.3428, "step": 6727 }, { "epoch": 0.05, "grad_norm": 4.416183312932971, "learning_rate": 1.999364464870512e-06, "loss": 1.2816, "step": 6728 }, { "epoch": 0.05, "grad_norm": 4.797701094957937, "learning_rate": 1.9993642756728105e-06, "loss": 1.4073, "step": 6729 }, { "epoch": 0.05, "grad_norm": 4.286794607526967, "learning_rate": 1.9993640864469605e-06, "loss": 1.3414, "step": 6730 }, { "epoch": 0.05, "grad_norm": 4.362785076865784, "learning_rate": 1.9993638971929614e-06, "loss": 1.4041, "step": 6731 }, { "epoch": 0.05, "grad_norm": 4.827484811429517, "learning_rate": 1.9993637079108137e-06, "loss": 1.2185, "step": 6732 }, { "epoch": 0.05, "grad_norm": 4.793371618050493, "learning_rate": 1.999363518600517e-06, "loss": 1.3566, "step": 6733 }, { "epoch": 0.05, "grad_norm": 6.501510216120231, "learning_rate": 1.9993633292620717e-06, "loss": 1.2956, "step": 6734 }, { "epoch": 0.05, "grad_norm": 4.502067774323721, "learning_rate": 1.999363139895478e-06, "loss": 1.4332, "step": 6735 }, { "epoch": 0.05, "grad_norm": 4.510472547089669, "learning_rate": 1.999362950500735e-06, "loss": 1.326, "step": 6736 }, { "epoch": 0.05, "grad_norm": 4.788552485407938, "learning_rate": 1.9993627610778434e-06, "loss": 1.4404, "step": 6737 }, { "epoch": 0.05, "grad_norm": 4.763290583939385, "learning_rate": 1.9993625716268033e-06, "loss": 1.3506, "step": 6738 }, { "epoch": 0.05, "grad_norm": 4.962941378087676, "learning_rate": 1.9993623821476145e-06, "loss": 1.4501, "step": 6739 }, { "epoch": 0.05, "grad_norm": 4.734603507151676, "learning_rate": 1.999362192640277e-06, "loss": 1.4826, "step": 6740 }, { "epoch": 0.05, "grad_norm": 5.4821355438064066, "learning_rate": 1.9993620031047905e-06, "loss": 1.2351, "step": 6741 }, { "epoch": 0.05, "grad_norm": 4.664220692507016, "learning_rate": 1.999361813541155e-06, "loss": 1.3643, "step": 6742 }, { "epoch": 0.05, "grad_norm": 5.030646691611031, "learning_rate": 1.9993616239493716e-06, "loss": 1.3815, "step": 6743 }, { "epoch": 0.05, "grad_norm": 4.657599464406103, "learning_rate": 1.999361434329439e-06, "loss": 1.4302, "step": 6744 }, { "epoch": 0.05, "grad_norm": 5.195231242405325, "learning_rate": 1.9993612446813583e-06, "loss": 1.4276, "step": 6745 }, { "epoch": 0.05, "grad_norm": 5.195767950009777, "learning_rate": 1.9993610550051285e-06, "loss": 1.2503, "step": 6746 }, { "epoch": 0.05, "grad_norm": 4.8074046626759515, "learning_rate": 1.9993608653007497e-06, "loss": 1.5235, "step": 6747 }, { "epoch": 0.05, "grad_norm": 8.744323740588001, "learning_rate": 1.9993606755682224e-06, "loss": 1.3762, "step": 6748 }, { "epoch": 0.05, "grad_norm": 6.245123084310273, "learning_rate": 1.999360485807547e-06, "loss": 1.4982, "step": 6749 }, { "epoch": 0.05, "grad_norm": 4.724901880584157, "learning_rate": 1.9993602960187222e-06, "loss": 1.4688, "step": 6750 }, { "epoch": 0.05, "grad_norm": 4.062984920686892, "learning_rate": 1.999360106201749e-06, "loss": 1.1148, "step": 6751 }, { "epoch": 0.05, "grad_norm": 4.922804318823875, "learning_rate": 1.9993599163566273e-06, "loss": 1.3527, "step": 6752 }, { "epoch": 0.05, "grad_norm": 4.405788728461931, "learning_rate": 1.999359726483357e-06, "loss": 1.4234, "step": 6753 }, { "epoch": 0.05, "grad_norm": 5.43178947554927, "learning_rate": 1.999359536581938e-06, "loss": 1.2283, "step": 6754 }, { "epoch": 0.05, "grad_norm": 4.609775491526868, "learning_rate": 1.99935934665237e-06, "loss": 1.306, "step": 6755 }, { "epoch": 0.05, "grad_norm": 4.649230195283178, "learning_rate": 1.9993591566946538e-06, "loss": 1.2241, "step": 6756 }, { "epoch": 0.05, "grad_norm": 4.370387099523893, "learning_rate": 1.9993589667087886e-06, "loss": 1.2887, "step": 6757 }, { "epoch": 0.05, "grad_norm": 5.0529457358156815, "learning_rate": 1.9993587766947752e-06, "loss": 1.3095, "step": 6758 }, { "epoch": 0.05, "grad_norm": 5.178158984358035, "learning_rate": 1.999358586652613e-06, "loss": 1.3035, "step": 6759 }, { "epoch": 0.05, "grad_norm": 4.324852952301155, "learning_rate": 1.999358396582302e-06, "loss": 1.3687, "step": 6760 }, { "epoch": 0.05, "grad_norm": 4.8875340750125815, "learning_rate": 1.9993582064838427e-06, "loss": 1.5395, "step": 6761 }, { "epoch": 0.05, "grad_norm": 5.8338921895906335, "learning_rate": 1.9993580163572345e-06, "loss": 1.3958, "step": 6762 }, { "epoch": 0.05, "grad_norm": 4.671095462187975, "learning_rate": 1.999357826202478e-06, "loss": 1.479, "step": 6763 }, { "epoch": 0.05, "grad_norm": 4.847359293413845, "learning_rate": 1.999357636019573e-06, "loss": 1.3489, "step": 6764 }, { "epoch": 0.05, "grad_norm": 4.826941852458828, "learning_rate": 1.999357445808519e-06, "loss": 1.3473, "step": 6765 }, { "epoch": 0.05, "grad_norm": 4.48197442305317, "learning_rate": 1.999357255569317e-06, "loss": 1.4244, "step": 6766 }, { "epoch": 0.05, "grad_norm": 4.7112532714020325, "learning_rate": 1.999357065301966e-06, "loss": 1.2484, "step": 6767 }, { "epoch": 0.05, "grad_norm": 6.659895178956035, "learning_rate": 1.9993568750064665e-06, "loss": 1.65, "step": 6768 }, { "epoch": 0.05, "grad_norm": 4.870670663353911, "learning_rate": 1.9993566846828185e-06, "loss": 1.5089, "step": 6769 }, { "epoch": 0.05, "grad_norm": 4.861815066844898, "learning_rate": 1.999356494331022e-06, "loss": 1.3603, "step": 6770 }, { "epoch": 0.05, "grad_norm": 4.991904188010678, "learning_rate": 1.999356303951077e-06, "loss": 1.4834, "step": 6771 }, { "epoch": 0.05, "grad_norm": 6.583693064838302, "learning_rate": 1.999356113542983e-06, "loss": 1.4503, "step": 6772 }, { "epoch": 0.05, "grad_norm": 4.788019993080197, "learning_rate": 1.9993559231067407e-06, "loss": 1.441, "step": 6773 }, { "epoch": 0.05, "grad_norm": 5.132358273316544, "learning_rate": 1.99935573264235e-06, "loss": 1.4604, "step": 6774 }, { "epoch": 0.05, "grad_norm": 5.564394271127444, "learning_rate": 1.9993555421498106e-06, "loss": 1.2205, "step": 6775 }, { "epoch": 0.05, "grad_norm": 7.219206563441031, "learning_rate": 1.999355351629123e-06, "loss": 1.3279, "step": 6776 }, { "epoch": 0.05, "grad_norm": 5.19897224787566, "learning_rate": 1.9993551610802866e-06, "loss": 1.4387, "step": 6777 }, { "epoch": 0.05, "grad_norm": 7.750746311809073, "learning_rate": 1.999354970503302e-06, "loss": 1.2267, "step": 6778 }, { "epoch": 0.05, "grad_norm": 4.316868787748949, "learning_rate": 1.9993547798981686e-06, "loss": 1.3246, "step": 6779 }, { "epoch": 0.05, "grad_norm": 4.369907535226818, "learning_rate": 1.999354589264887e-06, "loss": 1.2768, "step": 6780 }, { "epoch": 0.05, "grad_norm": 4.968674371602953, "learning_rate": 1.9993543986034566e-06, "loss": 1.469, "step": 6781 }, { "epoch": 0.05, "grad_norm": 5.847112123468328, "learning_rate": 1.9993542079138777e-06, "loss": 1.4619, "step": 6782 }, { "epoch": 0.05, "grad_norm": 4.606886165255847, "learning_rate": 1.99935401719615e-06, "loss": 1.3749, "step": 6783 }, { "epoch": 0.05, "grad_norm": 5.482123500917216, "learning_rate": 1.9993538264502745e-06, "loss": 1.3001, "step": 6784 }, { "epoch": 0.05, "grad_norm": 4.760330384158345, "learning_rate": 1.9993536356762503e-06, "loss": 1.2746, "step": 6785 }, { "epoch": 0.05, "grad_norm": 4.751904908868463, "learning_rate": 1.999353444874078e-06, "loss": 1.4656, "step": 6786 }, { "epoch": 0.05, "grad_norm": 4.666852535159014, "learning_rate": 1.9993532540437564e-06, "loss": 1.4469, "step": 6787 }, { "epoch": 0.05, "grad_norm": 4.645274427363757, "learning_rate": 1.9993530631852867e-06, "loss": 1.4806, "step": 6788 }, { "epoch": 0.05, "grad_norm": 4.189250836000196, "learning_rate": 1.999352872298669e-06, "loss": 1.4331, "step": 6789 }, { "epoch": 0.05, "eval_loss": 1.5680055618286133, "eval_runtime": 4.6606, "eval_samples_per_second": 1.931, "eval_steps_per_second": 1.073, "step": 6789 }, { "epoch": 0.05, "grad_norm": 4.422525645669675, "learning_rate": 1.999352681383902e-06, "loss": 1.2584, "step": 6790 }, { "epoch": 0.05, "grad_norm": 5.136187181924857, "learning_rate": 1.9993524904409875e-06, "loss": 1.3341, "step": 6791 }, { "epoch": 0.05, "grad_norm": 4.120382141581498, "learning_rate": 1.999352299469924e-06, "loss": 1.3204, "step": 6792 }, { "epoch": 0.05, "grad_norm": 4.476742417121338, "learning_rate": 1.999352108470712e-06, "loss": 1.3657, "step": 6793 }, { "epoch": 0.05, "grad_norm": 4.422022078537015, "learning_rate": 1.9993519174433517e-06, "loss": 1.3419, "step": 6794 }, { "epoch": 0.05, "grad_norm": 4.424003067502367, "learning_rate": 1.999351726387843e-06, "loss": 1.439, "step": 6795 }, { "epoch": 0.05, "grad_norm": 4.699584061842587, "learning_rate": 1.999351535304186e-06, "loss": 1.3611, "step": 6796 }, { "epoch": 0.05, "grad_norm": 4.056675746590429, "learning_rate": 1.9993513441923802e-06, "loss": 1.1614, "step": 6797 }, { "epoch": 0.05, "grad_norm": 4.5954958477898025, "learning_rate": 1.9993511530524267e-06, "loss": 1.4285, "step": 6798 }, { "epoch": 0.05, "grad_norm": 5.6939243142476546, "learning_rate": 1.999350961884324e-06, "loss": 1.3497, "step": 6799 }, { "epoch": 0.05, "grad_norm": 4.300882995440221, "learning_rate": 1.9993507706880735e-06, "loss": 1.2112, "step": 6800 }, { "epoch": 0.05, "grad_norm": 4.759095128554072, "learning_rate": 1.9993505794636746e-06, "loss": 1.3324, "step": 6801 }, { "epoch": 0.05, "grad_norm": 4.541870415791323, "learning_rate": 1.999350388211127e-06, "loss": 1.3426, "step": 6802 }, { "epoch": 0.05, "grad_norm": 4.74074298737864, "learning_rate": 1.999350196930431e-06, "loss": 1.4334, "step": 6803 }, { "epoch": 0.05, "grad_norm": 5.290814235308155, "learning_rate": 1.9993500056215867e-06, "loss": 1.3596, "step": 6804 }, { "epoch": 0.05, "grad_norm": 4.65067797609521, "learning_rate": 1.9993498142845943e-06, "loss": 1.5196, "step": 6805 }, { "epoch": 0.05, "grad_norm": 5.709008044350683, "learning_rate": 1.9993496229194533e-06, "loss": 1.629, "step": 6806 }, { "epoch": 0.05, "grad_norm": 4.51087458061536, "learning_rate": 1.999349431526164e-06, "loss": 1.3164, "step": 6807 }, { "epoch": 0.05, "grad_norm": 4.6646851692360976, "learning_rate": 1.9993492401047263e-06, "loss": 1.478, "step": 6808 }, { "epoch": 0.05, "grad_norm": 4.866150174345354, "learning_rate": 1.9993490486551403e-06, "loss": 1.4925, "step": 6809 }, { "epoch": 0.05, "grad_norm": 4.393808926247055, "learning_rate": 1.999348857177406e-06, "loss": 1.4241, "step": 6810 }, { "epoch": 0.05, "grad_norm": 4.902806652534005, "learning_rate": 1.9993486656715234e-06, "loss": 1.3867, "step": 6811 }, { "epoch": 0.05, "grad_norm": 5.879795534899488, "learning_rate": 1.9993484741374924e-06, "loss": 1.4297, "step": 6812 }, { "epoch": 0.05, "grad_norm": 5.885920687924517, "learning_rate": 1.9993482825753133e-06, "loss": 1.3667, "step": 6813 }, { "epoch": 0.05, "grad_norm": 4.973942111531444, "learning_rate": 1.9993480909849856e-06, "loss": 1.2795, "step": 6814 }, { "epoch": 0.05, "grad_norm": 5.613497463734784, "learning_rate": 1.9993478993665093e-06, "loss": 1.5971, "step": 6815 }, { "epoch": 0.05, "grad_norm": 4.133732404917408, "learning_rate": 1.9993477077198853e-06, "loss": 1.2271, "step": 6816 }, { "epoch": 0.05, "grad_norm": 5.665315728538176, "learning_rate": 1.9993475160451126e-06, "loss": 1.3965, "step": 6817 }, { "epoch": 0.05, "grad_norm": 4.248638617101129, "learning_rate": 1.9993473243421918e-06, "loss": 1.3748, "step": 6818 }, { "epoch": 0.05, "grad_norm": 4.505218041052679, "learning_rate": 1.9993471326111228e-06, "loss": 1.458, "step": 6819 }, { "epoch": 0.05, "grad_norm": 4.906229574701761, "learning_rate": 1.999346940851905e-06, "loss": 1.2691, "step": 6820 }, { "epoch": 0.05, "grad_norm": 4.528227047114751, "learning_rate": 1.99934674906454e-06, "loss": 1.2568, "step": 6821 }, { "epoch": 0.05, "grad_norm": 4.995258189577926, "learning_rate": 1.9993465572490254e-06, "loss": 1.3504, "step": 6822 }, { "epoch": 0.05, "grad_norm": 5.0930601669871844, "learning_rate": 1.9993463654053633e-06, "loss": 1.6452, "step": 6823 }, { "epoch": 0.05, "grad_norm": 5.071696645561831, "learning_rate": 1.999346173533553e-06, "loss": 1.5685, "step": 6824 }, { "epoch": 0.05, "grad_norm": 4.572076745536653, "learning_rate": 1.999345981633594e-06, "loss": 1.4568, "step": 6825 }, { "epoch": 0.05, "grad_norm": 5.0786621892967325, "learning_rate": 1.999345789705487e-06, "loss": 1.419, "step": 6826 }, { "epoch": 0.05, "grad_norm": 4.669340479173038, "learning_rate": 1.9993455977492318e-06, "loss": 1.4064, "step": 6827 }, { "epoch": 0.05, "grad_norm": 5.678686993829397, "learning_rate": 1.9993454057648283e-06, "loss": 1.354, "step": 6828 }, { "epoch": 0.05, "grad_norm": 6.435562980583757, "learning_rate": 1.9993452137522767e-06, "loss": 1.2744, "step": 6829 }, { "epoch": 0.05, "grad_norm": 4.568405034845767, "learning_rate": 1.9993450217115765e-06, "loss": 1.3724, "step": 6830 }, { "epoch": 0.05, "grad_norm": 3.9964672561071484, "learning_rate": 1.9993448296427285e-06, "loss": 1.3108, "step": 6831 }, { "epoch": 0.05, "grad_norm": 4.641773458028604, "learning_rate": 1.999344637545732e-06, "loss": 1.4106, "step": 6832 }, { "epoch": 0.05, "grad_norm": 5.087940261592281, "learning_rate": 1.9993444454205873e-06, "loss": 1.4682, "step": 6833 }, { "epoch": 0.05, "grad_norm": 4.546386764938594, "learning_rate": 1.9993442532672944e-06, "loss": 1.3314, "step": 6834 }, { "epoch": 0.05, "grad_norm": 4.603797612749039, "learning_rate": 1.9993440610858533e-06, "loss": 1.2827, "step": 6835 }, { "epoch": 0.05, "grad_norm": 4.479258916036704, "learning_rate": 1.9993438688762644e-06, "loss": 1.4022, "step": 6836 }, { "epoch": 0.05, "grad_norm": 4.919903156358536, "learning_rate": 1.9993436766385266e-06, "loss": 1.3488, "step": 6837 }, { "epoch": 0.05, "grad_norm": 4.887590078133437, "learning_rate": 1.999343484372641e-06, "loss": 1.3933, "step": 6838 }, { "epoch": 0.05, "grad_norm": 5.064321876851956, "learning_rate": 1.9993432920786076e-06, "loss": 1.3675, "step": 6839 }, { "epoch": 0.05, "grad_norm": 4.450035338692157, "learning_rate": 1.9993430997564252e-06, "loss": 1.3144, "step": 6840 }, { "epoch": 0.05, "grad_norm": 6.031476254527788, "learning_rate": 1.999342907406095e-06, "loss": 1.5278, "step": 6841 }, { "epoch": 0.05, "grad_norm": 5.097726411671194, "learning_rate": 1.9993427150276168e-06, "loss": 1.2777, "step": 6842 }, { "epoch": 0.05, "grad_norm": 4.905114226724136, "learning_rate": 1.9993425226209903e-06, "loss": 1.2456, "step": 6843 }, { "epoch": 0.05, "grad_norm": 6.425934118133389, "learning_rate": 1.9993423301862156e-06, "loss": 1.5056, "step": 6844 }, { "epoch": 0.05, "grad_norm": 4.9857232481345575, "learning_rate": 1.9993421377232932e-06, "loss": 1.2308, "step": 6845 }, { "epoch": 0.05, "grad_norm": 4.8197316210835135, "learning_rate": 1.999341945232222e-06, "loss": 1.5689, "step": 6846 }, { "epoch": 0.05, "grad_norm": 4.405409394671111, "learning_rate": 1.999341752713003e-06, "loss": 1.3169, "step": 6847 }, { "epoch": 0.05, "grad_norm": 6.019859507589784, "learning_rate": 1.9993415601656357e-06, "loss": 1.5227, "step": 6848 }, { "epoch": 0.05, "grad_norm": 5.4594911090968266, "learning_rate": 1.9993413675901205e-06, "loss": 1.3446, "step": 6849 }, { "epoch": 0.05, "grad_norm": 4.580882245612422, "learning_rate": 1.9993411749864572e-06, "loss": 1.5252, "step": 6850 }, { "epoch": 0.05, "grad_norm": 4.136211089855359, "learning_rate": 1.9993409823546458e-06, "loss": 1.2152, "step": 6851 }, { "epoch": 0.05, "grad_norm": 5.618576081915454, "learning_rate": 1.9993407896946857e-06, "loss": 1.5029, "step": 6852 }, { "epoch": 0.05, "grad_norm": 4.311446578436019, "learning_rate": 1.999340597006578e-06, "loss": 1.2554, "step": 6853 }, { "epoch": 0.05, "grad_norm": 5.752186161665894, "learning_rate": 1.9993404042903223e-06, "loss": 1.4682, "step": 6854 }, { "epoch": 0.05, "grad_norm": 5.466796146071113, "learning_rate": 1.999340211545918e-06, "loss": 1.178, "step": 6855 }, { "epoch": 0.05, "grad_norm": 4.48702870115137, "learning_rate": 1.999340018773366e-06, "loss": 1.3704, "step": 6856 }, { "epoch": 0.05, "grad_norm": 4.713826697032225, "learning_rate": 1.9993398259726657e-06, "loss": 1.4762, "step": 6857 }, { "epoch": 0.05, "grad_norm": 4.690339510790461, "learning_rate": 1.999339633143818e-06, "loss": 1.4056, "step": 6858 }, { "epoch": 0.05, "grad_norm": 4.847237358682079, "learning_rate": 1.9993394402868214e-06, "loss": 1.3334, "step": 6859 }, { "epoch": 0.05, "grad_norm": 4.775039655417167, "learning_rate": 1.9993392474016772e-06, "loss": 1.4907, "step": 6860 }, { "epoch": 0.05, "grad_norm": 4.855679216107115, "learning_rate": 1.9993390544883844e-06, "loss": 1.2684, "step": 6861 }, { "epoch": 0.05, "grad_norm": 4.223365241119003, "learning_rate": 1.9993388615469443e-06, "loss": 1.304, "step": 6862 }, { "epoch": 0.05, "eval_loss": 1.575598955154419, "eval_runtime": 4.6402, "eval_samples_per_second": 1.94, "eval_steps_per_second": 1.078, "step": 6862 }, { "epoch": 0.05, "grad_norm": 5.070239909566293, "learning_rate": 1.9993386685773556e-06, "loss": 1.186, "step": 6863 }, { "epoch": 0.05, "grad_norm": 4.0594749047991705, "learning_rate": 1.999338475579619e-06, "loss": 1.2335, "step": 6864 }, { "epoch": 0.05, "grad_norm": 4.791100252051131, "learning_rate": 1.9993382825537345e-06, "loss": 1.2713, "step": 6865 }, { "epoch": 0.05, "grad_norm": 6.503726091763445, "learning_rate": 1.9993380894997016e-06, "loss": 1.3833, "step": 6866 }, { "epoch": 0.05, "grad_norm": 5.280141948692929, "learning_rate": 1.999337896417521e-06, "loss": 1.4877, "step": 6867 }, { "epoch": 0.05, "grad_norm": 7.844652304567667, "learning_rate": 1.9993377033071923e-06, "loss": 1.3832, "step": 6868 }, { "epoch": 0.05, "grad_norm": 4.547420021866008, "learning_rate": 1.9993375101687154e-06, "loss": 1.4303, "step": 6869 }, { "epoch": 0.05, "grad_norm": 4.635850000080861, "learning_rate": 1.9993373170020907e-06, "loss": 1.2988, "step": 6870 }, { "epoch": 0.05, "grad_norm": 4.281029348555836, "learning_rate": 1.999337123807318e-06, "loss": 1.3039, "step": 6871 }, { "epoch": 0.05, "grad_norm": 4.502956579167798, "learning_rate": 1.9993369305843972e-06, "loss": 1.3757, "step": 6872 }, { "epoch": 0.05, "grad_norm": 4.77796851557104, "learning_rate": 1.9993367373333285e-06, "loss": 1.3773, "step": 6873 }, { "epoch": 0.05, "grad_norm": 4.680370836553787, "learning_rate": 1.9993365440541115e-06, "loss": 1.4397, "step": 6874 }, { "epoch": 0.05, "grad_norm": 4.942965078238444, "learning_rate": 1.9993363507467472e-06, "loss": 1.4949, "step": 6875 }, { "epoch": 0.05, "grad_norm": 4.643273004423091, "learning_rate": 1.9993361574112343e-06, "loss": 1.184, "step": 6876 }, { "epoch": 0.05, "grad_norm": 4.3609216572447576, "learning_rate": 1.9993359640475737e-06, "loss": 1.3088, "step": 6877 }, { "epoch": 0.05, "grad_norm": 4.787339816164571, "learning_rate": 1.999335770655765e-06, "loss": 1.199, "step": 6878 }, { "epoch": 0.05, "grad_norm": 4.894417482716107, "learning_rate": 1.9993355772358083e-06, "loss": 1.5335, "step": 6879 }, { "epoch": 0.05, "grad_norm": 4.350974372047054, "learning_rate": 1.9993353837877036e-06, "loss": 1.1965, "step": 6880 }, { "epoch": 0.05, "grad_norm": 4.407920338766403, "learning_rate": 1.999335190311451e-06, "loss": 1.3512, "step": 6881 }, { "epoch": 0.05, "grad_norm": 4.379281732056031, "learning_rate": 1.999334996807051e-06, "loss": 1.4875, "step": 6882 }, { "epoch": 0.05, "grad_norm": 5.551702761695887, "learning_rate": 1.999334803274502e-06, "loss": 1.3129, "step": 6883 }, { "epoch": 0.05, "grad_norm": 4.20458905020584, "learning_rate": 1.999334609713806e-06, "loss": 1.3311, "step": 6884 }, { "epoch": 0.05, "grad_norm": 4.470326304268248, "learning_rate": 1.9993344161249615e-06, "loss": 1.5145, "step": 6885 }, { "epoch": 0.05, "grad_norm": 4.495500117715615, "learning_rate": 1.9993342225079694e-06, "loss": 1.3477, "step": 6886 }, { "epoch": 0.05, "grad_norm": 4.811239812311492, "learning_rate": 1.999334028862829e-06, "loss": 1.2781, "step": 6887 }, { "epoch": 0.05, "grad_norm": 4.972958663622191, "learning_rate": 1.999333835189541e-06, "loss": 1.5122, "step": 6888 }, { "epoch": 0.05, "grad_norm": 4.568720415992226, "learning_rate": 1.999333641488105e-06, "loss": 1.299, "step": 6889 }, { "epoch": 0.05, "grad_norm": 4.746186391728743, "learning_rate": 1.9993334477585213e-06, "loss": 1.2392, "step": 6890 }, { "epoch": 0.05, "grad_norm": 5.111310988137005, "learning_rate": 1.9993332540007892e-06, "loss": 1.3255, "step": 6891 }, { "epoch": 0.05, "grad_norm": 4.345039811533563, "learning_rate": 1.9993330602149098e-06, "loss": 1.3531, "step": 6892 }, { "epoch": 0.05, "grad_norm": 5.800113486998973, "learning_rate": 1.999332866400882e-06, "loss": 1.3932, "step": 6893 }, { "epoch": 0.05, "grad_norm": 4.5390260387263455, "learning_rate": 1.9993326725587067e-06, "loss": 1.4972, "step": 6894 }, { "epoch": 0.05, "grad_norm": 4.642919651205738, "learning_rate": 1.999332478688383e-06, "loss": 1.5056, "step": 6895 }, { "epoch": 0.05, "grad_norm": 4.211086047842109, "learning_rate": 1.9993322847899123e-06, "loss": 1.1944, "step": 6896 }, { "epoch": 0.05, "grad_norm": 5.0373630346256215, "learning_rate": 1.9993320908632932e-06, "loss": 1.426, "step": 6897 }, { "epoch": 0.05, "grad_norm": 4.584760026750547, "learning_rate": 1.9993318969085264e-06, "loss": 1.4059, "step": 6898 }, { "epoch": 0.05, "grad_norm": 6.506308801211316, "learning_rate": 1.9993317029256114e-06, "loss": 1.3975, "step": 6899 }, { "epoch": 0.05, "grad_norm": 4.314436521861751, "learning_rate": 1.9993315089145487e-06, "loss": 1.2356, "step": 6900 }, { "epoch": 0.05, "grad_norm": 4.826954371528216, "learning_rate": 1.999331314875338e-06, "loss": 1.4411, "step": 6901 }, { "epoch": 0.05, "grad_norm": 4.6030328422897275, "learning_rate": 1.99933112080798e-06, "loss": 1.3896, "step": 6902 }, { "epoch": 0.05, "grad_norm": 4.771397256157349, "learning_rate": 1.999330926712474e-06, "loss": 1.2812, "step": 6903 }, { "epoch": 0.05, "grad_norm": 4.284144775411557, "learning_rate": 1.9993307325888198e-06, "loss": 1.3495, "step": 6904 }, { "epoch": 0.05, "grad_norm": 4.228034221171642, "learning_rate": 1.999330538437018e-06, "loss": 1.2876, "step": 6905 }, { "epoch": 0.05, "grad_norm": 5.392237483360412, "learning_rate": 1.999330344257068e-06, "loss": 1.1658, "step": 6906 }, { "epoch": 0.05, "grad_norm": 4.64437426987661, "learning_rate": 1.999330150048971e-06, "loss": 1.4072, "step": 6907 }, { "epoch": 0.05, "grad_norm": 4.416164096008154, "learning_rate": 1.9993299558127256e-06, "loss": 1.3356, "step": 6908 }, { "epoch": 0.05, "grad_norm": 4.7859961924199625, "learning_rate": 1.9993297615483326e-06, "loss": 1.3603, "step": 6909 }, { "epoch": 0.05, "grad_norm": 4.393422363730001, "learning_rate": 1.999329567255792e-06, "loss": 1.2703, "step": 6910 }, { "epoch": 0.05, "grad_norm": 4.670508487457412, "learning_rate": 1.999329372935103e-06, "loss": 1.295, "step": 6911 }, { "epoch": 0.05, "grad_norm": 4.579141587895197, "learning_rate": 1.999329178586267e-06, "loss": 1.3891, "step": 6912 }, { "epoch": 0.05, "grad_norm": 5.26539877130667, "learning_rate": 1.9993289842092825e-06, "loss": 1.3487, "step": 6913 }, { "epoch": 0.05, "grad_norm": 4.977386122403641, "learning_rate": 1.999328789804151e-06, "loss": 1.2892, "step": 6914 }, { "epoch": 0.05, "grad_norm": 4.41454309865534, "learning_rate": 1.999328595370871e-06, "loss": 1.3939, "step": 6915 }, { "epoch": 0.05, "grad_norm": 4.346010598292651, "learning_rate": 1.9993284009094434e-06, "loss": 1.2757, "step": 6916 }, { "epoch": 0.05, "grad_norm": 4.768778563565909, "learning_rate": 1.9993282064198684e-06, "loss": 1.5186, "step": 6917 }, { "epoch": 0.05, "grad_norm": 4.755652958487592, "learning_rate": 1.9993280119021453e-06, "loss": 1.4002, "step": 6918 }, { "epoch": 0.05, "grad_norm": 4.331344225156601, "learning_rate": 1.9993278173562744e-06, "loss": 1.3672, "step": 6919 }, { "epoch": 0.05, "grad_norm": 4.653920498560621, "learning_rate": 1.9993276227822558e-06, "loss": 1.3141, "step": 6920 }, { "epoch": 0.05, "grad_norm": 5.362738733631665, "learning_rate": 1.99932742818009e-06, "loss": 1.5203, "step": 6921 }, { "epoch": 0.05, "grad_norm": 4.581591377708871, "learning_rate": 1.999327233549776e-06, "loss": 1.4938, "step": 6922 }, { "epoch": 0.05, "grad_norm": 4.521119030646661, "learning_rate": 1.9993270388913142e-06, "loss": 1.3383, "step": 6923 }, { "epoch": 0.05, "grad_norm": 4.4477436133485515, "learning_rate": 1.9993268442047046e-06, "loss": 1.4124, "step": 6924 }, { "epoch": 0.05, "grad_norm": 4.219439004089865, "learning_rate": 1.9993266494899476e-06, "loss": 1.2864, "step": 6925 }, { "epoch": 0.05, "grad_norm": 5.413412450535536, "learning_rate": 1.999326454747043e-06, "loss": 1.345, "step": 6926 }, { "epoch": 0.05, "grad_norm": 5.395495498549822, "learning_rate": 1.9993262599759905e-06, "loss": 1.6163, "step": 6927 }, { "epoch": 0.05, "grad_norm": 4.461204382952007, "learning_rate": 1.9993260651767902e-06, "loss": 1.2796, "step": 6928 }, { "epoch": 0.05, "grad_norm": 4.70158661483042, "learning_rate": 1.9993258703494423e-06, "loss": 1.4023, "step": 6929 }, { "epoch": 0.05, "grad_norm": 4.657124784576091, "learning_rate": 1.9993256754939466e-06, "loss": 1.5306, "step": 6930 }, { "epoch": 0.05, "grad_norm": 4.319084152081989, "learning_rate": 1.999325480610303e-06, "loss": 1.2713, "step": 6931 }, { "epoch": 0.05, "grad_norm": 4.8836721096870495, "learning_rate": 1.9993252856985123e-06, "loss": 1.4842, "step": 6932 }, { "epoch": 0.05, "grad_norm": 5.003329628999651, "learning_rate": 1.9993250907585737e-06, "loss": 1.5485, "step": 6933 }, { "epoch": 0.05, "grad_norm": 4.466184810791883, "learning_rate": 1.9993248957904874e-06, "loss": 1.3609, "step": 6934 }, { "epoch": 0.05, "grad_norm": 4.390481679221492, "learning_rate": 1.9993247007942534e-06, "loss": 1.4082, "step": 6935 }, { "epoch": 0.05, "eval_loss": 1.5723286867141724, "eval_runtime": 4.6205, "eval_samples_per_second": 1.948, "eval_steps_per_second": 1.082, "step": 6935 }, { "epoch": 0.05, "grad_norm": 4.977454918176336, "learning_rate": 1.999324505769872e-06, "loss": 1.6005, "step": 6936 }, { "epoch": 0.05, "grad_norm": 4.104359444931619, "learning_rate": 1.9993243107173424e-06, "loss": 1.1847, "step": 6937 }, { "epoch": 0.05, "grad_norm": 5.183134978438038, "learning_rate": 1.999324115636666e-06, "loss": 1.2699, "step": 6938 }, { "epoch": 0.05, "grad_norm": 4.335590800223618, "learning_rate": 1.9993239205278413e-06, "loss": 1.2705, "step": 6939 }, { "epoch": 0.05, "grad_norm": 5.184095863148774, "learning_rate": 1.999323725390869e-06, "loss": 1.3175, "step": 6940 }, { "epoch": 0.05, "grad_norm": 5.147678536143057, "learning_rate": 1.999323530225749e-06, "loss": 1.4798, "step": 6941 }, { "epoch": 0.05, "grad_norm": 4.6373518780410405, "learning_rate": 1.999323335032482e-06, "loss": 1.3143, "step": 6942 }, { "epoch": 0.05, "grad_norm": 4.2709168255312795, "learning_rate": 1.999323139811067e-06, "loss": 1.2856, "step": 6943 }, { "epoch": 0.05, "grad_norm": 4.311726534025149, "learning_rate": 1.9993229445615044e-06, "loss": 1.2459, "step": 6944 }, { "epoch": 0.05, "grad_norm": 4.179776281619437, "learning_rate": 1.999322749283794e-06, "loss": 1.1715, "step": 6945 }, { "epoch": 0.05, "grad_norm": 5.495318228734055, "learning_rate": 1.999322553977936e-06, "loss": 1.2555, "step": 6946 }, { "epoch": 0.05, "grad_norm": 5.971979955034916, "learning_rate": 1.9993223586439306e-06, "loss": 1.4059, "step": 6947 }, { "epoch": 0.05, "grad_norm": 5.005923638743876, "learning_rate": 1.999322163281778e-06, "loss": 1.4792, "step": 6948 }, { "epoch": 0.05, "grad_norm": 4.7602400190770435, "learning_rate": 1.9993219678914774e-06, "loss": 1.4684, "step": 6949 }, { "epoch": 0.05, "grad_norm": 4.492886879913653, "learning_rate": 1.999321772473029e-06, "loss": 1.3479, "step": 6950 }, { "epoch": 0.05, "grad_norm": 4.82438216668573, "learning_rate": 1.9993215770264332e-06, "loss": 1.4402, "step": 6951 }, { "epoch": 0.05, "grad_norm": 4.153286125585261, "learning_rate": 1.9993213815516904e-06, "loss": 1.3149, "step": 6952 }, { "epoch": 0.05, "grad_norm": 4.610237570964904, "learning_rate": 1.9993211860487993e-06, "loss": 1.3137, "step": 6953 }, { "epoch": 0.05, "grad_norm": 4.469313429707944, "learning_rate": 1.999320990517761e-06, "loss": 1.3516, "step": 6954 }, { "epoch": 0.05, "grad_norm": 4.300017442584707, "learning_rate": 1.999320794958575e-06, "loss": 1.4162, "step": 6955 }, { "epoch": 0.05, "grad_norm": 4.655434474147025, "learning_rate": 1.9993205993712413e-06, "loss": 1.4391, "step": 6956 }, { "epoch": 0.05, "grad_norm": 8.375470359607021, "learning_rate": 1.99932040375576e-06, "loss": 1.3658, "step": 6957 }, { "epoch": 0.05, "grad_norm": 4.715506315814977, "learning_rate": 1.999320208112132e-06, "loss": 1.5253, "step": 6958 }, { "epoch": 0.05, "grad_norm": 4.298702932777346, "learning_rate": 1.9993200124403557e-06, "loss": 1.2558, "step": 6959 }, { "epoch": 0.05, "grad_norm": 4.318303073674942, "learning_rate": 1.999319816740432e-06, "loss": 1.3611, "step": 6960 }, { "epoch": 0.05, "grad_norm": 5.423364926780558, "learning_rate": 1.9993196210123607e-06, "loss": 1.4353, "step": 6961 }, { "epoch": 0.05, "grad_norm": 4.258926210240222, "learning_rate": 1.9993194252561425e-06, "loss": 1.4383, "step": 6962 }, { "epoch": 0.05, "grad_norm": 4.093338517619337, "learning_rate": 1.9993192294717764e-06, "loss": 1.0727, "step": 6963 }, { "epoch": 0.05, "grad_norm": 4.0655292860980845, "learning_rate": 1.9993190336592627e-06, "loss": 1.2041, "step": 6964 }, { "epoch": 0.05, "grad_norm": 5.599705641006607, "learning_rate": 1.9993188378186015e-06, "loss": 1.372, "step": 6965 }, { "epoch": 0.05, "grad_norm": 4.72024774357659, "learning_rate": 1.999318641949793e-06, "loss": 1.31, "step": 6966 }, { "epoch": 0.05, "grad_norm": 4.226626516280397, "learning_rate": 1.999318446052837e-06, "loss": 1.4052, "step": 6967 }, { "epoch": 0.05, "grad_norm": 4.2693569618817575, "learning_rate": 1.9993182501277334e-06, "loss": 1.3971, "step": 6968 }, { "epoch": 0.05, "grad_norm": 5.933767142266201, "learning_rate": 1.999318054174482e-06, "loss": 1.1795, "step": 6969 }, { "epoch": 0.05, "grad_norm": 4.878579386086101, "learning_rate": 1.9993178581930835e-06, "loss": 1.1806, "step": 6970 }, { "epoch": 0.05, "grad_norm": 4.458773886496264, "learning_rate": 1.9993176621835376e-06, "loss": 1.3968, "step": 6971 }, { "epoch": 0.05, "grad_norm": 4.76623802011029, "learning_rate": 1.999317466145844e-06, "loss": 1.3787, "step": 6972 }, { "epoch": 0.05, "grad_norm": 4.535789557346703, "learning_rate": 1.9993172700800034e-06, "loss": 1.305, "step": 6973 }, { "epoch": 0.05, "grad_norm": 4.897699384646505, "learning_rate": 1.999317073986015e-06, "loss": 1.4912, "step": 6974 }, { "epoch": 0.05, "grad_norm": 4.308973146345478, "learning_rate": 1.999316877863879e-06, "loss": 1.3261, "step": 6975 }, { "epoch": 0.05, "grad_norm": 4.658121187619888, "learning_rate": 1.999316681713596e-06, "loss": 1.25, "step": 6976 }, { "epoch": 0.05, "grad_norm": 4.1919895784019765, "learning_rate": 1.999316485535165e-06, "loss": 1.3216, "step": 6977 }, { "epoch": 0.05, "grad_norm": 5.284225591364939, "learning_rate": 1.999316289328587e-06, "loss": 1.4093, "step": 6978 }, { "epoch": 0.05, "grad_norm": 4.348208872393012, "learning_rate": 1.9993160930938617e-06, "loss": 1.4178, "step": 6979 }, { "epoch": 0.05, "grad_norm": 4.572514282630385, "learning_rate": 1.999315896830989e-06, "loss": 1.4744, "step": 6980 }, { "epoch": 0.05, "grad_norm": 4.418349728422921, "learning_rate": 1.999315700539969e-06, "loss": 1.2933, "step": 6981 }, { "epoch": 0.05, "grad_norm": 4.558021339233919, "learning_rate": 1.999315504220801e-06, "loss": 1.4606, "step": 6982 }, { "epoch": 0.05, "grad_norm": 4.24562868661408, "learning_rate": 1.999315307873486e-06, "loss": 1.2966, "step": 6983 }, { "epoch": 0.05, "grad_norm": 5.271760029283781, "learning_rate": 1.9993151114980235e-06, "loss": 1.3675, "step": 6984 }, { "epoch": 0.05, "grad_norm": 4.542048971598718, "learning_rate": 1.9993149150944133e-06, "loss": 1.4162, "step": 6985 }, { "epoch": 0.05, "grad_norm": 5.480258515122354, "learning_rate": 1.9993147186626566e-06, "loss": 1.6395, "step": 6986 }, { "epoch": 0.05, "grad_norm": 4.650269922396915, "learning_rate": 1.9993145222027518e-06, "loss": 1.3323, "step": 6987 }, { "epoch": 0.05, "grad_norm": 4.438414509566363, "learning_rate": 1.9993143257147e-06, "loss": 1.4809, "step": 6988 }, { "epoch": 0.05, "grad_norm": 5.294656213773487, "learning_rate": 1.999314129198501e-06, "loss": 1.2105, "step": 6989 }, { "epoch": 0.05, "grad_norm": 4.2334458657036755, "learning_rate": 1.999313932654154e-06, "loss": 1.2839, "step": 6990 }, { "epoch": 0.05, "grad_norm": 4.624831745587173, "learning_rate": 1.99931373608166e-06, "loss": 1.3657, "step": 6991 }, { "epoch": 0.05, "grad_norm": 4.488528504649194, "learning_rate": 1.9993135394810187e-06, "loss": 1.2834, "step": 6992 }, { "epoch": 0.05, "grad_norm": 4.2783500728366075, "learning_rate": 1.99931334285223e-06, "loss": 1.3903, "step": 6993 }, { "epoch": 0.05, "grad_norm": 4.452222299914063, "learning_rate": 1.9993131461952937e-06, "loss": 1.3414, "step": 6994 }, { "epoch": 0.05, "grad_norm": 4.287603379438306, "learning_rate": 1.9993129495102106e-06, "loss": 1.371, "step": 6995 }, { "epoch": 0.05, "grad_norm": 5.286737903177311, "learning_rate": 1.99931275279698e-06, "loss": 1.3009, "step": 6996 }, { "epoch": 0.05, "grad_norm": 4.4597821183225825, "learning_rate": 1.9993125560556017e-06, "loss": 1.3868, "step": 6997 }, { "epoch": 0.05, "grad_norm": 4.861821131174677, "learning_rate": 1.9993123592860766e-06, "loss": 1.2148, "step": 6998 }, { "epoch": 0.05, "grad_norm": 4.7383936760760905, "learning_rate": 1.9993121624884038e-06, "loss": 1.4344, "step": 6999 }, { "epoch": 0.05, "grad_norm": 4.395521332019862, "learning_rate": 1.999311965662584e-06, "loss": 1.4098, "step": 7000 }, { "epoch": 0.05, "grad_norm": 5.67210447379715, "learning_rate": 1.9993117688086166e-06, "loss": 1.3977, "step": 7001 }, { "epoch": 0.05, "grad_norm": 4.486717587274083, "learning_rate": 1.9993115719265022e-06, "loss": 1.4141, "step": 7002 }, { "epoch": 0.05, "grad_norm": 6.388080977880258, "learning_rate": 1.99931137501624e-06, "loss": 1.3399, "step": 7003 }, { "epoch": 0.05, "grad_norm": 4.354846205868966, "learning_rate": 1.9993111780778315e-06, "loss": 1.2439, "step": 7004 }, { "epoch": 0.05, "grad_norm": 4.420176344788217, "learning_rate": 1.999310981111275e-06, "loss": 1.3686, "step": 7005 }, { "epoch": 0.05, "grad_norm": 4.298808459950213, "learning_rate": 1.9993107841165714e-06, "loss": 1.2517, "step": 7006 }, { "epoch": 0.05, "grad_norm": 4.5239589270697556, "learning_rate": 1.9993105870937204e-06, "loss": 1.2818, "step": 7007 }, { "epoch": 0.05, "grad_norm": 5.820376038421986, "learning_rate": 1.9993103900427225e-06, "loss": 1.4665, "step": 7008 }, { "epoch": 0.05, "eval_loss": 1.5669991970062256, "eval_runtime": 4.6273, "eval_samples_per_second": 1.945, "eval_steps_per_second": 1.081, "step": 7008 }, { "epoch": 0.05, "grad_norm": 6.568714723975113, "learning_rate": 1.999310192963577e-06, "loss": 1.415, "step": 7009 }, { "epoch": 0.05, "grad_norm": 4.258802195347123, "learning_rate": 1.9993099958562843e-06, "loss": 1.4024, "step": 7010 }, { "epoch": 0.05, "grad_norm": 4.346309364819591, "learning_rate": 1.9993097987208448e-06, "loss": 1.4376, "step": 7011 }, { "epoch": 0.05, "grad_norm": 4.467535077062481, "learning_rate": 1.9993096015572575e-06, "loss": 1.5115, "step": 7012 }, { "epoch": 0.05, "grad_norm": 16.89344498572743, "learning_rate": 1.9993094043655234e-06, "loss": 1.2977, "step": 7013 }, { "epoch": 0.05, "grad_norm": 5.5596796603582135, "learning_rate": 1.9993092071456415e-06, "loss": 1.4269, "step": 7014 }, { "epoch": 0.05, "grad_norm": 4.227228057518389, "learning_rate": 1.9993090098976127e-06, "loss": 1.2534, "step": 7015 }, { "epoch": 0.05, "grad_norm": 4.3349168448345825, "learning_rate": 1.999308812621437e-06, "loss": 1.3639, "step": 7016 }, { "epoch": 0.05, "grad_norm": 6.11591365366573, "learning_rate": 1.9993086153171136e-06, "loss": 1.3397, "step": 7017 }, { "epoch": 0.05, "grad_norm": 4.831188936060776, "learning_rate": 1.999308417984643e-06, "loss": 1.3551, "step": 7018 }, { "epoch": 0.05, "grad_norm": 9.997520569140493, "learning_rate": 1.9993082206240255e-06, "loss": 1.4289, "step": 7019 }, { "epoch": 0.05, "grad_norm": 6.293562371846127, "learning_rate": 1.9993080232352613e-06, "loss": 1.4367, "step": 7020 }, { "epoch": 0.05, "grad_norm": 5.3404875449306495, "learning_rate": 1.999307825818349e-06, "loss": 1.3398, "step": 7021 }, { "epoch": 0.05, "grad_norm": 4.420242231038565, "learning_rate": 1.99930762837329e-06, "loss": 1.4361, "step": 7022 }, { "epoch": 0.05, "grad_norm": 4.751971261982712, "learning_rate": 1.999307430900084e-06, "loss": 1.3339, "step": 7023 }, { "epoch": 0.05, "grad_norm": 4.68022717845886, "learning_rate": 1.9993072333987305e-06, "loss": 1.4873, "step": 7024 }, { "epoch": 0.05, "grad_norm": 4.023650995709581, "learning_rate": 1.9993070358692297e-06, "loss": 1.234, "step": 7025 }, { "epoch": 0.05, "grad_norm": 4.6206935764357, "learning_rate": 1.999306838311582e-06, "loss": 1.3175, "step": 7026 }, { "epoch": 0.05, "grad_norm": 4.23570349032178, "learning_rate": 1.999306640725787e-06, "loss": 1.2568, "step": 7027 }, { "epoch": 0.05, "grad_norm": 4.360481046170506, "learning_rate": 1.999306443111845e-06, "loss": 1.2818, "step": 7028 }, { "epoch": 0.05, "grad_norm": 4.375563225425501, "learning_rate": 1.9993062454697557e-06, "loss": 1.2929, "step": 7029 }, { "epoch": 0.05, "grad_norm": 4.628504922104216, "learning_rate": 1.9993060477995196e-06, "loss": 1.3326, "step": 7030 }, { "epoch": 0.05, "grad_norm": 4.575993452326688, "learning_rate": 1.999305850101136e-06, "loss": 1.2761, "step": 7031 }, { "epoch": 0.05, "grad_norm": 6.72559626643048, "learning_rate": 1.9993056523746056e-06, "loss": 1.3921, "step": 7032 }, { "epoch": 0.05, "grad_norm": 4.416715262293596, "learning_rate": 1.999305454619928e-06, "loss": 1.5138, "step": 7033 }, { "epoch": 0.05, "grad_norm": 4.690092663341355, "learning_rate": 1.999305256837103e-06, "loss": 1.4576, "step": 7034 }, { "epoch": 0.05, "grad_norm": 4.607408817445301, "learning_rate": 1.9993050590261313e-06, "loss": 1.415, "step": 7035 }, { "epoch": 0.05, "grad_norm": 4.150485453499374, "learning_rate": 1.999304861187012e-06, "loss": 1.1881, "step": 7036 }, { "epoch": 0.05, "grad_norm": 4.878865150190498, "learning_rate": 1.9993046633197458e-06, "loss": 1.4202, "step": 7037 }, { "epoch": 0.05, "grad_norm": 4.637497369315281, "learning_rate": 1.9993044654243327e-06, "loss": 1.3978, "step": 7038 }, { "epoch": 0.05, "grad_norm": 4.789392369680805, "learning_rate": 1.9993042675007726e-06, "loss": 1.2131, "step": 7039 }, { "epoch": 0.05, "grad_norm": 4.361090767631489, "learning_rate": 1.999304069549065e-06, "loss": 1.4597, "step": 7040 }, { "epoch": 0.05, "grad_norm": 6.548024639426102, "learning_rate": 1.9993038715692106e-06, "loss": 1.5183, "step": 7041 }, { "epoch": 0.05, "grad_norm": 4.679356646521861, "learning_rate": 1.999303673561209e-06, "loss": 1.277, "step": 7042 }, { "epoch": 0.05, "grad_norm": 4.767084313387827, "learning_rate": 1.9993034755250606e-06, "loss": 1.3261, "step": 7043 }, { "epoch": 0.05, "grad_norm": 4.468264798853903, "learning_rate": 1.999303277460765e-06, "loss": 1.4225, "step": 7044 }, { "epoch": 0.05, "grad_norm": 4.846290499605908, "learning_rate": 1.9993030793683224e-06, "loss": 1.4198, "step": 7045 }, { "epoch": 0.05, "grad_norm": 4.567648853605749, "learning_rate": 1.9993028812477324e-06, "loss": 1.2052, "step": 7046 }, { "epoch": 0.05, "grad_norm": 4.762112369071904, "learning_rate": 1.999302683098996e-06, "loss": 1.4471, "step": 7047 }, { "epoch": 0.05, "grad_norm": 4.2900700848101145, "learning_rate": 1.9993024849221124e-06, "loss": 1.43, "step": 7048 }, { "epoch": 0.05, "grad_norm": 4.349936827376891, "learning_rate": 1.9993022867170813e-06, "loss": 1.3023, "step": 7049 }, { "epoch": 0.05, "grad_norm": 5.174500826223994, "learning_rate": 1.999302088483903e-06, "loss": 1.4459, "step": 7050 }, { "epoch": 0.05, "grad_norm": 4.513014922663578, "learning_rate": 1.9993018902225786e-06, "loss": 1.4669, "step": 7051 }, { "epoch": 0.05, "grad_norm": 4.853982671088095, "learning_rate": 1.9993016919331067e-06, "loss": 1.3639, "step": 7052 }, { "epoch": 0.05, "grad_norm": 4.5309510693657105, "learning_rate": 1.999301493615488e-06, "loss": 1.4476, "step": 7053 }, { "epoch": 0.05, "grad_norm": 4.174735379024303, "learning_rate": 1.999301295269722e-06, "loss": 1.2145, "step": 7054 }, { "epoch": 0.05, "grad_norm": 5.033066321644365, "learning_rate": 1.999301096895809e-06, "loss": 1.3672, "step": 7055 }, { "epoch": 0.05, "grad_norm": 5.161208891913241, "learning_rate": 1.999300898493749e-06, "loss": 1.3853, "step": 7056 }, { "epoch": 0.05, "grad_norm": 6.22934878300113, "learning_rate": 1.9993007000635428e-06, "loss": 1.4928, "step": 7057 }, { "epoch": 0.05, "grad_norm": 5.094330939015403, "learning_rate": 1.9993005016051886e-06, "loss": 1.3466, "step": 7058 }, { "epoch": 0.05, "grad_norm": 4.2479827223026545, "learning_rate": 1.999300303118688e-06, "loss": 1.529, "step": 7059 }, { "epoch": 0.05, "grad_norm": 4.700246160327025, "learning_rate": 1.9993001046040404e-06, "loss": 1.289, "step": 7060 }, { "epoch": 0.05, "grad_norm": 4.279110977270551, "learning_rate": 1.9992999060612455e-06, "loss": 1.317, "step": 7061 }, { "epoch": 0.05, "grad_norm": 5.413824182779514, "learning_rate": 1.999299707490304e-06, "loss": 1.4649, "step": 7062 }, { "epoch": 0.05, "grad_norm": 4.7679883888006405, "learning_rate": 1.9992995088912154e-06, "loss": 1.5463, "step": 7063 }, { "epoch": 0.05, "grad_norm": 4.8878133107789985, "learning_rate": 1.99929931026398e-06, "loss": 1.4551, "step": 7064 }, { "epoch": 0.05, "grad_norm": 4.3657136117913655, "learning_rate": 1.9992991116085977e-06, "loss": 1.2022, "step": 7065 }, { "epoch": 0.05, "grad_norm": 9.676292143026265, "learning_rate": 1.999298912925068e-06, "loss": 1.3797, "step": 7066 }, { "epoch": 0.05, "grad_norm": 4.733118211622471, "learning_rate": 1.9992987142133916e-06, "loss": 1.3954, "step": 7067 }, { "epoch": 0.05, "grad_norm": 4.625827313351914, "learning_rate": 1.9992985154735684e-06, "loss": 1.4772, "step": 7068 }, { "epoch": 0.05, "grad_norm": 5.147236131897613, "learning_rate": 1.9992983167055982e-06, "loss": 1.2632, "step": 7069 }, { "epoch": 0.05, "grad_norm": 5.261124074541268, "learning_rate": 1.999298117909481e-06, "loss": 1.126, "step": 7070 }, { "epoch": 0.05, "grad_norm": 4.327070414146177, "learning_rate": 1.999297919085217e-06, "loss": 1.2833, "step": 7071 }, { "epoch": 0.05, "grad_norm": 4.77893372168249, "learning_rate": 1.9992977202328064e-06, "loss": 1.4818, "step": 7072 }, { "epoch": 0.05, "grad_norm": 4.664613067488928, "learning_rate": 1.9992975213522487e-06, "loss": 1.3801, "step": 7073 }, { "epoch": 0.05, "grad_norm": 4.349833872305326, "learning_rate": 1.999297322443544e-06, "loss": 1.3626, "step": 7074 }, { "epoch": 0.05, "grad_norm": 4.637094477852255, "learning_rate": 1.999297123506692e-06, "loss": 1.7056, "step": 7075 }, { "epoch": 0.05, "grad_norm": 4.45878222952638, "learning_rate": 1.999296924541694e-06, "loss": 1.2444, "step": 7076 }, { "epoch": 0.05, "grad_norm": 5.1777412078222715, "learning_rate": 1.9992967255485486e-06, "loss": 1.2872, "step": 7077 }, { "epoch": 0.05, "grad_norm": 7.0107257886669085, "learning_rate": 1.9992965265272564e-06, "loss": 1.4654, "step": 7078 }, { "epoch": 0.05, "grad_norm": 4.229458021701935, "learning_rate": 1.999296327477817e-06, "loss": 1.433, "step": 7079 }, { "epoch": 0.05, "grad_norm": 4.174997408987769, "learning_rate": 1.9992961284002315e-06, "loss": 1.157, "step": 7080 }, { "epoch": 0.05, "grad_norm": 4.335672145792359, "learning_rate": 1.999295929294499e-06, "loss": 1.323, "step": 7081 }, { "epoch": 0.05, "eval_loss": 1.570174217224121, "eval_runtime": 4.6448, "eval_samples_per_second": 1.938, "eval_steps_per_second": 1.076, "step": 7081 }, { "epoch": 0.05, "grad_norm": 4.811371363661427, "learning_rate": 1.999295730160619e-06, "loss": 1.385, "step": 7082 }, { "epoch": 0.05, "grad_norm": 4.3682546839224505, "learning_rate": 1.999295530998593e-06, "loss": 1.2735, "step": 7083 }, { "epoch": 0.05, "grad_norm": 4.3888204143712155, "learning_rate": 1.99929533180842e-06, "loss": 1.3274, "step": 7084 }, { "epoch": 0.05, "grad_norm": 4.604437987931662, "learning_rate": 1.9992951325900997e-06, "loss": 1.3971, "step": 7085 }, { "epoch": 0.05, "grad_norm": 4.474810562400445, "learning_rate": 1.999294933343633e-06, "loss": 1.2397, "step": 7086 }, { "epoch": 0.05, "grad_norm": 4.789163032414594, "learning_rate": 1.999294734069019e-06, "loss": 1.3812, "step": 7087 }, { "epoch": 0.05, "grad_norm": 4.550103353270677, "learning_rate": 1.9992945347662585e-06, "loss": 1.4251, "step": 7088 }, { "epoch": 0.05, "grad_norm": 5.859663664867163, "learning_rate": 1.999294335435351e-06, "loss": 1.1507, "step": 7089 }, { "epoch": 0.05, "grad_norm": 8.870067509070557, "learning_rate": 1.9992941360762973e-06, "loss": 1.3919, "step": 7090 }, { "epoch": 0.05, "grad_norm": 4.444587337241732, "learning_rate": 1.999293936689096e-06, "loss": 1.3743, "step": 7091 }, { "epoch": 0.05, "grad_norm": 7.590176109793462, "learning_rate": 1.9992937372737485e-06, "loss": 1.4597, "step": 7092 }, { "epoch": 0.05, "grad_norm": 4.684358104116401, "learning_rate": 1.9992935378302543e-06, "loss": 1.4329, "step": 7093 }, { "epoch": 0.05, "grad_norm": 4.301154213222004, "learning_rate": 1.999293338358613e-06, "loss": 1.3823, "step": 7094 }, { "epoch": 0.05, "grad_norm": 4.563520356749165, "learning_rate": 1.999293138858825e-06, "loss": 1.2924, "step": 7095 }, { "epoch": 0.05, "grad_norm": 4.196199016619723, "learning_rate": 1.99929293933089e-06, "loss": 1.3279, "step": 7096 }, { "epoch": 0.05, "grad_norm": 4.5708057141337655, "learning_rate": 1.9992927397748087e-06, "loss": 1.4336, "step": 7097 }, { "epoch": 0.05, "grad_norm": 4.4028628703375485, "learning_rate": 1.9992925401905804e-06, "loss": 1.4404, "step": 7098 }, { "epoch": 0.05, "grad_norm": 4.454289041945561, "learning_rate": 1.9992923405782053e-06, "loss": 1.298, "step": 7099 }, { "epoch": 0.05, "grad_norm": 4.6092020835148295, "learning_rate": 1.9992921409376836e-06, "loss": 1.3991, "step": 7100 }, { "epoch": 0.05, "grad_norm": 4.724165738743983, "learning_rate": 1.999291941269015e-06, "loss": 1.3128, "step": 7101 }, { "epoch": 0.05, "grad_norm": 7.628888992375959, "learning_rate": 1.9992917415722e-06, "loss": 1.3288, "step": 7102 }, { "epoch": 0.05, "grad_norm": 4.323847876798911, "learning_rate": 1.9992915418472377e-06, "loss": 1.4043, "step": 7103 }, { "epoch": 0.05, "grad_norm": 7.035752767274381, "learning_rate": 1.9992913420941293e-06, "loss": 1.4073, "step": 7104 }, { "epoch": 0.05, "grad_norm": 4.325587001110071, "learning_rate": 1.999291142312874e-06, "loss": 1.3061, "step": 7105 }, { "epoch": 0.05, "grad_norm": 5.199628991889981, "learning_rate": 1.9992909425034718e-06, "loss": 1.3827, "step": 7106 }, { "epoch": 0.05, "grad_norm": 5.406121908977349, "learning_rate": 1.999290742665923e-06, "loss": 1.3586, "step": 7107 }, { "epoch": 0.05, "grad_norm": 4.558299433919767, "learning_rate": 1.9992905428002275e-06, "loss": 1.3723, "step": 7108 }, { "epoch": 0.05, "grad_norm": 4.223500824729328, "learning_rate": 1.9992903429063854e-06, "loss": 1.2571, "step": 7109 }, { "epoch": 0.05, "grad_norm": 4.479936789167395, "learning_rate": 1.9992901429843964e-06, "loss": 1.378, "step": 7110 }, { "epoch": 0.05, "grad_norm": 4.435244730015443, "learning_rate": 1.999289943034261e-06, "loss": 1.3304, "step": 7111 }, { "epoch": 0.05, "grad_norm": 5.9527847800012275, "learning_rate": 1.9992897430559785e-06, "loss": 1.3581, "step": 7112 }, { "epoch": 0.05, "grad_norm": 4.909367945330029, "learning_rate": 1.9992895430495497e-06, "loss": 1.406, "step": 7113 }, { "epoch": 0.05, "grad_norm": 5.190231735778181, "learning_rate": 1.9992893430149743e-06, "loss": 1.3874, "step": 7114 }, { "epoch": 0.05, "grad_norm": 5.1098326817510005, "learning_rate": 1.9992891429522517e-06, "loss": 1.507, "step": 7115 }, { "epoch": 0.05, "grad_norm": 4.670053833225784, "learning_rate": 1.999288942861383e-06, "loss": 1.3949, "step": 7116 }, { "epoch": 0.05, "grad_norm": 4.729259274285833, "learning_rate": 1.9992887427423673e-06, "loss": 1.4257, "step": 7117 }, { "epoch": 0.05, "grad_norm": 4.512513122039602, "learning_rate": 1.9992885425952056e-06, "loss": 1.4456, "step": 7118 }, { "epoch": 0.05, "grad_norm": 8.108953569615805, "learning_rate": 1.9992883424198966e-06, "loss": 1.3828, "step": 7119 }, { "epoch": 0.05, "grad_norm": 4.57398615749145, "learning_rate": 1.999288142216441e-06, "loss": 1.4798, "step": 7120 }, { "epoch": 0.05, "grad_norm": 4.651803144089258, "learning_rate": 1.999287941984839e-06, "loss": 1.2976, "step": 7121 }, { "epoch": 0.05, "grad_norm": 5.785537099371874, "learning_rate": 1.9992877417250902e-06, "loss": 1.3611, "step": 7122 }, { "epoch": 0.05, "grad_norm": 4.601544236922998, "learning_rate": 1.9992875414371953e-06, "loss": 1.5246, "step": 7123 }, { "epoch": 0.05, "grad_norm": 4.906981999133515, "learning_rate": 1.9992873411211534e-06, "loss": 1.4048, "step": 7124 }, { "epoch": 0.05, "grad_norm": 4.5571072221652145, "learning_rate": 1.999287140776965e-06, "loss": 1.2723, "step": 7125 }, { "epoch": 0.05, "grad_norm": 5.540119498159003, "learning_rate": 1.99928694040463e-06, "loss": 1.4562, "step": 7126 }, { "epoch": 0.05, "grad_norm": 4.261087828782238, "learning_rate": 1.999286740004148e-06, "loss": 1.4593, "step": 7127 }, { "epoch": 0.05, "grad_norm": 4.480342065811858, "learning_rate": 1.99928653957552e-06, "loss": 1.3901, "step": 7128 }, { "epoch": 0.05, "grad_norm": 6.378217500629964, "learning_rate": 1.9992863391187453e-06, "loss": 1.2788, "step": 7129 }, { "epoch": 0.05, "grad_norm": 4.785807661732239, "learning_rate": 1.9992861386338237e-06, "loss": 1.4057, "step": 7130 }, { "epoch": 0.05, "grad_norm": 4.290249821340279, "learning_rate": 1.9992859381207557e-06, "loss": 1.1968, "step": 7131 }, { "epoch": 0.05, "grad_norm": 4.974272186061106, "learning_rate": 1.9992857375795415e-06, "loss": 1.0952, "step": 7132 }, { "epoch": 0.05, "grad_norm": 4.513567056570524, "learning_rate": 1.99928553701018e-06, "loss": 1.3944, "step": 7133 }, { "epoch": 0.05, "grad_norm": 4.630798342020451, "learning_rate": 1.9992853364126726e-06, "loss": 1.3717, "step": 7134 }, { "epoch": 0.05, "grad_norm": 6.398772534327991, "learning_rate": 1.9992851357870186e-06, "loss": 1.1976, "step": 7135 }, { "epoch": 0.05, "grad_norm": 7.581948389564741, "learning_rate": 1.9992849351332177e-06, "loss": 1.3963, "step": 7136 }, { "epoch": 0.05, "grad_norm": 4.34453243695143, "learning_rate": 1.9992847344512703e-06, "loss": 1.342, "step": 7137 }, { "epoch": 0.05, "grad_norm": 4.644334161230356, "learning_rate": 1.999284533741177e-06, "loss": 1.4106, "step": 7138 }, { "epoch": 0.05, "grad_norm": 4.1059165125277675, "learning_rate": 1.9992843330029365e-06, "loss": 1.3086, "step": 7139 }, { "epoch": 0.05, "grad_norm": 4.862855522369235, "learning_rate": 1.9992841322365493e-06, "loss": 1.42, "step": 7140 }, { "epoch": 0.05, "grad_norm": 4.124975380634363, "learning_rate": 1.9992839314420164e-06, "loss": 1.2321, "step": 7141 }, { "epoch": 0.05, "grad_norm": 5.188867126585959, "learning_rate": 1.9992837306193366e-06, "loss": 1.5118, "step": 7142 }, { "epoch": 0.05, "grad_norm": 4.2375175719389855, "learning_rate": 1.99928352976851e-06, "loss": 1.4133, "step": 7143 }, { "epoch": 0.05, "grad_norm": 3.9684837541082776, "learning_rate": 1.9992833288895376e-06, "loss": 1.2047, "step": 7144 }, { "epoch": 0.05, "grad_norm": 4.6670484287059635, "learning_rate": 1.999283127982418e-06, "loss": 1.4256, "step": 7145 }, { "epoch": 0.05, "grad_norm": 4.333563368852219, "learning_rate": 1.9992829270471523e-06, "loss": 1.3734, "step": 7146 }, { "epoch": 0.05, "grad_norm": 4.896524270162512, "learning_rate": 1.99928272608374e-06, "loss": 1.4001, "step": 7147 }, { "epoch": 0.05, "grad_norm": 4.963258045428311, "learning_rate": 1.999282525092181e-06, "loss": 1.4361, "step": 7148 }, { "epoch": 0.05, "grad_norm": 5.365295255952227, "learning_rate": 1.9992823240724763e-06, "loss": 1.2996, "step": 7149 }, { "epoch": 0.05, "grad_norm": 5.116854182507893, "learning_rate": 1.9992821230246242e-06, "loss": 1.2889, "step": 7150 }, { "epoch": 0.05, "grad_norm": 4.561146233461829, "learning_rate": 1.9992819219486265e-06, "loss": 1.3426, "step": 7151 }, { "epoch": 0.05, "grad_norm": 4.194502504465386, "learning_rate": 1.9992817208444815e-06, "loss": 1.2945, "step": 7152 }, { "epoch": 0.05, "grad_norm": 4.295953453598461, "learning_rate": 1.999281519712191e-06, "loss": 1.2147, "step": 7153 }, { "epoch": 0.05, "grad_norm": 4.528545964936234, "learning_rate": 1.9992813185517533e-06, "loss": 1.388, "step": 7154 }, { "epoch": 0.05, "eval_loss": 1.567068338394165, "eval_runtime": 4.6225, "eval_samples_per_second": 1.947, "eval_steps_per_second": 1.082, "step": 7154 }, { "epoch": 0.05, "grad_norm": 10.921444427983847, "learning_rate": 1.9992811173631697e-06, "loss": 1.4803, "step": 7155 }, { "epoch": 0.05, "grad_norm": 4.907709249301519, "learning_rate": 1.999280916146439e-06, "loss": 1.5444, "step": 7156 }, { "epoch": 0.05, "grad_norm": 5.168774541457149, "learning_rate": 1.9992807149015626e-06, "loss": 1.4698, "step": 7157 }, { "epoch": 0.05, "grad_norm": 4.017412028829558, "learning_rate": 1.9992805136285395e-06, "loss": 1.1133, "step": 7158 }, { "epoch": 0.05, "grad_norm": 4.711310901105997, "learning_rate": 1.99928031232737e-06, "loss": 1.4661, "step": 7159 }, { "epoch": 0.05, "grad_norm": 4.544690314172373, "learning_rate": 1.999280110998054e-06, "loss": 1.3673, "step": 7160 }, { "epoch": 0.05, "grad_norm": 4.880445142339393, "learning_rate": 1.999279909640592e-06, "loss": 1.3126, "step": 7161 }, { "epoch": 0.05, "grad_norm": 4.917321757945998, "learning_rate": 1.999279708254983e-06, "loss": 1.3804, "step": 7162 }, { "epoch": 0.05, "grad_norm": 4.653624386235508, "learning_rate": 1.999279506841228e-06, "loss": 1.4004, "step": 7163 }, { "epoch": 0.05, "grad_norm": 4.641605473638878, "learning_rate": 1.9992793053993263e-06, "loss": 1.4101, "step": 7164 }, { "epoch": 0.05, "grad_norm": 4.772727351135286, "learning_rate": 1.9992791039292783e-06, "loss": 1.3172, "step": 7165 }, { "epoch": 0.05, "grad_norm": 5.327924009459131, "learning_rate": 1.9992789024310843e-06, "loss": 1.4011, "step": 7166 }, { "epoch": 0.05, "grad_norm": 4.723919366011537, "learning_rate": 1.9992787009047437e-06, "loss": 1.3796, "step": 7167 }, { "epoch": 0.05, "grad_norm": 5.117283611959668, "learning_rate": 1.9992784993502567e-06, "loss": 1.523, "step": 7168 }, { "epoch": 0.05, "grad_norm": 4.760794845198276, "learning_rate": 1.999278297767623e-06, "loss": 1.434, "step": 7169 }, { "epoch": 0.05, "grad_norm": 4.478686254102247, "learning_rate": 1.9992780961568436e-06, "loss": 1.2839, "step": 7170 }, { "epoch": 0.05, "grad_norm": 4.70572025232697, "learning_rate": 1.9992778945179176e-06, "loss": 1.1947, "step": 7171 }, { "epoch": 0.05, "grad_norm": 10.726440764118971, "learning_rate": 1.999277692850845e-06, "loss": 1.1479, "step": 7172 }, { "epoch": 0.05, "grad_norm": 4.678234691069941, "learning_rate": 1.999277491155627e-06, "loss": 1.3532, "step": 7173 }, { "epoch": 0.05, "grad_norm": 4.675570907673433, "learning_rate": 1.999277289432262e-06, "loss": 1.4047, "step": 7174 }, { "epoch": 0.05, "grad_norm": 4.742232259035592, "learning_rate": 1.9992770876807507e-06, "loss": 1.3146, "step": 7175 }, { "epoch": 0.05, "grad_norm": 4.685176244166455, "learning_rate": 1.999276885901093e-06, "loss": 1.4183, "step": 7176 }, { "epoch": 0.05, "grad_norm": 4.482371416270683, "learning_rate": 1.999276684093289e-06, "loss": 1.3548, "step": 7177 }, { "epoch": 0.05, "grad_norm": 4.338258334182361, "learning_rate": 1.999276482257339e-06, "loss": 1.2454, "step": 7178 }, { "epoch": 0.05, "grad_norm": 4.371003444331901, "learning_rate": 1.9992762803932422e-06, "loss": 1.3772, "step": 7179 }, { "epoch": 0.05, "grad_norm": 4.6069330702425635, "learning_rate": 1.9992760785009996e-06, "loss": 1.4256, "step": 7180 }, { "epoch": 0.05, "grad_norm": 4.40564081578818, "learning_rate": 1.9992758765806104e-06, "loss": 1.3014, "step": 7181 }, { "epoch": 0.05, "grad_norm": 4.935391473798801, "learning_rate": 1.9992756746320747e-06, "loss": 1.3351, "step": 7182 }, { "epoch": 0.05, "grad_norm": 4.8133620388908, "learning_rate": 1.9992754726553935e-06, "loss": 1.3787, "step": 7183 }, { "epoch": 0.05, "grad_norm": 4.568201462520875, "learning_rate": 1.9992752706505657e-06, "loss": 1.3528, "step": 7184 }, { "epoch": 0.05, "grad_norm": 4.452527950518018, "learning_rate": 1.9992750686175915e-06, "loss": 1.3868, "step": 7185 }, { "epoch": 0.05, "grad_norm": 4.773019213284009, "learning_rate": 1.9992748665564707e-06, "loss": 1.4771, "step": 7186 }, { "epoch": 0.05, "grad_norm": 5.610027861715398, "learning_rate": 1.9992746644672044e-06, "loss": 1.4357, "step": 7187 }, { "epoch": 0.05, "grad_norm": 4.321191416060121, "learning_rate": 1.9992744623497915e-06, "loss": 1.5074, "step": 7188 }, { "epoch": 0.05, "grad_norm": 4.721109355842101, "learning_rate": 1.9992742602042322e-06, "loss": 1.3765, "step": 7189 }, { "epoch": 0.05, "grad_norm": 7.183826862980063, "learning_rate": 1.999274058030527e-06, "loss": 1.4573, "step": 7190 }, { "epoch": 0.05, "grad_norm": 4.950158905269603, "learning_rate": 1.9992738558286754e-06, "loss": 1.3038, "step": 7191 }, { "epoch": 0.05, "grad_norm": 4.911177213559306, "learning_rate": 1.9992736535986775e-06, "loss": 1.3219, "step": 7192 }, { "epoch": 0.05, "grad_norm": 4.460467972167957, "learning_rate": 1.9992734513405335e-06, "loss": 1.3124, "step": 7193 }, { "epoch": 0.05, "grad_norm": 6.246070381301334, "learning_rate": 1.999273249054243e-06, "loss": 1.6182, "step": 7194 }, { "epoch": 0.05, "grad_norm": 4.624861204697921, "learning_rate": 1.9992730467398065e-06, "loss": 1.4558, "step": 7195 }, { "epoch": 0.05, "grad_norm": 4.589034187021738, "learning_rate": 1.999272844397224e-06, "loss": 1.4034, "step": 7196 }, { "epoch": 0.05, "grad_norm": 4.650788554300284, "learning_rate": 1.9992726420264953e-06, "loss": 1.512, "step": 7197 }, { "epoch": 0.05, "grad_norm": 5.52570463035501, "learning_rate": 1.9992724396276202e-06, "loss": 1.4149, "step": 7198 }, { "epoch": 0.05, "grad_norm": 4.551489139267119, "learning_rate": 1.999272237200599e-06, "loss": 1.2638, "step": 7199 }, { "epoch": 0.05, "grad_norm": 5.3093413342954205, "learning_rate": 1.999272034745432e-06, "loss": 1.4248, "step": 7200 }, { "epoch": 0.05, "grad_norm": 4.547272174735781, "learning_rate": 1.999271832262118e-06, "loss": 1.3718, "step": 7201 }, { "epoch": 0.05, "grad_norm": 5.0991239605157554, "learning_rate": 1.9992716297506583e-06, "loss": 1.5065, "step": 7202 }, { "epoch": 0.05, "grad_norm": 5.655145919457609, "learning_rate": 1.9992714272110525e-06, "loss": 1.3268, "step": 7203 }, { "epoch": 0.05, "grad_norm": 5.535267468654156, "learning_rate": 1.9992712246433006e-06, "loss": 1.3459, "step": 7204 }, { "epoch": 0.05, "grad_norm": 4.182924658459184, "learning_rate": 1.9992710220474023e-06, "loss": 1.3959, "step": 7205 }, { "epoch": 0.05, "grad_norm": 4.612869329891801, "learning_rate": 1.9992708194233583e-06, "loss": 1.4567, "step": 7206 }, { "epoch": 0.05, "grad_norm": 4.448613869464716, "learning_rate": 1.9992706167711674e-06, "loss": 1.3456, "step": 7207 }, { "epoch": 0.05, "grad_norm": 5.056001488518422, "learning_rate": 1.999270414090831e-06, "loss": 1.5162, "step": 7208 }, { "epoch": 0.05, "grad_norm": 4.636850415598218, "learning_rate": 1.9992702113823482e-06, "loss": 1.3942, "step": 7209 }, { "epoch": 0.05, "grad_norm": 4.408189411810255, "learning_rate": 1.9992700086457196e-06, "loss": 1.3791, "step": 7210 }, { "epoch": 0.05, "grad_norm": 6.156967507051979, "learning_rate": 1.9992698058809445e-06, "loss": 1.3764, "step": 7211 }, { "epoch": 0.05, "grad_norm": 4.144828158330086, "learning_rate": 1.9992696030880233e-06, "loss": 1.2605, "step": 7212 }, { "epoch": 0.05, "grad_norm": 4.334539376114015, "learning_rate": 1.9992694002669564e-06, "loss": 1.3453, "step": 7213 }, { "epoch": 0.05, "grad_norm": 5.5490568419096, "learning_rate": 1.999269197417743e-06, "loss": 1.2466, "step": 7214 }, { "epoch": 0.05, "grad_norm": 4.269630829092351, "learning_rate": 1.999268994540384e-06, "loss": 1.3376, "step": 7215 }, { "epoch": 0.05, "grad_norm": 5.982669192717452, "learning_rate": 1.9992687916348783e-06, "loss": 1.2952, "step": 7216 }, { "epoch": 0.05, "grad_norm": 4.4680837655833185, "learning_rate": 1.999268588701227e-06, "loss": 1.3757, "step": 7217 }, { "epoch": 0.05, "grad_norm": 4.7246110673802795, "learning_rate": 1.9992683857394293e-06, "loss": 1.3782, "step": 7218 }, { "epoch": 0.05, "grad_norm": 5.864975756214362, "learning_rate": 1.9992681827494857e-06, "loss": 1.3936, "step": 7219 }, { "epoch": 0.05, "grad_norm": 4.6834196331782305, "learning_rate": 1.999267979731396e-06, "loss": 1.406, "step": 7220 }, { "epoch": 0.05, "grad_norm": 4.594454101042429, "learning_rate": 1.9992677766851604e-06, "loss": 1.3496, "step": 7221 }, { "epoch": 0.05, "grad_norm": 4.322212074880055, "learning_rate": 1.9992675736107786e-06, "loss": 1.336, "step": 7222 }, { "epoch": 0.05, "grad_norm": 6.113943446504393, "learning_rate": 1.9992673705082508e-06, "loss": 1.5227, "step": 7223 }, { "epoch": 0.05, "grad_norm": 5.534835650861005, "learning_rate": 1.999267167377577e-06, "loss": 1.2289, "step": 7224 }, { "epoch": 0.05, "grad_norm": 4.61861667871324, "learning_rate": 1.999266964218757e-06, "loss": 1.3614, "step": 7225 }, { "epoch": 0.05, "grad_norm": 4.762436771085244, "learning_rate": 1.999266761031791e-06, "loss": 1.4824, "step": 7226 }, { "epoch": 0.05, "grad_norm": 7.7458925109076855, "learning_rate": 1.999266557816679e-06, "loss": 1.4602, "step": 7227 }, { "epoch": 0.05, "eval_loss": 1.5644173622131348, "eval_runtime": 4.6341, "eval_samples_per_second": 1.942, "eval_steps_per_second": 1.079, "step": 7227 }, { "epoch": 0.05, "grad_norm": 4.269117442811875, "learning_rate": 1.999266354573421e-06, "loss": 1.2042, "step": 7228 }, { "epoch": 0.05, "grad_norm": 4.103004876119535, "learning_rate": 1.999266151302017e-06, "loss": 1.3064, "step": 7229 }, { "epoch": 0.05, "grad_norm": 4.915827720000491, "learning_rate": 1.999265948002467e-06, "loss": 1.4454, "step": 7230 }, { "epoch": 0.05, "grad_norm": 8.232922113923749, "learning_rate": 1.9992657446747713e-06, "loss": 1.1688, "step": 7231 }, { "epoch": 0.05, "grad_norm": 5.611433405805369, "learning_rate": 1.9992655413189294e-06, "loss": 1.4779, "step": 7232 }, { "epoch": 0.05, "grad_norm": 4.8273026279435385, "learning_rate": 1.999265337934941e-06, "loss": 1.3144, "step": 7233 }, { "epoch": 0.05, "grad_norm": 4.340686313858295, "learning_rate": 1.9992651345228074e-06, "loss": 1.3688, "step": 7234 }, { "epoch": 0.05, "grad_norm": 4.678005211656241, "learning_rate": 1.9992649310825273e-06, "loss": 1.4732, "step": 7235 }, { "epoch": 0.05, "grad_norm": 4.6153428658701054, "learning_rate": 1.9992647276141016e-06, "loss": 1.2993, "step": 7236 }, { "epoch": 0.05, "grad_norm": 6.507593398959669, "learning_rate": 1.9992645241175298e-06, "loss": 1.3634, "step": 7237 }, { "epoch": 0.05, "grad_norm": 5.548297258722604, "learning_rate": 1.999264320592812e-06, "loss": 1.5778, "step": 7238 }, { "epoch": 0.05, "grad_norm": 4.43266976705774, "learning_rate": 1.999264117039948e-06, "loss": 1.3544, "step": 7239 }, { "epoch": 0.05, "grad_norm": 5.8133746880980866, "learning_rate": 1.9992639134589385e-06, "loss": 1.5575, "step": 7240 }, { "epoch": 0.05, "grad_norm": 4.519531563282679, "learning_rate": 1.999263709849783e-06, "loss": 1.3455, "step": 7241 }, { "epoch": 0.05, "grad_norm": 4.459301006089623, "learning_rate": 1.9992635062124812e-06, "loss": 1.496, "step": 7242 }, { "epoch": 0.05, "grad_norm": 5.018446660090502, "learning_rate": 1.9992633025470335e-06, "loss": 1.4359, "step": 7243 }, { "epoch": 0.05, "grad_norm": 4.777508876904298, "learning_rate": 1.99926309885344e-06, "loss": 1.3539, "step": 7244 }, { "epoch": 0.05, "grad_norm": 4.416612309875375, "learning_rate": 1.999262895131701e-06, "loss": 1.2904, "step": 7245 }, { "epoch": 0.05, "grad_norm": 5.707584003932533, "learning_rate": 1.9992626913818157e-06, "loss": 1.3613, "step": 7246 }, { "epoch": 0.05, "grad_norm": 4.713965020599682, "learning_rate": 1.9992624876037846e-06, "loss": 1.4183, "step": 7247 }, { "epoch": 0.05, "grad_norm": 4.621425580963093, "learning_rate": 1.9992622837976075e-06, "loss": 1.3502, "step": 7248 }, { "epoch": 0.05, "grad_norm": 4.708933878134982, "learning_rate": 1.9992620799632842e-06, "loss": 1.3163, "step": 7249 }, { "epoch": 0.05, "grad_norm": 21.462949372467563, "learning_rate": 1.9992618761008154e-06, "loss": 1.1493, "step": 7250 }, { "epoch": 0.05, "grad_norm": 4.960880910395757, "learning_rate": 1.999261672210201e-06, "loss": 1.352, "step": 7251 }, { "epoch": 0.05, "grad_norm": 4.189651513145621, "learning_rate": 1.9992614682914404e-06, "loss": 1.2776, "step": 7252 }, { "epoch": 0.05, "grad_norm": 4.348706161336212, "learning_rate": 1.9992612643445338e-06, "loss": 1.3746, "step": 7253 }, { "epoch": 0.05, "grad_norm": 5.156911805153333, "learning_rate": 1.9992610603694815e-06, "loss": 1.4067, "step": 7254 }, { "epoch": 0.05, "grad_norm": 5.746286640354555, "learning_rate": 1.9992608563662833e-06, "loss": 1.5055, "step": 7255 }, { "epoch": 0.05, "grad_norm": 4.779899027234835, "learning_rate": 1.9992606523349393e-06, "loss": 1.3745, "step": 7256 }, { "epoch": 0.05, "grad_norm": 5.07367978515684, "learning_rate": 1.9992604482754494e-06, "loss": 1.2191, "step": 7257 }, { "epoch": 0.05, "grad_norm": 4.476592586337321, "learning_rate": 1.9992602441878133e-06, "loss": 1.3919, "step": 7258 }, { "epoch": 0.05, "grad_norm": 4.097038115614779, "learning_rate": 1.999260040072032e-06, "loss": 1.2798, "step": 7259 }, { "epoch": 0.05, "grad_norm": 4.592234046684155, "learning_rate": 1.9992598359281043e-06, "loss": 1.3016, "step": 7260 }, { "epoch": 0.05, "grad_norm": 4.981756079202014, "learning_rate": 1.9992596317560314e-06, "loss": 1.4327, "step": 7261 }, { "epoch": 0.05, "grad_norm": 4.4521181860167856, "learning_rate": 1.999259427555812e-06, "loss": 1.2757, "step": 7262 }, { "epoch": 0.05, "grad_norm": 4.504295020403219, "learning_rate": 1.9992592233274473e-06, "loss": 1.4797, "step": 7263 }, { "epoch": 0.05, "grad_norm": 5.891024626584452, "learning_rate": 1.9992590190709366e-06, "loss": 1.3852, "step": 7264 }, { "epoch": 0.05, "grad_norm": 4.396010567068984, "learning_rate": 1.99925881478628e-06, "loss": 1.4091, "step": 7265 }, { "epoch": 0.05, "grad_norm": 4.429988230228727, "learning_rate": 1.9992586104734775e-06, "loss": 1.3656, "step": 7266 }, { "epoch": 0.05, "grad_norm": 8.061804235902759, "learning_rate": 1.9992584061325295e-06, "loss": 1.5519, "step": 7267 }, { "epoch": 0.05, "grad_norm": 5.5539398798006445, "learning_rate": 1.999258201763436e-06, "loss": 1.4247, "step": 7268 }, { "epoch": 0.05, "grad_norm": 4.346574327871408, "learning_rate": 1.999257997366196e-06, "loss": 1.307, "step": 7269 }, { "epoch": 0.05, "grad_norm": 4.35911714101614, "learning_rate": 1.9992577929408108e-06, "loss": 1.3277, "step": 7270 }, { "epoch": 0.05, "grad_norm": 4.50178127722436, "learning_rate": 1.9992575884872794e-06, "loss": 1.3753, "step": 7271 }, { "epoch": 0.05, "grad_norm": 4.740171535197059, "learning_rate": 1.9992573840056023e-06, "loss": 1.4548, "step": 7272 }, { "epoch": 0.05, "grad_norm": 5.136386992512055, "learning_rate": 1.99925717949578e-06, "loss": 1.5318, "step": 7273 }, { "epoch": 0.05, "grad_norm": 4.3995958749780035, "learning_rate": 1.9992569749578114e-06, "loss": 1.3616, "step": 7274 }, { "epoch": 0.05, "grad_norm": 4.6170842047356775, "learning_rate": 1.999256770391697e-06, "loss": 1.3766, "step": 7275 }, { "epoch": 0.05, "grad_norm": 16.933378171516903, "learning_rate": 1.999256565797437e-06, "loss": 1.2301, "step": 7276 }, { "epoch": 0.05, "grad_norm": 7.741447733684727, "learning_rate": 1.9992563611750314e-06, "loss": 1.568, "step": 7277 }, { "epoch": 0.05, "grad_norm": 4.862826197676831, "learning_rate": 1.99925615652448e-06, "loss": 1.5091, "step": 7278 }, { "epoch": 0.05, "grad_norm": 4.20416030497113, "learning_rate": 1.999255951845783e-06, "loss": 1.2619, "step": 7279 }, { "epoch": 0.05, "grad_norm": 4.678330900809795, "learning_rate": 1.9992557471389402e-06, "loss": 1.4058, "step": 7280 }, { "epoch": 0.05, "grad_norm": 6.913039857633783, "learning_rate": 1.9992555424039517e-06, "loss": 1.3196, "step": 7281 }, { "epoch": 0.05, "grad_norm": 4.408340180151074, "learning_rate": 1.9992553376408174e-06, "loss": 1.3457, "step": 7282 }, { "epoch": 0.05, "grad_norm": 4.449935084526542, "learning_rate": 1.999255132849537e-06, "loss": 1.4257, "step": 7283 }, { "epoch": 0.05, "grad_norm": 5.4207675508970015, "learning_rate": 1.9992549280301116e-06, "loss": 1.6019, "step": 7284 }, { "epoch": 0.05, "grad_norm": 5.477242216719545, "learning_rate": 1.99925472318254e-06, "loss": 1.3708, "step": 7285 }, { "epoch": 0.05, "grad_norm": 4.383366961212569, "learning_rate": 1.9992545183068233e-06, "loss": 1.421, "step": 7286 }, { "epoch": 0.05, "grad_norm": 4.61606610771247, "learning_rate": 1.9992543134029605e-06, "loss": 1.3712, "step": 7287 }, { "epoch": 0.05, "grad_norm": 4.320782634456836, "learning_rate": 1.999254108470952e-06, "loss": 1.3268, "step": 7288 }, { "epoch": 0.05, "grad_norm": 5.0796257315842315, "learning_rate": 1.999253903510798e-06, "loss": 1.4247, "step": 7289 }, { "epoch": 0.05, "grad_norm": 4.272497034161681, "learning_rate": 1.9992536985224982e-06, "loss": 1.4255, "step": 7290 }, { "epoch": 0.05, "grad_norm": 6.594298621441523, "learning_rate": 1.9992534935060533e-06, "loss": 1.5112, "step": 7291 }, { "epoch": 0.05, "grad_norm": 4.3453128528093075, "learning_rate": 1.9992532884614623e-06, "loss": 1.489, "step": 7292 }, { "epoch": 0.05, "grad_norm": 4.453335044428289, "learning_rate": 1.9992530833887257e-06, "loss": 1.3698, "step": 7293 }, { "epoch": 0.05, "grad_norm": 4.714124330176096, "learning_rate": 1.999252878287843e-06, "loss": 1.4104, "step": 7294 }, { "epoch": 0.05, "grad_norm": 5.16185976444741, "learning_rate": 1.999252673158815e-06, "loss": 1.2957, "step": 7295 }, { "epoch": 0.05, "grad_norm": 4.437924276392332, "learning_rate": 1.999252468001642e-06, "loss": 1.4887, "step": 7296 }, { "epoch": 0.05, "grad_norm": 4.8156888462998815, "learning_rate": 1.999252262816323e-06, "loss": 1.3959, "step": 7297 }, { "epoch": 0.05, "grad_norm": 4.478605005888798, "learning_rate": 1.9992520576028576e-06, "loss": 1.3295, "step": 7298 }, { "epoch": 0.05, "grad_norm": 6.355293943112965, "learning_rate": 1.999251852361247e-06, "loss": 1.619, "step": 7299 }, { "epoch": 0.05, "grad_norm": 4.2606813455826416, "learning_rate": 1.9992516470914915e-06, "loss": 1.3573, "step": 7300 }, { "epoch": 0.05, "eval_loss": 1.5637669563293457, "eval_runtime": 4.6365, "eval_samples_per_second": 1.941, "eval_steps_per_second": 1.078, "step": 7300 }, { "epoch": 0.05, "grad_norm": 5.5954563351727, "learning_rate": 1.99925144179359e-06, "loss": 1.3519, "step": 7301 }, { "epoch": 0.05, "grad_norm": 4.477663235338056, "learning_rate": 1.999251236467543e-06, "loss": 1.2947, "step": 7302 }, { "epoch": 0.05, "grad_norm": 5.965186232483232, "learning_rate": 1.99925103111335e-06, "loss": 1.3614, "step": 7303 }, { "epoch": 0.05, "grad_norm": 5.238490966270003, "learning_rate": 1.9992508257310118e-06, "loss": 1.4763, "step": 7304 }, { "epoch": 0.05, "grad_norm": 4.605327573333336, "learning_rate": 1.9992506203205275e-06, "loss": 1.4628, "step": 7305 }, { "epoch": 0.05, "grad_norm": 10.469113645099888, "learning_rate": 1.9992504148818985e-06, "loss": 1.3254, "step": 7306 }, { "epoch": 0.05, "grad_norm": 4.813629336065855, "learning_rate": 1.9992502094151234e-06, "loss": 1.3518, "step": 7307 }, { "epoch": 0.05, "grad_norm": 6.664320004408355, "learning_rate": 1.9992500039202027e-06, "loss": 1.3544, "step": 7308 }, { "epoch": 0.05, "grad_norm": 7.922547438824867, "learning_rate": 1.9992497983971368e-06, "loss": 1.3439, "step": 7309 }, { "epoch": 0.05, "grad_norm": 4.476921199182293, "learning_rate": 1.9992495928459248e-06, "loss": 1.4385, "step": 7310 }, { "epoch": 0.05, "grad_norm": 4.899521233112283, "learning_rate": 1.999249387266568e-06, "loss": 1.3226, "step": 7311 }, { "epoch": 0.05, "grad_norm": 4.27282078614927, "learning_rate": 1.999249181659065e-06, "loss": 1.4157, "step": 7312 }, { "epoch": 0.05, "grad_norm": 6.090719977400682, "learning_rate": 1.9992489760234167e-06, "loss": 1.4147, "step": 7313 }, { "epoch": 0.05, "grad_norm": 4.326843115142952, "learning_rate": 1.9992487703596235e-06, "loss": 1.2975, "step": 7314 }, { "epoch": 0.05, "grad_norm": 4.802526226042229, "learning_rate": 1.9992485646676837e-06, "loss": 1.4724, "step": 7315 }, { "epoch": 0.05, "grad_norm": 4.969465924580249, "learning_rate": 1.999248358947599e-06, "loss": 1.399, "step": 7316 }, { "epoch": 0.05, "grad_norm": 4.570327141656747, "learning_rate": 1.9992481531993686e-06, "loss": 1.3189, "step": 7317 }, { "epoch": 0.05, "grad_norm": 4.512235689988891, "learning_rate": 1.999247947422993e-06, "loss": 1.3882, "step": 7318 }, { "epoch": 0.05, "grad_norm": 4.285706096876549, "learning_rate": 1.999247741618472e-06, "loss": 1.37, "step": 7319 }, { "epoch": 0.05, "grad_norm": 6.197431297378672, "learning_rate": 1.9992475357858048e-06, "loss": 1.4586, "step": 7320 }, { "epoch": 0.05, "grad_norm": 4.746678859831825, "learning_rate": 1.9992473299249925e-06, "loss": 1.2389, "step": 7321 }, { "epoch": 0.05, "grad_norm": 5.088746174682082, "learning_rate": 1.999247124036035e-06, "loss": 1.3926, "step": 7322 }, { "epoch": 0.05, "grad_norm": 4.647638022711612, "learning_rate": 1.999246918118932e-06, "loss": 1.567, "step": 7323 }, { "epoch": 0.05, "grad_norm": 4.67110456820303, "learning_rate": 1.9992467121736827e-06, "loss": 1.3872, "step": 7324 }, { "epoch": 0.05, "grad_norm": 3.9348255202574722, "learning_rate": 1.999246506200289e-06, "loss": 1.0848, "step": 7325 }, { "epoch": 0.05, "grad_norm": 4.9900836596421785, "learning_rate": 1.999246300198749e-06, "loss": 1.2838, "step": 7326 }, { "epoch": 0.05, "grad_norm": 4.829868869159872, "learning_rate": 1.9992460941690644e-06, "loss": 1.2675, "step": 7327 }, { "epoch": 0.05, "grad_norm": 4.498854417376345, "learning_rate": 1.999245888111234e-06, "loss": 1.4141, "step": 7328 }, { "epoch": 0.05, "grad_norm": 4.61646225131352, "learning_rate": 1.9992456820252583e-06, "loss": 1.368, "step": 7329 }, { "epoch": 0.05, "grad_norm": 5.812091482134837, "learning_rate": 1.999245475911137e-06, "loss": 1.3893, "step": 7330 }, { "epoch": 0.05, "grad_norm": 4.061551626493299, "learning_rate": 1.99924526976887e-06, "loss": 1.3587, "step": 7331 }, { "epoch": 0.05, "grad_norm": 4.3011884538295435, "learning_rate": 1.9992450635984575e-06, "loss": 1.2488, "step": 7332 }, { "epoch": 0.05, "grad_norm": 4.520647237170638, "learning_rate": 1.9992448573999e-06, "loss": 1.4716, "step": 7333 }, { "epoch": 0.05, "grad_norm": 4.292284180073359, "learning_rate": 1.999244651173197e-06, "loss": 1.1681, "step": 7334 }, { "epoch": 0.05, "grad_norm": 5.396098356608658, "learning_rate": 1.999244444918349e-06, "loss": 1.5132, "step": 7335 }, { "epoch": 0.05, "grad_norm": 4.345992756782151, "learning_rate": 1.999244238635355e-06, "loss": 1.3966, "step": 7336 }, { "epoch": 0.05, "grad_norm": 4.157017882741236, "learning_rate": 1.999244032324216e-06, "loss": 1.2564, "step": 7337 }, { "epoch": 0.05, "grad_norm": 4.505707871836894, "learning_rate": 1.9992438259849315e-06, "loss": 1.4252, "step": 7338 }, { "epoch": 0.05, "grad_norm": 5.274265626314412, "learning_rate": 1.9992436196175016e-06, "loss": 1.3626, "step": 7339 }, { "epoch": 0.05, "grad_norm": 5.1729965703205805, "learning_rate": 1.9992434132219265e-06, "loss": 1.4692, "step": 7340 }, { "epoch": 0.05, "grad_norm": 5.0689135323759125, "learning_rate": 1.999243206798206e-06, "loss": 1.337, "step": 7341 }, { "epoch": 0.05, "grad_norm": 4.252617714755524, "learning_rate": 1.99924300034634e-06, "loss": 1.2991, "step": 7342 }, { "epoch": 0.05, "grad_norm": 5.10964437252738, "learning_rate": 1.9992427938663283e-06, "loss": 1.433, "step": 7343 }, { "epoch": 0.05, "grad_norm": 5.016053891574288, "learning_rate": 1.9992425873581715e-06, "loss": 1.2565, "step": 7344 }, { "epoch": 0.05, "grad_norm": 4.930861852717732, "learning_rate": 1.9992423808218696e-06, "loss": 1.518, "step": 7345 }, { "epoch": 0.05, "grad_norm": 4.105863610245573, "learning_rate": 1.9992421742574224e-06, "loss": 1.2983, "step": 7346 }, { "epoch": 0.05, "grad_norm": 5.049648806211745, "learning_rate": 1.99924196766483e-06, "loss": 1.4002, "step": 7347 }, { "epoch": 0.05, "grad_norm": 3.994453973011883, "learning_rate": 1.999241761044092e-06, "loss": 1.2405, "step": 7348 }, { "epoch": 0.05, "grad_norm": 7.337536156192312, "learning_rate": 1.9992415543952082e-06, "loss": 1.468, "step": 7349 }, { "epoch": 0.05, "grad_norm": 5.2216189505629576, "learning_rate": 1.9992413477181798e-06, "loss": 1.3073, "step": 7350 }, { "epoch": 0.05, "grad_norm": 4.192465249527119, "learning_rate": 1.999241141013006e-06, "loss": 1.3372, "step": 7351 }, { "epoch": 0.05, "grad_norm": 4.700859435023539, "learning_rate": 1.9992409342796868e-06, "loss": 1.3439, "step": 7352 }, { "epoch": 0.05, "grad_norm": 10.193769874004852, "learning_rate": 1.9992407275182223e-06, "loss": 1.4881, "step": 7353 }, { "epoch": 0.05, "grad_norm": 4.428904938587758, "learning_rate": 1.999240520728612e-06, "loss": 1.456, "step": 7354 }, { "epoch": 0.05, "grad_norm": 5.161417636454554, "learning_rate": 1.999240313910857e-06, "loss": 1.5098, "step": 7355 }, { "epoch": 0.05, "grad_norm": 4.697588287729796, "learning_rate": 1.9992401070649566e-06, "loss": 1.4525, "step": 7356 }, { "epoch": 0.05, "grad_norm": 4.626578782312183, "learning_rate": 1.9992399001909112e-06, "loss": 1.3924, "step": 7357 }, { "epoch": 0.05, "grad_norm": 4.128909024382533, "learning_rate": 1.9992396932887202e-06, "loss": 1.4037, "step": 7358 }, { "epoch": 0.05, "grad_norm": 4.256685371087901, "learning_rate": 1.999239486358384e-06, "loss": 1.4139, "step": 7359 }, { "epoch": 0.05, "grad_norm": 4.592908345817238, "learning_rate": 1.9992392793999026e-06, "loss": 1.2786, "step": 7360 }, { "epoch": 0.05, "grad_norm": 4.463309766415438, "learning_rate": 1.999239072413276e-06, "loss": 1.4485, "step": 7361 }, { "epoch": 0.05, "grad_norm": 5.022503835779689, "learning_rate": 1.999238865398504e-06, "loss": 1.4483, "step": 7362 }, { "epoch": 0.05, "grad_norm": 4.71268462333702, "learning_rate": 1.999238658355587e-06, "loss": 1.5676, "step": 7363 }, { "epoch": 0.05, "grad_norm": 4.133889654003795, "learning_rate": 1.999238451284525e-06, "loss": 1.367, "step": 7364 }, { "epoch": 0.05, "grad_norm": 4.659510554017874, "learning_rate": 1.999238244185317e-06, "loss": 1.4392, "step": 7365 }, { "epoch": 0.05, "grad_norm": 4.17722698716844, "learning_rate": 1.9992380370579642e-06, "loss": 1.2386, "step": 7366 }, { "epoch": 0.05, "grad_norm": 6.164443893128184, "learning_rate": 1.9992378299024663e-06, "loss": 1.4385, "step": 7367 }, { "epoch": 0.05, "grad_norm": 5.441030713653209, "learning_rate": 1.999237622718823e-06, "loss": 1.3581, "step": 7368 }, { "epoch": 0.05, "grad_norm": 4.629594959612535, "learning_rate": 1.9992374155070345e-06, "loss": 1.2106, "step": 7369 }, { "epoch": 0.05, "grad_norm": 5.008766684258405, "learning_rate": 1.9992372082671005e-06, "loss": 1.398, "step": 7370 }, { "epoch": 0.05, "grad_norm": 5.670500409496447, "learning_rate": 1.9992370009990222e-06, "loss": 1.5493, "step": 7371 }, { "epoch": 0.05, "grad_norm": 4.780738562991446, "learning_rate": 1.9992367937027983e-06, "loss": 1.4821, "step": 7372 }, { "epoch": 0.05, "grad_norm": 4.550289122621756, "learning_rate": 1.9992365863784287e-06, "loss": 1.4029, "step": 7373 }, { "epoch": 0.05, "eval_loss": 1.573071002960205, "eval_runtime": 4.6489, "eval_samples_per_second": 1.936, "eval_steps_per_second": 1.076, "step": 7373 }, { "epoch": 0.05, "grad_norm": 4.673707280575764, "learning_rate": 1.9992363790259143e-06, "loss": 1.3031, "step": 7374 }, { "epoch": 0.05, "grad_norm": 4.604876229271979, "learning_rate": 1.999236171645255e-06, "loss": 1.3352, "step": 7375 }, { "epoch": 0.05, "grad_norm": 4.410069371179086, "learning_rate": 1.9992359642364504e-06, "loss": 1.3583, "step": 7376 }, { "epoch": 0.05, "grad_norm": 4.579686671004471, "learning_rate": 1.9992357567995004e-06, "loss": 1.4515, "step": 7377 }, { "epoch": 0.05, "grad_norm": 4.436806375168678, "learning_rate": 1.9992355493344056e-06, "loss": 1.3337, "step": 7378 }, { "epoch": 0.05, "grad_norm": 4.221774682133403, "learning_rate": 1.9992353418411656e-06, "loss": 1.3291, "step": 7379 }, { "epoch": 0.05, "grad_norm": 8.913468587557904, "learning_rate": 1.9992351343197804e-06, "loss": 1.4496, "step": 7380 }, { "epoch": 0.05, "grad_norm": 4.565015804452917, "learning_rate": 1.9992349267702495e-06, "loss": 1.2683, "step": 7381 }, { "epoch": 0.05, "grad_norm": 4.7064155789657365, "learning_rate": 1.9992347191925743e-06, "loss": 1.3673, "step": 7382 }, { "epoch": 0.05, "grad_norm": 4.9123784995845305, "learning_rate": 1.999234511586754e-06, "loss": 1.3041, "step": 7383 }, { "epoch": 0.05, "grad_norm": 7.945039676337444, "learning_rate": 1.999234303952788e-06, "loss": 1.3561, "step": 7384 }, { "epoch": 0.05, "grad_norm": 4.799075047982834, "learning_rate": 1.999234096290677e-06, "loss": 1.3097, "step": 7385 }, { "epoch": 0.05, "grad_norm": 4.171980247891026, "learning_rate": 1.9992338886004214e-06, "loss": 1.2901, "step": 7386 }, { "epoch": 0.05, "grad_norm": 5.975476243409887, "learning_rate": 1.99923368088202e-06, "loss": 1.4298, "step": 7387 }, { "epoch": 0.05, "grad_norm": 5.76984043239986, "learning_rate": 1.999233473135474e-06, "loss": 1.2632, "step": 7388 }, { "epoch": 0.05, "grad_norm": 4.6555391663168555, "learning_rate": 1.999233265360783e-06, "loss": 1.419, "step": 7389 }, { "epoch": 0.05, "grad_norm": 4.295835800574735, "learning_rate": 1.9992330575579468e-06, "loss": 1.3903, "step": 7390 }, { "epoch": 0.05, "grad_norm": 4.437129893281457, "learning_rate": 1.999232849726965e-06, "loss": 1.3633, "step": 7391 }, { "epoch": 0.05, "grad_norm": 4.357941254373054, "learning_rate": 1.9992326418678386e-06, "loss": 1.2785, "step": 7392 }, { "epoch": 0.05, "grad_norm": 4.569134358917213, "learning_rate": 1.999232433980567e-06, "loss": 1.262, "step": 7393 }, { "epoch": 0.05, "grad_norm": 10.322195650052354, "learning_rate": 1.9992322260651505e-06, "loss": 1.4675, "step": 7394 }, { "epoch": 0.05, "grad_norm": 5.543120885931156, "learning_rate": 1.999232018121589e-06, "loss": 1.5693, "step": 7395 }, { "epoch": 0.05, "grad_norm": 4.410134304941793, "learning_rate": 1.9992318101498827e-06, "loss": 1.4316, "step": 7396 }, { "epoch": 0.05, "grad_norm": 5.245618861523013, "learning_rate": 1.9992316021500306e-06, "loss": 1.3848, "step": 7397 }, { "epoch": 0.05, "grad_norm": 4.625316287070328, "learning_rate": 1.999231394122034e-06, "loss": 1.353, "step": 7398 }, { "epoch": 0.05, "grad_norm": 4.549935306430918, "learning_rate": 1.9992311860658925e-06, "loss": 1.2532, "step": 7399 }, { "epoch": 0.05, "grad_norm": 4.38771144571651, "learning_rate": 1.9992309779816056e-06, "loss": 1.3705, "step": 7400 }, { "epoch": 0.05, "grad_norm": 4.469167605812163, "learning_rate": 1.999230769869174e-06, "loss": 1.293, "step": 7401 }, { "epoch": 0.05, "grad_norm": 5.117354318441001, "learning_rate": 1.999230561728597e-06, "loss": 1.2552, "step": 7402 }, { "epoch": 0.05, "grad_norm": 4.6693833595307925, "learning_rate": 1.9992303535598754e-06, "loss": 1.4505, "step": 7403 }, { "epoch": 0.05, "grad_norm": 4.550200045568771, "learning_rate": 1.9992301453630085e-06, "loss": 1.3253, "step": 7404 }, { "epoch": 0.05, "grad_norm": 4.2902897192049965, "learning_rate": 1.999229937137997e-06, "loss": 1.2953, "step": 7405 }, { "epoch": 0.05, "grad_norm": 4.513265596497015, "learning_rate": 1.99922972888484e-06, "loss": 1.4143, "step": 7406 }, { "epoch": 0.05, "grad_norm": 4.516402862129844, "learning_rate": 1.9992295206035387e-06, "loss": 1.2216, "step": 7407 }, { "epoch": 0.05, "grad_norm": 4.197009686960515, "learning_rate": 1.999229312294092e-06, "loss": 1.2568, "step": 7408 }, { "epoch": 0.05, "grad_norm": 4.824515246057933, "learning_rate": 1.9992291039565e-06, "loss": 1.2897, "step": 7409 }, { "epoch": 0.05, "grad_norm": 4.487043346309826, "learning_rate": 1.9992288955907633e-06, "loss": 1.4268, "step": 7410 }, { "epoch": 0.05, "grad_norm": 4.640055501409591, "learning_rate": 1.999228687196882e-06, "loss": 1.399, "step": 7411 }, { "epoch": 0.05, "grad_norm": 7.546929320535393, "learning_rate": 1.9992284787748556e-06, "loss": 1.3385, "step": 7412 }, { "epoch": 0.05, "grad_norm": 4.922382207278435, "learning_rate": 1.999228270324684e-06, "loss": 1.4232, "step": 7413 }, { "epoch": 0.05, "grad_norm": 4.218078494521849, "learning_rate": 1.9992280618463675e-06, "loss": 1.2936, "step": 7414 }, { "epoch": 0.05, "grad_norm": 4.193398066694619, "learning_rate": 1.9992278533399063e-06, "loss": 1.2767, "step": 7415 }, { "epoch": 0.05, "grad_norm": 7.790536718844028, "learning_rate": 1.9992276448053002e-06, "loss": 1.6827, "step": 7416 }, { "epoch": 0.05, "grad_norm": 4.91142299302245, "learning_rate": 1.999227436242549e-06, "loss": 1.5335, "step": 7417 }, { "epoch": 0.05, "grad_norm": 4.821570677265493, "learning_rate": 1.999227227651653e-06, "loss": 1.4597, "step": 7418 }, { "epoch": 0.05, "grad_norm": 4.959099966111125, "learning_rate": 1.9992270190326117e-06, "loss": 1.4658, "step": 7419 }, { "epoch": 0.05, "grad_norm": 4.250196452440744, "learning_rate": 1.999226810385426e-06, "loss": 1.3278, "step": 7420 }, { "epoch": 0.05, "grad_norm": 4.483348025882049, "learning_rate": 1.9992266017100953e-06, "loss": 1.3393, "step": 7421 }, { "epoch": 0.05, "grad_norm": 5.126705520969932, "learning_rate": 1.9992263930066193e-06, "loss": 1.3196, "step": 7422 }, { "epoch": 0.05, "grad_norm": 4.538923214896234, "learning_rate": 1.999226184274999e-06, "loss": 1.3337, "step": 7423 }, { "epoch": 0.05, "grad_norm": 4.305397568389926, "learning_rate": 1.9992259755152334e-06, "loss": 1.3421, "step": 7424 }, { "epoch": 0.05, "grad_norm": 4.566249237341296, "learning_rate": 1.999225766727323e-06, "loss": 1.3722, "step": 7425 }, { "epoch": 0.05, "grad_norm": 4.2848674379910445, "learning_rate": 1.999225557911268e-06, "loss": 1.3016, "step": 7426 }, { "epoch": 0.05, "grad_norm": 4.355769508222125, "learning_rate": 1.9992253490670683e-06, "loss": 1.4002, "step": 7427 }, { "epoch": 0.05, "grad_norm": 4.945082908797928, "learning_rate": 1.999225140194723e-06, "loss": 1.2991, "step": 7428 }, { "epoch": 0.05, "grad_norm": 4.716297526264571, "learning_rate": 1.999224931294233e-06, "loss": 1.3753, "step": 7429 }, { "epoch": 0.05, "grad_norm": 4.447555160988962, "learning_rate": 1.999224722365599e-06, "loss": 1.1973, "step": 7430 }, { "epoch": 0.05, "grad_norm": 4.452649223935581, "learning_rate": 1.9992245134088193e-06, "loss": 1.4074, "step": 7431 }, { "epoch": 0.05, "grad_norm": 4.93924971706922, "learning_rate": 1.999224304423895e-06, "loss": 1.5055, "step": 7432 }, { "epoch": 0.05, "grad_norm": 4.416898495997896, "learning_rate": 1.9992240954108263e-06, "loss": 1.3244, "step": 7433 }, { "epoch": 0.05, "grad_norm": 4.017610458364144, "learning_rate": 1.999223886369612e-06, "loss": 1.3539, "step": 7434 }, { "epoch": 0.05, "grad_norm": 5.433098184649393, "learning_rate": 1.9992236773002534e-06, "loss": 1.3889, "step": 7435 }, { "epoch": 0.05, "grad_norm": 5.021597854474734, "learning_rate": 1.99922346820275e-06, "loss": 1.3402, "step": 7436 }, { "epoch": 0.05, "grad_norm": 7.126245043136787, "learning_rate": 1.9992232590771016e-06, "loss": 1.1735, "step": 7437 }, { "epoch": 0.05, "grad_norm": 4.52229462955812, "learning_rate": 1.9992230499233086e-06, "loss": 1.3968, "step": 7438 }, { "epoch": 0.05, "grad_norm": 4.357142938384053, "learning_rate": 1.9992228407413708e-06, "loss": 1.3284, "step": 7439 }, { "epoch": 0.05, "grad_norm": 4.21521871438118, "learning_rate": 1.9992226315312878e-06, "loss": 1.2314, "step": 7440 }, { "epoch": 0.05, "grad_norm": 4.550663522094946, "learning_rate": 1.9992224222930604e-06, "loss": 1.3645, "step": 7441 }, { "epoch": 0.05, "grad_norm": 4.267465042307704, "learning_rate": 1.999222213026688e-06, "loss": 1.3019, "step": 7442 }, { "epoch": 0.05, "grad_norm": 4.8167693874291855, "learning_rate": 1.999222003732171e-06, "loss": 1.3427, "step": 7443 }, { "epoch": 0.05, "grad_norm": 4.39873638799913, "learning_rate": 1.9992217944095094e-06, "loss": 1.4339, "step": 7444 }, { "epoch": 0.05, "grad_norm": 4.392270223772549, "learning_rate": 1.999221585058703e-06, "loss": 1.2795, "step": 7445 }, { "epoch": 0.05, "grad_norm": 4.450816606402411, "learning_rate": 1.9992213756797516e-06, "loss": 1.41, "step": 7446 }, { "epoch": 0.05, "eval_loss": 1.5701239109039307, "eval_runtime": 4.6065, "eval_samples_per_second": 1.954, "eval_steps_per_second": 1.085, "step": 7446 }, { "epoch": 0.05, "grad_norm": 4.573150171432968, "learning_rate": 1.999221166272656e-06, "loss": 1.4725, "step": 7447 }, { "epoch": 0.05, "grad_norm": 4.512280198559463, "learning_rate": 1.999220956837415e-06, "loss": 1.2915, "step": 7448 }, { "epoch": 0.05, "grad_norm": 4.838369388662114, "learning_rate": 1.9992207473740292e-06, "loss": 1.3659, "step": 7449 }, { "epoch": 0.05, "grad_norm": 4.812578854501264, "learning_rate": 1.999220537882499e-06, "loss": 1.4475, "step": 7450 }, { "epoch": 0.05, "grad_norm": 5.534462447032788, "learning_rate": 1.9992203283628243e-06, "loss": 1.4574, "step": 7451 }, { "epoch": 0.05, "grad_norm": 6.21722209726451, "learning_rate": 1.9992201188150047e-06, "loss": 1.4408, "step": 7452 }, { "epoch": 0.05, "grad_norm": 4.438672299727171, "learning_rate": 1.9992199092390403e-06, "loss": 1.375, "step": 7453 }, { "epoch": 0.05, "grad_norm": 4.492166208634664, "learning_rate": 1.9992196996349315e-06, "loss": 1.3941, "step": 7454 }, { "epoch": 0.05, "grad_norm": 4.437522815355215, "learning_rate": 1.9992194900026775e-06, "loss": 1.1889, "step": 7455 }, { "epoch": 0.05, "grad_norm": 7.238265347653003, "learning_rate": 1.999219280342279e-06, "loss": 1.4654, "step": 7456 }, { "epoch": 0.05, "grad_norm": 6.860131181343806, "learning_rate": 1.999219070653736e-06, "loss": 1.2926, "step": 7457 }, { "epoch": 0.05, "grad_norm": 4.987829261570923, "learning_rate": 1.999218860937048e-06, "loss": 1.2139, "step": 7458 }, { "epoch": 0.05, "grad_norm": 4.648797926415735, "learning_rate": 1.999218651192216e-06, "loss": 1.4387, "step": 7459 }, { "epoch": 0.05, "grad_norm": 5.039978074039654, "learning_rate": 1.9992184414192387e-06, "loss": 1.2072, "step": 7460 }, { "epoch": 0.05, "grad_norm": 7.400230310703256, "learning_rate": 1.999218231618117e-06, "loss": 1.4834, "step": 7461 }, { "epoch": 0.05, "grad_norm": 4.584055743374395, "learning_rate": 1.9992180217888506e-06, "loss": 1.2319, "step": 7462 }, { "epoch": 0.05, "grad_norm": 4.708777872494851, "learning_rate": 1.9992178119314395e-06, "loss": 1.2039, "step": 7463 }, { "epoch": 0.05, "grad_norm": 8.04554306262474, "learning_rate": 1.9992176020458837e-06, "loss": 1.348, "step": 7464 }, { "epoch": 0.05, "grad_norm": 4.937957035525078, "learning_rate": 1.999217392132183e-06, "loss": 1.3155, "step": 7465 }, { "epoch": 0.05, "grad_norm": 4.1419766796957065, "learning_rate": 1.999217182190338e-06, "loss": 1.2914, "step": 7466 }, { "epoch": 0.05, "grad_norm": 4.651469149559408, "learning_rate": 1.999216972220349e-06, "loss": 1.1188, "step": 7467 }, { "epoch": 0.05, "grad_norm": 5.751174725438613, "learning_rate": 1.9992167622222143e-06, "loss": 1.378, "step": 7468 }, { "epoch": 0.05, "grad_norm": 7.699455869536229, "learning_rate": 1.999216552195936e-06, "loss": 1.262, "step": 7469 }, { "epoch": 0.05, "grad_norm": 4.382239086363008, "learning_rate": 1.999216342141512e-06, "loss": 1.2522, "step": 7470 }, { "epoch": 0.05, "grad_norm": 4.651479161221517, "learning_rate": 1.999216132058944e-06, "loss": 1.2955, "step": 7471 }, { "epoch": 0.05, "grad_norm": 4.568777518685292, "learning_rate": 1.9992159219482313e-06, "loss": 1.3835, "step": 7472 }, { "epoch": 0.05, "grad_norm": 4.391413872161779, "learning_rate": 1.999215711809374e-06, "loss": 1.3809, "step": 7473 }, { "epoch": 0.05, "grad_norm": 4.605255442900876, "learning_rate": 1.9992155016423725e-06, "loss": 1.4206, "step": 7474 }, { "epoch": 0.05, "grad_norm": 4.409253281436837, "learning_rate": 1.999215291447226e-06, "loss": 1.3142, "step": 7475 }, { "epoch": 0.05, "grad_norm": 4.289788760226969, "learning_rate": 1.999215081223935e-06, "loss": 1.1975, "step": 7476 }, { "epoch": 0.05, "grad_norm": 4.888872212556441, "learning_rate": 1.9992148709724995e-06, "loss": 1.3347, "step": 7477 }, { "epoch": 0.05, "grad_norm": 6.105017631796159, "learning_rate": 1.9992146606929196e-06, "loss": 1.4587, "step": 7478 }, { "epoch": 0.05, "grad_norm": 4.8402316651560815, "learning_rate": 1.9992144503851945e-06, "loss": 1.4594, "step": 7479 }, { "epoch": 0.05, "grad_norm": 4.378066415011355, "learning_rate": 1.9992142400493255e-06, "loss": 1.5412, "step": 7480 }, { "epoch": 0.05, "grad_norm": 4.511165007051402, "learning_rate": 1.9992140296853117e-06, "loss": 1.2978, "step": 7481 }, { "epoch": 0.05, "grad_norm": 4.3326371257799945, "learning_rate": 1.9992138192931536e-06, "loss": 1.2211, "step": 7482 }, { "epoch": 0.05, "grad_norm": 4.748881972156852, "learning_rate": 1.999213608872851e-06, "loss": 1.4369, "step": 7483 }, { "epoch": 0.05, "grad_norm": 4.710942496672667, "learning_rate": 1.9992133984244033e-06, "loss": 1.4203, "step": 7484 }, { "epoch": 0.05, "grad_norm": 4.927205327872397, "learning_rate": 1.9992131879478116e-06, "loss": 1.2991, "step": 7485 }, { "epoch": 0.05, "grad_norm": 4.459703037858343, "learning_rate": 1.999212977443075e-06, "loss": 1.2423, "step": 7486 }, { "epoch": 0.05, "grad_norm": 4.782765748957992, "learning_rate": 1.9992127669101943e-06, "loss": 1.3758, "step": 7487 }, { "epoch": 0.05, "grad_norm": 4.780062741943454, "learning_rate": 1.9992125563491687e-06, "loss": 1.3777, "step": 7488 }, { "epoch": 0.05, "grad_norm": 4.603265983567355, "learning_rate": 1.999212345759999e-06, "loss": 1.3982, "step": 7489 }, { "epoch": 0.05, "grad_norm": 4.431271252782981, "learning_rate": 1.9992121351426847e-06, "loss": 1.413, "step": 7490 }, { "epoch": 0.05, "grad_norm": 4.393194529153575, "learning_rate": 1.999211924497226e-06, "loss": 1.3839, "step": 7491 }, { "epoch": 0.05, "grad_norm": 4.728714432823337, "learning_rate": 1.9992117138236225e-06, "loss": 1.2302, "step": 7492 }, { "epoch": 0.05, "grad_norm": 3.9799082608966705, "learning_rate": 1.999211503121875e-06, "loss": 1.1864, "step": 7493 }, { "epoch": 0.05, "grad_norm": 4.304124551036202, "learning_rate": 1.9992112923919824e-06, "loss": 1.3466, "step": 7494 }, { "epoch": 0.05, "grad_norm": 5.004044791622379, "learning_rate": 1.999211081633946e-06, "loss": 1.4469, "step": 7495 }, { "epoch": 0.05, "grad_norm": 4.223878866794653, "learning_rate": 1.999210870847765e-06, "loss": 1.4294, "step": 7496 }, { "epoch": 0.05, "grad_norm": 4.5802998728245345, "learning_rate": 1.999210660033439e-06, "loss": 1.3718, "step": 7497 }, { "epoch": 0.05, "grad_norm": 4.578170505258845, "learning_rate": 1.999210449190969e-06, "loss": 1.2767, "step": 7498 }, { "epoch": 0.05, "grad_norm": 7.320370774008876, "learning_rate": 1.9992102383203546e-06, "loss": 1.3018, "step": 7499 }, { "epoch": 0.05, "grad_norm": 4.4422580545508845, "learning_rate": 1.9992100274215957e-06, "loss": 1.3817, "step": 7500 }, { "epoch": 0.05, "grad_norm": 4.837031481374435, "learning_rate": 1.9992098164946925e-06, "loss": 1.1922, "step": 7501 }, { "epoch": 0.05, "grad_norm": 4.828756136838027, "learning_rate": 1.9992096055396445e-06, "loss": 1.3433, "step": 7502 }, { "epoch": 0.05, "grad_norm": 5.2984083354478, "learning_rate": 1.9992093945564526e-06, "loss": 1.292, "step": 7503 }, { "epoch": 0.05, "grad_norm": 4.450931773337462, "learning_rate": 1.999209183545116e-06, "loss": 1.4557, "step": 7504 }, { "epoch": 0.05, "grad_norm": 6.055775483792625, "learning_rate": 1.999208972505635e-06, "loss": 1.4037, "step": 7505 }, { "epoch": 0.05, "grad_norm": 4.700180848251725, "learning_rate": 1.99920876143801e-06, "loss": 1.3338, "step": 7506 }, { "epoch": 0.05, "grad_norm": 8.481627631444834, "learning_rate": 1.99920855034224e-06, "loss": 1.3783, "step": 7507 }, { "epoch": 0.05, "grad_norm": 11.803303536369528, "learning_rate": 1.999208339218326e-06, "loss": 1.2976, "step": 7508 }, { "epoch": 0.05, "grad_norm": 4.527972647526172, "learning_rate": 1.9992081280662673e-06, "loss": 1.3766, "step": 7509 }, { "epoch": 0.05, "grad_norm": 6.345808994842762, "learning_rate": 1.9992079168860644e-06, "loss": 1.4757, "step": 7510 }, { "epoch": 0.05, "grad_norm": 4.311606264028323, "learning_rate": 1.999207705677717e-06, "loss": 1.3921, "step": 7511 }, { "epoch": 0.05, "grad_norm": 4.853724080984389, "learning_rate": 1.999207494441226e-06, "loss": 1.3419, "step": 7512 }, { "epoch": 0.05, "grad_norm": 9.478915088434864, "learning_rate": 1.99920728317659e-06, "loss": 1.4275, "step": 7513 }, { "epoch": 0.05, "grad_norm": 4.137978331220008, "learning_rate": 1.9992070718838096e-06, "loss": 1.2578, "step": 7514 }, { "epoch": 0.05, "grad_norm": 7.013237095293034, "learning_rate": 1.999206860562885e-06, "loss": 1.3706, "step": 7515 }, { "epoch": 0.05, "grad_norm": 4.8452772436182165, "learning_rate": 1.999206649213816e-06, "loss": 1.3772, "step": 7516 }, { "epoch": 0.05, "grad_norm": 4.839135989567441, "learning_rate": 1.9992064378366024e-06, "loss": 1.3297, "step": 7517 }, { "epoch": 0.05, "grad_norm": 4.022057276414956, "learning_rate": 1.999206226431245e-06, "loss": 1.3498, "step": 7518 }, { "epoch": 0.05, "grad_norm": 4.888149873956158, "learning_rate": 1.999206014997743e-06, "loss": 1.4392, "step": 7519 }, { "epoch": 0.05, "eval_loss": 1.5739436149597168, "eval_runtime": 4.5993, "eval_samples_per_second": 1.957, "eval_steps_per_second": 1.087, "step": 7519 }, { "epoch": 0.05, "grad_norm": 4.850017736564452, "learning_rate": 1.9992058035360967e-06, "loss": 1.3255, "step": 7520 }, { "epoch": 0.05, "grad_norm": 6.07201821075577, "learning_rate": 1.9992055920463062e-06, "loss": 1.5122, "step": 7521 }, { "epoch": 0.05, "grad_norm": 4.633619936002238, "learning_rate": 1.9992053805283714e-06, "loss": 1.3204, "step": 7522 }, { "epoch": 0.05, "grad_norm": 4.353881180573241, "learning_rate": 1.9992051689822922e-06, "loss": 1.33, "step": 7523 }, { "epoch": 0.05, "grad_norm": 4.085300814804301, "learning_rate": 1.999204957408069e-06, "loss": 1.1291, "step": 7524 }, { "epoch": 0.05, "grad_norm": 4.4293373888957275, "learning_rate": 1.999204745805701e-06, "loss": 1.3773, "step": 7525 }, { "epoch": 0.05, "grad_norm": 5.006012115084632, "learning_rate": 1.9992045341751893e-06, "loss": 1.1745, "step": 7526 }, { "epoch": 0.05, "grad_norm": 4.299224895683835, "learning_rate": 1.999204322516533e-06, "loss": 1.2442, "step": 7527 }, { "epoch": 0.05, "grad_norm": 4.3938784367069275, "learning_rate": 1.9992041108297324e-06, "loss": 1.3268, "step": 7528 }, { "epoch": 0.05, "grad_norm": 5.653293606702152, "learning_rate": 1.999203899114788e-06, "loss": 1.3435, "step": 7529 }, { "epoch": 0.05, "grad_norm": 6.382324380225979, "learning_rate": 1.999203687371699e-06, "loss": 1.4888, "step": 7530 }, { "epoch": 0.05, "grad_norm": 6.195126751275663, "learning_rate": 1.9992034756004657e-06, "loss": 1.4749, "step": 7531 }, { "epoch": 0.05, "grad_norm": 4.591911768215456, "learning_rate": 1.999203263801088e-06, "loss": 1.4264, "step": 7532 }, { "epoch": 0.05, "grad_norm": 4.717871400252642, "learning_rate": 1.999203051973566e-06, "loss": 1.3906, "step": 7533 }, { "epoch": 0.05, "grad_norm": 4.883379310029599, "learning_rate": 1.9992028401179006e-06, "loss": 1.4342, "step": 7534 }, { "epoch": 0.05, "grad_norm": 4.734168604515194, "learning_rate": 1.9992026282340903e-06, "loss": 1.354, "step": 7535 }, { "epoch": 0.05, "grad_norm": 5.230464566154019, "learning_rate": 1.999202416322136e-06, "loss": 1.4531, "step": 7536 }, { "epoch": 0.05, "grad_norm": 4.319655446876068, "learning_rate": 1.9992022043820375e-06, "loss": 1.3998, "step": 7537 }, { "epoch": 0.05, "grad_norm": 4.634591300977178, "learning_rate": 1.9992019924137945e-06, "loss": 1.3172, "step": 7538 }, { "epoch": 0.05, "grad_norm": 4.478692259002949, "learning_rate": 1.9992017804174076e-06, "loss": 1.393, "step": 7539 }, { "epoch": 0.05, "grad_norm": 5.68354275265913, "learning_rate": 1.9992015683928764e-06, "loss": 1.384, "step": 7540 }, { "epoch": 0.05, "grad_norm": 4.572497334846359, "learning_rate": 1.999201356340201e-06, "loss": 1.5404, "step": 7541 }, { "epoch": 0.05, "grad_norm": 4.425917264064885, "learning_rate": 1.9992011442593816e-06, "loss": 1.292, "step": 7542 }, { "epoch": 0.05, "grad_norm": 4.5134585041037605, "learning_rate": 1.9992009321504176e-06, "loss": 1.41, "step": 7543 }, { "epoch": 0.05, "grad_norm": 5.087655137891195, "learning_rate": 1.9992007200133098e-06, "loss": 1.3439, "step": 7544 }, { "epoch": 0.05, "grad_norm": 5.0170171079884405, "learning_rate": 1.999200507848058e-06, "loss": 1.2401, "step": 7545 }, { "epoch": 0.05, "grad_norm": 4.839421160481582, "learning_rate": 1.9992002956546618e-06, "loss": 1.3812, "step": 7546 }, { "epoch": 0.05, "grad_norm": 4.954040077738562, "learning_rate": 1.9992000834331212e-06, "loss": 1.4392, "step": 7547 }, { "epoch": 0.05, "grad_norm": 4.483495922487406, "learning_rate": 1.9991998711834368e-06, "loss": 1.2413, "step": 7548 }, { "epoch": 0.05, "grad_norm": 4.347338593270482, "learning_rate": 1.9991996589056083e-06, "loss": 1.4145, "step": 7549 }, { "epoch": 0.05, "grad_norm": 5.258883976146873, "learning_rate": 1.9991994465996355e-06, "loss": 1.5194, "step": 7550 }, { "epoch": 0.05, "grad_norm": 4.902386368942976, "learning_rate": 1.9991992342655184e-06, "loss": 1.416, "step": 7551 }, { "epoch": 0.05, "grad_norm": 4.581090920843044, "learning_rate": 1.9991990219032573e-06, "loss": 1.486, "step": 7552 }, { "epoch": 0.05, "grad_norm": 4.976906516981136, "learning_rate": 1.9991988095128523e-06, "loss": 1.3842, "step": 7553 }, { "epoch": 0.05, "grad_norm": 4.746425971427753, "learning_rate": 1.999198597094303e-06, "loss": 1.3763, "step": 7554 }, { "epoch": 0.05, "grad_norm": 5.663033206396401, "learning_rate": 1.99919838464761e-06, "loss": 1.2705, "step": 7555 }, { "epoch": 0.05, "grad_norm": 4.639966126082136, "learning_rate": 1.9991981721727723e-06, "loss": 1.4347, "step": 7556 }, { "epoch": 0.05, "grad_norm": 6.355082730528446, "learning_rate": 1.999197959669791e-06, "loss": 1.2996, "step": 7557 }, { "epoch": 0.05, "grad_norm": 4.347052866110602, "learning_rate": 1.9991977471386655e-06, "loss": 1.2805, "step": 7558 }, { "epoch": 0.05, "grad_norm": 4.831489266966424, "learning_rate": 1.9991975345793955e-06, "loss": 1.4006, "step": 7559 }, { "epoch": 0.05, "grad_norm": 4.311176778473698, "learning_rate": 1.999197321991982e-06, "loss": 1.4189, "step": 7560 }, { "epoch": 0.05, "grad_norm": 4.202810375272943, "learning_rate": 1.9991971093764243e-06, "loss": 1.2585, "step": 7561 }, { "epoch": 0.05, "grad_norm": 4.334873082367487, "learning_rate": 1.999196896732722e-06, "loss": 1.324, "step": 7562 }, { "epoch": 0.05, "grad_norm": 4.95153424400383, "learning_rate": 1.9991966840608764e-06, "loss": 1.3845, "step": 7563 }, { "epoch": 0.05, "grad_norm": 4.588927698547799, "learning_rate": 1.9991964713608863e-06, "loss": 1.2984, "step": 7564 }, { "epoch": 0.05, "grad_norm": 4.541985476062299, "learning_rate": 1.9991962586327523e-06, "loss": 1.2117, "step": 7565 }, { "epoch": 0.05, "grad_norm": 4.255727253615963, "learning_rate": 1.9991960458764743e-06, "loss": 1.366, "step": 7566 }, { "epoch": 0.05, "grad_norm": 4.551957472324326, "learning_rate": 1.9991958330920525e-06, "loss": 1.3601, "step": 7567 }, { "epoch": 0.05, "grad_norm": 6.211898133017765, "learning_rate": 1.999195620279486e-06, "loss": 1.4803, "step": 7568 }, { "epoch": 0.05, "grad_norm": 4.4276203795964895, "learning_rate": 1.9991954074387764e-06, "loss": 1.3188, "step": 7569 }, { "epoch": 0.05, "grad_norm": 4.334674168803675, "learning_rate": 1.9991951945699223e-06, "loss": 1.4219, "step": 7570 }, { "epoch": 0.05, "grad_norm": 4.641557559505302, "learning_rate": 1.999194981672924e-06, "loss": 1.425, "step": 7571 }, { "epoch": 0.05, "grad_norm": 6.014444082861425, "learning_rate": 1.999194768747782e-06, "loss": 1.368, "step": 7572 }, { "epoch": 0.05, "grad_norm": 4.495092107911484, "learning_rate": 1.999194555794496e-06, "loss": 1.4016, "step": 7573 }, { "epoch": 0.05, "grad_norm": 4.8400570785126815, "learning_rate": 1.9991943428130655e-06, "loss": 1.5305, "step": 7574 }, { "epoch": 0.05, "grad_norm": 4.900225894912572, "learning_rate": 1.9991941298034917e-06, "loss": 1.3316, "step": 7575 }, { "epoch": 0.05, "grad_norm": 4.820719590901148, "learning_rate": 1.9991939167657735e-06, "loss": 1.4355, "step": 7576 }, { "epoch": 0.05, "grad_norm": 5.56367202282452, "learning_rate": 1.9991937036999114e-06, "loss": 1.3357, "step": 7577 }, { "epoch": 0.05, "grad_norm": 4.244082494492976, "learning_rate": 1.9991934906059053e-06, "loss": 1.2959, "step": 7578 }, { "epoch": 0.05, "grad_norm": 6.415295209391374, "learning_rate": 1.9991932774837553e-06, "loss": 1.5239, "step": 7579 }, { "epoch": 0.05, "grad_norm": 6.999281965965089, "learning_rate": 1.9991930643334617e-06, "loss": 1.2191, "step": 7580 }, { "epoch": 0.05, "grad_norm": 4.459450874340679, "learning_rate": 1.9991928511550234e-06, "loss": 1.2562, "step": 7581 }, { "epoch": 0.05, "grad_norm": 4.873941361153492, "learning_rate": 1.999192637948442e-06, "loss": 1.543, "step": 7582 }, { "epoch": 0.05, "grad_norm": 4.910439921531112, "learning_rate": 1.999192424713716e-06, "loss": 1.4493, "step": 7583 }, { "epoch": 0.05, "grad_norm": 4.357771223503947, "learning_rate": 1.9991922114508465e-06, "loss": 1.4809, "step": 7584 }, { "epoch": 0.05, "grad_norm": 5.000510152931486, "learning_rate": 1.9991919981598333e-06, "loss": 1.2669, "step": 7585 }, { "epoch": 0.05, "grad_norm": 4.3877562085668576, "learning_rate": 1.9991917848406757e-06, "loss": 1.267, "step": 7586 }, { "epoch": 0.05, "grad_norm": 4.551551766412684, "learning_rate": 1.999191571493374e-06, "loss": 1.2544, "step": 7587 }, { "epoch": 0.05, "grad_norm": 5.3217255983306915, "learning_rate": 1.9991913581179286e-06, "loss": 1.2866, "step": 7588 }, { "epoch": 0.05, "grad_norm": 4.644350150809683, "learning_rate": 1.9991911447143396e-06, "loss": 1.364, "step": 7589 }, { "epoch": 0.05, "grad_norm": 5.061253927602553, "learning_rate": 1.9991909312826067e-06, "loss": 1.3396, "step": 7590 }, { "epoch": 0.05, "grad_norm": 4.264002760737331, "learning_rate": 1.99919071782273e-06, "loss": 1.3014, "step": 7591 }, { "epoch": 0.05, "grad_norm": 4.441916095520014, "learning_rate": 1.9991905043347086e-06, "loss": 1.3047, "step": 7592 }, { "epoch": 0.05, "eval_loss": 1.5700528621673584, "eval_runtime": 4.6061, "eval_samples_per_second": 1.954, "eval_steps_per_second": 1.086, "step": 7592 }, { "epoch": 0.05, "grad_norm": 4.50763366652099, "learning_rate": 1.9991902908185443e-06, "loss": 1.3902, "step": 7593 }, { "epoch": 0.05, "grad_norm": 4.567516624931846, "learning_rate": 1.9991900772742356e-06, "loss": 1.4207, "step": 7594 }, { "epoch": 0.05, "grad_norm": 5.469208455159706, "learning_rate": 1.999189863701783e-06, "loss": 1.3061, "step": 7595 }, { "epoch": 0.05, "grad_norm": 4.507321346854405, "learning_rate": 1.9991896501011868e-06, "loss": 1.2978, "step": 7596 }, { "epoch": 0.05, "grad_norm": 4.705919185623571, "learning_rate": 1.9991894364724462e-06, "loss": 1.4087, "step": 7597 }, { "epoch": 0.05, "grad_norm": 4.830327965545178, "learning_rate": 1.9991892228155626e-06, "loss": 1.2841, "step": 7598 }, { "epoch": 0.05, "grad_norm": 4.576257248046412, "learning_rate": 1.9991890091305347e-06, "loss": 1.3881, "step": 7599 }, { "epoch": 0.05, "grad_norm": 9.788896315184894, "learning_rate": 1.999188795417363e-06, "loss": 1.3785, "step": 7600 }, { "epoch": 0.05, "grad_norm": 4.19557552236981, "learning_rate": 1.9991885816760477e-06, "loss": 1.1425, "step": 7601 }, { "epoch": 0.05, "grad_norm": 4.311391603416675, "learning_rate": 1.9991883679065884e-06, "loss": 1.344, "step": 7602 }, { "epoch": 0.05, "grad_norm": 5.662070728410959, "learning_rate": 1.999188154108985e-06, "loss": 1.2592, "step": 7603 }, { "epoch": 0.05, "grad_norm": 5.030834737000275, "learning_rate": 1.9991879402832382e-06, "loss": 1.3079, "step": 7604 }, { "epoch": 0.05, "grad_norm": 4.244292369442249, "learning_rate": 1.9991877264293475e-06, "loss": 1.302, "step": 7605 }, { "epoch": 0.05, "grad_norm": 6.728935847490792, "learning_rate": 1.999187512547313e-06, "loss": 1.2369, "step": 7606 }, { "epoch": 0.05, "grad_norm": 4.567344152034847, "learning_rate": 1.999187298637135e-06, "loss": 1.3803, "step": 7607 }, { "epoch": 0.05, "grad_norm": 4.6302742862011215, "learning_rate": 1.9991870846988128e-06, "loss": 1.4437, "step": 7608 }, { "epoch": 0.05, "grad_norm": 4.398165571751, "learning_rate": 1.9991868707323467e-06, "loss": 1.4116, "step": 7609 }, { "epoch": 0.05, "grad_norm": 5.387672783123877, "learning_rate": 1.999186656737737e-06, "loss": 1.4482, "step": 7610 }, { "epoch": 0.05, "grad_norm": 4.53482773106228, "learning_rate": 1.9991864427149835e-06, "loss": 1.3436, "step": 7611 }, { "epoch": 0.05, "grad_norm": 5.447229112190835, "learning_rate": 1.9991862286640864e-06, "loss": 1.369, "step": 7612 }, { "epoch": 0.05, "grad_norm": 4.705943022214699, "learning_rate": 1.999186014585046e-06, "loss": 1.4268, "step": 7613 }, { "epoch": 0.05, "grad_norm": 4.135714515083432, "learning_rate": 1.999185800477861e-06, "loss": 1.3231, "step": 7614 }, { "epoch": 0.05, "grad_norm": 4.797316313344802, "learning_rate": 1.9991855863425324e-06, "loss": 1.3912, "step": 7615 }, { "epoch": 0.05, "grad_norm": 4.905245193613705, "learning_rate": 1.9991853721790604e-06, "loss": 1.4603, "step": 7616 }, { "epoch": 0.05, "grad_norm": 4.463673751539228, "learning_rate": 1.9991851579874444e-06, "loss": 1.4263, "step": 7617 }, { "epoch": 0.05, "grad_norm": 4.550066622520543, "learning_rate": 1.999184943767685e-06, "loss": 1.3874, "step": 7618 }, { "epoch": 0.05, "grad_norm": 5.116289304822083, "learning_rate": 1.9991847295197816e-06, "loss": 1.2743, "step": 7619 }, { "epoch": 0.05, "grad_norm": 4.847895909779918, "learning_rate": 1.9991845152437347e-06, "loss": 1.3733, "step": 7620 }, { "epoch": 0.05, "grad_norm": 4.544948051134896, "learning_rate": 1.999184300939544e-06, "loss": 1.3121, "step": 7621 }, { "epoch": 0.05, "grad_norm": 4.9161014187415075, "learning_rate": 1.9991840866072095e-06, "loss": 1.5178, "step": 7622 }, { "epoch": 0.05, "grad_norm": 4.109001303482605, "learning_rate": 1.9991838722467316e-06, "loss": 1.353, "step": 7623 }, { "epoch": 0.05, "grad_norm": 4.689455766716899, "learning_rate": 1.9991836578581098e-06, "loss": 1.4274, "step": 7624 }, { "epoch": 0.05, "grad_norm": 4.347314122371463, "learning_rate": 1.999183443441344e-06, "loss": 1.3521, "step": 7625 }, { "epoch": 0.05, "grad_norm": 4.280665610394947, "learning_rate": 1.999183228996435e-06, "loss": 1.3754, "step": 7626 }, { "epoch": 0.05, "grad_norm": 4.693452337137895, "learning_rate": 1.9991830145233824e-06, "loss": 1.062, "step": 7627 }, { "epoch": 0.05, "grad_norm": 4.630067514312146, "learning_rate": 1.9991828000221856e-06, "loss": 1.3391, "step": 7628 }, { "epoch": 0.05, "grad_norm": 5.652135975578942, "learning_rate": 1.9991825854928454e-06, "loss": 1.4033, "step": 7629 }, { "epoch": 0.05, "grad_norm": 5.310546548692459, "learning_rate": 1.9991823709353616e-06, "loss": 1.4428, "step": 7630 }, { "epoch": 0.05, "grad_norm": 6.575253347742195, "learning_rate": 1.9991821563497343e-06, "loss": 1.4305, "step": 7631 }, { "epoch": 0.05, "grad_norm": 4.510907523838614, "learning_rate": 1.9991819417359635e-06, "loss": 1.3979, "step": 7632 }, { "epoch": 0.05, "grad_norm": 5.147068439311762, "learning_rate": 1.9991817270940483e-06, "loss": 1.5523, "step": 7633 }, { "epoch": 0.05, "grad_norm": 4.630190596622757, "learning_rate": 1.9991815124239905e-06, "loss": 1.4398, "step": 7634 }, { "epoch": 0.05, "grad_norm": 5.3873966974277865, "learning_rate": 1.9991812977257883e-06, "loss": 1.3819, "step": 7635 }, { "epoch": 0.05, "grad_norm": 5.331555845816171, "learning_rate": 1.999181082999443e-06, "loss": 1.451, "step": 7636 }, { "epoch": 0.05, "grad_norm": 4.195597310495803, "learning_rate": 1.9991808682449538e-06, "loss": 1.3387, "step": 7637 }, { "epoch": 0.05, "grad_norm": 5.59757788182534, "learning_rate": 1.999180653462321e-06, "loss": 1.3998, "step": 7638 }, { "epoch": 0.05, "grad_norm": 4.680953946703209, "learning_rate": 1.9991804386515443e-06, "loss": 1.343, "step": 7639 }, { "epoch": 0.05, "grad_norm": 4.707573270991824, "learning_rate": 1.9991802238126246e-06, "loss": 1.5309, "step": 7640 }, { "epoch": 0.05, "grad_norm": 4.574889890879614, "learning_rate": 1.999180008945561e-06, "loss": 1.4282, "step": 7641 }, { "epoch": 0.05, "grad_norm": 5.346226363576077, "learning_rate": 1.999179794050354e-06, "loss": 1.4928, "step": 7642 }, { "epoch": 0.05, "grad_norm": 4.996357306726255, "learning_rate": 1.9991795791270033e-06, "loss": 1.5805, "step": 7643 }, { "epoch": 0.05, "grad_norm": 4.3052586604708765, "learning_rate": 1.999179364175509e-06, "loss": 1.465, "step": 7644 }, { "epoch": 0.05, "grad_norm": 4.8801635316586784, "learning_rate": 1.999179149195871e-06, "loss": 1.4977, "step": 7645 }, { "epoch": 0.05, "grad_norm": 13.141730114747917, "learning_rate": 1.99917893418809e-06, "loss": 1.3392, "step": 7646 }, { "epoch": 0.05, "grad_norm": 4.290557900101686, "learning_rate": 1.999178719152165e-06, "loss": 1.2953, "step": 7647 }, { "epoch": 0.05, "grad_norm": 4.748820077072978, "learning_rate": 1.9991785040880963e-06, "loss": 1.3732, "step": 7648 }, { "epoch": 0.05, "grad_norm": 4.796785189689416, "learning_rate": 1.9991782889958845e-06, "loss": 1.2464, "step": 7649 }, { "epoch": 0.05, "grad_norm": 10.518281175945447, "learning_rate": 1.9991780738755287e-06, "loss": 1.4756, "step": 7650 }, { "epoch": 0.05, "grad_norm": 5.262400482220038, "learning_rate": 1.9991778587270298e-06, "loss": 1.5016, "step": 7651 }, { "epoch": 0.05, "grad_norm": 5.767338958392831, "learning_rate": 1.9991776435503874e-06, "loss": 1.3498, "step": 7652 }, { "epoch": 0.05, "grad_norm": 5.707342790754888, "learning_rate": 1.999177428345601e-06, "loss": 1.4146, "step": 7653 }, { "epoch": 0.05, "grad_norm": 5.837598030259442, "learning_rate": 1.999177213112672e-06, "loss": 1.533, "step": 7654 }, { "epoch": 0.05, "grad_norm": 4.817336305552736, "learning_rate": 1.9991769978515986e-06, "loss": 1.5477, "step": 7655 }, { "epoch": 0.05, "grad_norm": 5.541072638621059, "learning_rate": 1.999176782562382e-06, "loss": 1.4791, "step": 7656 }, { "epoch": 0.05, "grad_norm": 4.362139635494086, "learning_rate": 1.999176567245022e-06, "loss": 1.2908, "step": 7657 }, { "epoch": 0.05, "grad_norm": 5.050374341280424, "learning_rate": 1.9991763518995186e-06, "loss": 1.3763, "step": 7658 }, { "epoch": 0.05, "grad_norm": 4.137338527396757, "learning_rate": 1.9991761365258716e-06, "loss": 1.1767, "step": 7659 }, { "epoch": 0.05, "grad_norm": 4.451534089522701, "learning_rate": 1.999175921124081e-06, "loss": 1.3037, "step": 7660 }, { "epoch": 0.05, "grad_norm": 5.249832801599255, "learning_rate": 1.999175705694147e-06, "loss": 1.5443, "step": 7661 }, { "epoch": 0.05, "grad_norm": 4.446358249752105, "learning_rate": 1.9991754902360694e-06, "loss": 1.426, "step": 7662 }, { "epoch": 0.05, "grad_norm": 4.936234931959161, "learning_rate": 1.9991752747498488e-06, "loss": 1.4527, "step": 7663 }, { "epoch": 0.05, "grad_norm": 5.935401993666435, "learning_rate": 1.9991750592354846e-06, "loss": 1.3183, "step": 7664 }, { "epoch": 0.05, "grad_norm": 6.295307921416386, "learning_rate": 1.999174843692977e-06, "loss": 1.4899, "step": 7665 }, { "epoch": 0.05, "eval_loss": 1.5683300495147705, "eval_runtime": 4.6102, "eval_samples_per_second": 1.952, "eval_steps_per_second": 1.085, "step": 7665 }, { "epoch": 0.05, "grad_norm": 4.858198007581066, "learning_rate": 1.9991746281223257e-06, "loss": 1.3773, "step": 7666 }, { "epoch": 0.05, "grad_norm": 4.418985838139268, "learning_rate": 1.9991744125235314e-06, "loss": 1.3106, "step": 7667 }, { "epoch": 0.05, "grad_norm": 4.318416281379778, "learning_rate": 1.999174196896593e-06, "loss": 1.4869, "step": 7668 }, { "epoch": 0.05, "grad_norm": 4.694071144513556, "learning_rate": 1.9991739812415118e-06, "loss": 1.5267, "step": 7669 }, { "epoch": 0.05, "grad_norm": 4.373950245690968, "learning_rate": 1.999173765558287e-06, "loss": 1.4851, "step": 7670 }, { "epoch": 0.05, "grad_norm": 4.324197333779506, "learning_rate": 1.9991735498469185e-06, "loss": 1.3096, "step": 7671 }, { "epoch": 0.05, "grad_norm": 4.394592318974435, "learning_rate": 1.999173334107407e-06, "loss": 1.3644, "step": 7672 }, { "epoch": 0.05, "grad_norm": 4.266670772743948, "learning_rate": 1.999173118339752e-06, "loss": 1.362, "step": 7673 }, { "epoch": 0.05, "grad_norm": 4.295955296296668, "learning_rate": 1.9991729025439536e-06, "loss": 1.3735, "step": 7674 }, { "epoch": 0.05, "grad_norm": 4.246048670721659, "learning_rate": 1.999172686720012e-06, "loss": 1.449, "step": 7675 }, { "epoch": 0.05, "grad_norm": 4.638944960089106, "learning_rate": 1.999172470867927e-06, "loss": 1.2955, "step": 7676 }, { "epoch": 0.05, "grad_norm": 5.580028912051613, "learning_rate": 1.9991722549876982e-06, "loss": 1.2753, "step": 7677 }, { "epoch": 0.05, "grad_norm": 4.138273856542915, "learning_rate": 1.9991720390793265e-06, "loss": 1.3861, "step": 7678 }, { "epoch": 0.05, "grad_norm": 4.470127672789156, "learning_rate": 1.9991718231428113e-06, "loss": 1.4956, "step": 7679 }, { "epoch": 0.05, "grad_norm": 8.008273460136488, "learning_rate": 1.9991716071781525e-06, "loss": 1.3993, "step": 7680 }, { "epoch": 0.05, "grad_norm": 4.878399604988985, "learning_rate": 1.9991713911853507e-06, "loss": 1.4606, "step": 7681 }, { "epoch": 0.05, "grad_norm": 4.407855914055984, "learning_rate": 1.9991711751644057e-06, "loss": 1.3583, "step": 7682 }, { "epoch": 0.05, "grad_norm": 4.7127898730579885, "learning_rate": 1.999170959115317e-06, "loss": 1.4571, "step": 7683 }, { "epoch": 0.05, "grad_norm": 5.769010678931985, "learning_rate": 1.9991707430380853e-06, "loss": 1.387, "step": 7684 }, { "epoch": 0.05, "grad_norm": 4.755844513192177, "learning_rate": 1.9991705269327098e-06, "loss": 1.5078, "step": 7685 }, { "epoch": 0.05, "grad_norm": 4.318945303055633, "learning_rate": 1.999170310799191e-06, "loss": 1.3274, "step": 7686 }, { "epoch": 0.05, "grad_norm": 4.161086412670242, "learning_rate": 1.9991700946375295e-06, "loss": 1.3265, "step": 7687 }, { "epoch": 0.05, "grad_norm": 4.6455919428608885, "learning_rate": 1.9991698784477243e-06, "loss": 1.3427, "step": 7688 }, { "epoch": 0.05, "grad_norm": 4.898295652476637, "learning_rate": 1.999169662229776e-06, "loss": 1.3903, "step": 7689 }, { "epoch": 0.05, "grad_norm": 5.028746890185826, "learning_rate": 1.9991694459836842e-06, "loss": 1.4847, "step": 7690 }, { "epoch": 0.05, "grad_norm": 4.555505230137718, "learning_rate": 1.9991692297094493e-06, "loss": 1.3457, "step": 7691 }, { "epoch": 0.05, "grad_norm": 5.253081476633083, "learning_rate": 1.9991690134070713e-06, "loss": 1.3342, "step": 7692 }, { "epoch": 0.05, "grad_norm": 4.225732921026402, "learning_rate": 1.99916879707655e-06, "loss": 1.292, "step": 7693 }, { "epoch": 0.05, "grad_norm": 4.265843493123243, "learning_rate": 1.9991685807178848e-06, "loss": 1.3243, "step": 7694 }, { "epoch": 0.05, "grad_norm": 4.624620250875206, "learning_rate": 1.9991683643310767e-06, "loss": 1.1394, "step": 7695 }, { "epoch": 0.05, "grad_norm": 4.509476668068783, "learning_rate": 1.9991681479161255e-06, "loss": 1.3454, "step": 7696 }, { "epoch": 0.05, "grad_norm": 4.790735666943298, "learning_rate": 1.999167931473031e-06, "loss": 1.5233, "step": 7697 }, { "epoch": 0.05, "grad_norm": 4.199591650612746, "learning_rate": 1.9991677150017933e-06, "loss": 1.321, "step": 7698 }, { "epoch": 0.05, "grad_norm": 5.17592961582678, "learning_rate": 1.999167498502412e-06, "loss": 1.3803, "step": 7699 }, { "epoch": 0.05, "grad_norm": 4.6320550743298705, "learning_rate": 1.9991672819748876e-06, "loss": 1.373, "step": 7700 }, { "epoch": 0.05, "grad_norm": 5.043437303463971, "learning_rate": 1.9991670654192204e-06, "loss": 1.3333, "step": 7701 }, { "epoch": 0.05, "grad_norm": 4.209274659703086, "learning_rate": 1.99916684883541e-06, "loss": 1.319, "step": 7702 }, { "epoch": 0.05, "grad_norm": 5.210886527687919, "learning_rate": 1.999166632223456e-06, "loss": 1.4947, "step": 7703 }, { "epoch": 0.05, "grad_norm": 4.3022353892124965, "learning_rate": 1.999166415583359e-06, "loss": 1.3958, "step": 7704 }, { "epoch": 0.05, "grad_norm": 5.048515012407663, "learning_rate": 1.9991661989151186e-06, "loss": 1.2334, "step": 7705 }, { "epoch": 0.05, "grad_norm": 4.616532323992355, "learning_rate": 1.999165982218735e-06, "loss": 1.3724, "step": 7706 }, { "epoch": 0.05, "grad_norm": 5.2642419668717135, "learning_rate": 1.9991657654942086e-06, "loss": 1.3066, "step": 7707 }, { "epoch": 0.05, "grad_norm": 4.384429344311389, "learning_rate": 1.999165548741539e-06, "loss": 1.3015, "step": 7708 }, { "epoch": 0.05, "grad_norm": 5.102380930634415, "learning_rate": 1.999165331960726e-06, "loss": 1.2993, "step": 7709 }, { "epoch": 0.05, "grad_norm": 4.539353904636387, "learning_rate": 1.9991651151517693e-06, "loss": 1.385, "step": 7710 }, { "epoch": 0.05, "grad_norm": 5.352630533139624, "learning_rate": 1.9991648983146704e-06, "loss": 1.3978, "step": 7711 }, { "epoch": 0.05, "grad_norm": 4.280251079924495, "learning_rate": 1.999164681449428e-06, "loss": 1.4009, "step": 7712 }, { "epoch": 0.05, "grad_norm": 5.394229640539235, "learning_rate": 1.9991644645560425e-06, "loss": 1.3591, "step": 7713 }, { "epoch": 0.05, "grad_norm": 4.440874286887443, "learning_rate": 1.9991642476345135e-06, "loss": 1.4757, "step": 7714 }, { "epoch": 0.05, "grad_norm": 5.283008931402802, "learning_rate": 1.999164030684842e-06, "loss": 1.5276, "step": 7715 }, { "epoch": 0.05, "grad_norm": 6.806909170081613, "learning_rate": 1.9991638137070266e-06, "loss": 1.3684, "step": 7716 }, { "epoch": 0.05, "grad_norm": 4.289471770878402, "learning_rate": 1.9991635967010688e-06, "loss": 1.3916, "step": 7717 }, { "epoch": 0.05, "grad_norm": 4.4840091204695725, "learning_rate": 1.9991633796669674e-06, "loss": 1.4037, "step": 7718 }, { "epoch": 0.05, "grad_norm": 4.632919184504854, "learning_rate": 1.999163162604723e-06, "loss": 1.445, "step": 7719 }, { "epoch": 0.05, "grad_norm": 4.917869072196511, "learning_rate": 1.9991629455143354e-06, "loss": 1.4749, "step": 7720 }, { "epoch": 0.05, "grad_norm": 4.872851811456384, "learning_rate": 1.999162728395805e-06, "loss": 1.2289, "step": 7721 }, { "epoch": 0.05, "grad_norm": 5.599025969389777, "learning_rate": 1.9991625112491314e-06, "loss": 1.2262, "step": 7722 }, { "epoch": 0.05, "grad_norm": 4.527501135619346, "learning_rate": 1.9991622940743145e-06, "loss": 1.3792, "step": 7723 }, { "epoch": 0.05, "grad_norm": 5.005597639086992, "learning_rate": 1.999162076871355e-06, "loss": 1.4674, "step": 7724 }, { "epoch": 0.05, "grad_norm": 4.141539620779096, "learning_rate": 1.999161859640252e-06, "loss": 1.2431, "step": 7725 }, { "epoch": 0.05, "grad_norm": 4.903599320285552, "learning_rate": 1.999161642381006e-06, "loss": 1.3699, "step": 7726 }, { "epoch": 0.05, "grad_norm": 4.530716110763893, "learning_rate": 1.999161425093617e-06, "loss": 1.409, "step": 7727 }, { "epoch": 0.05, "grad_norm": 4.419189511613821, "learning_rate": 1.999161207778085e-06, "loss": 1.3161, "step": 7728 }, { "epoch": 0.05, "grad_norm": 4.517876307126391, "learning_rate": 1.99916099043441e-06, "loss": 1.4033, "step": 7729 }, { "epoch": 0.05, "grad_norm": 5.046973948033741, "learning_rate": 1.999160773062592e-06, "loss": 1.4555, "step": 7730 }, { "epoch": 0.05, "grad_norm": 5.20427528859649, "learning_rate": 1.999160555662631e-06, "loss": 1.4801, "step": 7731 }, { "epoch": 0.05, "grad_norm": 4.529459234389931, "learning_rate": 1.9991603382345262e-06, "loss": 1.2787, "step": 7732 }, { "epoch": 0.05, "grad_norm": 4.396261511256881, "learning_rate": 1.9991601207782793e-06, "loss": 1.3565, "step": 7733 }, { "epoch": 0.05, "grad_norm": 4.436307213266451, "learning_rate": 1.999159903293889e-06, "loss": 1.4514, "step": 7734 }, { "epoch": 0.05, "grad_norm": 4.504728249316001, "learning_rate": 1.9991596857813557e-06, "loss": 1.4202, "step": 7735 }, { "epoch": 0.05, "grad_norm": 4.670095542486996, "learning_rate": 1.9991594682406795e-06, "loss": 1.3657, "step": 7736 }, { "epoch": 0.05, "grad_norm": 4.698157926590413, "learning_rate": 1.9991592506718602e-06, "loss": 1.3066, "step": 7737 }, { "epoch": 0.05, "grad_norm": 5.174875762511404, "learning_rate": 1.999159033074898e-06, "loss": 1.3919, "step": 7738 }, { "epoch": 0.05, "eval_loss": 1.5714409351348877, "eval_runtime": 4.603, "eval_samples_per_second": 1.955, "eval_steps_per_second": 1.086, "step": 7738 }, { "epoch": 0.05, "grad_norm": 4.175707810276016, "learning_rate": 1.9991588154497928e-06, "loss": 1.2561, "step": 7739 }, { "epoch": 0.05, "grad_norm": 4.93781445252905, "learning_rate": 1.9991585977965446e-06, "loss": 1.3439, "step": 7740 }, { "epoch": 0.05, "grad_norm": 4.805994091804873, "learning_rate": 1.9991583801151534e-06, "loss": 1.3494, "step": 7741 }, { "epoch": 0.05, "grad_norm": 4.410801166793402, "learning_rate": 1.999158162405619e-06, "loss": 1.4503, "step": 7742 }, { "epoch": 0.05, "grad_norm": 4.420418755361164, "learning_rate": 1.999157944667942e-06, "loss": 1.388, "step": 7743 }, { "epoch": 0.05, "grad_norm": 4.6956053703000284, "learning_rate": 1.999157726902122e-06, "loss": 1.3943, "step": 7744 }, { "epoch": 0.05, "grad_norm": 4.8785853587190315, "learning_rate": 1.999157509108159e-06, "loss": 1.5745, "step": 7745 }, { "epoch": 0.05, "grad_norm": 4.533105281280708, "learning_rate": 1.9991572912860532e-06, "loss": 1.166, "step": 7746 }, { "epoch": 0.05, "grad_norm": 4.54046659217364, "learning_rate": 1.9991570734358043e-06, "loss": 1.3861, "step": 7747 }, { "epoch": 0.05, "grad_norm": 7.421652151878216, "learning_rate": 1.999156855557412e-06, "loss": 1.4499, "step": 7748 }, { "epoch": 0.05, "grad_norm": 4.698522721572353, "learning_rate": 1.9991566376508775e-06, "loss": 1.4939, "step": 7749 }, { "epoch": 0.05, "grad_norm": 4.422380492771535, "learning_rate": 1.9991564197161997e-06, "loss": 1.3063, "step": 7750 }, { "epoch": 0.05, "grad_norm": 4.581948781805496, "learning_rate": 1.999156201753379e-06, "loss": 1.3506, "step": 7751 }, { "epoch": 0.05, "grad_norm": 4.966434544463009, "learning_rate": 1.9991559837624156e-06, "loss": 1.4167, "step": 7752 }, { "epoch": 0.05, "grad_norm": 4.494576354753205, "learning_rate": 1.9991557657433093e-06, "loss": 1.3358, "step": 7753 }, { "epoch": 0.05, "grad_norm": 4.443704979027001, "learning_rate": 1.99915554769606e-06, "loss": 1.2271, "step": 7754 }, { "epoch": 0.05, "grad_norm": 5.05415589509564, "learning_rate": 1.999155329620668e-06, "loss": 1.2592, "step": 7755 }, { "epoch": 0.05, "grad_norm": 4.658098683784834, "learning_rate": 1.999155111517133e-06, "loss": 1.1916, "step": 7756 }, { "epoch": 0.05, "grad_norm": 4.146560184197253, "learning_rate": 1.9991548933854546e-06, "loss": 1.2936, "step": 7757 }, { "epoch": 0.05, "grad_norm": 4.840651733689375, "learning_rate": 1.9991546752256338e-06, "loss": 1.5336, "step": 7758 }, { "epoch": 0.05, "grad_norm": 4.3703138172329385, "learning_rate": 1.9991544570376702e-06, "loss": 1.2503, "step": 7759 }, { "epoch": 0.05, "grad_norm": 4.485609813261699, "learning_rate": 1.9991542388215636e-06, "loss": 1.3837, "step": 7760 }, { "epoch": 0.05, "grad_norm": 4.41336165208518, "learning_rate": 1.9991540205773143e-06, "loss": 1.3231, "step": 7761 }, { "epoch": 0.05, "grad_norm": 4.694239628147161, "learning_rate": 1.9991538023049223e-06, "loss": 1.3579, "step": 7762 }, { "epoch": 0.05, "grad_norm": 8.023323210257608, "learning_rate": 1.9991535840043872e-06, "loss": 1.1974, "step": 7763 }, { "epoch": 0.05, "grad_norm": 7.428292899323734, "learning_rate": 1.9991533656757095e-06, "loss": 1.348, "step": 7764 }, { "epoch": 0.05, "grad_norm": 4.619712492638562, "learning_rate": 1.9991531473188887e-06, "loss": 1.2491, "step": 7765 }, { "epoch": 0.05, "grad_norm": 4.451820552376083, "learning_rate": 1.9991529289339247e-06, "loss": 1.33, "step": 7766 }, { "epoch": 0.05, "grad_norm": 14.239291286538496, "learning_rate": 1.9991527105208186e-06, "loss": 1.4719, "step": 7767 }, { "epoch": 0.05, "grad_norm": 4.982827409515907, "learning_rate": 1.9991524920795693e-06, "loss": 1.2224, "step": 7768 }, { "epoch": 0.05, "grad_norm": 4.655043526777409, "learning_rate": 1.9991522736101774e-06, "loss": 1.4407, "step": 7769 }, { "epoch": 0.05, "grad_norm": 4.651583819715488, "learning_rate": 1.9991520551126428e-06, "loss": 1.4398, "step": 7770 }, { "epoch": 0.05, "grad_norm": 4.235185466684276, "learning_rate": 1.999151836586965e-06, "loss": 1.3399, "step": 7771 }, { "epoch": 0.05, "grad_norm": 7.3405089471179, "learning_rate": 1.9991516180331447e-06, "loss": 1.3231, "step": 7772 }, { "epoch": 0.05, "grad_norm": 4.512421989259268, "learning_rate": 1.9991513994511817e-06, "loss": 1.3765, "step": 7773 }, { "epoch": 0.05, "grad_norm": 5.326834488426423, "learning_rate": 1.9991511808410755e-06, "loss": 1.2044, "step": 7774 }, { "epoch": 0.05, "grad_norm": 4.253178385231695, "learning_rate": 1.999150962202827e-06, "loss": 1.1378, "step": 7775 }, { "epoch": 0.05, "grad_norm": 4.852808156252643, "learning_rate": 1.9991507435364357e-06, "loss": 1.4702, "step": 7776 }, { "epoch": 0.05, "grad_norm": 4.61770150323408, "learning_rate": 1.9991505248419016e-06, "loss": 1.3665, "step": 7777 }, { "epoch": 0.05, "grad_norm": 4.332576443131674, "learning_rate": 1.9991503061192243e-06, "loss": 1.3262, "step": 7778 }, { "epoch": 0.05, "grad_norm": 4.419350171628437, "learning_rate": 1.999150087368405e-06, "loss": 1.4025, "step": 7779 }, { "epoch": 0.05, "grad_norm": 4.29821925327133, "learning_rate": 1.9991498685894427e-06, "loss": 1.344, "step": 7780 }, { "epoch": 0.05, "grad_norm": 4.96186103873307, "learning_rate": 1.999149649782337e-06, "loss": 1.4244, "step": 7781 }, { "epoch": 0.05, "grad_norm": 5.290680575600782, "learning_rate": 1.9991494309470896e-06, "loss": 1.4055, "step": 7782 }, { "epoch": 0.05, "grad_norm": 4.171848548411604, "learning_rate": 1.999149212083699e-06, "loss": 1.4467, "step": 7783 }, { "epoch": 0.05, "grad_norm": 4.483008484867138, "learning_rate": 1.9991489931921657e-06, "loss": 1.3366, "step": 7784 }, { "epoch": 0.05, "grad_norm": 4.716816966629754, "learning_rate": 1.9991487742724894e-06, "loss": 1.4159, "step": 7785 }, { "epoch": 0.05, "grad_norm": 4.250233570874446, "learning_rate": 1.999148555324671e-06, "loss": 1.3413, "step": 7786 }, { "epoch": 0.05, "grad_norm": 4.319258666893267, "learning_rate": 1.9991483363487095e-06, "loss": 1.4302, "step": 7787 }, { "epoch": 0.05, "grad_norm": 4.379931424126575, "learning_rate": 1.9991481173446056e-06, "loss": 1.2281, "step": 7788 }, { "epoch": 0.05, "grad_norm": 4.4928101612259645, "learning_rate": 1.999147898312359e-06, "loss": 1.4116, "step": 7789 }, { "epoch": 0.05, "grad_norm": 5.531646572971643, "learning_rate": 1.9991476792519693e-06, "loss": 1.2289, "step": 7790 }, { "epoch": 0.05, "grad_norm": 4.433755615515491, "learning_rate": 1.9991474601634374e-06, "loss": 1.2472, "step": 7791 }, { "epoch": 0.05, "grad_norm": 4.555845638794292, "learning_rate": 1.9991472410467628e-06, "loss": 1.3909, "step": 7792 }, { "epoch": 0.05, "grad_norm": 6.5910918415917115, "learning_rate": 1.999147021901945e-06, "loss": 1.4906, "step": 7793 }, { "epoch": 0.05, "grad_norm": 7.123243079842171, "learning_rate": 1.999146802728985e-06, "loss": 1.2805, "step": 7794 }, { "epoch": 0.05, "grad_norm": 5.895052706646069, "learning_rate": 1.9991465835278825e-06, "loss": 1.5177, "step": 7795 }, { "epoch": 0.05, "grad_norm": 7.281958515549516, "learning_rate": 1.9991463642986372e-06, "loss": 1.3783, "step": 7796 }, { "epoch": 0.05, "grad_norm": 4.497646384942129, "learning_rate": 1.9991461450412493e-06, "loss": 1.2287, "step": 7797 }, { "epoch": 0.05, "grad_norm": 4.3093800668051925, "learning_rate": 1.9991459257557187e-06, "loss": 1.3017, "step": 7798 }, { "epoch": 0.05, "grad_norm": 4.149845365434638, "learning_rate": 1.9991457064420458e-06, "loss": 1.2618, "step": 7799 }, { "epoch": 0.05, "grad_norm": 4.943966845201791, "learning_rate": 1.99914548710023e-06, "loss": 1.4804, "step": 7800 }, { "epoch": 0.05, "grad_norm": 4.593668643901737, "learning_rate": 1.999145267730271e-06, "loss": 1.4017, "step": 7801 }, { "epoch": 0.05, "grad_norm": 4.864193671590243, "learning_rate": 1.9991450483321703e-06, "loss": 1.5987, "step": 7802 }, { "epoch": 0.05, "grad_norm": 4.858887370929525, "learning_rate": 1.9991448289059268e-06, "loss": 1.2643, "step": 7803 }, { "epoch": 0.05, "grad_norm": 6.364807491989936, "learning_rate": 1.9991446094515405e-06, "loss": 1.3496, "step": 7804 }, { "epoch": 0.05, "grad_norm": 4.169090337062379, "learning_rate": 1.999144389969012e-06, "loss": 1.3751, "step": 7805 }, { "epoch": 0.05, "grad_norm": 4.270304158177507, "learning_rate": 1.9991441704583405e-06, "loss": 1.2546, "step": 7806 }, { "epoch": 0.05, "grad_norm": 4.65265370191332, "learning_rate": 1.9991439509195267e-06, "loss": 1.2049, "step": 7807 }, { "epoch": 0.05, "grad_norm": 5.638393088093555, "learning_rate": 1.9991437313525702e-06, "loss": 1.3842, "step": 7808 }, { "epoch": 0.05, "grad_norm": 4.3623669323791745, "learning_rate": 1.999143511757471e-06, "loss": 1.3777, "step": 7809 }, { "epoch": 0.05, "grad_norm": 5.332668611820201, "learning_rate": 1.9991432921342297e-06, "loss": 1.4373, "step": 7810 }, { "epoch": 0.05, "grad_norm": 4.45144845665693, "learning_rate": 1.9991430724828456e-06, "loss": 1.4446, "step": 7811 }, { "epoch": 0.05, "eval_loss": 1.5688176155090332, "eval_runtime": 4.5803, "eval_samples_per_second": 1.965, "eval_steps_per_second": 1.092, "step": 7811 }, { "epoch": 0.05, "grad_norm": 4.32891217625353, "learning_rate": 1.999142852803319e-06, "loss": 1.341, "step": 7812 }, { "epoch": 0.05, "grad_norm": 4.357331814719709, "learning_rate": 1.9991426330956495e-06, "loss": 1.2788, "step": 7813 }, { "epoch": 0.05, "grad_norm": 4.47179926701401, "learning_rate": 1.999142413359838e-06, "loss": 1.3917, "step": 7814 }, { "epoch": 0.05, "grad_norm": 4.635258202399129, "learning_rate": 1.9991421935958836e-06, "loss": 1.3874, "step": 7815 }, { "epoch": 0.05, "grad_norm": 5.188118711526927, "learning_rate": 1.999141973803787e-06, "loss": 1.5923, "step": 7816 }, { "epoch": 0.05, "grad_norm": 4.170732677350809, "learning_rate": 1.9991417539835478e-06, "loss": 1.2578, "step": 7817 }, { "epoch": 0.05, "grad_norm": 4.452852094348596, "learning_rate": 1.999141534135166e-06, "loss": 1.2733, "step": 7818 }, { "epoch": 0.05, "grad_norm": 4.465874097236842, "learning_rate": 1.9991413142586417e-06, "loss": 1.3696, "step": 7819 }, { "epoch": 0.05, "grad_norm": 4.375335811664349, "learning_rate": 1.999141094353975e-06, "loss": 1.2605, "step": 7820 }, { "epoch": 0.05, "grad_norm": 4.88899512779934, "learning_rate": 1.999140874421166e-06, "loss": 1.4881, "step": 7821 }, { "epoch": 0.05, "grad_norm": 6.507944758478165, "learning_rate": 1.999140654460214e-06, "loss": 1.3324, "step": 7822 }, { "epoch": 0.05, "grad_norm": 5.161837898418599, "learning_rate": 1.99914043447112e-06, "loss": 1.2969, "step": 7823 }, { "epoch": 0.05, "grad_norm": 5.5483531089027265, "learning_rate": 1.9991402144538835e-06, "loss": 1.338, "step": 7824 }, { "epoch": 0.05, "grad_norm": 4.317890046692939, "learning_rate": 1.999139994408504e-06, "loss": 1.3808, "step": 7825 }, { "epoch": 0.05, "grad_norm": 4.85742461700556, "learning_rate": 1.9991397743349826e-06, "loss": 1.2791, "step": 7826 }, { "epoch": 0.05, "grad_norm": 4.251258612356232, "learning_rate": 1.999139554233319e-06, "loss": 1.3822, "step": 7827 }, { "epoch": 0.05, "grad_norm": 4.433372185310212, "learning_rate": 1.9991393341035123e-06, "loss": 1.3458, "step": 7828 }, { "epoch": 0.05, "grad_norm": 5.56654917233872, "learning_rate": 1.9991391139455636e-06, "loss": 1.3783, "step": 7829 }, { "epoch": 0.05, "grad_norm": 5.79917965548318, "learning_rate": 1.9991388937594726e-06, "loss": 1.4272, "step": 7830 }, { "epoch": 0.05, "grad_norm": 4.4833060398499835, "learning_rate": 1.9991386735452385e-06, "loss": 1.423, "step": 7831 }, { "epoch": 0.05, "grad_norm": 4.423563714754524, "learning_rate": 1.9991384533028627e-06, "loss": 1.4966, "step": 7832 }, { "epoch": 0.05, "grad_norm": 4.338117109069359, "learning_rate": 1.999138233032344e-06, "loss": 1.3705, "step": 7833 }, { "epoch": 0.05, "grad_norm": 4.75332256939269, "learning_rate": 1.9991380127336833e-06, "loss": 1.3029, "step": 7834 }, { "epoch": 0.05, "grad_norm": 3.9882628565471494, "learning_rate": 1.99913779240688e-06, "loss": 1.2201, "step": 7835 }, { "epoch": 0.05, "grad_norm": 4.411243434288691, "learning_rate": 1.999137572051934e-06, "loss": 1.419, "step": 7836 }, { "epoch": 0.05, "grad_norm": 4.215995318173886, "learning_rate": 1.999137351668846e-06, "loss": 1.3908, "step": 7837 }, { "epoch": 0.05, "grad_norm": 4.679897615810644, "learning_rate": 1.9991371312576155e-06, "loss": 1.4329, "step": 7838 }, { "epoch": 0.05, "grad_norm": 6.274822122131257, "learning_rate": 1.999136910818243e-06, "loss": 1.2731, "step": 7839 }, { "epoch": 0.05, "grad_norm": 4.444364596853928, "learning_rate": 1.9991366903507275e-06, "loss": 1.2914, "step": 7840 }, { "epoch": 0.05, "grad_norm": 4.375169619321004, "learning_rate": 1.99913646985507e-06, "loss": 1.4331, "step": 7841 }, { "epoch": 0.05, "grad_norm": 4.667044912199263, "learning_rate": 1.99913624933127e-06, "loss": 1.3761, "step": 7842 }, { "epoch": 0.05, "grad_norm": 5.038962012585465, "learning_rate": 1.999136028779328e-06, "loss": 1.3423, "step": 7843 }, { "epoch": 0.05, "grad_norm": 5.001327055835487, "learning_rate": 1.9991358081992437e-06, "loss": 1.3895, "step": 7844 }, { "epoch": 0.05, "grad_norm": 5.02449408586461, "learning_rate": 1.9991355875910165e-06, "loss": 1.2673, "step": 7845 }, { "epoch": 0.05, "grad_norm": 4.581799067890731, "learning_rate": 1.9991353669546474e-06, "loss": 1.3802, "step": 7846 }, { "epoch": 0.05, "grad_norm": 4.560127091757449, "learning_rate": 1.9991351462901357e-06, "loss": 1.3478, "step": 7847 }, { "epoch": 0.05, "grad_norm": 4.761668275999001, "learning_rate": 1.999134925597482e-06, "loss": 1.3887, "step": 7848 }, { "epoch": 0.05, "grad_norm": 4.451504166455566, "learning_rate": 1.9991347048766856e-06, "loss": 1.3024, "step": 7849 }, { "epoch": 0.05, "grad_norm": 5.440109975467861, "learning_rate": 1.999134484127747e-06, "loss": 1.4257, "step": 7850 }, { "epoch": 0.05, "grad_norm": 5.141436439016143, "learning_rate": 1.999134263350667e-06, "loss": 1.3099, "step": 7851 }, { "epoch": 0.05, "grad_norm": 4.860670820133751, "learning_rate": 1.9991340425454436e-06, "loss": 1.3657, "step": 7852 }, { "epoch": 0.05, "grad_norm": 4.448613419574616, "learning_rate": 1.9991338217120784e-06, "loss": 1.3957, "step": 7853 }, { "epoch": 0.05, "grad_norm": 4.671247219573718, "learning_rate": 1.9991336008505705e-06, "loss": 1.3791, "step": 7854 }, { "epoch": 0.05, "grad_norm": 4.959614105606823, "learning_rate": 1.999133379960921e-06, "loss": 1.2945, "step": 7855 }, { "epoch": 0.05, "grad_norm": 4.468128058062564, "learning_rate": 1.9991331590431285e-06, "loss": 1.3073, "step": 7856 }, { "epoch": 0.05, "grad_norm": 5.291204533016882, "learning_rate": 1.9991329380971944e-06, "loss": 1.3787, "step": 7857 }, { "epoch": 0.05, "grad_norm": 4.400267620252943, "learning_rate": 1.9991327171231176e-06, "loss": 1.4282, "step": 7858 }, { "epoch": 0.05, "grad_norm": 5.033168757191306, "learning_rate": 1.9991324961208985e-06, "loss": 1.362, "step": 7859 }, { "epoch": 0.05, "grad_norm": 4.515822451343939, "learning_rate": 1.9991322750905376e-06, "loss": 1.2719, "step": 7860 }, { "epoch": 0.05, "grad_norm": 4.9268503948497395, "learning_rate": 1.999132054032034e-06, "loss": 1.3558, "step": 7861 }, { "epoch": 0.05, "grad_norm": 8.60209526657174, "learning_rate": 1.9991318329453886e-06, "loss": 1.3861, "step": 7862 }, { "epoch": 0.05, "grad_norm": 4.697740151769331, "learning_rate": 1.9991316118306006e-06, "loss": 1.3295, "step": 7863 }, { "epoch": 0.05, "grad_norm": 4.741101177749595, "learning_rate": 1.9991313906876707e-06, "loss": 1.496, "step": 7864 }, { "epoch": 0.05, "grad_norm": 4.500865273629475, "learning_rate": 1.9991311695165986e-06, "loss": 1.5159, "step": 7865 }, { "epoch": 0.05, "grad_norm": 4.083464890835658, "learning_rate": 1.999130948317384e-06, "loss": 1.4502, "step": 7866 }, { "epoch": 0.05, "grad_norm": 4.420747557058192, "learning_rate": 1.999130727090027e-06, "loss": 1.2925, "step": 7867 }, { "epoch": 0.05, "grad_norm": 5.135998349106828, "learning_rate": 1.9991305058345283e-06, "loss": 1.4852, "step": 7868 }, { "epoch": 0.05, "grad_norm": 6.6955333591917965, "learning_rate": 1.9991302845508876e-06, "loss": 1.4416, "step": 7869 }, { "epoch": 0.05, "grad_norm": 4.72760677588766, "learning_rate": 1.9991300632391042e-06, "loss": 1.3802, "step": 7870 }, { "epoch": 0.05, "grad_norm": 4.420290122651671, "learning_rate": 1.999129841899179e-06, "loss": 1.2709, "step": 7871 }, { "epoch": 0.05, "grad_norm": 4.333308736928675, "learning_rate": 1.9991296205311116e-06, "loss": 1.4473, "step": 7872 }, { "epoch": 0.05, "grad_norm": 4.383603976083784, "learning_rate": 1.999129399134902e-06, "loss": 1.3902, "step": 7873 }, { "epoch": 0.05, "grad_norm": 4.371937519098825, "learning_rate": 1.99912917771055e-06, "loss": 1.299, "step": 7874 }, { "epoch": 0.05, "grad_norm": 4.700315092913012, "learning_rate": 1.999128956258056e-06, "loss": 1.3857, "step": 7875 }, { "epoch": 0.05, "grad_norm": 4.189280015488588, "learning_rate": 1.99912873477742e-06, "loss": 1.2397, "step": 7876 }, { "epoch": 0.05, "grad_norm": 4.220725041703095, "learning_rate": 1.9991285132686416e-06, "loss": 1.3956, "step": 7877 }, { "epoch": 0.05, "grad_norm": 4.859350391913755, "learning_rate": 1.999128291731721e-06, "loss": 1.4487, "step": 7878 }, { "epoch": 0.05, "grad_norm": 4.401864852624024, "learning_rate": 1.999128070166659e-06, "loss": 1.4258, "step": 7879 }, { "epoch": 0.05, "grad_norm": 4.44397745741506, "learning_rate": 1.9991278485734543e-06, "loss": 1.3566, "step": 7880 }, { "epoch": 0.05, "grad_norm": 4.91309823898804, "learning_rate": 1.9991276269521075e-06, "loss": 1.3444, "step": 7881 }, { "epoch": 0.05, "grad_norm": 4.667527704893879, "learning_rate": 1.9991274053026184e-06, "loss": 1.3744, "step": 7882 }, { "epoch": 0.05, "grad_norm": 4.309837037247037, "learning_rate": 1.9991271836249876e-06, "loss": 1.2796, "step": 7883 }, { "epoch": 0.05, "grad_norm": 4.381518346415893, "learning_rate": 1.999126961919215e-06, "loss": 1.5014, "step": 7884 }, { "epoch": 0.05, "eval_loss": 1.5658948421478271, "eval_runtime": 4.6115, "eval_samples_per_second": 1.952, "eval_steps_per_second": 1.084, "step": 7884 }, { "epoch": 0.05, "grad_norm": 4.283714767175808, "learning_rate": 1.9991267401852995e-06, "loss": 1.2823, "step": 7885 }, { "epoch": 0.05, "grad_norm": 4.549574675248714, "learning_rate": 1.9991265184232424e-06, "loss": 1.24, "step": 7886 }, { "epoch": 0.05, "grad_norm": 4.479672952540066, "learning_rate": 1.9991262966330434e-06, "loss": 1.3989, "step": 7887 }, { "epoch": 0.05, "grad_norm": 10.377616324433669, "learning_rate": 1.9991260748147017e-06, "loss": 1.5313, "step": 7888 }, { "epoch": 0.05, "grad_norm": 6.573317168040383, "learning_rate": 1.9991258529682186e-06, "loss": 1.4143, "step": 7889 }, { "epoch": 0.05, "grad_norm": 4.8446186121205495, "learning_rate": 1.9991256310935933e-06, "loss": 1.5735, "step": 7890 }, { "epoch": 0.05, "grad_norm": 4.757972434994108, "learning_rate": 1.9991254091908257e-06, "loss": 1.4053, "step": 7891 }, { "epoch": 0.05, "grad_norm": 4.367465286261333, "learning_rate": 1.9991251872599163e-06, "loss": 1.4348, "step": 7892 }, { "epoch": 0.05, "grad_norm": 4.550739875935156, "learning_rate": 1.9991249653008647e-06, "loss": 1.393, "step": 7893 }, { "epoch": 0.05, "grad_norm": 4.936416661660528, "learning_rate": 1.9991247433136713e-06, "loss": 1.3581, "step": 7894 }, { "epoch": 0.05, "grad_norm": 4.614387854754647, "learning_rate": 1.9991245212983356e-06, "loss": 1.3308, "step": 7895 }, { "epoch": 0.05, "grad_norm": 4.41145194509055, "learning_rate": 1.999124299254858e-06, "loss": 1.3587, "step": 7896 }, { "epoch": 0.05, "grad_norm": 5.231837605617733, "learning_rate": 1.9991240771832387e-06, "loss": 1.29, "step": 7897 }, { "epoch": 0.05, "grad_norm": 4.447012506572734, "learning_rate": 1.9991238550834767e-06, "loss": 1.3812, "step": 7898 }, { "epoch": 0.05, "grad_norm": 4.61486671530214, "learning_rate": 1.9991236329555732e-06, "loss": 1.3637, "step": 7899 }, { "epoch": 0.05, "grad_norm": 4.420749615143057, "learning_rate": 1.9991234107995276e-06, "loss": 1.3119, "step": 7900 }, { "epoch": 0.05, "grad_norm": 4.307335991095319, "learning_rate": 1.99912318861534e-06, "loss": 1.1966, "step": 7901 }, { "epoch": 0.05, "grad_norm": 6.507288920172387, "learning_rate": 1.9991229664030104e-06, "loss": 1.3704, "step": 7902 }, { "epoch": 0.05, "grad_norm": 4.7613792064088045, "learning_rate": 1.999122744162539e-06, "loss": 1.4332, "step": 7903 }, { "epoch": 0.05, "grad_norm": 4.863190180472665, "learning_rate": 1.9991225218939254e-06, "loss": 1.2518, "step": 7904 }, { "epoch": 0.05, "grad_norm": 4.493712066053234, "learning_rate": 1.99912229959717e-06, "loss": 1.2204, "step": 7905 }, { "epoch": 0.05, "grad_norm": 4.375741940371645, "learning_rate": 1.9991220772722724e-06, "loss": 1.4047, "step": 7906 }, { "epoch": 0.05, "grad_norm": 4.295892023382768, "learning_rate": 1.999121854919233e-06, "loss": 1.3851, "step": 7907 }, { "epoch": 0.05, "grad_norm": 4.597007234903246, "learning_rate": 1.999121632538052e-06, "loss": 1.3999, "step": 7908 }, { "epoch": 0.05, "grad_norm": 5.881100502163143, "learning_rate": 1.9991214101287287e-06, "loss": 1.3936, "step": 7909 }, { "epoch": 0.05, "grad_norm": 4.350752514009014, "learning_rate": 1.9991211876912635e-06, "loss": 1.2896, "step": 7910 }, { "epoch": 0.05, "grad_norm": 5.220851368581993, "learning_rate": 1.9991209652256565e-06, "loss": 1.4606, "step": 7911 }, { "epoch": 0.05, "grad_norm": 4.796976254952187, "learning_rate": 1.9991207427319073e-06, "loss": 1.6258, "step": 7912 }, { "epoch": 0.05, "grad_norm": 4.768419080865092, "learning_rate": 1.9991205202100167e-06, "loss": 1.3625, "step": 7913 }, { "epoch": 0.05, "grad_norm": 4.233874005716375, "learning_rate": 1.999120297659984e-06, "loss": 1.3188, "step": 7914 }, { "epoch": 0.05, "grad_norm": 4.177645778158895, "learning_rate": 1.999120075081809e-06, "loss": 1.2694, "step": 7915 }, { "epoch": 0.05, "grad_norm": 4.5435422081969445, "learning_rate": 1.9991198524754926e-06, "loss": 1.3417, "step": 7916 }, { "epoch": 0.05, "grad_norm": 5.641311379320699, "learning_rate": 1.9991196298410343e-06, "loss": 1.4612, "step": 7917 }, { "epoch": 0.05, "grad_norm": 4.410378331233864, "learning_rate": 1.9991194071784337e-06, "loss": 1.2973, "step": 7918 }, { "epoch": 0.05, "grad_norm": 4.810443179316793, "learning_rate": 1.9991191844876917e-06, "loss": 1.3293, "step": 7919 }, { "epoch": 0.05, "grad_norm": 10.2524967534719, "learning_rate": 1.999118961768808e-06, "loss": 0.9615, "step": 7920 }, { "epoch": 0.05, "grad_norm": 5.186281083510213, "learning_rate": 1.999118739021782e-06, "loss": 1.3054, "step": 7921 }, { "epoch": 0.05, "grad_norm": 4.40307733933914, "learning_rate": 1.999118516246614e-06, "loss": 1.3466, "step": 7922 }, { "epoch": 0.05, "grad_norm": 4.573887530740586, "learning_rate": 1.9991182934433042e-06, "loss": 1.4167, "step": 7923 }, { "epoch": 0.05, "grad_norm": 4.336000923774887, "learning_rate": 1.9991180706118527e-06, "loss": 1.2728, "step": 7924 }, { "epoch": 0.05, "grad_norm": 4.945110717150292, "learning_rate": 1.9991178477522594e-06, "loss": 1.4016, "step": 7925 }, { "epoch": 0.05, "grad_norm": 5.166560502236018, "learning_rate": 1.9991176248645246e-06, "loss": 1.4965, "step": 7926 }, { "epoch": 0.05, "grad_norm": 5.361818264778021, "learning_rate": 1.9991174019486476e-06, "loss": 1.4443, "step": 7927 }, { "epoch": 0.05, "grad_norm": 4.7580799872674255, "learning_rate": 1.999117179004629e-06, "loss": 1.3874, "step": 7928 }, { "epoch": 0.05, "grad_norm": 4.317283836336687, "learning_rate": 1.999116956032468e-06, "loss": 1.3588, "step": 7929 }, { "epoch": 0.05, "grad_norm": 5.746489083705707, "learning_rate": 1.9991167330321657e-06, "loss": 1.5891, "step": 7930 }, { "epoch": 0.05, "grad_norm": 4.449830229945385, "learning_rate": 1.999116510003722e-06, "loss": 1.3317, "step": 7931 }, { "epoch": 0.05, "grad_norm": 5.924329193284593, "learning_rate": 1.9991162869471358e-06, "loss": 1.3648, "step": 7932 }, { "epoch": 0.05, "grad_norm": 4.338714744766864, "learning_rate": 1.999116063862408e-06, "loss": 1.4679, "step": 7933 }, { "epoch": 0.05, "grad_norm": 4.252926058271161, "learning_rate": 1.9991158407495385e-06, "loss": 1.2192, "step": 7934 }, { "epoch": 0.05, "grad_norm": 4.681644481339857, "learning_rate": 1.9991156176085274e-06, "loss": 1.348, "step": 7935 }, { "epoch": 0.05, "grad_norm": 4.811285298925873, "learning_rate": 1.999115394439374e-06, "loss": 1.44, "step": 7936 }, { "epoch": 0.05, "grad_norm": 5.019647848894639, "learning_rate": 1.999115171242079e-06, "loss": 1.4317, "step": 7937 }, { "epoch": 0.05, "grad_norm": 4.307822714294956, "learning_rate": 1.999114948016643e-06, "loss": 1.5175, "step": 7938 }, { "epoch": 0.05, "grad_norm": 4.659200799819313, "learning_rate": 1.9991147247630646e-06, "loss": 1.5738, "step": 7939 }, { "epoch": 0.05, "grad_norm": 4.323585584812524, "learning_rate": 1.9991145014813447e-06, "loss": 1.2841, "step": 7940 }, { "epoch": 0.05, "grad_norm": 5.877089498727992, "learning_rate": 1.9991142781714826e-06, "loss": 1.2776, "step": 7941 }, { "epoch": 0.05, "grad_norm": 4.278507062876658, "learning_rate": 1.9991140548334796e-06, "loss": 1.2802, "step": 7942 }, { "epoch": 0.05, "grad_norm": 4.628730622051179, "learning_rate": 1.9991138314673343e-06, "loss": 1.1402, "step": 7943 }, { "epoch": 0.05, "grad_norm": 7.448035146551721, "learning_rate": 1.999113608073047e-06, "loss": 1.3304, "step": 7944 }, { "epoch": 0.05, "grad_norm": 9.517121551044305, "learning_rate": 1.9991133846506186e-06, "loss": 1.2432, "step": 7945 }, { "epoch": 0.05, "grad_norm": 4.769881942858478, "learning_rate": 1.9991131612000483e-06, "loss": 1.5942, "step": 7946 }, { "epoch": 0.05, "grad_norm": 4.697333913534161, "learning_rate": 1.999112937721336e-06, "loss": 1.2304, "step": 7947 }, { "epoch": 0.05, "grad_norm": 5.007259309617801, "learning_rate": 1.9991127142144825e-06, "loss": 1.3207, "step": 7948 }, { "epoch": 0.05, "grad_norm": 6.044983166732241, "learning_rate": 1.999112490679487e-06, "loss": 1.3671, "step": 7949 }, { "epoch": 0.05, "grad_norm": 4.8740643030418465, "learning_rate": 1.99911226711635e-06, "loss": 1.4998, "step": 7950 }, { "epoch": 0.05, "grad_norm": 4.497670011244053, "learning_rate": 1.9991120435250713e-06, "loss": 1.3662, "step": 7951 }, { "epoch": 0.05, "grad_norm": 4.482631661738367, "learning_rate": 1.999111819905651e-06, "loss": 1.4118, "step": 7952 }, { "epoch": 0.05, "grad_norm": 4.51675397584892, "learning_rate": 1.999111596258089e-06, "loss": 1.3852, "step": 7953 }, { "epoch": 0.05, "grad_norm": 4.19239560401668, "learning_rate": 1.9991113725823854e-06, "loss": 1.1872, "step": 7954 }, { "epoch": 0.05, "grad_norm": 4.299916242407997, "learning_rate": 1.99911114887854e-06, "loss": 1.3695, "step": 7955 }, { "epoch": 0.05, "grad_norm": 4.81532161685942, "learning_rate": 1.999110925146553e-06, "loss": 1.3898, "step": 7956 }, { "epoch": 0.05, "grad_norm": 4.507899544159196, "learning_rate": 1.9991107013864243e-06, "loss": 1.365, "step": 7957 }, { "epoch": 0.05, "eval_loss": 1.5656800270080566, "eval_runtime": 4.5989, "eval_samples_per_second": 1.957, "eval_steps_per_second": 1.087, "step": 7957 }, { "epoch": 0.05, "grad_norm": 5.468155035828772, "learning_rate": 1.999110477598154e-06, "loss": 1.4633, "step": 7958 }, { "epoch": 0.05, "grad_norm": 4.2164986604969865, "learning_rate": 1.9991102537817422e-06, "loss": 1.3522, "step": 7959 }, { "epoch": 0.05, "grad_norm": 4.822052602668897, "learning_rate": 1.9991100299371885e-06, "loss": 1.5062, "step": 7960 }, { "epoch": 0.05, "grad_norm": 4.428624494794631, "learning_rate": 1.9991098060644933e-06, "loss": 1.4166, "step": 7961 }, { "epoch": 0.05, "grad_norm": 4.444161894170243, "learning_rate": 1.999109582163657e-06, "loss": 1.3916, "step": 7962 }, { "epoch": 0.05, "grad_norm": 4.27619179863361, "learning_rate": 1.9991093582346784e-06, "loss": 1.2963, "step": 7963 }, { "epoch": 0.05, "grad_norm": 5.723855075709038, "learning_rate": 1.9991091342775587e-06, "loss": 1.4435, "step": 7964 }, { "epoch": 0.05, "grad_norm": 4.656683993322921, "learning_rate": 1.999108910292297e-06, "loss": 1.4645, "step": 7965 }, { "epoch": 0.05, "grad_norm": 5.534865508099635, "learning_rate": 1.999108686278894e-06, "loss": 1.3887, "step": 7966 }, { "epoch": 0.05, "grad_norm": 4.602022263169227, "learning_rate": 1.9991084622373497e-06, "loss": 1.4518, "step": 7967 }, { "epoch": 0.05, "grad_norm": 5.004070186843401, "learning_rate": 1.9991082381676635e-06, "loss": 1.277, "step": 7968 }, { "epoch": 0.05, "grad_norm": 4.458295029714002, "learning_rate": 1.999108014069836e-06, "loss": 1.3586, "step": 7969 }, { "epoch": 0.05, "grad_norm": 4.511494213746791, "learning_rate": 1.999107789943867e-06, "loss": 1.3855, "step": 7970 }, { "epoch": 0.05, "grad_norm": 3.955287824354757, "learning_rate": 1.999107565789756e-06, "loss": 1.2679, "step": 7971 }, { "epoch": 0.05, "grad_norm": 4.182679028897944, "learning_rate": 1.9991073416075034e-06, "loss": 1.2102, "step": 7972 }, { "epoch": 0.05, "grad_norm": 4.612797814274257, "learning_rate": 1.9991071173971093e-06, "loss": 1.0973, "step": 7973 }, { "epoch": 0.05, "grad_norm": 4.556509974545994, "learning_rate": 1.999106893158574e-06, "loss": 1.4159, "step": 7974 }, { "epoch": 0.05, "grad_norm": 4.744357186226214, "learning_rate": 1.999106668891897e-06, "loss": 1.4103, "step": 7975 }, { "epoch": 0.05, "grad_norm": 5.116167986576523, "learning_rate": 1.9991064445970787e-06, "loss": 1.5958, "step": 7976 }, { "epoch": 0.05, "grad_norm": 4.751589784125765, "learning_rate": 1.999106220274119e-06, "loss": 1.4511, "step": 7977 }, { "epoch": 0.05, "grad_norm": 7.532163283927442, "learning_rate": 1.9991059959230176e-06, "loss": 1.4321, "step": 7978 }, { "epoch": 0.05, "grad_norm": 4.218111466240033, "learning_rate": 1.9991057715437743e-06, "loss": 1.2408, "step": 7979 }, { "epoch": 0.05, "grad_norm": 4.907482698351131, "learning_rate": 1.99910554713639e-06, "loss": 1.3935, "step": 7980 }, { "epoch": 0.05, "grad_norm": 4.294816998411919, "learning_rate": 1.9991053227008644e-06, "loss": 1.2959, "step": 7981 }, { "epoch": 0.05, "grad_norm": 4.583837137387118, "learning_rate": 1.999105098237197e-06, "loss": 1.2133, "step": 7982 }, { "epoch": 0.05, "grad_norm": 4.710318921218467, "learning_rate": 1.999104873745388e-06, "loss": 1.2935, "step": 7983 }, { "epoch": 0.05, "grad_norm": 5.0482056465076734, "learning_rate": 1.9991046492254377e-06, "loss": 1.3708, "step": 7984 }, { "epoch": 0.05, "grad_norm": 4.492323569193223, "learning_rate": 1.999104424677346e-06, "loss": 1.4503, "step": 7985 }, { "epoch": 0.05, "grad_norm": 6.54824753415298, "learning_rate": 1.999104200101113e-06, "loss": 1.3438, "step": 7986 }, { "epoch": 0.05, "grad_norm": 5.502040850443269, "learning_rate": 1.9991039754967384e-06, "loss": 1.4879, "step": 7987 }, { "epoch": 0.05, "grad_norm": 4.6823869156823354, "learning_rate": 1.999103750864222e-06, "loss": 1.3546, "step": 7988 }, { "epoch": 0.05, "grad_norm": 4.368056132760108, "learning_rate": 1.9991035262035644e-06, "loss": 1.3031, "step": 7989 }, { "epoch": 0.05, "grad_norm": 4.6645893279900275, "learning_rate": 1.9991033015147657e-06, "loss": 1.5497, "step": 7990 }, { "epoch": 0.05, "grad_norm": 4.542573730753874, "learning_rate": 1.999103076797825e-06, "loss": 1.3482, "step": 7991 }, { "epoch": 0.05, "grad_norm": 5.665709613631178, "learning_rate": 1.9991028520527437e-06, "loss": 1.2485, "step": 7992 }, { "epoch": 0.05, "grad_norm": 4.990763824548995, "learning_rate": 1.9991026272795204e-06, "loss": 1.4217, "step": 7993 }, { "epoch": 0.05, "grad_norm": 4.967378873268701, "learning_rate": 1.9991024024781557e-06, "loss": 1.3438, "step": 7994 }, { "epoch": 0.05, "grad_norm": 4.3785765666543845, "learning_rate": 1.99910217764865e-06, "loss": 1.264, "step": 7995 }, { "epoch": 0.05, "grad_norm": 4.184943661373519, "learning_rate": 1.9991019527910025e-06, "loss": 1.2377, "step": 7996 }, { "epoch": 0.05, "grad_norm": 6.05297203094123, "learning_rate": 1.999101727905214e-06, "loss": 1.4009, "step": 7997 }, { "epoch": 0.05, "grad_norm": 4.199034736763347, "learning_rate": 1.9991015029912837e-06, "loss": 1.2605, "step": 7998 }, { "epoch": 0.05, "grad_norm": 4.911575981094752, "learning_rate": 1.9991012780492124e-06, "loss": 1.3555, "step": 7999 }, { "epoch": 0.05, "grad_norm": 5.486350257805041, "learning_rate": 1.9991010530789998e-06, "loss": 1.3395, "step": 8000 }, { "epoch": 0.05, "grad_norm": 6.805556361768766, "learning_rate": 1.9991008280806453e-06, "loss": 1.21, "step": 8001 }, { "epoch": 0.05, "grad_norm": 4.78064675158563, "learning_rate": 1.9991006030541498e-06, "loss": 1.383, "step": 8002 }, { "epoch": 0.05, "grad_norm": 4.3981583216140905, "learning_rate": 1.999100377999513e-06, "loss": 1.4246, "step": 8003 }, { "epoch": 0.05, "grad_norm": 5.032638384032344, "learning_rate": 1.999100152916735e-06, "loss": 1.3025, "step": 8004 }, { "epoch": 0.05, "grad_norm": 4.482588881582473, "learning_rate": 1.9990999278058154e-06, "loss": 1.3108, "step": 8005 }, { "epoch": 0.05, "grad_norm": 4.962625365251077, "learning_rate": 1.9990997026667547e-06, "loss": 1.3236, "step": 8006 }, { "epoch": 0.05, "grad_norm": 5.180179909877052, "learning_rate": 1.9990994774995522e-06, "loss": 1.3536, "step": 8007 }, { "epoch": 0.05, "grad_norm": 4.802706010129838, "learning_rate": 1.999099252304209e-06, "loss": 1.4625, "step": 8008 }, { "epoch": 0.05, "grad_norm": 5.110185539558003, "learning_rate": 1.9990990270807244e-06, "loss": 1.3365, "step": 8009 }, { "epoch": 0.05, "grad_norm": 4.422397453228428, "learning_rate": 1.999098801829098e-06, "loss": 1.3289, "step": 8010 }, { "epoch": 0.05, "grad_norm": 4.332316772085707, "learning_rate": 1.999098576549331e-06, "loss": 1.3325, "step": 8011 }, { "epoch": 0.05, "grad_norm": 4.458263958095114, "learning_rate": 1.9990983512414223e-06, "loss": 1.2976, "step": 8012 }, { "epoch": 0.05, "grad_norm": 4.7067448065935835, "learning_rate": 1.9990981259053723e-06, "loss": 1.4172, "step": 8013 }, { "epoch": 0.05, "grad_norm": 4.782644696484535, "learning_rate": 1.999097900541181e-06, "loss": 1.5318, "step": 8014 }, { "epoch": 0.05, "grad_norm": 5.839935554073177, "learning_rate": 1.9990976751488484e-06, "loss": 1.285, "step": 8015 }, { "epoch": 0.05, "grad_norm": 4.659889508150192, "learning_rate": 1.9990974497283747e-06, "loss": 1.4118, "step": 8016 }, { "epoch": 0.05, "grad_norm": 4.416726312842264, "learning_rate": 1.99909722427976e-06, "loss": 1.4156, "step": 8017 }, { "epoch": 0.05, "grad_norm": 4.644568718442651, "learning_rate": 1.9990969988030037e-06, "loss": 1.3075, "step": 8018 }, { "epoch": 0.05, "grad_norm": 4.8281507216630555, "learning_rate": 1.999096773298106e-06, "loss": 1.3449, "step": 8019 }, { "epoch": 0.05, "grad_norm": 4.261762514200759, "learning_rate": 1.9990965477650676e-06, "loss": 1.3258, "step": 8020 }, { "epoch": 0.05, "grad_norm": 4.683571660693128, "learning_rate": 1.9990963222038877e-06, "loss": 1.1711, "step": 8021 }, { "epoch": 0.05, "grad_norm": 4.613190518882954, "learning_rate": 1.9990960966145667e-06, "loss": 1.45, "step": 8022 }, { "epoch": 0.05, "grad_norm": 4.498647667229803, "learning_rate": 1.999095870997104e-06, "loss": 1.3781, "step": 8023 }, { "epoch": 0.05, "grad_norm": 4.843641654557754, "learning_rate": 1.9990956453515007e-06, "loss": 1.3764, "step": 8024 }, { "epoch": 0.05, "grad_norm": 7.69355776374498, "learning_rate": 1.999095419677756e-06, "loss": 1.3676, "step": 8025 }, { "epoch": 0.05, "grad_norm": 4.866485525806812, "learning_rate": 1.99909519397587e-06, "loss": 1.3919, "step": 8026 }, { "epoch": 0.05, "grad_norm": 4.839147512196882, "learning_rate": 1.999094968245843e-06, "loss": 1.417, "step": 8027 }, { "epoch": 0.05, "grad_norm": 4.822528422916652, "learning_rate": 1.999094742487675e-06, "loss": 1.4444, "step": 8028 }, { "epoch": 0.05, "grad_norm": 4.54834627752157, "learning_rate": 1.999094516701365e-06, "loss": 1.4076, "step": 8029 }, { "epoch": 0.05, "grad_norm": 4.852623418740598, "learning_rate": 1.9990942908869145e-06, "loss": 1.3949, "step": 8030 }, { "epoch": 0.05, "eval_loss": 1.563845157623291, "eval_runtime": 4.6163, "eval_samples_per_second": 1.95, "eval_steps_per_second": 1.083, "step": 8030 }, { "epoch": 0.05, "grad_norm": 4.770142324284321, "learning_rate": 1.9990940650443227e-06, "loss": 1.3439, "step": 8031 }, { "epoch": 0.05, "grad_norm": 4.425790507467108, "learning_rate": 1.99909383917359e-06, "loss": 1.4299, "step": 8032 }, { "epoch": 0.05, "grad_norm": 4.662183949613705, "learning_rate": 1.9990936132747157e-06, "loss": 1.3105, "step": 8033 }, { "epoch": 0.05, "grad_norm": 4.524969193411602, "learning_rate": 1.9990933873477e-06, "loss": 1.2497, "step": 8034 }, { "epoch": 0.05, "grad_norm": 4.710194129926366, "learning_rate": 1.999093161392544e-06, "loss": 1.5204, "step": 8035 }, { "epoch": 0.05, "grad_norm": 4.416709929502793, "learning_rate": 1.9990929354092465e-06, "loss": 1.3901, "step": 8036 }, { "epoch": 0.05, "grad_norm": 4.659000921428525, "learning_rate": 1.9990927093978075e-06, "loss": 1.3202, "step": 8037 }, { "epoch": 0.05, "grad_norm": 4.837349181329828, "learning_rate": 1.999092483358228e-06, "loss": 1.3652, "step": 8038 }, { "epoch": 0.05, "grad_norm": 4.490570955074692, "learning_rate": 1.999092257290507e-06, "loss": 1.5145, "step": 8039 }, { "epoch": 0.05, "grad_norm": 4.672958586547291, "learning_rate": 1.9990920311946453e-06, "loss": 1.4583, "step": 8040 }, { "epoch": 0.05, "grad_norm": 5.354063146841929, "learning_rate": 1.999091805070642e-06, "loss": 1.422, "step": 8041 }, { "epoch": 0.05, "grad_norm": 4.199683664508161, "learning_rate": 1.9990915789184977e-06, "loss": 1.1644, "step": 8042 }, { "epoch": 0.05, "grad_norm": 5.436254873694826, "learning_rate": 1.9990913527382125e-06, "loss": 1.3262, "step": 8043 }, { "epoch": 0.05, "grad_norm": 4.9643955407837685, "learning_rate": 1.999091126529786e-06, "loss": 1.3082, "step": 8044 }, { "epoch": 0.05, "grad_norm": 4.61008019829296, "learning_rate": 1.9990909002932187e-06, "loss": 1.3491, "step": 8045 }, { "epoch": 0.05, "grad_norm": 4.3840310204567725, "learning_rate": 1.99909067402851e-06, "loss": 1.3867, "step": 8046 }, { "epoch": 0.05, "grad_norm": 4.60962567518782, "learning_rate": 1.9990904477356606e-06, "loss": 1.3611, "step": 8047 }, { "epoch": 0.05, "grad_norm": 4.535853222553209, "learning_rate": 1.99909022141467e-06, "loss": 1.3939, "step": 8048 }, { "epoch": 0.05, "grad_norm": 4.937640684983281, "learning_rate": 1.9990899950655386e-06, "loss": 1.4775, "step": 8049 }, { "epoch": 0.05, "grad_norm": 4.1642360975629025, "learning_rate": 1.9990897686882657e-06, "loss": 1.2731, "step": 8050 }, { "epoch": 0.05, "grad_norm": 4.2612358008255615, "learning_rate": 1.999089542282852e-06, "loss": 1.1743, "step": 8051 }, { "epoch": 0.05, "grad_norm": 4.258771985240577, "learning_rate": 1.999089315849297e-06, "loss": 1.3246, "step": 8052 }, { "epoch": 0.05, "grad_norm": 4.376711858300126, "learning_rate": 1.999089089387601e-06, "loss": 1.4183, "step": 8053 }, { "epoch": 0.05, "grad_norm": 5.72324447125782, "learning_rate": 1.999088862897765e-06, "loss": 1.281, "step": 8054 }, { "epoch": 0.05, "grad_norm": 7.146245924107082, "learning_rate": 1.9990886363797866e-06, "loss": 1.2382, "step": 8055 }, { "epoch": 0.05, "grad_norm": 5.281382016603405, "learning_rate": 1.999088409833668e-06, "loss": 1.3788, "step": 8056 }, { "epoch": 0.05, "grad_norm": 4.717696954685534, "learning_rate": 1.999088183259408e-06, "loss": 1.196, "step": 8057 }, { "epoch": 0.05, "grad_norm": 6.413650640572415, "learning_rate": 1.9990879566570074e-06, "loss": 1.4799, "step": 8058 }, { "epoch": 0.05, "grad_norm": 4.5808792742950875, "learning_rate": 1.9990877300264658e-06, "loss": 1.45, "step": 8059 }, { "epoch": 0.05, "grad_norm": 4.075199271728905, "learning_rate": 1.9990875033677827e-06, "loss": 1.2714, "step": 8060 }, { "epoch": 0.05, "grad_norm": 4.8411634561770445, "learning_rate": 1.999087276680959e-06, "loss": 1.4637, "step": 8061 }, { "epoch": 0.05, "grad_norm": 4.730347196004556, "learning_rate": 1.999087049965994e-06, "loss": 1.462, "step": 8062 }, { "epoch": 0.05, "grad_norm": 5.45352777292585, "learning_rate": 1.9990868232228885e-06, "loss": 1.4368, "step": 8063 }, { "epoch": 0.05, "grad_norm": 4.95024326032512, "learning_rate": 1.999086596451642e-06, "loss": 1.2837, "step": 8064 }, { "epoch": 0.05, "grad_norm": 4.546712513743431, "learning_rate": 1.999086369652255e-06, "loss": 1.326, "step": 8065 }, { "epoch": 0.05, "grad_norm": 4.762441048351948, "learning_rate": 1.999086142824726e-06, "loss": 1.3847, "step": 8066 }, { "epoch": 0.05, "grad_norm": 4.086265084186629, "learning_rate": 1.9990859159690564e-06, "loss": 1.1633, "step": 8067 }, { "epoch": 0.05, "grad_norm": 6.362827333991816, "learning_rate": 1.999085689085246e-06, "loss": 1.2907, "step": 8068 }, { "epoch": 0.05, "grad_norm": 4.84438097168186, "learning_rate": 1.999085462173295e-06, "loss": 1.3589, "step": 8069 }, { "epoch": 0.05, "grad_norm": 5.430526178904168, "learning_rate": 1.9990852352332025e-06, "loss": 1.5309, "step": 8070 }, { "epoch": 0.05, "grad_norm": 4.199522070568892, "learning_rate": 1.9990850082649696e-06, "loss": 1.3193, "step": 8071 }, { "epoch": 0.05, "grad_norm": 4.32869786099321, "learning_rate": 1.9990847812685956e-06, "loss": 1.2633, "step": 8072 }, { "epoch": 0.05, "grad_norm": 5.751583966232769, "learning_rate": 1.9990845542440807e-06, "loss": 1.3492, "step": 8073 }, { "epoch": 0.05, "grad_norm": 5.942820926087466, "learning_rate": 1.999084327191425e-06, "loss": 1.5252, "step": 8074 }, { "epoch": 0.05, "grad_norm": 4.591915314043756, "learning_rate": 1.999084100110628e-06, "loss": 1.3366, "step": 8075 }, { "epoch": 0.05, "grad_norm": 4.778276917265565, "learning_rate": 1.9990838730016906e-06, "loss": 1.2202, "step": 8076 }, { "epoch": 0.05, "grad_norm": 5.517214795496894, "learning_rate": 1.999083645864612e-06, "loss": 1.4901, "step": 8077 }, { "epoch": 0.05, "grad_norm": 4.597442320894966, "learning_rate": 1.999083418699393e-06, "loss": 1.3603, "step": 8078 }, { "epoch": 0.05, "grad_norm": 5.3166984015696706, "learning_rate": 1.999083191506033e-06, "loss": 1.3266, "step": 8079 }, { "epoch": 0.05, "grad_norm": 4.346967686875101, "learning_rate": 1.9990829642845316e-06, "loss": 1.3003, "step": 8080 }, { "epoch": 0.05, "grad_norm": 7.410128572251315, "learning_rate": 1.99908273703489e-06, "loss": 1.2521, "step": 8081 }, { "epoch": 0.05, "grad_norm": 7.1329079467163226, "learning_rate": 1.9990825097571073e-06, "loss": 1.4368, "step": 8082 }, { "epoch": 0.05, "grad_norm": 4.634889547626244, "learning_rate": 1.999082282451184e-06, "loss": 1.2245, "step": 8083 }, { "epoch": 0.05, "grad_norm": 4.839212670748647, "learning_rate": 1.999082055117119e-06, "loss": 1.3027, "step": 8084 }, { "epoch": 0.05, "grad_norm": 4.150063099936581, "learning_rate": 1.999081827754914e-06, "loss": 1.3169, "step": 8085 }, { "epoch": 0.05, "grad_norm": 4.554343401129474, "learning_rate": 1.9990816003645683e-06, "loss": 1.3174, "step": 8086 }, { "epoch": 0.05, "grad_norm": 4.9142787613994585, "learning_rate": 1.9990813729460814e-06, "loss": 1.5393, "step": 8087 }, { "epoch": 0.05, "grad_norm": 5.055839235927033, "learning_rate": 1.999081145499454e-06, "loss": 1.3263, "step": 8088 }, { "epoch": 0.05, "grad_norm": 4.193998611837276, "learning_rate": 1.9990809180246856e-06, "loss": 1.391, "step": 8089 }, { "epoch": 0.05, "grad_norm": 4.874541138813153, "learning_rate": 1.999080690521776e-06, "loss": 1.5614, "step": 8090 }, { "epoch": 0.05, "grad_norm": 4.20299120814966, "learning_rate": 1.9990804629907267e-06, "loss": 1.2315, "step": 8091 }, { "epoch": 0.05, "grad_norm": 4.345438405753769, "learning_rate": 1.999080235431536e-06, "loss": 1.2839, "step": 8092 }, { "epoch": 0.05, "grad_norm": 4.4432134053918, "learning_rate": 1.9990800078442043e-06, "loss": 1.3814, "step": 8093 }, { "epoch": 0.05, "grad_norm": 4.733003354675106, "learning_rate": 1.9990797802287323e-06, "loss": 1.4173, "step": 8094 }, { "epoch": 0.05, "grad_norm": 4.810909281814115, "learning_rate": 1.9990795525851193e-06, "loss": 1.5015, "step": 8095 }, { "epoch": 0.05, "grad_norm": 4.936191269032214, "learning_rate": 1.9990793249133654e-06, "loss": 1.2872, "step": 8096 }, { "epoch": 0.05, "grad_norm": 4.955941674963165, "learning_rate": 1.999079097213471e-06, "loss": 1.1613, "step": 8097 }, { "epoch": 0.05, "grad_norm": 4.7887043356105945, "learning_rate": 1.999078869485436e-06, "loss": 1.4362, "step": 8098 }, { "epoch": 0.05, "grad_norm": 4.838216520686419, "learning_rate": 1.9990786417292598e-06, "loss": 1.2953, "step": 8099 }, { "epoch": 0.05, "grad_norm": 4.341617675005489, "learning_rate": 1.999078413944943e-06, "loss": 1.4635, "step": 8100 }, { "epoch": 0.05, "grad_norm": 7.002144125987926, "learning_rate": 1.9990781861324856e-06, "loss": 1.3311, "step": 8101 }, { "epoch": 0.05, "grad_norm": 6.251194460531906, "learning_rate": 1.999077958291888e-06, "loss": 1.5165, "step": 8102 }, { "epoch": 0.05, "grad_norm": 4.8632412765528255, "learning_rate": 1.999077730423149e-06, "loss": 1.3268, "step": 8103 }, { "epoch": 0.05, "eval_loss": 1.5651912689208984, "eval_runtime": 4.5788, "eval_samples_per_second": 1.966, "eval_steps_per_second": 1.092, "step": 8103 }, { "epoch": 0.05, "grad_norm": 5.008887883460614, "learning_rate": 1.9990775025262696e-06, "loss": 1.1079, "step": 8104 }, { "epoch": 0.05, "grad_norm": 4.350921997029759, "learning_rate": 1.9990772746012494e-06, "loss": 1.2946, "step": 8105 }, { "epoch": 0.05, "grad_norm": 6.089920056338072, "learning_rate": 1.999077046648088e-06, "loss": 1.513, "step": 8106 }, { "epoch": 0.05, "grad_norm": 4.9552021225881475, "learning_rate": 1.999076818666787e-06, "loss": 1.0746, "step": 8107 }, { "epoch": 0.05, "grad_norm": 4.505434682306873, "learning_rate": 1.9990765906573446e-06, "loss": 1.4548, "step": 8108 }, { "epoch": 0.05, "grad_norm": 4.160285517656663, "learning_rate": 1.999076362619762e-06, "loss": 1.2986, "step": 8109 }, { "epoch": 0.05, "grad_norm": 4.645818342890148, "learning_rate": 1.999076134554038e-06, "loss": 1.4311, "step": 8110 }, { "epoch": 0.05, "grad_norm": 4.421401785116704, "learning_rate": 1.999075906460174e-06, "loss": 1.3205, "step": 8111 }, { "epoch": 0.05, "grad_norm": 3.9282532997582704, "learning_rate": 1.999075678338169e-06, "loss": 1.1389, "step": 8112 }, { "epoch": 0.05, "grad_norm": 4.554066141171839, "learning_rate": 1.9990754501880237e-06, "loss": 1.3809, "step": 8113 }, { "epoch": 0.05, "grad_norm": 4.322508516203889, "learning_rate": 1.9990752220097377e-06, "loss": 1.3827, "step": 8114 }, { "epoch": 0.05, "grad_norm": 5.111361288098748, "learning_rate": 1.999074993803311e-06, "loss": 1.5507, "step": 8115 }, { "epoch": 0.05, "grad_norm": 4.6004343376874495, "learning_rate": 1.9990747655687435e-06, "loss": 1.3917, "step": 8116 }, { "epoch": 0.05, "grad_norm": 4.425102652583931, "learning_rate": 1.999074537306036e-06, "loss": 1.3274, "step": 8117 }, { "epoch": 0.05, "grad_norm": 4.092531236559646, "learning_rate": 1.9990743090151872e-06, "loss": 1.2099, "step": 8118 }, { "epoch": 0.05, "grad_norm": 4.9732572140275995, "learning_rate": 1.999074080696198e-06, "loss": 1.4398, "step": 8119 }, { "epoch": 0.05, "grad_norm": 5.302127272681002, "learning_rate": 1.9990738523490683e-06, "loss": 1.3473, "step": 8120 }, { "epoch": 0.05, "grad_norm": 4.454496335329512, "learning_rate": 1.999073623973798e-06, "loss": 1.2049, "step": 8121 }, { "epoch": 0.05, "grad_norm": 5.092550662199784, "learning_rate": 1.9990733955703867e-06, "loss": 1.3565, "step": 8122 }, { "epoch": 0.05, "grad_norm": 5.8762127998745886, "learning_rate": 1.9990731671388353e-06, "loss": 1.2918, "step": 8123 }, { "epoch": 0.05, "grad_norm": 4.92831374647037, "learning_rate": 1.9990729386791433e-06, "loss": 1.3664, "step": 8124 }, { "epoch": 0.05, "grad_norm": 4.269137439168466, "learning_rate": 1.999072710191311e-06, "loss": 1.2979, "step": 8125 }, { "epoch": 0.05, "grad_norm": 4.342377066117419, "learning_rate": 1.9990724816753377e-06, "loss": 1.273, "step": 8126 }, { "epoch": 0.05, "grad_norm": 4.192875267629217, "learning_rate": 1.9990722531312237e-06, "loss": 1.2949, "step": 8127 }, { "epoch": 0.05, "grad_norm": 4.634802307585735, "learning_rate": 1.9990720245589695e-06, "loss": 1.3945, "step": 8128 }, { "epoch": 0.05, "grad_norm": 4.469188523963776, "learning_rate": 1.999071795958575e-06, "loss": 1.33, "step": 8129 }, { "epoch": 0.05, "grad_norm": 4.239769048194955, "learning_rate": 1.9990715673300395e-06, "loss": 1.3014, "step": 8130 }, { "epoch": 0.05, "grad_norm": 4.920258791962778, "learning_rate": 1.9990713386733637e-06, "loss": 1.4194, "step": 8131 }, { "epoch": 0.05, "grad_norm": 5.289521187272178, "learning_rate": 1.9990711099885473e-06, "loss": 1.4002, "step": 8132 }, { "epoch": 0.05, "grad_norm": 5.62927614473075, "learning_rate": 1.9990708812755904e-06, "loss": 1.4542, "step": 8133 }, { "epoch": 0.05, "grad_norm": 4.871332993893018, "learning_rate": 1.999070652534493e-06, "loss": 1.3714, "step": 8134 }, { "epoch": 0.05, "grad_norm": 4.422696119803318, "learning_rate": 1.999070423765255e-06, "loss": 1.2183, "step": 8135 }, { "epoch": 0.05, "grad_norm": 5.9151491999559225, "learning_rate": 1.9990701949678767e-06, "loss": 1.5087, "step": 8136 }, { "epoch": 0.05, "grad_norm": 4.5374246993029335, "learning_rate": 1.9990699661423576e-06, "loss": 1.1925, "step": 8137 }, { "epoch": 0.05, "grad_norm": 5.337201794890096, "learning_rate": 1.9990697372886983e-06, "loss": 1.342, "step": 8138 }, { "epoch": 0.05, "grad_norm": 6.630343140609443, "learning_rate": 1.9990695084068985e-06, "loss": 1.1705, "step": 8139 }, { "epoch": 0.06, "grad_norm": 4.69765815208526, "learning_rate": 1.9990692794969586e-06, "loss": 1.1733, "step": 8140 }, { "epoch": 0.06, "grad_norm": 4.345402075168154, "learning_rate": 1.9990690505588772e-06, "loss": 1.3698, "step": 8141 }, { "epoch": 0.06, "grad_norm": 4.631343672968385, "learning_rate": 1.999068821592656e-06, "loss": 1.257, "step": 8142 }, { "epoch": 0.06, "grad_norm": 4.442509378135107, "learning_rate": 1.9990685925982946e-06, "loss": 1.4532, "step": 8143 }, { "epoch": 0.06, "grad_norm": 4.687434316828365, "learning_rate": 1.999068363575793e-06, "loss": 1.316, "step": 8144 }, { "epoch": 0.06, "grad_norm": 4.462368073655987, "learning_rate": 1.99906813452515e-06, "loss": 1.2797, "step": 8145 }, { "epoch": 0.06, "grad_norm": 4.5130780002857245, "learning_rate": 1.9990679054463673e-06, "loss": 1.3643, "step": 8146 }, { "epoch": 0.06, "grad_norm": 4.828893305048497, "learning_rate": 1.999067676339444e-06, "loss": 1.2414, "step": 8147 }, { "epoch": 0.06, "grad_norm": 6.034146510706116, "learning_rate": 1.9990674472043804e-06, "loss": 1.3613, "step": 8148 }, { "epoch": 0.06, "grad_norm": 4.297186875229896, "learning_rate": 1.999067218041176e-06, "loss": 1.2414, "step": 8149 }, { "epoch": 0.06, "grad_norm": 4.7763642568178195, "learning_rate": 1.9990669888498313e-06, "loss": 1.3164, "step": 8150 }, { "epoch": 0.06, "grad_norm": 5.062451573791261, "learning_rate": 1.9990667596303465e-06, "loss": 1.5195, "step": 8151 }, { "epoch": 0.06, "grad_norm": 4.612083552367059, "learning_rate": 1.9990665303827212e-06, "loss": 1.3243, "step": 8152 }, { "epoch": 0.06, "grad_norm": 4.483491492990833, "learning_rate": 1.9990663011069554e-06, "loss": 1.3377, "step": 8153 }, { "epoch": 0.06, "grad_norm": 4.964050117142661, "learning_rate": 1.9990660718030494e-06, "loss": 1.3485, "step": 8154 }, { "epoch": 0.06, "grad_norm": 5.014820366858035, "learning_rate": 1.999065842471003e-06, "loss": 1.2653, "step": 8155 }, { "epoch": 0.06, "grad_norm": 4.754692728145271, "learning_rate": 1.9990656131108158e-06, "loss": 1.4429, "step": 8156 }, { "epoch": 0.06, "grad_norm": 8.519424652866649, "learning_rate": 1.9990653837224886e-06, "loss": 1.402, "step": 8157 }, { "epoch": 0.06, "grad_norm": 4.540035419705552, "learning_rate": 1.9990651543060212e-06, "loss": 1.2572, "step": 8158 }, { "epoch": 0.06, "grad_norm": 6.619455235040363, "learning_rate": 1.9990649248614133e-06, "loss": 1.4367, "step": 8159 }, { "epoch": 0.06, "grad_norm": 4.71675450608144, "learning_rate": 1.999064695388665e-06, "loss": 1.4176, "step": 8160 }, { "epoch": 0.06, "grad_norm": 7.993286865850736, "learning_rate": 1.9990644658877763e-06, "loss": 1.2892, "step": 8161 }, { "epoch": 0.06, "grad_norm": 5.6147445009177295, "learning_rate": 1.9990642363587476e-06, "loss": 1.5793, "step": 8162 }, { "epoch": 0.06, "grad_norm": 5.238736997453921, "learning_rate": 1.9990640068015783e-06, "loss": 1.4, "step": 8163 }, { "epoch": 0.06, "grad_norm": 6.103905343401169, "learning_rate": 1.999063777216269e-06, "loss": 1.5593, "step": 8164 }, { "epoch": 0.06, "grad_norm": 4.584271636284578, "learning_rate": 1.999063547602819e-06, "loss": 1.3605, "step": 8165 }, { "epoch": 0.06, "grad_norm": 4.658753577897113, "learning_rate": 1.999063317961229e-06, "loss": 1.4471, "step": 8166 }, { "epoch": 0.06, "grad_norm": 4.447134445371129, "learning_rate": 1.9990630882914983e-06, "loss": 1.261, "step": 8167 }, { "epoch": 0.06, "grad_norm": 4.620107268350806, "learning_rate": 1.999062858593628e-06, "loss": 1.3733, "step": 8168 }, { "epoch": 0.06, "grad_norm": 4.403263421456244, "learning_rate": 1.9990626288676167e-06, "loss": 1.2944, "step": 8169 }, { "epoch": 0.06, "grad_norm": 4.654332866115498, "learning_rate": 1.9990623991134657e-06, "loss": 1.311, "step": 8170 }, { "epoch": 0.06, "grad_norm": 4.717059859979367, "learning_rate": 1.999062169331174e-06, "loss": 1.3374, "step": 8171 }, { "epoch": 0.06, "grad_norm": 4.582485380958364, "learning_rate": 1.9990619395207425e-06, "loss": 1.2183, "step": 8172 }, { "epoch": 0.06, "grad_norm": 4.330372017051996, "learning_rate": 1.9990617096821707e-06, "loss": 1.324, "step": 8173 }, { "epoch": 0.06, "grad_norm": 5.303122848386714, "learning_rate": 1.9990614798154583e-06, "loss": 1.3615, "step": 8174 }, { "epoch": 0.06, "grad_norm": 4.593865282301448, "learning_rate": 1.999061249920606e-06, "loss": 1.4621, "step": 8175 }, { "epoch": 0.06, "grad_norm": 4.706563442228462, "learning_rate": 1.999061019997613e-06, "loss": 1.3489, "step": 8176 }, { "epoch": 0.06, "eval_loss": 1.5680981874465942, "eval_runtime": 4.6026, "eval_samples_per_second": 1.955, "eval_steps_per_second": 1.086, "step": 8176 }, { "epoch": 0.06, "grad_norm": 5.004139685310967, "learning_rate": 1.99906079004648e-06, "loss": 1.4981, "step": 8177 }, { "epoch": 0.06, "grad_norm": 4.3104376332742405, "learning_rate": 1.9990605600672067e-06, "loss": 1.3347, "step": 8178 }, { "epoch": 0.06, "grad_norm": 4.80355136067286, "learning_rate": 1.9990603300597937e-06, "loss": 1.3438, "step": 8179 }, { "epoch": 0.06, "grad_norm": 5.289810607765796, "learning_rate": 1.99906010002424e-06, "loss": 1.4534, "step": 8180 }, { "epoch": 0.06, "grad_norm": 6.165244859201323, "learning_rate": 1.999059869960546e-06, "loss": 1.4254, "step": 8181 }, { "epoch": 0.06, "grad_norm": 5.194961481009174, "learning_rate": 1.999059639868712e-06, "loss": 1.3516, "step": 8182 }, { "epoch": 0.06, "grad_norm": 4.484051437901652, "learning_rate": 1.9990594097487382e-06, "loss": 1.5406, "step": 8183 }, { "epoch": 0.06, "grad_norm": 7.2052409589842314, "learning_rate": 1.9990591796006238e-06, "loss": 1.5056, "step": 8184 }, { "epoch": 0.06, "grad_norm": 4.527318203756598, "learning_rate": 1.999058949424369e-06, "loss": 1.5218, "step": 8185 }, { "epoch": 0.06, "grad_norm": 5.462347342839494, "learning_rate": 1.9990587192199744e-06, "loss": 1.4686, "step": 8186 }, { "epoch": 0.06, "grad_norm": 4.342169819394465, "learning_rate": 1.9990584889874395e-06, "loss": 1.3446, "step": 8187 }, { "epoch": 0.06, "grad_norm": 4.3229422682438114, "learning_rate": 1.9990582587267645e-06, "loss": 1.2816, "step": 8188 }, { "epoch": 0.06, "grad_norm": 5.256379348200149, "learning_rate": 1.9990580284379493e-06, "loss": 1.2799, "step": 8189 }, { "epoch": 0.06, "grad_norm": 4.416028606240512, "learning_rate": 1.999057798120994e-06, "loss": 1.271, "step": 8190 }, { "epoch": 0.06, "grad_norm": 4.350518838122808, "learning_rate": 1.9990575677758987e-06, "loss": 1.3903, "step": 8191 }, { "epoch": 0.06, "grad_norm": 4.423070548389722, "learning_rate": 1.999057337402663e-06, "loss": 1.3614, "step": 8192 }, { "epoch": 0.06, "grad_norm": 4.665553755458738, "learning_rate": 1.9990571070012875e-06, "loss": 1.3726, "step": 8193 }, { "epoch": 0.06, "grad_norm": 4.109205964016318, "learning_rate": 1.9990568765717717e-06, "loss": 1.2828, "step": 8194 }, { "epoch": 0.06, "grad_norm": 5.287821304594056, "learning_rate": 1.999056646114116e-06, "loss": 1.5666, "step": 8195 }, { "epoch": 0.06, "grad_norm": 4.529713016551012, "learning_rate": 1.9990564156283197e-06, "loss": 1.3478, "step": 8196 }, { "epoch": 0.06, "grad_norm": 5.784976617486848, "learning_rate": 1.9990561851143836e-06, "loss": 1.4471, "step": 8197 }, { "epoch": 0.06, "grad_norm": 4.454400798451111, "learning_rate": 1.9990559545723077e-06, "loss": 1.2595, "step": 8198 }, { "epoch": 0.06, "grad_norm": 5.707120385263027, "learning_rate": 1.9990557240020913e-06, "loss": 1.2314, "step": 8199 }, { "epoch": 0.06, "grad_norm": 4.633407491887176, "learning_rate": 1.9990554934037347e-06, "loss": 1.3608, "step": 8200 }, { "epoch": 0.06, "grad_norm": 5.260836740350598, "learning_rate": 1.9990552627772385e-06, "loss": 1.3762, "step": 8201 }, { "epoch": 0.06, "grad_norm": 4.784082329367463, "learning_rate": 1.999055032122602e-06, "loss": 1.5616, "step": 8202 }, { "epoch": 0.06, "grad_norm": 4.253611588958306, "learning_rate": 1.9990548014398256e-06, "loss": 1.3073, "step": 8203 }, { "epoch": 0.06, "grad_norm": 5.1595440466349896, "learning_rate": 1.999054570728909e-06, "loss": 1.1739, "step": 8204 }, { "epoch": 0.06, "grad_norm": 5.295805369092527, "learning_rate": 1.999054339989852e-06, "loss": 1.4336, "step": 8205 }, { "epoch": 0.06, "grad_norm": 4.507015401056458, "learning_rate": 1.9990541092226557e-06, "loss": 1.2707, "step": 8206 }, { "epoch": 0.06, "grad_norm": 4.439768181495246, "learning_rate": 1.9990538784273186e-06, "loss": 1.4005, "step": 8207 }, { "epoch": 0.06, "grad_norm": 5.466976867515769, "learning_rate": 1.9990536476038423e-06, "loss": 1.349, "step": 8208 }, { "epoch": 0.06, "grad_norm": 6.636619163511564, "learning_rate": 1.9990534167522255e-06, "loss": 1.3271, "step": 8209 }, { "epoch": 0.06, "grad_norm": 4.3467939491734615, "learning_rate": 1.9990531858724685e-06, "loss": 1.3297, "step": 8210 }, { "epoch": 0.06, "grad_norm": 4.581184592576977, "learning_rate": 1.999052954964572e-06, "loss": 1.2271, "step": 8211 }, { "epoch": 0.06, "grad_norm": 5.144152459181064, "learning_rate": 1.999052724028535e-06, "loss": 1.3265, "step": 8212 }, { "epoch": 0.06, "grad_norm": 4.153932906417823, "learning_rate": 1.9990524930643584e-06, "loss": 1.2782, "step": 8213 }, { "epoch": 0.06, "grad_norm": 5.105434301335586, "learning_rate": 1.9990522620720417e-06, "loss": 1.4297, "step": 8214 }, { "epoch": 0.06, "grad_norm": 6.270416839900056, "learning_rate": 1.999052031051585e-06, "loss": 1.2907, "step": 8215 }, { "epoch": 0.06, "grad_norm": 4.905599374416815, "learning_rate": 1.9990518000029885e-06, "loss": 1.3438, "step": 8216 }, { "epoch": 0.06, "grad_norm": 4.670361292521902, "learning_rate": 1.9990515689262514e-06, "loss": 1.4586, "step": 8217 }, { "epoch": 0.06, "grad_norm": 4.352520738279014, "learning_rate": 1.999051337821375e-06, "loss": 1.3927, "step": 8218 }, { "epoch": 0.06, "grad_norm": 4.777592430544689, "learning_rate": 1.9990511066883586e-06, "loss": 1.4646, "step": 8219 }, { "epoch": 0.06, "grad_norm": 4.982101908623657, "learning_rate": 1.9990508755272016e-06, "loss": 1.569, "step": 8220 }, { "epoch": 0.06, "grad_norm": 4.088346705840035, "learning_rate": 1.9990506443379053e-06, "loss": 1.1569, "step": 8221 }, { "epoch": 0.06, "grad_norm": 5.438399638784706, "learning_rate": 1.999050413120469e-06, "loss": 1.3463, "step": 8222 }, { "epoch": 0.06, "grad_norm": 4.770971061912871, "learning_rate": 1.9990501818748928e-06, "loss": 1.4733, "step": 8223 }, { "epoch": 0.06, "grad_norm": 4.643701043309167, "learning_rate": 1.9990499506011765e-06, "loss": 1.3408, "step": 8224 }, { "epoch": 0.06, "grad_norm": 5.605134780579443, "learning_rate": 1.99904971929932e-06, "loss": 1.3933, "step": 8225 }, { "epoch": 0.06, "grad_norm": 5.212881130818967, "learning_rate": 1.999049487969324e-06, "loss": 1.3868, "step": 8226 }, { "epoch": 0.06, "grad_norm": 4.379932623306622, "learning_rate": 1.9990492566111883e-06, "loss": 1.286, "step": 8227 }, { "epoch": 0.06, "grad_norm": 4.302565049798104, "learning_rate": 1.9990490252249123e-06, "loss": 1.4654, "step": 8228 }, { "epoch": 0.06, "grad_norm": 5.230311565955441, "learning_rate": 1.9990487938104967e-06, "loss": 1.5078, "step": 8229 }, { "epoch": 0.06, "grad_norm": 4.607159448600373, "learning_rate": 1.999048562367941e-06, "loss": 1.4102, "step": 8230 }, { "epoch": 0.06, "grad_norm": 4.576174547189744, "learning_rate": 1.9990483308972455e-06, "loss": 1.3934, "step": 8231 }, { "epoch": 0.06, "grad_norm": 4.518751709418994, "learning_rate": 1.99904809939841e-06, "loss": 1.4263, "step": 8232 }, { "epoch": 0.06, "grad_norm": 4.350862240947364, "learning_rate": 1.999047867871435e-06, "loss": 1.31, "step": 8233 }, { "epoch": 0.06, "grad_norm": 5.695325429858104, "learning_rate": 1.9990476363163196e-06, "loss": 1.3402, "step": 8234 }, { "epoch": 0.06, "grad_norm": 4.299995288856469, "learning_rate": 1.999047404733065e-06, "loss": 1.3675, "step": 8235 }, { "epoch": 0.06, "grad_norm": 5.25777156337636, "learning_rate": 1.99904717312167e-06, "loss": 1.4539, "step": 8236 }, { "epoch": 0.06, "grad_norm": 4.582696943694878, "learning_rate": 1.9990469414821356e-06, "loss": 1.2846, "step": 8237 }, { "epoch": 0.06, "grad_norm": 4.343873557143485, "learning_rate": 1.999046709814461e-06, "loss": 1.2103, "step": 8238 }, { "epoch": 0.06, "grad_norm": 4.810541817048231, "learning_rate": 1.9990464781186465e-06, "loss": 1.389, "step": 8239 }, { "epoch": 0.06, "grad_norm": 4.948976099055948, "learning_rate": 1.9990462463946925e-06, "loss": 1.4962, "step": 8240 }, { "epoch": 0.06, "grad_norm": 5.296171927722328, "learning_rate": 1.999046014642599e-06, "loss": 1.4292, "step": 8241 }, { "epoch": 0.06, "grad_norm": 5.172457818378872, "learning_rate": 1.999045782862365e-06, "loss": 1.4609, "step": 8242 }, { "epoch": 0.06, "grad_norm": 7.848331689769885, "learning_rate": 1.9990455510539916e-06, "loss": 1.2819, "step": 8243 }, { "epoch": 0.06, "grad_norm": 4.603434152628308, "learning_rate": 1.9990453192174783e-06, "loss": 1.3733, "step": 8244 }, { "epoch": 0.06, "grad_norm": 4.06967132791478, "learning_rate": 1.9990450873528252e-06, "loss": 1.242, "step": 8245 }, { "epoch": 0.06, "grad_norm": 4.409995740321932, "learning_rate": 1.9990448554600325e-06, "loss": 1.3252, "step": 8246 }, { "epoch": 0.06, "grad_norm": 4.738005509137268, "learning_rate": 1.9990446235390997e-06, "loss": 1.561, "step": 8247 }, { "epoch": 0.06, "grad_norm": 4.490052438206556, "learning_rate": 1.9990443915900275e-06, "loss": 1.4185, "step": 8248 }, { "epoch": 0.06, "grad_norm": 5.363836131497134, "learning_rate": 1.9990441596128152e-06, "loss": 1.3816, "step": 8249 }, { "epoch": 0.06, "eval_loss": 1.5642991065979004, "eval_runtime": 4.5979, "eval_samples_per_second": 1.957, "eval_steps_per_second": 1.087, "step": 8249 }, { "epoch": 0.06, "grad_norm": 4.724668924954548, "learning_rate": 1.9990439276074637e-06, "loss": 1.4598, "step": 8250 }, { "epoch": 0.06, "grad_norm": 4.716430565015037, "learning_rate": 1.9990436955739716e-06, "loss": 1.3109, "step": 8251 }, { "epoch": 0.06, "grad_norm": 5.888975945354197, "learning_rate": 1.9990434635123402e-06, "loss": 1.5034, "step": 8252 }, { "epoch": 0.06, "grad_norm": 4.326722261213911, "learning_rate": 1.9990432314225696e-06, "loss": 1.4211, "step": 8253 }, { "epoch": 0.06, "grad_norm": 5.637418593708245, "learning_rate": 1.9990429993046583e-06, "loss": 1.4174, "step": 8254 }, { "epoch": 0.06, "grad_norm": 4.485291731666469, "learning_rate": 1.999042767158608e-06, "loss": 1.4369, "step": 8255 }, { "epoch": 0.06, "grad_norm": 4.431705944415164, "learning_rate": 1.9990425349844176e-06, "loss": 1.4001, "step": 8256 }, { "epoch": 0.06, "grad_norm": 4.598067318289846, "learning_rate": 1.9990423027820877e-06, "loss": 1.4473, "step": 8257 }, { "epoch": 0.06, "grad_norm": 5.06084813647273, "learning_rate": 1.999042070551618e-06, "loss": 1.4613, "step": 8258 }, { "epoch": 0.06, "grad_norm": 4.823026033041981, "learning_rate": 1.999041838293009e-06, "loss": 1.3917, "step": 8259 }, { "epoch": 0.06, "grad_norm": 4.5145775182747165, "learning_rate": 1.9990416060062598e-06, "loss": 1.3649, "step": 8260 }, { "epoch": 0.06, "grad_norm": 4.527681517531891, "learning_rate": 1.999041373691371e-06, "loss": 1.4601, "step": 8261 }, { "epoch": 0.06, "grad_norm": 4.356568913087144, "learning_rate": 1.9990411413483426e-06, "loss": 1.2418, "step": 8262 }, { "epoch": 0.06, "grad_norm": 4.442988430813115, "learning_rate": 1.9990409089771745e-06, "loss": 1.4027, "step": 8263 }, { "epoch": 0.06, "grad_norm": 4.568982692907251, "learning_rate": 1.999040676577867e-06, "loss": 1.3534, "step": 8264 }, { "epoch": 0.06, "grad_norm": 4.795929785949326, "learning_rate": 1.9990404441504195e-06, "loss": 1.1863, "step": 8265 }, { "epoch": 0.06, "grad_norm": 4.953438182013541, "learning_rate": 1.9990402116948323e-06, "loss": 1.4043, "step": 8266 }, { "epoch": 0.06, "grad_norm": 4.984216225883633, "learning_rate": 1.9990399792111057e-06, "loss": 1.2769, "step": 8267 }, { "epoch": 0.06, "grad_norm": 4.172049235607687, "learning_rate": 1.9990397466992395e-06, "loss": 1.3389, "step": 8268 }, { "epoch": 0.06, "grad_norm": 4.5793249436205405, "learning_rate": 1.9990395141592335e-06, "loss": 1.2399, "step": 8269 }, { "epoch": 0.06, "grad_norm": 5.1413365370869455, "learning_rate": 1.999039281591088e-06, "loss": 1.5652, "step": 8270 }, { "epoch": 0.06, "grad_norm": 5.6265954357796195, "learning_rate": 1.999039048994803e-06, "loss": 1.3478, "step": 8271 }, { "epoch": 0.06, "grad_norm": 4.64472449885365, "learning_rate": 1.999038816370378e-06, "loss": 1.2266, "step": 8272 }, { "epoch": 0.06, "grad_norm": 4.576713415359496, "learning_rate": 1.9990385837178136e-06, "loss": 1.4215, "step": 8273 }, { "epoch": 0.06, "grad_norm": 4.311771019591452, "learning_rate": 1.9990383510371095e-06, "loss": 1.3022, "step": 8274 }, { "epoch": 0.06, "grad_norm": 5.2912808195945065, "learning_rate": 1.999038118328266e-06, "loss": 1.3223, "step": 8275 }, { "epoch": 0.06, "grad_norm": 4.259185697054471, "learning_rate": 1.999037885591283e-06, "loss": 1.3435, "step": 8276 }, { "epoch": 0.06, "grad_norm": 5.941675015218654, "learning_rate": 1.9990376528261604e-06, "loss": 1.2397, "step": 8277 }, { "epoch": 0.06, "grad_norm": 4.350089893975519, "learning_rate": 1.999037420032898e-06, "loss": 1.1177, "step": 8278 }, { "epoch": 0.06, "grad_norm": 4.867009482812644, "learning_rate": 1.9990371872114962e-06, "loss": 1.5145, "step": 8279 }, { "epoch": 0.06, "grad_norm": 5.137856817486438, "learning_rate": 1.999036954361955e-06, "loss": 1.3328, "step": 8280 }, { "epoch": 0.06, "grad_norm": 4.9084280239868106, "learning_rate": 1.999036721484274e-06, "loss": 1.4189, "step": 8281 }, { "epoch": 0.06, "grad_norm": 4.933105580358515, "learning_rate": 1.9990364885784533e-06, "loss": 1.3909, "step": 8282 }, { "epoch": 0.06, "grad_norm": 4.121523815240315, "learning_rate": 1.9990362556444936e-06, "loss": 1.2405, "step": 8283 }, { "epoch": 0.06, "grad_norm": 4.678365530703249, "learning_rate": 1.9990360226823937e-06, "loss": 1.3501, "step": 8284 }, { "epoch": 0.06, "grad_norm": 4.1656022022504375, "learning_rate": 1.9990357896921546e-06, "loss": 1.3065, "step": 8285 }, { "epoch": 0.06, "grad_norm": 4.812306620373129, "learning_rate": 1.9990355566737762e-06, "loss": 1.2899, "step": 8286 }, { "epoch": 0.06, "grad_norm": 5.088029165932999, "learning_rate": 1.9990353236272586e-06, "loss": 1.1263, "step": 8287 }, { "epoch": 0.06, "grad_norm": 4.277715583582814, "learning_rate": 1.9990350905526008e-06, "loss": 1.3021, "step": 8288 }, { "epoch": 0.06, "grad_norm": 5.54563260372641, "learning_rate": 1.9990348574498037e-06, "loss": 1.3948, "step": 8289 }, { "epoch": 0.06, "grad_norm": 4.078209195401177, "learning_rate": 1.9990346243188673e-06, "loss": 1.0781, "step": 8290 }, { "epoch": 0.06, "grad_norm": 5.005196801851178, "learning_rate": 1.9990343911597913e-06, "loss": 1.408, "step": 8291 }, { "epoch": 0.06, "grad_norm": 4.053339754093099, "learning_rate": 1.9990341579725755e-06, "loss": 1.1854, "step": 8292 }, { "epoch": 0.06, "grad_norm": 5.20351410089857, "learning_rate": 1.9990339247572204e-06, "loss": 1.4826, "step": 8293 }, { "epoch": 0.06, "grad_norm": 4.5590791381126845, "learning_rate": 1.999033691513726e-06, "loss": 1.2966, "step": 8294 }, { "epoch": 0.06, "grad_norm": 4.433799054499765, "learning_rate": 1.999033458242092e-06, "loss": 1.39, "step": 8295 }, { "epoch": 0.06, "grad_norm": 5.257488976646479, "learning_rate": 1.9990332249423187e-06, "loss": 1.404, "step": 8296 }, { "epoch": 0.06, "grad_norm": 5.681593272768738, "learning_rate": 1.999032991614406e-06, "loss": 1.3293, "step": 8297 }, { "epoch": 0.06, "grad_norm": 4.884480931323584, "learning_rate": 1.999032758258354e-06, "loss": 1.477, "step": 8298 }, { "epoch": 0.06, "grad_norm": 4.416750305037304, "learning_rate": 1.999032524874162e-06, "loss": 1.4386, "step": 8299 }, { "epoch": 0.06, "grad_norm": 5.661037174326694, "learning_rate": 1.9990322914618313e-06, "loss": 1.2421, "step": 8300 }, { "epoch": 0.06, "grad_norm": 4.124176017773704, "learning_rate": 1.9990320580213608e-06, "loss": 1.3226, "step": 8301 }, { "epoch": 0.06, "grad_norm": 5.111246829435499, "learning_rate": 1.9990318245527505e-06, "loss": 1.382, "step": 8302 }, { "epoch": 0.06, "grad_norm": 4.072916429433893, "learning_rate": 1.9990315910560013e-06, "loss": 1.2825, "step": 8303 }, { "epoch": 0.06, "grad_norm": 5.759840034049702, "learning_rate": 1.9990313575311125e-06, "loss": 1.3343, "step": 8304 }, { "epoch": 0.06, "grad_norm": 4.4830622737046335, "learning_rate": 1.999031123978085e-06, "loss": 1.4941, "step": 8305 }, { "epoch": 0.06, "grad_norm": 6.960919643282049, "learning_rate": 1.999030890396917e-06, "loss": 1.3745, "step": 8306 }, { "epoch": 0.06, "grad_norm": 4.4271138937610885, "learning_rate": 1.9990306567876103e-06, "loss": 1.3604, "step": 8307 }, { "epoch": 0.06, "grad_norm": 4.40386035540864, "learning_rate": 1.999030423150164e-06, "loss": 1.3374, "step": 8308 }, { "epoch": 0.06, "grad_norm": 4.26099550199566, "learning_rate": 1.999030189484578e-06, "loss": 1.2988, "step": 8309 }, { "epoch": 0.06, "grad_norm": 4.172925614709736, "learning_rate": 1.9990299557908532e-06, "loss": 1.3681, "step": 8310 }, { "epoch": 0.06, "grad_norm": 5.224800880866437, "learning_rate": 1.999029722068989e-06, "loss": 1.4005, "step": 8311 }, { "epoch": 0.06, "grad_norm": 4.542417807291675, "learning_rate": 1.9990294883189855e-06, "loss": 1.4132, "step": 8312 }, { "epoch": 0.06, "grad_norm": 4.877345293687558, "learning_rate": 1.9990292545408423e-06, "loss": 1.402, "step": 8313 }, { "epoch": 0.06, "grad_norm": 5.431716513454659, "learning_rate": 1.9990290207345597e-06, "loss": 1.415, "step": 8314 }, { "epoch": 0.06, "grad_norm": 4.233003724805698, "learning_rate": 1.9990287869001384e-06, "loss": 1.3477, "step": 8315 }, { "epoch": 0.06, "grad_norm": 4.6576296310136325, "learning_rate": 1.9990285530375773e-06, "loss": 1.4549, "step": 8316 }, { "epoch": 0.06, "grad_norm": 4.391415226549598, "learning_rate": 1.999028319146877e-06, "loss": 1.3342, "step": 8317 }, { "epoch": 0.06, "grad_norm": 4.602088029418293, "learning_rate": 1.9990280852280373e-06, "loss": 1.2976, "step": 8318 }, { "epoch": 0.06, "grad_norm": 4.360325546639366, "learning_rate": 1.999027851281059e-06, "loss": 1.3285, "step": 8319 }, { "epoch": 0.06, "grad_norm": 4.6403728530383095, "learning_rate": 1.9990276173059402e-06, "loss": 1.3746, "step": 8320 }, { "epoch": 0.06, "grad_norm": 5.270882820990297, "learning_rate": 1.999027383302683e-06, "loss": 1.6054, "step": 8321 }, { "epoch": 0.06, "grad_norm": 4.302185532932528, "learning_rate": 1.9990271492712864e-06, "loss": 1.2901, "step": 8322 }, { "epoch": 0.06, "eval_loss": 1.5656442642211914, "eval_runtime": 4.5823, "eval_samples_per_second": 1.964, "eval_steps_per_second": 1.091, "step": 8322 }, { "epoch": 0.06, "grad_norm": 5.020764746159491, "learning_rate": 1.9990269152117504e-06, "loss": 1.4232, "step": 8323 }, { "epoch": 0.06, "grad_norm": 4.606529395708601, "learning_rate": 1.999026681124075e-06, "loss": 1.2384, "step": 8324 }, { "epoch": 0.06, "grad_norm": 4.274363960052644, "learning_rate": 1.9990264470082604e-06, "loss": 1.188, "step": 8325 }, { "epoch": 0.06, "grad_norm": 4.22659651250079, "learning_rate": 1.9990262128643065e-06, "loss": 1.3778, "step": 8326 }, { "epoch": 0.06, "grad_norm": 4.918397701896192, "learning_rate": 1.9990259786922134e-06, "loss": 1.4058, "step": 8327 }, { "epoch": 0.06, "grad_norm": 4.325221835622321, "learning_rate": 1.9990257444919813e-06, "loss": 1.4229, "step": 8328 }, { "epoch": 0.06, "grad_norm": 5.57680022443474, "learning_rate": 1.99902551026361e-06, "loss": 1.4133, "step": 8329 }, { "epoch": 0.06, "grad_norm": 4.814148517311039, "learning_rate": 1.999025276007099e-06, "loss": 1.3071, "step": 8330 }, { "epoch": 0.06, "grad_norm": 4.3283132853618325, "learning_rate": 1.999025041722449e-06, "loss": 1.2699, "step": 8331 }, { "epoch": 0.06, "grad_norm": 4.911050608047389, "learning_rate": 1.99902480740966e-06, "loss": 1.1585, "step": 8332 }, { "epoch": 0.06, "grad_norm": 4.601249664935928, "learning_rate": 1.9990245730687316e-06, "loss": 1.5244, "step": 8333 }, { "epoch": 0.06, "grad_norm": 5.498290410267132, "learning_rate": 1.9990243386996643e-06, "loss": 1.472, "step": 8334 }, { "epoch": 0.06, "grad_norm": 4.830360884280506, "learning_rate": 1.9990241043024573e-06, "loss": 1.3676, "step": 8335 }, { "epoch": 0.06, "grad_norm": 4.757159460063796, "learning_rate": 1.9990238698771114e-06, "loss": 1.3391, "step": 8336 }, { "epoch": 0.06, "grad_norm": 4.360488398818196, "learning_rate": 1.9990236354236263e-06, "loss": 1.4066, "step": 8337 }, { "epoch": 0.06, "grad_norm": 4.411231104148737, "learning_rate": 1.999023400942002e-06, "loss": 1.3872, "step": 8338 }, { "epoch": 0.06, "grad_norm": 4.180522741794834, "learning_rate": 1.999023166432238e-06, "loss": 1.3609, "step": 8339 }, { "epoch": 0.06, "grad_norm": 5.193441642868844, "learning_rate": 1.999022931894336e-06, "loss": 1.3488, "step": 8340 }, { "epoch": 0.06, "grad_norm": 4.471788603914176, "learning_rate": 1.999022697328294e-06, "loss": 1.4602, "step": 8341 }, { "epoch": 0.06, "grad_norm": 4.341105435630704, "learning_rate": 1.999022462734113e-06, "loss": 1.3936, "step": 8342 }, { "epoch": 0.06, "grad_norm": 4.237003449736239, "learning_rate": 1.999022228111793e-06, "loss": 1.2631, "step": 8343 }, { "epoch": 0.06, "grad_norm": 4.575726133702913, "learning_rate": 1.999021993461334e-06, "loss": 1.3168, "step": 8344 }, { "epoch": 0.06, "grad_norm": 4.876989383729073, "learning_rate": 1.9990217587827357e-06, "loss": 1.4856, "step": 8345 }, { "epoch": 0.06, "grad_norm": 4.957069769203432, "learning_rate": 1.999021524075998e-06, "loss": 1.2782, "step": 8346 }, { "epoch": 0.06, "grad_norm": 4.555175055202144, "learning_rate": 1.9990212893411216e-06, "loss": 1.3844, "step": 8347 }, { "epoch": 0.06, "grad_norm": 4.982702575149782, "learning_rate": 1.9990210545781057e-06, "loss": 1.4147, "step": 8348 }, { "epoch": 0.06, "grad_norm": 4.511710257653521, "learning_rate": 1.9990208197869513e-06, "loss": 1.34, "step": 8349 }, { "epoch": 0.06, "grad_norm": 4.579584800817105, "learning_rate": 1.999020584967657e-06, "loss": 1.3131, "step": 8350 }, { "epoch": 0.06, "grad_norm": 4.3639826779258195, "learning_rate": 1.9990203501202246e-06, "loss": 1.3033, "step": 8351 }, { "epoch": 0.06, "grad_norm": 4.560570971067625, "learning_rate": 1.9990201152446524e-06, "loss": 1.3826, "step": 8352 }, { "epoch": 0.06, "grad_norm": 4.771442617526964, "learning_rate": 1.9990198803409413e-06, "loss": 1.4731, "step": 8353 }, { "epoch": 0.06, "grad_norm": 4.421985550458615, "learning_rate": 1.999019645409091e-06, "loss": 1.2672, "step": 8354 }, { "epoch": 0.06, "grad_norm": 4.262551774119744, "learning_rate": 1.9990194104491017e-06, "loss": 1.2955, "step": 8355 }, { "epoch": 0.06, "grad_norm": 4.557969622544508, "learning_rate": 1.9990191754609736e-06, "loss": 1.377, "step": 8356 }, { "epoch": 0.06, "grad_norm": 4.490612997548548, "learning_rate": 1.999018940444706e-06, "loss": 1.3569, "step": 8357 }, { "epoch": 0.06, "grad_norm": 4.254726746385519, "learning_rate": 1.9990187054003e-06, "loss": 1.3191, "step": 8358 }, { "epoch": 0.06, "grad_norm": 4.369880226717757, "learning_rate": 1.9990184703277544e-06, "loss": 1.3652, "step": 8359 }, { "epoch": 0.06, "grad_norm": 4.368191459851238, "learning_rate": 1.99901823522707e-06, "loss": 1.3865, "step": 8360 }, { "epoch": 0.06, "grad_norm": 4.492268895198598, "learning_rate": 1.9990180000982464e-06, "loss": 1.3948, "step": 8361 }, { "epoch": 0.06, "grad_norm": 4.792306538138668, "learning_rate": 1.9990177649412843e-06, "loss": 1.6932, "step": 8362 }, { "epoch": 0.06, "grad_norm": 7.839851060213818, "learning_rate": 1.9990175297561825e-06, "loss": 1.4006, "step": 8363 }, { "epoch": 0.06, "grad_norm": 4.2887203007389365, "learning_rate": 1.9990172945429423e-06, "loss": 1.181, "step": 8364 }, { "epoch": 0.06, "grad_norm": 4.177683976256777, "learning_rate": 1.9990170593015628e-06, "loss": 1.3094, "step": 8365 }, { "epoch": 0.06, "grad_norm": 5.407259103865772, "learning_rate": 1.9990168240320444e-06, "loss": 1.3241, "step": 8366 }, { "epoch": 0.06, "grad_norm": 4.351582161927502, "learning_rate": 1.9990165887343868e-06, "loss": 1.3269, "step": 8367 }, { "epoch": 0.06, "grad_norm": 4.306908350756632, "learning_rate": 1.9990163534085902e-06, "loss": 1.3031, "step": 8368 }, { "epoch": 0.06, "grad_norm": 6.022174789368055, "learning_rate": 1.999016118054655e-06, "loss": 1.2962, "step": 8369 }, { "epoch": 0.06, "grad_norm": 5.1253312574813625, "learning_rate": 1.9990158826725806e-06, "loss": 1.3777, "step": 8370 }, { "epoch": 0.06, "grad_norm": 4.955078193325565, "learning_rate": 1.9990156472623676e-06, "loss": 1.4517, "step": 8371 }, { "epoch": 0.06, "grad_norm": 4.6813878429700795, "learning_rate": 1.999015411824015e-06, "loss": 1.3286, "step": 8372 }, { "epoch": 0.06, "grad_norm": 4.987846314192864, "learning_rate": 1.999015176357524e-06, "loss": 1.4235, "step": 8373 }, { "epoch": 0.06, "grad_norm": 4.473948143583966, "learning_rate": 1.999014940862894e-06, "loss": 1.3693, "step": 8374 }, { "epoch": 0.06, "grad_norm": 4.2441833787143874, "learning_rate": 1.999014705340125e-06, "loss": 1.3634, "step": 8375 }, { "epoch": 0.06, "grad_norm": 4.552970496979477, "learning_rate": 1.9990144697892167e-06, "loss": 1.4939, "step": 8376 }, { "epoch": 0.06, "grad_norm": 4.797322067007558, "learning_rate": 1.99901423421017e-06, "loss": 1.4767, "step": 8377 }, { "epoch": 0.06, "grad_norm": 4.587597346903606, "learning_rate": 1.999013998602984e-06, "loss": 1.1092, "step": 8378 }, { "epoch": 0.06, "grad_norm": 5.534941541343506, "learning_rate": 1.9990137629676597e-06, "loss": 1.4859, "step": 8379 }, { "epoch": 0.06, "grad_norm": 4.9961469123467115, "learning_rate": 1.999013527304196e-06, "loss": 1.4147, "step": 8380 }, { "epoch": 0.06, "grad_norm": 4.044184154266132, "learning_rate": 1.9990132916125936e-06, "loss": 1.2626, "step": 8381 }, { "epoch": 0.06, "grad_norm": 4.648917140278735, "learning_rate": 1.999013055892852e-06, "loss": 1.3029, "step": 8382 }, { "epoch": 0.06, "grad_norm": 4.889605616911022, "learning_rate": 1.999012820144972e-06, "loss": 1.3647, "step": 8383 }, { "epoch": 0.06, "grad_norm": 5.359942093616338, "learning_rate": 1.999012584368953e-06, "loss": 1.4222, "step": 8384 }, { "epoch": 0.06, "grad_norm": 4.668293176598957, "learning_rate": 1.9990123485647945e-06, "loss": 1.4047, "step": 8385 }, { "epoch": 0.06, "grad_norm": 4.536147877502993, "learning_rate": 1.9990121127324977e-06, "loss": 1.3475, "step": 8386 }, { "epoch": 0.06, "grad_norm": 5.918582636107545, "learning_rate": 1.9990118768720625e-06, "loss": 1.3717, "step": 8387 }, { "epoch": 0.06, "grad_norm": 4.666692418540283, "learning_rate": 1.999011640983488e-06, "loss": 1.3138, "step": 8388 }, { "epoch": 0.06, "grad_norm": 4.435728448744984, "learning_rate": 1.9990114050667746e-06, "loss": 1.2664, "step": 8389 }, { "epoch": 0.06, "grad_norm": 4.91773565268233, "learning_rate": 1.9990111691219224e-06, "loss": 1.4333, "step": 8390 }, { "epoch": 0.06, "grad_norm": 4.372311084034018, "learning_rate": 1.9990109331489313e-06, "loss": 1.4715, "step": 8391 }, { "epoch": 0.06, "grad_norm": 4.855966195750034, "learning_rate": 1.9990106971478013e-06, "loss": 1.2862, "step": 8392 }, { "epoch": 0.06, "grad_norm": 4.197613641758737, "learning_rate": 1.999010461118533e-06, "loss": 1.3328, "step": 8393 }, { "epoch": 0.06, "grad_norm": 5.473355860344675, "learning_rate": 1.9990102250611253e-06, "loss": 1.359, "step": 8394 }, { "epoch": 0.06, "grad_norm": 4.92330000920753, "learning_rate": 1.999009988975579e-06, "loss": 1.6145, "step": 8395 }, { "epoch": 0.06, "eval_loss": 1.567657232284546, "eval_runtime": 4.5964, "eval_samples_per_second": 1.958, "eval_steps_per_second": 1.088, "step": 8395 }, { "epoch": 0.06, "grad_norm": 4.5701049314995625, "learning_rate": 1.999009752861894e-06, "loss": 1.2682, "step": 8396 }, { "epoch": 0.06, "grad_norm": 4.215243480806859, "learning_rate": 1.9990095167200704e-06, "loss": 1.3341, "step": 8397 }, { "epoch": 0.06, "grad_norm": 4.713273711413565, "learning_rate": 1.999009280550108e-06, "loss": 1.3942, "step": 8398 }, { "epoch": 0.06, "grad_norm": 5.712640789155021, "learning_rate": 1.9990090443520066e-06, "loss": 1.2203, "step": 8399 }, { "epoch": 0.06, "grad_norm": 4.596277722691164, "learning_rate": 1.9990088081257666e-06, "loss": 1.2669, "step": 8400 }, { "epoch": 0.06, "grad_norm": 4.9488258276807935, "learning_rate": 1.999008571871388e-06, "loss": 1.3826, "step": 8401 }, { "epoch": 0.06, "grad_norm": 4.613795635684511, "learning_rate": 1.99900833558887e-06, "loss": 1.218, "step": 8402 }, { "epoch": 0.06, "grad_norm": 5.030838257239036, "learning_rate": 1.999008099278214e-06, "loss": 1.3821, "step": 8403 }, { "epoch": 0.06, "grad_norm": 5.571990186711726, "learning_rate": 1.9990078629394186e-06, "loss": 1.6666, "step": 8404 }, { "epoch": 0.06, "grad_norm": 5.171066589706494, "learning_rate": 1.9990076265724848e-06, "loss": 1.3002, "step": 8405 }, { "epoch": 0.06, "grad_norm": 4.543127661532873, "learning_rate": 1.999007390177412e-06, "loss": 1.1585, "step": 8406 }, { "epoch": 0.06, "grad_norm": 5.327490998974423, "learning_rate": 1.999007153754201e-06, "loss": 1.3534, "step": 8407 }, { "epoch": 0.06, "grad_norm": 4.435630789880913, "learning_rate": 1.9990069173028514e-06, "loss": 1.3469, "step": 8408 }, { "epoch": 0.06, "grad_norm": 6.643697238326478, "learning_rate": 1.9990066808233625e-06, "loss": 1.3808, "step": 8409 }, { "epoch": 0.06, "grad_norm": 5.180431362370178, "learning_rate": 1.9990064443157353e-06, "loss": 1.4784, "step": 8410 }, { "epoch": 0.06, "grad_norm": 4.199888607186224, "learning_rate": 1.999006207779969e-06, "loss": 1.2871, "step": 8411 }, { "epoch": 0.06, "grad_norm": 4.234292284844202, "learning_rate": 1.9990059712160646e-06, "loss": 1.2079, "step": 8412 }, { "epoch": 0.06, "grad_norm": 4.62723876031503, "learning_rate": 1.999005734624021e-06, "loss": 1.3377, "step": 8413 }, { "epoch": 0.06, "grad_norm": 5.010857816758854, "learning_rate": 1.999005498003839e-06, "loss": 1.2954, "step": 8414 }, { "epoch": 0.06, "grad_norm": 5.993205867312916, "learning_rate": 1.9990052613555185e-06, "loss": 1.2182, "step": 8415 }, { "epoch": 0.06, "grad_norm": 7.339850469839931, "learning_rate": 1.999005024679059e-06, "loss": 1.4024, "step": 8416 }, { "epoch": 0.06, "grad_norm": 4.207679844091889, "learning_rate": 1.999004787974461e-06, "loss": 1.3022, "step": 8417 }, { "epoch": 0.06, "grad_norm": 4.438127125335588, "learning_rate": 1.9990045512417245e-06, "loss": 1.2749, "step": 8418 }, { "epoch": 0.06, "grad_norm": 4.84464536686767, "learning_rate": 1.999004314480849e-06, "loss": 1.3476, "step": 8419 }, { "epoch": 0.06, "grad_norm": 5.003419099622179, "learning_rate": 1.999004077691835e-06, "loss": 1.2251, "step": 8420 }, { "epoch": 0.06, "grad_norm": 4.68382568667382, "learning_rate": 1.9990038408746824e-06, "loss": 1.3134, "step": 8421 }, { "epoch": 0.06, "grad_norm": 5.846668941153372, "learning_rate": 1.9990036040293914e-06, "loss": 1.4848, "step": 8422 }, { "epoch": 0.06, "grad_norm": 72.22770611690466, "learning_rate": 1.9990033671559615e-06, "loss": 1.6378, "step": 8423 }, { "epoch": 0.06, "grad_norm": 4.913751231042928, "learning_rate": 1.999003130254393e-06, "loss": 1.3046, "step": 8424 }, { "epoch": 0.06, "grad_norm": 5.563361558333302, "learning_rate": 1.999002893324686e-06, "loss": 1.2962, "step": 8425 }, { "epoch": 0.06, "grad_norm": 4.621955668736098, "learning_rate": 1.999002656366841e-06, "loss": 1.3498, "step": 8426 }, { "epoch": 0.06, "grad_norm": 4.335433039296151, "learning_rate": 1.9990024193808568e-06, "loss": 1.3384, "step": 8427 }, { "epoch": 0.06, "grad_norm": 4.27659742870904, "learning_rate": 1.999002182366734e-06, "loss": 1.2137, "step": 8428 }, { "epoch": 0.06, "grad_norm": 4.317254755188132, "learning_rate": 1.9990019453244725e-06, "loss": 1.3333, "step": 8429 }, { "epoch": 0.06, "grad_norm": 4.588194714823662, "learning_rate": 1.9990017082540727e-06, "loss": 1.2952, "step": 8430 }, { "epoch": 0.06, "grad_norm": 4.536797481229204, "learning_rate": 1.9990014711555345e-06, "loss": 1.3067, "step": 8431 }, { "epoch": 0.06, "grad_norm": 4.458787242357463, "learning_rate": 1.9990012340288574e-06, "loss": 1.408, "step": 8432 }, { "epoch": 0.06, "grad_norm": 5.765351765905371, "learning_rate": 1.999000996874042e-06, "loss": 1.1525, "step": 8433 }, { "epoch": 0.06, "grad_norm": 8.862566168460543, "learning_rate": 1.999000759691088e-06, "loss": 1.473, "step": 8434 }, { "epoch": 0.06, "grad_norm": 4.150802771803171, "learning_rate": 1.999000522479995e-06, "loss": 1.2269, "step": 8435 }, { "epoch": 0.06, "grad_norm": 4.356398610103301, "learning_rate": 1.999000285240764e-06, "loss": 1.3826, "step": 8436 }, { "epoch": 0.06, "grad_norm": 5.2467688215660395, "learning_rate": 1.9990000479733947e-06, "loss": 1.5328, "step": 8437 }, { "epoch": 0.06, "grad_norm": 4.468282929336187, "learning_rate": 1.9989998106778866e-06, "loss": 1.2144, "step": 8438 }, { "epoch": 0.06, "grad_norm": 4.361593505439121, "learning_rate": 1.99899957335424e-06, "loss": 1.3896, "step": 8439 }, { "epoch": 0.06, "grad_norm": 5.53120283296376, "learning_rate": 1.9989993360024547e-06, "loss": 1.5312, "step": 8440 }, { "epoch": 0.06, "grad_norm": 4.868631773734439, "learning_rate": 1.9989990986225313e-06, "loss": 1.2824, "step": 8441 }, { "epoch": 0.06, "grad_norm": 4.514616463882857, "learning_rate": 1.998998861214469e-06, "loss": 1.3558, "step": 8442 }, { "epoch": 0.06, "grad_norm": 4.427873595952543, "learning_rate": 1.9989986237782687e-06, "loss": 1.1876, "step": 8443 }, { "epoch": 0.06, "grad_norm": 5.060044500434067, "learning_rate": 1.9989983863139296e-06, "loss": 1.3369, "step": 8444 }, { "epoch": 0.06, "grad_norm": 6.61085970566859, "learning_rate": 1.998998148821452e-06, "loss": 1.4488, "step": 8445 }, { "epoch": 0.06, "grad_norm": 4.429709658186737, "learning_rate": 1.998997911300836e-06, "loss": 1.4065, "step": 8446 }, { "epoch": 0.06, "grad_norm": 4.455764795481005, "learning_rate": 1.9989976737520816e-06, "loss": 1.3559, "step": 8447 }, { "epoch": 0.06, "grad_norm": 4.567685574616305, "learning_rate": 1.9989974361751887e-06, "loss": 1.4541, "step": 8448 }, { "epoch": 0.06, "grad_norm": 5.869228089840253, "learning_rate": 1.998997198570158e-06, "loss": 1.3385, "step": 8449 }, { "epoch": 0.06, "grad_norm": 4.325802290156743, "learning_rate": 1.998996960936988e-06, "loss": 1.2911, "step": 8450 }, { "epoch": 0.06, "grad_norm": 4.317738952887137, "learning_rate": 1.99899672327568e-06, "loss": 1.207, "step": 8451 }, { "epoch": 0.06, "grad_norm": 5.996256516985147, "learning_rate": 1.9989964855862333e-06, "loss": 1.4679, "step": 8452 }, { "epoch": 0.06, "grad_norm": 8.550414431812067, "learning_rate": 1.9989962478686483e-06, "loss": 1.3527, "step": 8453 }, { "epoch": 0.06, "grad_norm": 4.997985580091321, "learning_rate": 1.998996010122925e-06, "loss": 1.3554, "step": 8454 }, { "epoch": 0.06, "grad_norm": 4.416346521208863, "learning_rate": 1.9989957723490633e-06, "loss": 1.22, "step": 8455 }, { "epoch": 0.06, "grad_norm": 5.12896451783654, "learning_rate": 1.9989955345470634e-06, "loss": 1.1522, "step": 8456 }, { "epoch": 0.06, "grad_norm": 4.00349993452822, "learning_rate": 1.9989952967169246e-06, "loss": 1.2234, "step": 8457 }, { "epoch": 0.06, "grad_norm": 5.317189165747824, "learning_rate": 1.998995058858648e-06, "loss": 1.5081, "step": 8458 }, { "epoch": 0.06, "grad_norm": 4.330171577519852, "learning_rate": 1.9989948209722326e-06, "loss": 1.1255, "step": 8459 }, { "epoch": 0.06, "grad_norm": 4.655935757224283, "learning_rate": 1.998994583057679e-06, "loss": 1.2203, "step": 8460 }, { "epoch": 0.06, "grad_norm": 4.726772061267374, "learning_rate": 1.9989943451149872e-06, "loss": 1.3104, "step": 8461 }, { "epoch": 0.06, "grad_norm": 6.558257677936896, "learning_rate": 1.9989941071441567e-06, "loss": 1.3681, "step": 8462 }, { "epoch": 0.06, "grad_norm": 4.606013238089367, "learning_rate": 1.998993869145188e-06, "loss": 1.334, "step": 8463 }, { "epoch": 0.06, "grad_norm": 4.5024219623394774, "learning_rate": 1.998993631118081e-06, "loss": 1.514, "step": 8464 }, { "epoch": 0.06, "grad_norm": 4.537845678210268, "learning_rate": 1.998993393062836e-06, "loss": 1.3453, "step": 8465 }, { "epoch": 0.06, "grad_norm": 4.947351067739524, "learning_rate": 1.9989931549794523e-06, "loss": 1.3304, "step": 8466 }, { "epoch": 0.06, "grad_norm": 4.6459693351653994, "learning_rate": 1.99899291686793e-06, "loss": 1.381, "step": 8467 }, { "epoch": 0.06, "grad_norm": 5.210441146279967, "learning_rate": 1.99899267872827e-06, "loss": 1.4571, "step": 8468 }, { "epoch": 0.06, "eval_loss": 1.5647797584533691, "eval_runtime": 4.6033, "eval_samples_per_second": 1.955, "eval_steps_per_second": 1.086, "step": 8468 }, { "epoch": 0.06, "grad_norm": 4.6295052178558675, "learning_rate": 1.9989924405604715e-06, "loss": 1.3832, "step": 8469 }, { "epoch": 0.06, "grad_norm": 4.476411600889492, "learning_rate": 1.9989922023645347e-06, "loss": 1.2713, "step": 8470 }, { "epoch": 0.06, "grad_norm": 4.444776633143054, "learning_rate": 1.9989919641404596e-06, "loss": 1.4442, "step": 8471 }, { "epoch": 0.06, "grad_norm": 4.3045006734689135, "learning_rate": 1.998991725888246e-06, "loss": 1.3287, "step": 8472 }, { "epoch": 0.06, "grad_norm": 4.949909163837501, "learning_rate": 1.9989914876078948e-06, "loss": 1.4534, "step": 8473 }, { "epoch": 0.06, "grad_norm": 4.587667424988372, "learning_rate": 1.9989912492994047e-06, "loss": 1.2381, "step": 8474 }, { "epoch": 0.06, "grad_norm": 4.733889509541536, "learning_rate": 1.9989910109627762e-06, "loss": 1.3181, "step": 8475 }, { "epoch": 0.06, "grad_norm": 4.495264978351784, "learning_rate": 1.99899077259801e-06, "loss": 1.3402, "step": 8476 }, { "epoch": 0.06, "grad_norm": 4.967971490452831, "learning_rate": 1.998990534205105e-06, "loss": 1.2031, "step": 8477 }, { "epoch": 0.06, "grad_norm": 4.494943007961482, "learning_rate": 1.9989902957840623e-06, "loss": 1.4279, "step": 8478 }, { "epoch": 0.06, "grad_norm": 4.199579469091465, "learning_rate": 1.998990057334881e-06, "loss": 1.4212, "step": 8479 }, { "epoch": 0.06, "grad_norm": 4.757657046397719, "learning_rate": 1.9989898188575615e-06, "loss": 1.2369, "step": 8480 }, { "epoch": 0.06, "grad_norm": 5.7576924981494, "learning_rate": 1.998989580352104e-06, "loss": 1.4905, "step": 8481 }, { "epoch": 0.06, "grad_norm": 5.049433518561161, "learning_rate": 1.998989341818508e-06, "loss": 1.4028, "step": 8482 }, { "epoch": 0.06, "grad_norm": 4.6262923165133145, "learning_rate": 1.998989103256774e-06, "loss": 1.3761, "step": 8483 }, { "epoch": 0.06, "grad_norm": 4.655467986349239, "learning_rate": 1.9989888646669017e-06, "loss": 1.553, "step": 8484 }, { "epoch": 0.06, "grad_norm": 4.986639700257691, "learning_rate": 1.9989886260488915e-06, "loss": 1.4301, "step": 8485 }, { "epoch": 0.06, "grad_norm": 5.2966895098218885, "learning_rate": 1.9989883874027427e-06, "loss": 1.53, "step": 8486 }, { "epoch": 0.06, "grad_norm": 4.5444484427768295, "learning_rate": 1.998988148728456e-06, "loss": 1.3442, "step": 8487 }, { "epoch": 0.06, "grad_norm": 5.008691070223799, "learning_rate": 1.998987910026031e-06, "loss": 1.3694, "step": 8488 }, { "epoch": 0.06, "grad_norm": 4.448071289407071, "learning_rate": 1.9989876712954676e-06, "loss": 1.3987, "step": 8489 }, { "epoch": 0.06, "grad_norm": 4.73707677272594, "learning_rate": 1.9989874325367665e-06, "loss": 1.2986, "step": 8490 }, { "epoch": 0.06, "grad_norm": 4.439899938494986, "learning_rate": 1.998987193749927e-06, "loss": 1.2453, "step": 8491 }, { "epoch": 0.06, "grad_norm": 4.431748355424933, "learning_rate": 1.998986954934949e-06, "loss": 1.289, "step": 8492 }, { "epoch": 0.06, "grad_norm": 4.300642657485987, "learning_rate": 1.9989867160918335e-06, "loss": 1.249, "step": 8493 }, { "epoch": 0.06, "grad_norm": 5.6811034368925135, "learning_rate": 1.99898647722058e-06, "loss": 1.1791, "step": 8494 }, { "epoch": 0.06, "grad_norm": 4.26550025735318, "learning_rate": 1.9989862383211874e-06, "loss": 1.2816, "step": 8495 }, { "epoch": 0.06, "grad_norm": 4.53708574090178, "learning_rate": 1.9989859993936577e-06, "loss": 1.1602, "step": 8496 }, { "epoch": 0.06, "grad_norm": 4.742744124422893, "learning_rate": 1.998985760437989e-06, "loss": 1.395, "step": 8497 }, { "epoch": 0.06, "grad_norm": 4.4690881576926325, "learning_rate": 1.998985521454183e-06, "loss": 1.353, "step": 8498 }, { "epoch": 0.06, "grad_norm": 4.2509720499426775, "learning_rate": 1.9989852824422385e-06, "loss": 1.4282, "step": 8499 }, { "epoch": 0.06, "grad_norm": 4.257946868797118, "learning_rate": 1.998985043402156e-06, "loss": 1.4332, "step": 8500 }, { "epoch": 0.06, "grad_norm": 4.435618489918155, "learning_rate": 1.9989848043339353e-06, "loss": 1.3996, "step": 8501 }, { "epoch": 0.06, "grad_norm": 4.331520898794051, "learning_rate": 1.9989845652375767e-06, "loss": 1.217, "step": 8502 }, { "epoch": 0.06, "grad_norm": 5.3208047617569925, "learning_rate": 1.9989843261130797e-06, "loss": 1.4482, "step": 8503 }, { "epoch": 0.06, "grad_norm": 4.213078074750517, "learning_rate": 1.998984086960445e-06, "loss": 1.3417, "step": 8504 }, { "epoch": 0.06, "grad_norm": 4.277358132055113, "learning_rate": 1.998983847779672e-06, "loss": 1.2931, "step": 8505 }, { "epoch": 0.06, "grad_norm": 4.771436607458458, "learning_rate": 1.9989836085707614e-06, "loss": 1.3588, "step": 8506 }, { "epoch": 0.06, "grad_norm": 4.872825456159983, "learning_rate": 1.998983369333712e-06, "loss": 1.3651, "step": 8507 }, { "epoch": 0.06, "grad_norm": 6.177634031680093, "learning_rate": 1.9989831300685252e-06, "loss": 1.3515, "step": 8508 }, { "epoch": 0.06, "grad_norm": 4.793797796284552, "learning_rate": 1.9989828907752e-06, "loss": 1.3324, "step": 8509 }, { "epoch": 0.06, "grad_norm": 5.195979347895909, "learning_rate": 1.998982651453737e-06, "loss": 1.4251, "step": 8510 }, { "epoch": 0.06, "grad_norm": 4.199487295698571, "learning_rate": 1.998982412104136e-06, "loss": 1.4004, "step": 8511 }, { "epoch": 0.06, "grad_norm": 5.073184223910384, "learning_rate": 1.9989821727263964e-06, "loss": 1.369, "step": 8512 }, { "epoch": 0.06, "grad_norm": 4.8153718244069665, "learning_rate": 1.9989819333205196e-06, "loss": 1.4269, "step": 8513 }, { "epoch": 0.06, "grad_norm": 5.342580593413908, "learning_rate": 1.9989816938865045e-06, "loss": 1.3005, "step": 8514 }, { "epoch": 0.06, "grad_norm": 4.495641755331039, "learning_rate": 1.998981454424351e-06, "loss": 1.4519, "step": 8515 }, { "epoch": 0.06, "grad_norm": 4.858941609644203, "learning_rate": 1.99898121493406e-06, "loss": 1.3741, "step": 8516 }, { "epoch": 0.06, "grad_norm": 4.470121672802636, "learning_rate": 1.998980975415631e-06, "loss": 1.281, "step": 8517 }, { "epoch": 0.06, "grad_norm": 4.609799640590294, "learning_rate": 1.9989807358690638e-06, "loss": 1.5039, "step": 8518 }, { "epoch": 0.06, "grad_norm": 4.182923009388997, "learning_rate": 1.998980496294359e-06, "loss": 1.1797, "step": 8519 }, { "epoch": 0.06, "grad_norm": 4.372301603110184, "learning_rate": 1.998980256691516e-06, "loss": 1.4187, "step": 8520 }, { "epoch": 0.06, "grad_norm": 4.772789349020545, "learning_rate": 1.998980017060535e-06, "loss": 1.4531, "step": 8521 }, { "epoch": 0.06, "grad_norm": 4.849981144748239, "learning_rate": 1.998979777401416e-06, "loss": 1.4215, "step": 8522 }, { "epoch": 0.06, "grad_norm": 4.43961261312786, "learning_rate": 1.9989795377141594e-06, "loss": 1.4048, "step": 8523 }, { "epoch": 0.06, "grad_norm": 4.959397781374239, "learning_rate": 1.9989792979987645e-06, "loss": 1.4702, "step": 8524 }, { "epoch": 0.06, "grad_norm": 6.017486661827444, "learning_rate": 1.998979058255232e-06, "loss": 1.4678, "step": 8525 }, { "epoch": 0.06, "grad_norm": 4.336367582601711, "learning_rate": 1.9989788184835612e-06, "loss": 1.4022, "step": 8526 }, { "epoch": 0.06, "grad_norm": 5.056854763137082, "learning_rate": 1.998978578683753e-06, "loss": 1.3941, "step": 8527 }, { "epoch": 0.06, "grad_norm": 5.071298356844417, "learning_rate": 1.9989783388558063e-06, "loss": 1.2104, "step": 8528 }, { "epoch": 0.06, "grad_norm": 5.56109902802452, "learning_rate": 1.9989780989997223e-06, "loss": 1.3429, "step": 8529 }, { "epoch": 0.06, "grad_norm": 4.272698035148218, "learning_rate": 1.9989778591155e-06, "loss": 1.4256, "step": 8530 }, { "epoch": 0.06, "grad_norm": 4.4819311694642066, "learning_rate": 1.9989776192031397e-06, "loss": 1.3249, "step": 8531 }, { "epoch": 0.06, "grad_norm": 4.77110257315929, "learning_rate": 1.9989773792626417e-06, "loss": 1.3022, "step": 8532 }, { "epoch": 0.06, "grad_norm": 5.6644398684298025, "learning_rate": 1.998977139294006e-06, "loss": 1.2276, "step": 8533 }, { "epoch": 0.06, "grad_norm": 5.143495642461218, "learning_rate": 1.9989768992972323e-06, "loss": 1.6171, "step": 8534 }, { "epoch": 0.06, "grad_norm": 4.747245216439308, "learning_rate": 1.998976659272321e-06, "loss": 1.2689, "step": 8535 }, { "epoch": 0.06, "grad_norm": 5.049445485742834, "learning_rate": 1.9989764192192714e-06, "loss": 1.4048, "step": 8536 }, { "epoch": 0.06, "grad_norm": 5.059810980609118, "learning_rate": 1.998976179138084e-06, "loss": 1.3626, "step": 8537 }, { "epoch": 0.06, "grad_norm": 4.626880381269344, "learning_rate": 1.9989759390287592e-06, "loss": 1.3461, "step": 8538 }, { "epoch": 0.06, "grad_norm": 4.278100266976135, "learning_rate": 1.9989756988912963e-06, "loss": 1.404, "step": 8539 }, { "epoch": 0.06, "grad_norm": 4.259013662464501, "learning_rate": 1.9989754587256954e-06, "loss": 1.2941, "step": 8540 }, { "epoch": 0.06, "grad_norm": 4.8646773905077945, "learning_rate": 1.998975218531957e-06, "loss": 1.3921, "step": 8541 }, { "epoch": 0.06, "eval_loss": 1.5622339248657227, "eval_runtime": 4.5857, "eval_samples_per_second": 1.963, "eval_steps_per_second": 1.09, "step": 8541 }, { "epoch": 0.06, "grad_norm": 4.560308396326247, "learning_rate": 1.9989749783100805e-06, "loss": 1.2731, "step": 8542 }, { "epoch": 0.06, "grad_norm": 4.444471595101668, "learning_rate": 1.9989747380600664e-06, "loss": 1.4093, "step": 8543 }, { "epoch": 0.06, "grad_norm": 4.67091987159255, "learning_rate": 1.9989744977819147e-06, "loss": 1.4359, "step": 8544 }, { "epoch": 0.06, "grad_norm": 4.205426977104215, "learning_rate": 1.9989742574756246e-06, "loss": 1.2703, "step": 8545 }, { "epoch": 0.06, "grad_norm": 4.532299284853898, "learning_rate": 1.9989740171411973e-06, "loss": 1.233, "step": 8546 }, { "epoch": 0.06, "grad_norm": 4.053406104112797, "learning_rate": 1.998973776778632e-06, "loss": 1.1172, "step": 8547 }, { "epoch": 0.06, "grad_norm": 4.904905801012794, "learning_rate": 1.9989735363879288e-06, "loss": 1.4605, "step": 8548 }, { "epoch": 0.06, "grad_norm": 4.886231215167016, "learning_rate": 1.9989732959690883e-06, "loss": 1.5226, "step": 8549 }, { "epoch": 0.06, "grad_norm": 4.361488596414847, "learning_rate": 1.9989730555221094e-06, "loss": 1.2426, "step": 8550 }, { "epoch": 0.06, "grad_norm": 4.443425779548887, "learning_rate": 1.998972815046993e-06, "loss": 1.4173, "step": 8551 }, { "epoch": 0.06, "grad_norm": 5.1952922918194, "learning_rate": 1.9989725745437393e-06, "loss": 1.2579, "step": 8552 }, { "epoch": 0.06, "grad_norm": 4.3948464438093415, "learning_rate": 1.9989723340123473e-06, "loss": 1.4079, "step": 8553 }, { "epoch": 0.06, "grad_norm": 4.261899456293534, "learning_rate": 1.9989720934528176e-06, "loss": 1.3644, "step": 8554 }, { "epoch": 0.06, "grad_norm": 4.506906797424179, "learning_rate": 1.9989718528651504e-06, "loss": 1.3937, "step": 8555 }, { "epoch": 0.06, "grad_norm": 5.6651610636706735, "learning_rate": 1.9989716122493455e-06, "loss": 1.4507, "step": 8556 }, { "epoch": 0.06, "grad_norm": 4.243794223588025, "learning_rate": 1.9989713716054027e-06, "loss": 1.353, "step": 8557 }, { "epoch": 0.06, "grad_norm": 4.589656190547693, "learning_rate": 1.9989711309333227e-06, "loss": 1.312, "step": 8558 }, { "epoch": 0.06, "grad_norm": 7.183830005026017, "learning_rate": 1.9989708902331042e-06, "loss": 1.2717, "step": 8559 }, { "epoch": 0.06, "grad_norm": 4.570281138766998, "learning_rate": 1.9989706495047486e-06, "loss": 1.5105, "step": 8560 }, { "epoch": 0.06, "grad_norm": 4.984154098685025, "learning_rate": 1.9989704087482554e-06, "loss": 1.406, "step": 8561 }, { "epoch": 0.06, "grad_norm": 4.6692178026189355, "learning_rate": 1.998970167963624e-06, "loss": 1.5083, "step": 8562 }, { "epoch": 0.06, "grad_norm": 4.354950180037289, "learning_rate": 1.9989699271508554e-06, "loss": 1.2876, "step": 8563 }, { "epoch": 0.06, "grad_norm": 5.692769757742883, "learning_rate": 1.998969686309949e-06, "loss": 1.3777, "step": 8564 }, { "epoch": 0.06, "grad_norm": 10.344158078038417, "learning_rate": 1.9989694454409046e-06, "loss": 1.523, "step": 8565 }, { "epoch": 0.06, "grad_norm": 5.157777099201219, "learning_rate": 1.998969204543723e-06, "loss": 1.4596, "step": 8566 }, { "epoch": 0.06, "grad_norm": 5.54091234629367, "learning_rate": 1.9989689636184035e-06, "loss": 1.321, "step": 8567 }, { "epoch": 0.06, "grad_norm": 4.575061344334274, "learning_rate": 1.9989687226649468e-06, "loss": 1.4686, "step": 8568 }, { "epoch": 0.06, "grad_norm": 4.892550476958275, "learning_rate": 1.9989684816833525e-06, "loss": 1.3026, "step": 8569 }, { "epoch": 0.06, "grad_norm": 4.828909312102762, "learning_rate": 1.9989682406736197e-06, "loss": 1.4125, "step": 8570 }, { "epoch": 0.06, "grad_norm": 7.45745693202822, "learning_rate": 1.9989679996357502e-06, "loss": 1.3367, "step": 8571 }, { "epoch": 0.06, "grad_norm": 4.4022759781226775, "learning_rate": 1.9989677585697423e-06, "loss": 1.2662, "step": 8572 }, { "epoch": 0.06, "grad_norm": 4.340356873136581, "learning_rate": 1.998967517475597e-06, "loss": 1.2306, "step": 8573 }, { "epoch": 0.06, "grad_norm": 5.419841733654, "learning_rate": 1.998967276353315e-06, "loss": 1.3005, "step": 8574 }, { "epoch": 0.06, "grad_norm": 4.832158476472871, "learning_rate": 1.9989670352028947e-06, "loss": 1.3494, "step": 8575 }, { "epoch": 0.06, "grad_norm": 4.430984219423789, "learning_rate": 1.998966794024337e-06, "loss": 1.2929, "step": 8576 }, { "epoch": 0.06, "grad_norm": 4.248625008130523, "learning_rate": 1.9989665528176414e-06, "loss": 1.3362, "step": 8577 }, { "epoch": 0.06, "grad_norm": 5.614643944877151, "learning_rate": 1.9989663115828083e-06, "loss": 1.3905, "step": 8578 }, { "epoch": 0.06, "grad_norm": 6.63337513444417, "learning_rate": 1.9989660703198377e-06, "loss": 1.2821, "step": 8579 }, { "epoch": 0.06, "grad_norm": 4.526822811437598, "learning_rate": 1.9989658290287295e-06, "loss": 1.4322, "step": 8580 }, { "epoch": 0.06, "grad_norm": 4.8388842303922655, "learning_rate": 1.998965587709484e-06, "loss": 1.4267, "step": 8581 }, { "epoch": 0.06, "grad_norm": 4.323232523098396, "learning_rate": 1.998965346362101e-06, "loss": 1.3074, "step": 8582 }, { "epoch": 0.06, "grad_norm": 4.417307975487444, "learning_rate": 1.9989651049865802e-06, "loss": 1.3595, "step": 8583 }, { "epoch": 0.06, "grad_norm": 4.3269613111556255, "learning_rate": 1.998964863582922e-06, "loss": 1.2635, "step": 8584 }, { "epoch": 0.06, "grad_norm": 4.335371860108603, "learning_rate": 1.9989646221511264e-06, "loss": 1.3606, "step": 8585 }, { "epoch": 0.06, "grad_norm": 4.359495829838069, "learning_rate": 1.998964380691193e-06, "loss": 1.3434, "step": 8586 }, { "epoch": 0.06, "grad_norm": 4.44770043890702, "learning_rate": 1.998964139203122e-06, "loss": 1.4353, "step": 8587 }, { "epoch": 0.06, "grad_norm": 4.915849793456516, "learning_rate": 1.998963897686914e-06, "loss": 1.4704, "step": 8588 }, { "epoch": 0.06, "grad_norm": 4.30994049417916, "learning_rate": 1.9989636561425685e-06, "loss": 1.2936, "step": 8589 }, { "epoch": 0.06, "grad_norm": 4.575542425437791, "learning_rate": 1.9989634145700853e-06, "loss": 1.2776, "step": 8590 }, { "epoch": 0.06, "grad_norm": 4.611122011833436, "learning_rate": 1.9989631729694644e-06, "loss": 1.4858, "step": 8591 }, { "epoch": 0.06, "grad_norm": 4.243257737791913, "learning_rate": 1.9989629313407064e-06, "loss": 1.3865, "step": 8592 }, { "epoch": 0.06, "grad_norm": 4.216379044773727, "learning_rate": 1.998962689683811e-06, "loss": 1.3655, "step": 8593 }, { "epoch": 0.06, "grad_norm": 4.777388875462107, "learning_rate": 1.9989624479987777e-06, "loss": 1.3467, "step": 8594 }, { "epoch": 0.06, "grad_norm": 4.439491769251264, "learning_rate": 1.998962206285607e-06, "loss": 1.2773, "step": 8595 }, { "epoch": 0.06, "grad_norm": 4.566005422118384, "learning_rate": 1.998961964544299e-06, "loss": 1.3942, "step": 8596 }, { "epoch": 0.06, "grad_norm": 4.833735778742291, "learning_rate": 1.9989617227748536e-06, "loss": 1.4937, "step": 8597 }, { "epoch": 0.06, "grad_norm": 4.460196615545827, "learning_rate": 1.998961480977271e-06, "loss": 1.3117, "step": 8598 }, { "epoch": 0.06, "grad_norm": 4.8489372465448275, "learning_rate": 1.9989612391515507e-06, "loss": 1.5303, "step": 8599 }, { "epoch": 0.06, "grad_norm": 4.515736253131246, "learning_rate": 1.9989609972976932e-06, "loss": 1.2889, "step": 8600 }, { "epoch": 0.06, "grad_norm": 6.851979426866184, "learning_rate": 1.998960755415698e-06, "loss": 1.5018, "step": 8601 }, { "epoch": 0.06, "grad_norm": 4.320321043670359, "learning_rate": 1.9989605135055652e-06, "loss": 1.3428, "step": 8602 }, { "epoch": 0.06, "grad_norm": 4.067919097885855, "learning_rate": 1.9989602715672955e-06, "loss": 1.3363, "step": 8603 }, { "epoch": 0.06, "grad_norm": 4.5896400589841635, "learning_rate": 1.998960029600888e-06, "loss": 1.2651, "step": 8604 }, { "epoch": 0.06, "grad_norm": 4.521195068481925, "learning_rate": 1.9989597876063436e-06, "loss": 1.394, "step": 8605 }, { "epoch": 0.06, "grad_norm": 4.333137968003755, "learning_rate": 1.9989595455836615e-06, "loss": 1.38, "step": 8606 }, { "epoch": 0.06, "grad_norm": 4.2335118225035835, "learning_rate": 1.998959303532842e-06, "loss": 1.2555, "step": 8607 }, { "epoch": 0.06, "grad_norm": 4.179939185913571, "learning_rate": 1.9989590614538854e-06, "loss": 1.3689, "step": 8608 }, { "epoch": 0.06, "grad_norm": 8.37935699734852, "learning_rate": 1.998958819346791e-06, "loss": 1.4963, "step": 8609 }, { "epoch": 0.06, "grad_norm": 5.168749470757185, "learning_rate": 1.99895857721156e-06, "loss": 1.4045, "step": 8610 }, { "epoch": 0.06, "grad_norm": 4.515727329681631, "learning_rate": 1.998958335048191e-06, "loss": 1.274, "step": 8611 }, { "epoch": 0.06, "grad_norm": 4.586023389036626, "learning_rate": 1.9989580928566847e-06, "loss": 1.4079, "step": 8612 }, { "epoch": 0.06, "grad_norm": 4.306414951992419, "learning_rate": 1.9989578506370417e-06, "loss": 1.4126, "step": 8613 }, { "epoch": 0.06, "grad_norm": 4.822983272936278, "learning_rate": 1.9989576083892606e-06, "loss": 1.4169, "step": 8614 }, { "epoch": 0.06, "eval_loss": 1.561651587486267, "eval_runtime": 4.6098, "eval_samples_per_second": 1.952, "eval_steps_per_second": 1.085, "step": 8614 }, { "epoch": 0.06, "grad_norm": 5.404938213549534, "learning_rate": 1.9989573661133427e-06, "loss": 1.48, "step": 8615 }, { "epoch": 0.06, "grad_norm": 4.621790196337391, "learning_rate": 1.9989571238092873e-06, "loss": 1.283, "step": 8616 }, { "epoch": 0.06, "grad_norm": 4.443049372904137, "learning_rate": 1.9989568814770947e-06, "loss": 1.3574, "step": 8617 }, { "epoch": 0.06, "grad_norm": 4.801069293612943, "learning_rate": 1.9989566391167645e-06, "loss": 1.3953, "step": 8618 }, { "epoch": 0.06, "grad_norm": 4.90808002632212, "learning_rate": 1.9989563967282976e-06, "loss": 1.5248, "step": 8619 }, { "epoch": 0.06, "grad_norm": 4.513745233629364, "learning_rate": 1.9989561543116927e-06, "loss": 1.4299, "step": 8620 }, { "epoch": 0.06, "grad_norm": 4.625267465917626, "learning_rate": 1.9989559118669506e-06, "loss": 1.3427, "step": 8621 }, { "epoch": 0.06, "grad_norm": 4.596777102150783, "learning_rate": 1.9989556693940718e-06, "loss": 1.4271, "step": 8622 }, { "epoch": 0.06, "grad_norm": 4.487247482653133, "learning_rate": 1.9989554268930553e-06, "loss": 1.2861, "step": 8623 }, { "epoch": 0.06, "grad_norm": 5.184012572654904, "learning_rate": 1.9989551843639018e-06, "loss": 1.5925, "step": 8624 }, { "epoch": 0.06, "grad_norm": 4.9471120778181055, "learning_rate": 1.998954941806611e-06, "loss": 1.5653, "step": 8625 }, { "epoch": 0.06, "grad_norm": 4.994694318313504, "learning_rate": 1.998954699221183e-06, "loss": 1.4042, "step": 8626 }, { "epoch": 0.06, "grad_norm": 5.666546802417481, "learning_rate": 1.9989544566076176e-06, "loss": 1.5053, "step": 8627 }, { "epoch": 0.06, "grad_norm": 4.576464304521912, "learning_rate": 1.998954213965915e-06, "loss": 1.3794, "step": 8628 }, { "epoch": 0.06, "grad_norm": 6.8434106582639345, "learning_rate": 1.998953971296075e-06, "loss": 1.524, "step": 8629 }, { "epoch": 0.06, "grad_norm": 4.458196304928374, "learning_rate": 1.9989537285980986e-06, "loss": 1.363, "step": 8630 }, { "epoch": 0.06, "grad_norm": 5.076340426993319, "learning_rate": 1.998953485871984e-06, "loss": 1.4671, "step": 8631 }, { "epoch": 0.06, "grad_norm": 4.999092485124452, "learning_rate": 1.9989532431177327e-06, "loss": 1.4206, "step": 8632 }, { "epoch": 0.06, "grad_norm": 4.135494016866375, "learning_rate": 1.998953000335344e-06, "loss": 1.2825, "step": 8633 }, { "epoch": 0.06, "grad_norm": 4.289975665728462, "learning_rate": 1.9989527575248185e-06, "loss": 1.3606, "step": 8634 }, { "epoch": 0.06, "grad_norm": 4.120967424800536, "learning_rate": 1.9989525146861553e-06, "loss": 1.0945, "step": 8635 }, { "epoch": 0.06, "grad_norm": 6.412239856672185, "learning_rate": 1.9989522718193553e-06, "loss": 1.2861, "step": 8636 }, { "epoch": 0.06, "grad_norm": 4.124055552597445, "learning_rate": 1.998952028924418e-06, "loss": 1.2708, "step": 8637 }, { "epoch": 0.06, "grad_norm": 4.2699944061669655, "learning_rate": 1.998951786001344e-06, "loss": 1.462, "step": 8638 }, { "epoch": 0.06, "grad_norm": 4.808028542220539, "learning_rate": 1.9989515430501324e-06, "loss": 1.516, "step": 8639 }, { "epoch": 0.06, "grad_norm": 4.530550955331592, "learning_rate": 1.9989513000707838e-06, "loss": 1.3129, "step": 8640 }, { "epoch": 0.06, "grad_norm": 5.170794436694128, "learning_rate": 1.998951057063298e-06, "loss": 1.1989, "step": 8641 }, { "epoch": 0.06, "grad_norm": 4.477557905506454, "learning_rate": 1.9989508140276746e-06, "loss": 1.3618, "step": 8642 }, { "epoch": 0.06, "grad_norm": 4.297419670431285, "learning_rate": 1.998950570963915e-06, "loss": 1.3248, "step": 8643 }, { "epoch": 0.06, "grad_norm": 4.834191699367694, "learning_rate": 1.9989503278720176e-06, "loss": 1.3471, "step": 8644 }, { "epoch": 0.06, "grad_norm": 4.4287647826822845, "learning_rate": 1.998950084751983e-06, "loss": 1.4201, "step": 8645 }, { "epoch": 0.06, "grad_norm": 4.186716153956602, "learning_rate": 1.998949841603812e-06, "loss": 1.3603, "step": 8646 }, { "epoch": 0.06, "grad_norm": 4.490008277132269, "learning_rate": 1.998949598427503e-06, "loss": 1.3985, "step": 8647 }, { "epoch": 0.06, "grad_norm": 5.426720452827193, "learning_rate": 1.9989493552230577e-06, "loss": 1.2834, "step": 8648 }, { "epoch": 0.06, "grad_norm": 4.494441318763501, "learning_rate": 1.998949111990475e-06, "loss": 1.4092, "step": 8649 }, { "epoch": 0.06, "grad_norm": 4.592067127642173, "learning_rate": 1.998948868729755e-06, "loss": 1.3878, "step": 8650 }, { "epoch": 0.06, "grad_norm": 4.566106280331906, "learning_rate": 1.9989486254408986e-06, "loss": 1.3731, "step": 8651 }, { "epoch": 0.06, "grad_norm": 4.179776470294157, "learning_rate": 1.9989483821239044e-06, "loss": 1.3391, "step": 8652 }, { "epoch": 0.06, "grad_norm": 5.197549322309192, "learning_rate": 1.9989481387787735e-06, "loss": 1.5253, "step": 8653 }, { "epoch": 0.06, "grad_norm": 5.67446584414418, "learning_rate": 1.998947895405506e-06, "loss": 1.4139, "step": 8654 }, { "epoch": 0.06, "grad_norm": 4.054368467752239, "learning_rate": 1.9989476520041006e-06, "loss": 1.1647, "step": 8655 }, { "epoch": 0.06, "grad_norm": 6.16009416378856, "learning_rate": 1.9989474085745586e-06, "loss": 1.6016, "step": 8656 }, { "epoch": 0.06, "grad_norm": 4.4399785100788876, "learning_rate": 1.9989471651168795e-06, "loss": 1.2265, "step": 8657 }, { "epoch": 0.06, "grad_norm": 4.580783083064213, "learning_rate": 1.9989469216310636e-06, "loss": 1.2891, "step": 8658 }, { "epoch": 0.06, "grad_norm": 4.496061509764125, "learning_rate": 1.9989466781171105e-06, "loss": 1.3259, "step": 8659 }, { "epoch": 0.06, "grad_norm": 4.411478429750405, "learning_rate": 1.9989464345750203e-06, "loss": 1.3539, "step": 8660 }, { "epoch": 0.06, "grad_norm": 4.684656237641494, "learning_rate": 1.9989461910047933e-06, "loss": 1.2303, "step": 8661 }, { "epoch": 0.06, "grad_norm": 4.460270820621148, "learning_rate": 1.9989459474064292e-06, "loss": 1.4033, "step": 8662 }, { "epoch": 0.06, "grad_norm": 4.3521566859203125, "learning_rate": 1.998945703779928e-06, "loss": 1.3895, "step": 8663 }, { "epoch": 0.06, "grad_norm": 4.153102238423549, "learning_rate": 1.99894546012529e-06, "loss": 1.2866, "step": 8664 }, { "epoch": 0.06, "grad_norm": 4.535488259940307, "learning_rate": 1.9989452164425147e-06, "loss": 1.3091, "step": 8665 }, { "epoch": 0.06, "grad_norm": 4.2177008668247735, "learning_rate": 1.998944972731603e-06, "loss": 1.3291, "step": 8666 }, { "epoch": 0.06, "grad_norm": 4.394143711573607, "learning_rate": 1.998944728992554e-06, "loss": 1.1878, "step": 8667 }, { "epoch": 0.06, "grad_norm": 4.317990618328454, "learning_rate": 1.998944485225368e-06, "loss": 1.2873, "step": 8668 }, { "epoch": 0.06, "grad_norm": 4.860610716848633, "learning_rate": 1.9989442414300453e-06, "loss": 1.3647, "step": 8669 }, { "epoch": 0.06, "grad_norm": 5.418178515526341, "learning_rate": 1.9989439976065856e-06, "loss": 1.343, "step": 8670 }, { "epoch": 0.06, "grad_norm": 4.309911841757617, "learning_rate": 1.9989437537549887e-06, "loss": 1.3987, "step": 8671 }, { "epoch": 0.06, "grad_norm": 6.134460312253366, "learning_rate": 1.998943509875255e-06, "loss": 1.1095, "step": 8672 }, { "epoch": 0.06, "grad_norm": 5.69042170920404, "learning_rate": 1.9989432659673846e-06, "loss": 1.4126, "step": 8673 }, { "epoch": 0.06, "grad_norm": 4.205104834688021, "learning_rate": 1.998943022031377e-06, "loss": 1.2288, "step": 8674 }, { "epoch": 0.06, "grad_norm": 4.219213579974913, "learning_rate": 1.998942778067233e-06, "loss": 1.3297, "step": 8675 }, { "epoch": 0.06, "grad_norm": 4.28436594377733, "learning_rate": 1.9989425340749514e-06, "loss": 1.2752, "step": 8676 }, { "epoch": 0.06, "grad_norm": 4.792125683770973, "learning_rate": 1.9989422900545336e-06, "loss": 1.5153, "step": 8677 }, { "epoch": 0.06, "grad_norm": 4.984280147366857, "learning_rate": 1.9989420460059787e-06, "loss": 1.4225, "step": 8678 }, { "epoch": 0.06, "grad_norm": 4.364243265873441, "learning_rate": 1.998941801929286e-06, "loss": 1.4426, "step": 8679 }, { "epoch": 0.06, "grad_norm": 5.221518211304695, "learning_rate": 1.9989415578244577e-06, "loss": 1.4625, "step": 8680 }, { "epoch": 0.06, "grad_norm": 4.699562163985216, "learning_rate": 1.998941313691492e-06, "loss": 1.0918, "step": 8681 }, { "epoch": 0.06, "grad_norm": 5.133053415829835, "learning_rate": 1.99894106953039e-06, "loss": 1.405, "step": 8682 }, { "epoch": 0.06, "grad_norm": 5.126993969426706, "learning_rate": 1.9989408253411504e-06, "loss": 1.2673, "step": 8683 }, { "epoch": 0.06, "grad_norm": 4.243153131967924, "learning_rate": 1.998940581123774e-06, "loss": 1.4695, "step": 8684 }, { "epoch": 0.06, "grad_norm": 4.6581281023995285, "learning_rate": 1.9989403368782608e-06, "loss": 1.3283, "step": 8685 }, { "epoch": 0.06, "grad_norm": 4.616248285747535, "learning_rate": 1.998940092604611e-06, "loss": 1.4178, "step": 8686 }, { "epoch": 0.06, "grad_norm": 4.860542184260749, "learning_rate": 1.9989398483028246e-06, "loss": 1.4559, "step": 8687 }, { "epoch": 0.06, "eval_loss": 1.5583451986312866, "eval_runtime": 4.5891, "eval_samples_per_second": 1.961, "eval_steps_per_second": 1.09, "step": 8687 }, { "epoch": 0.06, "grad_norm": 4.230646743272511, "learning_rate": 1.998939603972901e-06, "loss": 1.2765, "step": 8688 }, { "epoch": 0.06, "grad_norm": 4.3307652735719895, "learning_rate": 1.998939359614841e-06, "loss": 1.3156, "step": 8689 }, { "epoch": 0.06, "grad_norm": 5.051706932839966, "learning_rate": 1.998939115228644e-06, "loss": 1.3716, "step": 8690 }, { "epoch": 0.06, "grad_norm": 4.579711634780233, "learning_rate": 1.9989388708143097e-06, "loss": 1.3465, "step": 8691 }, { "epoch": 0.06, "grad_norm": 4.713771561682751, "learning_rate": 1.998938626371839e-06, "loss": 1.4448, "step": 8692 }, { "epoch": 0.06, "grad_norm": 4.358999498209377, "learning_rate": 1.998938381901232e-06, "loss": 1.2604, "step": 8693 }, { "epoch": 0.06, "grad_norm": 4.78337687012583, "learning_rate": 1.998938137402488e-06, "loss": 1.3188, "step": 8694 }, { "epoch": 0.06, "grad_norm": 4.819149794860213, "learning_rate": 1.9989378928756066e-06, "loss": 1.3173, "step": 8695 }, { "epoch": 0.06, "grad_norm": 4.237940696688464, "learning_rate": 1.998937648320589e-06, "loss": 1.2149, "step": 8696 }, { "epoch": 0.06, "grad_norm": 4.381531228903312, "learning_rate": 1.9989374037374344e-06, "loss": 1.392, "step": 8697 }, { "epoch": 0.06, "grad_norm": 4.47317585160481, "learning_rate": 1.9989371591261434e-06, "loss": 1.3614, "step": 8698 }, { "epoch": 0.06, "grad_norm": 4.786103712286903, "learning_rate": 1.998936914486715e-06, "loss": 1.3327, "step": 8699 }, { "epoch": 0.06, "grad_norm": 4.397710283266012, "learning_rate": 1.9989366698191507e-06, "loss": 1.3765, "step": 8700 }, { "epoch": 0.06, "grad_norm": 4.659605117600823, "learning_rate": 1.9989364251234495e-06, "loss": 1.4559, "step": 8701 }, { "epoch": 0.06, "grad_norm": 5.549783460574593, "learning_rate": 1.998936180399611e-06, "loss": 1.282, "step": 8702 }, { "epoch": 0.06, "grad_norm": 4.832371672535695, "learning_rate": 1.9989359356476364e-06, "loss": 1.4638, "step": 8703 }, { "epoch": 0.06, "grad_norm": 5.039478393047218, "learning_rate": 1.998935690867525e-06, "loss": 1.2765, "step": 8704 }, { "epoch": 0.06, "grad_norm": 4.237934104476604, "learning_rate": 1.9989354460592763e-06, "loss": 1.3035, "step": 8705 }, { "epoch": 0.06, "grad_norm": 5.597502989358775, "learning_rate": 1.9989352012228914e-06, "loss": 1.579, "step": 8706 }, { "epoch": 0.06, "grad_norm": 4.34738877208113, "learning_rate": 1.99893495635837e-06, "loss": 1.3389, "step": 8707 }, { "epoch": 0.06, "grad_norm": 4.9117493324578065, "learning_rate": 1.9989347114657117e-06, "loss": 1.422, "step": 8708 }, { "epoch": 0.06, "grad_norm": 4.756986013116514, "learning_rate": 1.998934466544917e-06, "loss": 1.3718, "step": 8709 }, { "epoch": 0.06, "grad_norm": 5.218655925139599, "learning_rate": 1.998934221595985e-06, "loss": 1.304, "step": 8710 }, { "epoch": 0.06, "grad_norm": 4.391285921635442, "learning_rate": 1.998933976618917e-06, "loss": 1.2891, "step": 8711 }, { "epoch": 0.06, "grad_norm": 4.167040109285794, "learning_rate": 1.998933731613712e-06, "loss": 1.3312, "step": 8712 }, { "epoch": 0.06, "grad_norm": 4.522685898963279, "learning_rate": 1.9989334865803706e-06, "loss": 1.3341, "step": 8713 }, { "epoch": 0.06, "grad_norm": 11.437772127217965, "learning_rate": 1.9989332415188926e-06, "loss": 1.5211, "step": 8714 }, { "epoch": 0.06, "grad_norm": 5.074391210248724, "learning_rate": 1.9989329964292774e-06, "loss": 1.0282, "step": 8715 }, { "epoch": 0.06, "grad_norm": 4.618355620435837, "learning_rate": 1.9989327513115264e-06, "loss": 1.4167, "step": 8716 }, { "epoch": 0.06, "grad_norm": 4.466497069781705, "learning_rate": 1.998932506165638e-06, "loss": 1.3133, "step": 8717 }, { "epoch": 0.06, "grad_norm": 4.611878151625593, "learning_rate": 1.9989322609916136e-06, "loss": 1.1923, "step": 8718 }, { "epoch": 0.06, "grad_norm": 4.680157195069533, "learning_rate": 1.998932015789452e-06, "loss": 1.5067, "step": 8719 }, { "epoch": 0.06, "grad_norm": 4.441432214504197, "learning_rate": 1.9989317705591543e-06, "loss": 1.31, "step": 8720 }, { "epoch": 0.06, "grad_norm": 4.174305541652056, "learning_rate": 1.99893152530072e-06, "loss": 1.3555, "step": 8721 }, { "epoch": 0.06, "grad_norm": 4.188362270519318, "learning_rate": 1.998931280014149e-06, "loss": 1.3384, "step": 8722 }, { "epoch": 0.06, "grad_norm": 7.600255299744319, "learning_rate": 1.9989310346994414e-06, "loss": 1.215, "step": 8723 }, { "epoch": 0.06, "grad_norm": 4.5732688620236415, "learning_rate": 1.998930789356597e-06, "loss": 1.4367, "step": 8724 }, { "epoch": 0.06, "grad_norm": 4.738497424726799, "learning_rate": 1.9989305439856164e-06, "loss": 1.3956, "step": 8725 }, { "epoch": 0.06, "grad_norm": 4.271996059356595, "learning_rate": 1.998930298586499e-06, "loss": 1.3214, "step": 8726 }, { "epoch": 0.06, "grad_norm": 4.300865942253508, "learning_rate": 1.9989300531592453e-06, "loss": 1.3691, "step": 8727 }, { "epoch": 0.06, "grad_norm": 4.272467364521415, "learning_rate": 1.998929807703855e-06, "loss": 1.3222, "step": 8728 }, { "epoch": 0.06, "grad_norm": 4.258944322355307, "learning_rate": 1.998929562220328e-06, "loss": 1.354, "step": 8729 }, { "epoch": 0.06, "grad_norm": 4.402519104960977, "learning_rate": 1.998929316708665e-06, "loss": 1.305, "step": 8730 }, { "epoch": 0.06, "grad_norm": 4.973747776517347, "learning_rate": 1.998929071168865e-06, "loss": 1.5857, "step": 8731 }, { "epoch": 0.06, "grad_norm": 4.436818796756212, "learning_rate": 1.9989288256009283e-06, "loss": 1.4667, "step": 8732 }, { "epoch": 0.06, "grad_norm": 4.166641529143792, "learning_rate": 1.9989285800048556e-06, "loss": 1.2593, "step": 8733 }, { "epoch": 0.06, "grad_norm": 5.143412085210361, "learning_rate": 1.998928334380646e-06, "loss": 1.368, "step": 8734 }, { "epoch": 0.06, "grad_norm": 5.623616716017324, "learning_rate": 1.9989280887283004e-06, "loss": 1.2999, "step": 8735 }, { "epoch": 0.06, "grad_norm": 4.929314548576958, "learning_rate": 1.998927843047818e-06, "loss": 1.3587, "step": 8736 }, { "epoch": 0.06, "grad_norm": 4.596938892659514, "learning_rate": 1.998927597339199e-06, "loss": 1.4456, "step": 8737 }, { "epoch": 0.06, "grad_norm": 4.692918653297475, "learning_rate": 1.9989273516024435e-06, "loss": 1.2818, "step": 8738 }, { "epoch": 0.06, "grad_norm": 4.344206382567416, "learning_rate": 1.998927105837552e-06, "loss": 1.3174, "step": 8739 }, { "epoch": 0.06, "grad_norm": 4.794268839973716, "learning_rate": 1.9989268600445235e-06, "loss": 1.3462, "step": 8740 }, { "epoch": 0.06, "grad_norm": 5.428331293785562, "learning_rate": 1.998926614223359e-06, "loss": 1.342, "step": 8741 }, { "epoch": 0.06, "grad_norm": 4.360292445286974, "learning_rate": 1.9989263683740578e-06, "loss": 1.4407, "step": 8742 }, { "epoch": 0.06, "grad_norm": 4.980192833133715, "learning_rate": 1.9989261224966202e-06, "loss": 1.3852, "step": 8743 }, { "epoch": 0.06, "grad_norm": 5.0662858232348285, "learning_rate": 1.9989258765910463e-06, "loss": 1.2963, "step": 8744 }, { "epoch": 0.06, "grad_norm": 4.422522279617291, "learning_rate": 1.9989256306573357e-06, "loss": 1.2535, "step": 8745 }, { "epoch": 0.06, "grad_norm": 4.5365366347764, "learning_rate": 1.9989253846954892e-06, "loss": 1.3592, "step": 8746 }, { "epoch": 0.06, "grad_norm": 4.304538924155177, "learning_rate": 1.998925138705506e-06, "loss": 1.4012, "step": 8747 }, { "epoch": 0.06, "grad_norm": 4.884147183983975, "learning_rate": 1.9989248926873864e-06, "loss": 1.268, "step": 8748 }, { "epoch": 0.06, "grad_norm": 4.609824951436014, "learning_rate": 1.9989246466411305e-06, "loss": 1.2965, "step": 8749 }, { "epoch": 0.06, "grad_norm": 5.480861767927759, "learning_rate": 1.998924400566738e-06, "loss": 1.1923, "step": 8750 }, { "epoch": 0.06, "grad_norm": 5.195946348947488, "learning_rate": 1.998924154464209e-06, "loss": 1.3513, "step": 8751 }, { "epoch": 0.06, "grad_norm": 4.545555038200293, "learning_rate": 1.998923908333544e-06, "loss": 1.2329, "step": 8752 }, { "epoch": 0.06, "grad_norm": 4.556411712333217, "learning_rate": 1.9989236621747426e-06, "loss": 1.4029, "step": 8753 }, { "epoch": 0.06, "grad_norm": 4.405448473310726, "learning_rate": 1.9989234159878047e-06, "loss": 1.279, "step": 8754 }, { "epoch": 0.06, "grad_norm": 5.125783994932461, "learning_rate": 1.9989231697727305e-06, "loss": 1.3239, "step": 8755 }, { "epoch": 0.06, "grad_norm": 4.436118542277149, "learning_rate": 1.9989229235295195e-06, "loss": 1.3637, "step": 8756 }, { "epoch": 0.06, "grad_norm": 4.488235923339999, "learning_rate": 1.9989226772581727e-06, "loss": 1.4432, "step": 8757 }, { "epoch": 0.06, "grad_norm": 4.501830793620562, "learning_rate": 1.99892243095869e-06, "loss": 1.318, "step": 8758 }, { "epoch": 0.06, "grad_norm": 4.5392638161088685, "learning_rate": 1.99892218463107e-06, "loss": 1.4454, "step": 8759 }, { "epoch": 0.06, "grad_norm": 4.9460297613034765, "learning_rate": 1.9989219382753143e-06, "loss": 1.5006, "step": 8760 }, { "epoch": 0.06, "eval_loss": 1.555009126663208, "eval_runtime": 4.5781, "eval_samples_per_second": 1.966, "eval_steps_per_second": 1.092, "step": 8760 }, { "epoch": 0.06, "grad_norm": 4.839212746469922, "learning_rate": 1.998921691891422e-06, "loss": 1.5159, "step": 8761 }, { "epoch": 0.06, "grad_norm": 4.682904810042825, "learning_rate": 1.9989214454793938e-06, "loss": 1.4336, "step": 8762 }, { "epoch": 0.06, "grad_norm": 4.4672441639336915, "learning_rate": 1.998921199039229e-06, "loss": 1.3792, "step": 8763 }, { "epoch": 0.06, "grad_norm": 4.389071968080816, "learning_rate": 1.998920952570928e-06, "loss": 1.3526, "step": 8764 }, { "epoch": 0.06, "grad_norm": 4.692541912597106, "learning_rate": 1.9989207060744906e-06, "loss": 1.3597, "step": 8765 }, { "epoch": 0.06, "grad_norm": 4.371014152314616, "learning_rate": 1.998920459549917e-06, "loss": 1.4378, "step": 8766 }, { "epoch": 0.06, "grad_norm": 7.6450488555112, "learning_rate": 1.9989202129972073e-06, "loss": 1.5821, "step": 8767 }, { "epoch": 0.06, "grad_norm": 4.389847254791679, "learning_rate": 1.998919966416361e-06, "loss": 1.3623, "step": 8768 }, { "epoch": 0.06, "grad_norm": 4.310722944850334, "learning_rate": 1.9989197198073788e-06, "loss": 1.4095, "step": 8769 }, { "epoch": 0.06, "grad_norm": 5.896039638219889, "learning_rate": 1.9989194731702602e-06, "loss": 1.3831, "step": 8770 }, { "epoch": 0.06, "grad_norm": 5.094709366572391, "learning_rate": 1.9989192265050054e-06, "loss": 1.5436, "step": 8771 }, { "epoch": 0.06, "grad_norm": 5.081231840658383, "learning_rate": 1.998918979811614e-06, "loss": 1.2478, "step": 8772 }, { "epoch": 0.06, "grad_norm": 4.765541311742645, "learning_rate": 1.9989187330900863e-06, "loss": 1.2019, "step": 8773 }, { "epoch": 0.06, "grad_norm": 4.507442922285264, "learning_rate": 1.9989184863404234e-06, "loss": 1.497, "step": 8774 }, { "epoch": 0.06, "grad_norm": 4.650385783759415, "learning_rate": 1.9989182395626232e-06, "loss": 1.341, "step": 8775 }, { "epoch": 0.06, "grad_norm": 5.193658928719714, "learning_rate": 1.9989179927566876e-06, "loss": 1.3883, "step": 8776 }, { "epoch": 0.06, "grad_norm": 4.708588008961572, "learning_rate": 1.9989177459226153e-06, "loss": 1.2086, "step": 8777 }, { "epoch": 0.06, "grad_norm": 4.361636087189225, "learning_rate": 1.9989174990604067e-06, "loss": 1.5196, "step": 8778 }, { "epoch": 0.06, "grad_norm": 4.93887406872926, "learning_rate": 1.9989172521700626e-06, "loss": 1.3296, "step": 8779 }, { "epoch": 0.06, "grad_norm": 4.308513529177304, "learning_rate": 1.9989170052515817e-06, "loss": 1.1928, "step": 8780 }, { "epoch": 0.06, "grad_norm": 6.029849476676653, "learning_rate": 1.998916758304965e-06, "loss": 1.4123, "step": 8781 }, { "epoch": 0.06, "grad_norm": 4.9678471569973, "learning_rate": 1.998916511330212e-06, "loss": 1.3926, "step": 8782 }, { "epoch": 0.06, "grad_norm": 4.7508932027583635, "learning_rate": 1.9989162643273225e-06, "loss": 1.2513, "step": 8783 }, { "epoch": 0.06, "grad_norm": 5.298068532157554, "learning_rate": 1.9989160172962972e-06, "loss": 1.4054, "step": 8784 }, { "epoch": 0.06, "grad_norm": 4.358156610070261, "learning_rate": 1.9989157702371356e-06, "loss": 1.1538, "step": 8785 }, { "epoch": 0.06, "grad_norm": 5.452888444017439, "learning_rate": 1.998915523149838e-06, "loss": 1.4679, "step": 8786 }, { "epoch": 0.06, "grad_norm": 4.489425381541063, "learning_rate": 1.9989152760344043e-06, "loss": 1.2096, "step": 8787 }, { "epoch": 0.06, "grad_norm": 4.264896254010867, "learning_rate": 1.9989150288908346e-06, "loss": 1.4319, "step": 8788 }, { "epoch": 0.06, "grad_norm": 4.757954697110054, "learning_rate": 1.9989147817191286e-06, "loss": 1.3849, "step": 8789 }, { "epoch": 0.06, "grad_norm": 4.209735455715297, "learning_rate": 1.9989145345192863e-06, "loss": 1.1999, "step": 8790 }, { "epoch": 0.06, "grad_norm": 4.136721460927477, "learning_rate": 1.998914287291308e-06, "loss": 1.3417, "step": 8791 }, { "epoch": 0.06, "grad_norm": 4.111017128544717, "learning_rate": 1.998914040035194e-06, "loss": 1.3239, "step": 8792 }, { "epoch": 0.06, "grad_norm": 4.12195419282272, "learning_rate": 1.9989137927509435e-06, "loss": 1.2935, "step": 8793 }, { "epoch": 0.06, "grad_norm": 4.679635603064866, "learning_rate": 1.998913545438557e-06, "loss": 1.2381, "step": 8794 }, { "epoch": 0.06, "grad_norm": 4.403710047305166, "learning_rate": 1.9989132980980345e-06, "loss": 1.4084, "step": 8795 }, { "epoch": 0.06, "grad_norm": 8.025200756047498, "learning_rate": 1.998913050729376e-06, "loss": 1.3225, "step": 8796 }, { "epoch": 0.06, "grad_norm": 4.758372436662555, "learning_rate": 1.9989128033325816e-06, "loss": 1.4158, "step": 8797 }, { "epoch": 0.06, "grad_norm": 5.640940112631746, "learning_rate": 1.998912555907651e-06, "loss": 1.347, "step": 8798 }, { "epoch": 0.06, "grad_norm": 5.391049757984428, "learning_rate": 1.9989123084545837e-06, "loss": 1.24, "step": 8799 }, { "epoch": 0.06, "grad_norm": 5.036625817707134, "learning_rate": 1.998912060973381e-06, "loss": 1.3422, "step": 8800 }, { "epoch": 0.06, "grad_norm": 4.817900719965218, "learning_rate": 1.9989118134640424e-06, "loss": 1.487, "step": 8801 }, { "epoch": 0.06, "grad_norm": 4.319819293700203, "learning_rate": 1.9989115659265676e-06, "loss": 1.2542, "step": 8802 }, { "epoch": 0.06, "grad_norm": 4.991466332708066, "learning_rate": 1.9989113183609565e-06, "loss": 1.3914, "step": 8803 }, { "epoch": 0.06, "grad_norm": 4.553171235603451, "learning_rate": 1.99891107076721e-06, "loss": 1.2282, "step": 8804 }, { "epoch": 0.06, "grad_norm": 4.748633228554743, "learning_rate": 1.9989108231453267e-06, "loss": 1.4943, "step": 8805 }, { "epoch": 0.06, "grad_norm": 4.318508498223785, "learning_rate": 1.998910575495308e-06, "loss": 1.2343, "step": 8806 }, { "epoch": 0.06, "grad_norm": 4.24344702131763, "learning_rate": 1.9989103278171533e-06, "loss": 1.4504, "step": 8807 }, { "epoch": 0.06, "grad_norm": 4.3328169325124675, "learning_rate": 1.9989100801108624e-06, "loss": 1.32, "step": 8808 }, { "epoch": 0.06, "grad_norm": 4.666296720918735, "learning_rate": 1.9989098323764355e-06, "loss": 1.4708, "step": 8809 }, { "epoch": 0.06, "grad_norm": 4.663540340753861, "learning_rate": 1.9989095846138732e-06, "loss": 1.2516, "step": 8810 }, { "epoch": 0.06, "grad_norm": 4.879755583128598, "learning_rate": 1.998909336823174e-06, "loss": 1.213, "step": 8811 }, { "epoch": 0.06, "grad_norm": 5.82249615089797, "learning_rate": 1.9989090890043396e-06, "loss": 1.5187, "step": 8812 }, { "epoch": 0.06, "grad_norm": 4.7303714460669175, "learning_rate": 1.998908841157369e-06, "loss": 1.4238, "step": 8813 }, { "epoch": 0.06, "grad_norm": 4.990304035232559, "learning_rate": 1.9989085932822625e-06, "loss": 1.3205, "step": 8814 }, { "epoch": 0.06, "grad_norm": 5.467170374744285, "learning_rate": 1.99890834537902e-06, "loss": 1.559, "step": 8815 }, { "epoch": 0.06, "grad_norm": 4.307507709697949, "learning_rate": 1.9989080974476417e-06, "loss": 1.3504, "step": 8816 }, { "epoch": 0.06, "grad_norm": 4.733747372979935, "learning_rate": 1.998907849488127e-06, "loss": 1.4028, "step": 8817 }, { "epoch": 0.06, "grad_norm": 16.527564507850006, "learning_rate": 1.998907601500477e-06, "loss": 1.4501, "step": 8818 }, { "epoch": 0.06, "grad_norm": 4.40096857873657, "learning_rate": 1.998907353484691e-06, "loss": 1.2448, "step": 8819 }, { "epoch": 0.06, "grad_norm": 4.459333126206982, "learning_rate": 1.9989071054407683e-06, "loss": 1.3535, "step": 8820 }, { "epoch": 0.06, "grad_norm": 4.506137806984103, "learning_rate": 1.9989068573687107e-06, "loss": 1.3735, "step": 8821 }, { "epoch": 0.06, "grad_norm": 4.474364100939082, "learning_rate": 1.998906609268517e-06, "loss": 1.4221, "step": 8822 }, { "epoch": 0.06, "grad_norm": 4.735553303601279, "learning_rate": 1.998906361140187e-06, "loss": 1.3305, "step": 8823 }, { "epoch": 0.06, "grad_norm": 5.127362221019991, "learning_rate": 1.998906112983722e-06, "loss": 1.3965, "step": 8824 }, { "epoch": 0.06, "grad_norm": 4.9303345751613, "learning_rate": 1.9989058647991202e-06, "loss": 1.3227, "step": 8825 }, { "epoch": 0.06, "grad_norm": 4.783778776675495, "learning_rate": 1.9989056165863828e-06, "loss": 1.4039, "step": 8826 }, { "epoch": 0.06, "grad_norm": 4.70280304045451, "learning_rate": 1.99890536834551e-06, "loss": 1.1958, "step": 8827 }, { "epoch": 0.06, "grad_norm": 4.7621594928258695, "learning_rate": 1.9989051200765006e-06, "loss": 1.5909, "step": 8828 }, { "epoch": 0.06, "grad_norm": 8.10383376704116, "learning_rate": 1.998904871779356e-06, "loss": 1.4304, "step": 8829 }, { "epoch": 0.06, "grad_norm": 4.2341673557291815, "learning_rate": 1.9989046234540753e-06, "loss": 1.0553, "step": 8830 }, { "epoch": 0.06, "grad_norm": 4.526921310757564, "learning_rate": 1.9989043751006588e-06, "loss": 1.2812, "step": 8831 }, { "epoch": 0.06, "grad_norm": 4.335854028565122, "learning_rate": 1.9989041267191064e-06, "loss": 1.4074, "step": 8832 }, { "epoch": 0.06, "grad_norm": 4.726335316129505, "learning_rate": 1.9989038783094185e-06, "loss": 1.45, "step": 8833 }, { "epoch": 0.06, "eval_loss": 1.558302402496338, "eval_runtime": 4.6017, "eval_samples_per_second": 1.956, "eval_steps_per_second": 1.087, "step": 8833 }, { "epoch": 0.06, "grad_norm": 4.363316698415397, "learning_rate": 1.9989036298715948e-06, "loss": 1.2738, "step": 8834 }, { "epoch": 0.06, "grad_norm": 4.699976256764717, "learning_rate": 1.9989033814056347e-06, "loss": 1.4648, "step": 8835 }, { "epoch": 0.06, "grad_norm": 4.886362899403995, "learning_rate": 1.998903132911539e-06, "loss": 1.2614, "step": 8836 }, { "epoch": 0.06, "grad_norm": 4.679037576520284, "learning_rate": 1.998902884389308e-06, "loss": 1.367, "step": 8837 }, { "epoch": 0.06, "grad_norm": 4.450518318121184, "learning_rate": 1.998902635838941e-06, "loss": 1.3282, "step": 8838 }, { "epoch": 0.06, "grad_norm": 4.103511848760463, "learning_rate": 1.9989023872604384e-06, "loss": 1.1773, "step": 8839 }, { "epoch": 0.06, "grad_norm": 4.356978446538244, "learning_rate": 1.9989021386537997e-06, "loss": 1.3099, "step": 8840 }, { "epoch": 0.06, "grad_norm": 5.043122752496365, "learning_rate": 1.9989018900190255e-06, "loss": 1.5796, "step": 8841 }, { "epoch": 0.06, "grad_norm": 4.811375667955704, "learning_rate": 1.9989016413561155e-06, "loss": 1.4509, "step": 8842 }, { "epoch": 0.06, "grad_norm": 4.525444669181647, "learning_rate": 1.99890139266507e-06, "loss": 1.4023, "step": 8843 }, { "epoch": 0.06, "grad_norm": 4.279821968144671, "learning_rate": 1.998901143945888e-06, "loss": 1.238, "step": 8844 }, { "epoch": 0.06, "grad_norm": 4.4510274186849825, "learning_rate": 1.998900895198571e-06, "loss": 1.4324, "step": 8845 }, { "epoch": 0.06, "grad_norm": 4.191724069124076, "learning_rate": 1.998900646423118e-06, "loss": 1.2724, "step": 8846 }, { "epoch": 0.06, "grad_norm": 5.175557266684348, "learning_rate": 1.9989003976195293e-06, "loss": 1.3548, "step": 8847 }, { "epoch": 0.06, "grad_norm": 4.587299792626568, "learning_rate": 1.998900148787805e-06, "loss": 1.4213, "step": 8848 }, { "epoch": 0.06, "grad_norm": 4.1151803926653905, "learning_rate": 1.998899899927945e-06, "loss": 1.3732, "step": 8849 }, { "epoch": 0.06, "grad_norm": 4.367591021656302, "learning_rate": 1.998899651039949e-06, "loss": 1.3188, "step": 8850 }, { "epoch": 0.06, "grad_norm": 4.751278274380196, "learning_rate": 1.9988994021238178e-06, "loss": 1.2984, "step": 8851 }, { "epoch": 0.06, "grad_norm": 4.783135464172729, "learning_rate": 1.998899153179551e-06, "loss": 1.4884, "step": 8852 }, { "epoch": 0.06, "grad_norm": 7.8611914946675405, "learning_rate": 1.998898904207148e-06, "loss": 1.3191, "step": 8853 }, { "epoch": 0.06, "grad_norm": 5.906791464476312, "learning_rate": 1.9988986552066095e-06, "loss": 1.2687, "step": 8854 }, { "epoch": 0.06, "grad_norm": 4.385300189761708, "learning_rate": 1.9988984061779358e-06, "loss": 1.245, "step": 8855 }, { "epoch": 0.06, "grad_norm": 4.971463879300862, "learning_rate": 1.9988981571211257e-06, "loss": 1.1891, "step": 8856 }, { "epoch": 0.06, "grad_norm": 4.6166773470662426, "learning_rate": 1.9988979080361807e-06, "loss": 1.2598, "step": 8857 }, { "epoch": 0.06, "grad_norm": 4.724677461858467, "learning_rate": 1.9988976589230993e-06, "loss": 1.3926, "step": 8858 }, { "epoch": 0.06, "grad_norm": 4.451596700122062, "learning_rate": 1.998897409781883e-06, "loss": 1.3847, "step": 8859 }, { "epoch": 0.06, "grad_norm": 5.353190518247788, "learning_rate": 1.9988971606125305e-06, "loss": 1.2629, "step": 8860 }, { "epoch": 0.06, "grad_norm": 4.768436682447021, "learning_rate": 1.9988969114150427e-06, "loss": 1.2988, "step": 8861 }, { "epoch": 0.06, "grad_norm": 4.173315215551296, "learning_rate": 1.9988966621894194e-06, "loss": 1.286, "step": 8862 }, { "epoch": 0.06, "grad_norm": 5.041523559896114, "learning_rate": 1.9988964129356603e-06, "loss": 1.2269, "step": 8863 }, { "epoch": 0.06, "grad_norm": 4.325590935683932, "learning_rate": 1.9988961636537656e-06, "loss": 1.2875, "step": 8864 }, { "epoch": 0.06, "grad_norm": 4.115782126940359, "learning_rate": 1.9988959143437355e-06, "loss": 1.3078, "step": 8865 }, { "epoch": 0.06, "grad_norm": 4.480126941503472, "learning_rate": 1.9988956650055695e-06, "loss": 1.4237, "step": 8866 }, { "epoch": 0.06, "grad_norm": 4.54970557504812, "learning_rate": 1.9988954156392685e-06, "loss": 1.4411, "step": 8867 }, { "epoch": 0.06, "grad_norm": 4.278451381604128, "learning_rate": 1.9988951662448316e-06, "loss": 1.4004, "step": 8868 }, { "epoch": 0.06, "grad_norm": 4.735404366057957, "learning_rate": 1.998894916822259e-06, "loss": 1.4554, "step": 8869 }, { "epoch": 0.06, "grad_norm": 4.378085473898133, "learning_rate": 1.998894667371551e-06, "loss": 1.3601, "step": 8870 }, { "epoch": 0.06, "grad_norm": 4.543131960179484, "learning_rate": 1.9988944178927075e-06, "loss": 1.2967, "step": 8871 }, { "epoch": 0.06, "grad_norm": 4.037717908122634, "learning_rate": 1.9988941683857283e-06, "loss": 1.0675, "step": 8872 }, { "epoch": 0.06, "grad_norm": 3.9960044939604873, "learning_rate": 1.9988939188506136e-06, "loss": 1.4066, "step": 8873 }, { "epoch": 0.06, "grad_norm": 4.271650937109512, "learning_rate": 1.998893669287364e-06, "loss": 1.3711, "step": 8874 }, { "epoch": 0.06, "grad_norm": 4.4055312781165, "learning_rate": 1.998893419695978e-06, "loss": 1.3049, "step": 8875 }, { "epoch": 0.06, "grad_norm": 4.205210231980988, "learning_rate": 1.998893170076457e-06, "loss": 1.1131, "step": 8876 }, { "epoch": 0.06, "grad_norm": 4.52296723687895, "learning_rate": 1.9988929204288005e-06, "loss": 1.3961, "step": 8877 }, { "epoch": 0.06, "grad_norm": 5.468240764241983, "learning_rate": 1.998892670753008e-06, "loss": 1.426, "step": 8878 }, { "epoch": 0.06, "grad_norm": 4.428518120557946, "learning_rate": 1.9988924210490804e-06, "loss": 1.3621, "step": 8879 }, { "epoch": 0.06, "grad_norm": 4.4233618382909645, "learning_rate": 1.9988921713170174e-06, "loss": 1.3834, "step": 8880 }, { "epoch": 0.06, "grad_norm": 4.788668460219144, "learning_rate": 1.9988919215568186e-06, "loss": 1.3543, "step": 8881 }, { "epoch": 0.06, "grad_norm": 5.038344659421539, "learning_rate": 1.9988916717684846e-06, "loss": 1.2472, "step": 8882 }, { "epoch": 0.06, "grad_norm": 4.448190152642244, "learning_rate": 1.9988914219520156e-06, "loss": 1.3846, "step": 8883 }, { "epoch": 0.06, "grad_norm": 5.015883758953264, "learning_rate": 1.9988911721074103e-06, "loss": 1.3266, "step": 8884 }, { "epoch": 0.06, "grad_norm": 5.604425102482683, "learning_rate": 1.99889092223467e-06, "loss": 1.5366, "step": 8885 }, { "epoch": 0.06, "grad_norm": 4.853075950448697, "learning_rate": 1.998890672333794e-06, "loss": 1.3602, "step": 8886 }, { "epoch": 0.06, "grad_norm": 4.382559289220733, "learning_rate": 1.998890422404783e-06, "loss": 1.2622, "step": 8887 }, { "epoch": 0.06, "grad_norm": 4.533039206904667, "learning_rate": 1.9988901724476365e-06, "loss": 1.2871, "step": 8888 }, { "epoch": 0.06, "grad_norm": 4.448063347993088, "learning_rate": 1.998889922462354e-06, "loss": 1.3113, "step": 8889 }, { "epoch": 0.06, "grad_norm": 4.448566612499572, "learning_rate": 1.9988896724489366e-06, "loss": 1.3776, "step": 8890 }, { "epoch": 0.06, "grad_norm": 5.452867809262882, "learning_rate": 1.998889422407384e-06, "loss": 1.5249, "step": 8891 }, { "epoch": 0.06, "grad_norm": 4.599794126337971, "learning_rate": 1.9988891723376957e-06, "loss": 1.3296, "step": 8892 }, { "epoch": 0.06, "grad_norm": 5.05562711237251, "learning_rate": 1.998888922239872e-06, "loss": 1.1367, "step": 8893 }, { "epoch": 0.06, "grad_norm": 6.0460495218161485, "learning_rate": 1.998888672113913e-06, "loss": 1.2502, "step": 8894 }, { "epoch": 0.06, "grad_norm": 5.765418887979006, "learning_rate": 1.9988884219598186e-06, "loss": 1.3728, "step": 8895 }, { "epoch": 0.06, "grad_norm": 4.679369204625647, "learning_rate": 1.9988881717775885e-06, "loss": 1.2669, "step": 8896 }, { "epoch": 0.06, "grad_norm": 5.065217443313155, "learning_rate": 1.998887921567224e-06, "loss": 1.5629, "step": 8897 }, { "epoch": 0.06, "grad_norm": 4.478375261594609, "learning_rate": 1.9988876713287232e-06, "loss": 1.3335, "step": 8898 }, { "epoch": 0.06, "grad_norm": 5.407675825978634, "learning_rate": 1.998887421062087e-06, "loss": 1.4492, "step": 8899 }, { "epoch": 0.06, "grad_norm": 5.200237987636907, "learning_rate": 1.998887170767316e-06, "loss": 1.2939, "step": 8900 }, { "epoch": 0.06, "grad_norm": 4.532814601759366, "learning_rate": 1.99888692044441e-06, "loss": 1.3333, "step": 8901 }, { "epoch": 0.06, "grad_norm": 4.465312311772244, "learning_rate": 1.9988866700933678e-06, "loss": 1.4924, "step": 8902 }, { "epoch": 0.06, "grad_norm": 4.466904480519897, "learning_rate": 1.9988864197141907e-06, "loss": 1.3114, "step": 8903 }, { "epoch": 0.06, "grad_norm": 4.682485277654333, "learning_rate": 1.9988861693068785e-06, "loss": 1.3864, "step": 8904 }, { "epoch": 0.06, "grad_norm": 9.31636658719417, "learning_rate": 1.9988859188714305e-06, "loss": 1.281, "step": 8905 }, { "epoch": 0.06, "grad_norm": 4.914615163990817, "learning_rate": 1.9988856684078474e-06, "loss": 1.5427, "step": 8906 }, { "epoch": 0.06, "eval_loss": 1.5543663501739502, "eval_runtime": 4.5787, "eval_samples_per_second": 1.966, "eval_steps_per_second": 1.092, "step": 8906 }, { "epoch": 0.06, "grad_norm": 4.489190529489661, "learning_rate": 1.998885417916129e-06, "loss": 1.3335, "step": 8907 }, { "epoch": 0.06, "grad_norm": 4.4726409644589085, "learning_rate": 1.9988851673962756e-06, "loss": 1.4104, "step": 8908 }, { "epoch": 0.06, "grad_norm": 4.5195462156905455, "learning_rate": 1.9988849168482865e-06, "loss": 1.309, "step": 8909 }, { "epoch": 0.06, "grad_norm": 4.652669844355484, "learning_rate": 1.9988846662721624e-06, "loss": 1.421, "step": 8910 }, { "epoch": 0.06, "grad_norm": 4.927666429060232, "learning_rate": 1.998884415667903e-06, "loss": 1.248, "step": 8911 }, { "epoch": 0.06, "grad_norm": 5.027081136036229, "learning_rate": 1.9988841650355086e-06, "loss": 1.2158, "step": 8912 }, { "epoch": 0.06, "grad_norm": 4.2101052194981, "learning_rate": 1.9988839143749784e-06, "loss": 1.3401, "step": 8913 }, { "epoch": 0.06, "grad_norm": 4.366447884123436, "learning_rate": 1.9988836636863137e-06, "loss": 1.4023, "step": 8914 }, { "epoch": 0.06, "grad_norm": 4.603617986530109, "learning_rate": 1.998883412969513e-06, "loss": 1.3939, "step": 8915 }, { "epoch": 0.06, "grad_norm": 4.781767465595025, "learning_rate": 1.9988831622245774e-06, "loss": 1.4682, "step": 8916 }, { "epoch": 0.06, "grad_norm": 5.207559511450623, "learning_rate": 1.9988829114515067e-06, "loss": 1.2992, "step": 8917 }, { "epoch": 0.06, "grad_norm": 10.133034512086452, "learning_rate": 1.9988826606503005e-06, "loss": 1.3581, "step": 8918 }, { "epoch": 0.06, "grad_norm": 4.281864915731136, "learning_rate": 1.9988824098209597e-06, "loss": 1.3436, "step": 8919 }, { "epoch": 0.06, "grad_norm": 4.361095459871603, "learning_rate": 1.9988821589634834e-06, "loss": 1.3927, "step": 8920 }, { "epoch": 0.06, "grad_norm": 4.3010799083719835, "learning_rate": 1.998881908077871e-06, "loss": 1.3578, "step": 8921 }, { "epoch": 0.06, "grad_norm": 4.1188084511681655, "learning_rate": 1.998881657164125e-06, "loss": 1.2002, "step": 8922 }, { "epoch": 0.06, "grad_norm": 4.5025913689646835, "learning_rate": 1.998881406222243e-06, "loss": 1.329, "step": 8923 }, { "epoch": 0.06, "grad_norm": 4.529673633173111, "learning_rate": 1.9988811552522256e-06, "loss": 1.3608, "step": 8924 }, { "epoch": 0.06, "grad_norm": 4.569770946380013, "learning_rate": 1.9988809042540737e-06, "loss": 1.2989, "step": 8925 }, { "epoch": 0.06, "grad_norm": 5.557240589904327, "learning_rate": 1.998880653227786e-06, "loss": 1.1284, "step": 8926 }, { "epoch": 0.06, "grad_norm": 4.28813762250133, "learning_rate": 1.9988804021733634e-06, "loss": 1.396, "step": 8927 }, { "epoch": 0.06, "grad_norm": 5.400184757435649, "learning_rate": 1.998880151090806e-06, "loss": 1.2865, "step": 8928 }, { "epoch": 0.06, "grad_norm": 6.339021754186371, "learning_rate": 1.998879899980113e-06, "loss": 1.3962, "step": 8929 }, { "epoch": 0.06, "grad_norm": 4.314780556483214, "learning_rate": 1.998879648841285e-06, "loss": 1.4211, "step": 8930 }, { "epoch": 0.06, "grad_norm": 4.594912157470313, "learning_rate": 1.998879397674322e-06, "loss": 1.4905, "step": 8931 }, { "epoch": 0.06, "grad_norm": 5.106468959389977, "learning_rate": 1.9988791464792237e-06, "loss": 1.5001, "step": 8932 }, { "epoch": 0.06, "grad_norm": 4.4737293133162135, "learning_rate": 1.9988788952559905e-06, "loss": 1.399, "step": 8933 }, { "epoch": 0.06, "grad_norm": 4.652228634028416, "learning_rate": 1.998878644004622e-06, "loss": 1.3587, "step": 8934 }, { "epoch": 0.06, "grad_norm": 4.45502457916985, "learning_rate": 1.9988783927251187e-06, "loss": 1.4417, "step": 8935 }, { "epoch": 0.06, "grad_norm": 4.445395169182832, "learning_rate": 1.99887814141748e-06, "loss": 1.346, "step": 8936 }, { "epoch": 0.06, "grad_norm": 5.132407819507094, "learning_rate": 1.998877890081706e-06, "loss": 1.4948, "step": 8937 }, { "epoch": 0.06, "grad_norm": 4.588242467876485, "learning_rate": 1.9988776387177973e-06, "loss": 1.345, "step": 8938 }, { "epoch": 0.06, "grad_norm": 6.067830352222538, "learning_rate": 1.998877387325754e-06, "loss": 1.5489, "step": 8939 }, { "epoch": 0.06, "grad_norm": 4.714850688819917, "learning_rate": 1.998877135905575e-06, "loss": 1.3105, "step": 8940 }, { "epoch": 0.06, "grad_norm": 4.1786756721600335, "learning_rate": 1.998876884457261e-06, "loss": 1.3166, "step": 8941 }, { "epoch": 0.06, "grad_norm": 4.459295861189716, "learning_rate": 1.998876632980812e-06, "loss": 1.3105, "step": 8942 }, { "epoch": 0.06, "grad_norm": 4.2369627871224065, "learning_rate": 1.9988763814762284e-06, "loss": 1.0185, "step": 8943 }, { "epoch": 0.06, "grad_norm": 4.479324234333639, "learning_rate": 1.9988761299435093e-06, "loss": 1.3638, "step": 8944 }, { "epoch": 0.06, "grad_norm": 5.447393268254241, "learning_rate": 1.998875878382655e-06, "loss": 1.2788, "step": 8945 }, { "epoch": 0.06, "grad_norm": 4.5828523366326595, "learning_rate": 1.9988756267936664e-06, "loss": 1.3339, "step": 8946 }, { "epoch": 0.06, "grad_norm": 4.325089835344213, "learning_rate": 1.998875375176542e-06, "loss": 1.2877, "step": 8947 }, { "epoch": 0.06, "grad_norm": 4.564531541461375, "learning_rate": 1.9988751235312833e-06, "loss": 1.2966, "step": 8948 }, { "epoch": 0.06, "grad_norm": 4.296073373679613, "learning_rate": 1.9988748718578894e-06, "loss": 1.4188, "step": 8949 }, { "epoch": 0.06, "grad_norm": 4.32396496786447, "learning_rate": 1.9988746201563605e-06, "loss": 1.4169, "step": 8950 }, { "epoch": 0.06, "grad_norm": 4.138408818532786, "learning_rate": 1.9988743684266965e-06, "loss": 1.2979, "step": 8951 }, { "epoch": 0.06, "grad_norm": 4.232780116774831, "learning_rate": 1.9988741166688974e-06, "loss": 1.2763, "step": 8952 }, { "epoch": 0.06, "grad_norm": 4.601664517313853, "learning_rate": 1.9988738648829638e-06, "loss": 1.3497, "step": 8953 }, { "epoch": 0.06, "grad_norm": 6.916078946149807, "learning_rate": 1.9988736130688946e-06, "loss": 1.2813, "step": 8954 }, { "epoch": 0.06, "grad_norm": 4.5223440926608065, "learning_rate": 1.9988733612266913e-06, "loss": 1.3428, "step": 8955 }, { "epoch": 0.06, "grad_norm": 4.11933876690971, "learning_rate": 1.9988731093563525e-06, "loss": 1.1801, "step": 8956 }, { "epoch": 0.06, "grad_norm": 4.77892602384412, "learning_rate": 1.9988728574578786e-06, "loss": 1.3533, "step": 8957 }, { "epoch": 0.06, "grad_norm": 4.491787013798658, "learning_rate": 1.99887260553127e-06, "loss": 1.3315, "step": 8958 }, { "epoch": 0.06, "grad_norm": 4.463416607886815, "learning_rate": 1.9988723535765266e-06, "loss": 1.3055, "step": 8959 }, { "epoch": 0.06, "grad_norm": 4.466109919275049, "learning_rate": 1.998872101593648e-06, "loss": 1.336, "step": 8960 }, { "epoch": 0.06, "grad_norm": 5.08836560764422, "learning_rate": 1.9988718495826353e-06, "loss": 1.4302, "step": 8961 }, { "epoch": 0.06, "grad_norm": 4.711433007796799, "learning_rate": 1.998871597543487e-06, "loss": 1.3982, "step": 8962 }, { "epoch": 0.06, "grad_norm": 4.368650154835614, "learning_rate": 1.998871345476204e-06, "loss": 1.2549, "step": 8963 }, { "epoch": 0.06, "grad_norm": 4.796732836900713, "learning_rate": 1.998871093380786e-06, "loss": 1.3864, "step": 8964 }, { "epoch": 0.06, "grad_norm": 4.238552048880954, "learning_rate": 1.998870841257233e-06, "loss": 1.4041, "step": 8965 }, { "epoch": 0.06, "grad_norm": 4.842170994219262, "learning_rate": 1.998870589105545e-06, "loss": 1.317, "step": 8966 }, { "epoch": 0.06, "grad_norm": 4.452796361111205, "learning_rate": 1.9988703369257223e-06, "loss": 1.4161, "step": 8967 }, { "epoch": 0.06, "grad_norm": 4.788521128772244, "learning_rate": 1.998870084717765e-06, "loss": 1.3019, "step": 8968 }, { "epoch": 0.06, "grad_norm": 4.328572650147352, "learning_rate": 1.9988698324816728e-06, "loss": 1.2451, "step": 8969 }, { "epoch": 0.06, "grad_norm": 10.840972499942911, "learning_rate": 1.9988695802174454e-06, "loss": 1.4126, "step": 8970 }, { "epoch": 0.06, "grad_norm": 4.288973331266906, "learning_rate": 1.9988693279250835e-06, "loss": 1.2919, "step": 8971 }, { "epoch": 0.06, "grad_norm": 4.059719407412008, "learning_rate": 1.998869075604587e-06, "loss": 1.2728, "step": 8972 }, { "epoch": 0.06, "grad_norm": 6.451005985306049, "learning_rate": 1.9988688232559553e-06, "loss": 1.6012, "step": 8973 }, { "epoch": 0.06, "grad_norm": 4.3410327220214855, "learning_rate": 1.998868570879189e-06, "loss": 1.2263, "step": 8974 }, { "epoch": 0.06, "grad_norm": 3.999191136292253, "learning_rate": 1.9988683184742877e-06, "loss": 1.2986, "step": 8975 }, { "epoch": 0.06, "grad_norm": 4.572950767432566, "learning_rate": 1.998868066041252e-06, "loss": 1.4045, "step": 8976 }, { "epoch": 0.06, "grad_norm": 4.767952435722493, "learning_rate": 1.998867813580081e-06, "loss": 1.5217, "step": 8977 }, { "epoch": 0.06, "grad_norm": 5.11152585487058, "learning_rate": 1.9988675610907752e-06, "loss": 1.4652, "step": 8978 }, { "epoch": 0.06, "grad_norm": 4.518933749702347, "learning_rate": 1.998867308573335e-06, "loss": 1.5219, "step": 8979 }, { "epoch": 0.06, "eval_loss": 1.5553007125854492, "eval_runtime": 4.6042, "eval_samples_per_second": 1.955, "eval_steps_per_second": 1.086, "step": 8979 }, { "epoch": 0.06, "grad_norm": 5.60210528703573, "learning_rate": 1.9988670560277598e-06, "loss": 1.506, "step": 8980 }, { "epoch": 0.06, "grad_norm": 4.660456001265746, "learning_rate": 1.99886680345405e-06, "loss": 1.3371, "step": 8981 }, { "epoch": 0.06, "grad_norm": 4.851341565771996, "learning_rate": 1.998866550852205e-06, "loss": 1.3738, "step": 8982 }, { "epoch": 0.06, "grad_norm": 4.79938957161957, "learning_rate": 1.998866298222226e-06, "loss": 1.116, "step": 8983 }, { "epoch": 0.06, "grad_norm": 4.38864501046462, "learning_rate": 1.9988660455641117e-06, "loss": 1.2905, "step": 8984 }, { "epoch": 0.06, "grad_norm": 4.255559335307141, "learning_rate": 1.998865792877863e-06, "loss": 1.4645, "step": 8985 }, { "epoch": 0.06, "grad_norm": 4.394914108937541, "learning_rate": 1.998865540163479e-06, "loss": 1.3422, "step": 8986 }, { "epoch": 0.06, "grad_norm": 4.549130965535259, "learning_rate": 1.9988652874209606e-06, "loss": 1.3085, "step": 8987 }, { "epoch": 0.06, "grad_norm": 4.313659621286637, "learning_rate": 1.998865034650308e-06, "loss": 1.26, "step": 8988 }, { "epoch": 0.06, "grad_norm": 4.489065298698566, "learning_rate": 1.99886478185152e-06, "loss": 1.3966, "step": 8989 }, { "epoch": 0.06, "grad_norm": 4.267065028983889, "learning_rate": 1.9988645290245974e-06, "loss": 1.3753, "step": 8990 }, { "epoch": 0.06, "grad_norm": 4.947449398550815, "learning_rate": 1.99886427616954e-06, "loss": 1.4579, "step": 8991 }, { "epoch": 0.06, "grad_norm": 5.558811076207193, "learning_rate": 1.9988640232863485e-06, "loss": 1.3795, "step": 8992 }, { "epoch": 0.06, "grad_norm": 5.0790335795573895, "learning_rate": 1.998863770375022e-06, "loss": 1.4484, "step": 8993 }, { "epoch": 0.06, "grad_norm": 5.0138216757250476, "learning_rate": 1.998863517435561e-06, "loss": 1.3437, "step": 8994 }, { "epoch": 0.06, "grad_norm": 4.5056106488928895, "learning_rate": 1.9988632644679647e-06, "loss": 1.4137, "step": 8995 }, { "epoch": 0.06, "grad_norm": 4.602550611027919, "learning_rate": 1.9988630114722346e-06, "loss": 1.4645, "step": 8996 }, { "epoch": 0.06, "grad_norm": 4.661575315677727, "learning_rate": 1.998862758448369e-06, "loss": 1.3124, "step": 8997 }, { "epoch": 0.06, "grad_norm": 4.351667010009662, "learning_rate": 1.9988625053963694e-06, "loss": 1.4001, "step": 8998 }, { "epoch": 0.06, "grad_norm": 4.483493069528355, "learning_rate": 1.9988622523162354e-06, "loss": 1.477, "step": 8999 }, { "epoch": 0.06, "grad_norm": 4.167612226402738, "learning_rate": 1.998861999207966e-06, "loss": 1.2781, "step": 9000 }, { "epoch": 0.06, "grad_norm": 4.750547480446069, "learning_rate": 1.9988617460715624e-06, "loss": 1.3683, "step": 9001 }, { "epoch": 0.06, "grad_norm": 5.292162839259224, "learning_rate": 1.998861492907024e-06, "loss": 1.5366, "step": 9002 }, { "epoch": 0.06, "grad_norm": 4.609356748192954, "learning_rate": 1.9988612397143513e-06, "loss": 1.2481, "step": 9003 }, { "epoch": 0.06, "grad_norm": 4.554353075504676, "learning_rate": 1.998860986493544e-06, "loss": 1.2943, "step": 9004 }, { "epoch": 0.06, "grad_norm": 4.4289470785945095, "learning_rate": 1.9988607332446017e-06, "loss": 1.3443, "step": 9005 }, { "epoch": 0.06, "grad_norm": 4.600551169801115, "learning_rate": 1.998860479967525e-06, "loss": 1.4291, "step": 9006 }, { "epoch": 0.06, "grad_norm": 4.668299697183073, "learning_rate": 1.9988602266623137e-06, "loss": 1.5173, "step": 9007 }, { "epoch": 0.06, "grad_norm": 4.231037382439115, "learning_rate": 1.998859973328968e-06, "loss": 1.3386, "step": 9008 }, { "epoch": 0.06, "grad_norm": 6.297794802363379, "learning_rate": 1.9988597199674875e-06, "loss": 1.4288, "step": 9009 }, { "epoch": 0.06, "grad_norm": 4.868180986069198, "learning_rate": 1.9988594665778727e-06, "loss": 1.2063, "step": 9010 }, { "epoch": 0.06, "grad_norm": 5.362600955403026, "learning_rate": 1.9988592131601233e-06, "loss": 1.3834, "step": 9011 }, { "epoch": 0.06, "grad_norm": 4.407803982727705, "learning_rate": 1.9988589597142393e-06, "loss": 1.417, "step": 9012 }, { "epoch": 0.06, "grad_norm": 4.7563480578361075, "learning_rate": 1.9988587062402206e-06, "loss": 1.594, "step": 9013 }, { "epoch": 0.06, "grad_norm": 4.646984395306063, "learning_rate": 1.9988584527380678e-06, "loss": 1.3546, "step": 9014 }, { "epoch": 0.06, "grad_norm": 4.6616927620878394, "learning_rate": 1.99885819920778e-06, "loss": 1.4772, "step": 9015 }, { "epoch": 0.06, "grad_norm": 4.578605251102135, "learning_rate": 1.9988579456493577e-06, "loss": 1.5212, "step": 9016 }, { "epoch": 0.06, "grad_norm": 4.539410903890842, "learning_rate": 1.9988576920628014e-06, "loss": 1.3062, "step": 9017 }, { "epoch": 0.06, "grad_norm": 4.06144243687616, "learning_rate": 1.99885743844811e-06, "loss": 1.2502, "step": 9018 }, { "epoch": 0.06, "grad_norm": 5.440801598505531, "learning_rate": 1.9988571848052845e-06, "loss": 1.3371, "step": 9019 }, { "epoch": 0.06, "grad_norm": 4.586879669813342, "learning_rate": 1.9988569311343244e-06, "loss": 1.3869, "step": 9020 }, { "epoch": 0.06, "grad_norm": 5.482282733225696, "learning_rate": 1.99885667743523e-06, "loss": 1.404, "step": 9021 }, { "epoch": 0.06, "grad_norm": 5.031821009158009, "learning_rate": 1.9988564237080006e-06, "loss": 1.6249, "step": 9022 }, { "epoch": 0.06, "grad_norm": 4.962595085343715, "learning_rate": 1.9988561699526374e-06, "loss": 1.3203, "step": 9023 }, { "epoch": 0.06, "grad_norm": 5.694113615090797, "learning_rate": 1.9988559161691396e-06, "loss": 1.3867, "step": 9024 }, { "epoch": 0.06, "grad_norm": 4.742365519040978, "learning_rate": 1.9988556623575067e-06, "loss": 1.4539, "step": 9025 }, { "epoch": 0.06, "grad_norm": 4.830335538176732, "learning_rate": 1.99885540851774e-06, "loss": 1.3798, "step": 9026 }, { "epoch": 0.06, "grad_norm": 5.276387047185486, "learning_rate": 1.998855154649839e-06, "loss": 1.4987, "step": 9027 }, { "epoch": 0.06, "grad_norm": 4.853044083121314, "learning_rate": 1.9988549007538033e-06, "loss": 1.3321, "step": 9028 }, { "epoch": 0.06, "grad_norm": 4.284981764679088, "learning_rate": 1.998854646829633e-06, "loss": 0.9964, "step": 9029 }, { "epoch": 0.06, "grad_norm": 4.346686675214998, "learning_rate": 1.9988543928773285e-06, "loss": 1.4307, "step": 9030 }, { "epoch": 0.06, "grad_norm": 4.495526724877368, "learning_rate": 1.9988541388968896e-06, "loss": 1.4653, "step": 9031 }, { "epoch": 0.06, "grad_norm": 4.239632068789658, "learning_rate": 1.998853884888316e-06, "loss": 1.3582, "step": 9032 }, { "epoch": 0.06, "grad_norm": 4.98748674096309, "learning_rate": 1.9988536308516084e-06, "loss": 1.2501, "step": 9033 }, { "epoch": 0.06, "grad_norm": 5.556394646926006, "learning_rate": 1.998853376786766e-06, "loss": 1.1305, "step": 9034 }, { "epoch": 0.06, "grad_norm": 4.346098874865625, "learning_rate": 1.99885312269379e-06, "loss": 1.2558, "step": 9035 }, { "epoch": 0.06, "grad_norm": 4.494707023554734, "learning_rate": 1.9988528685726787e-06, "loss": 1.3644, "step": 9036 }, { "epoch": 0.06, "grad_norm": 6.874976902701597, "learning_rate": 1.9988526144234338e-06, "loss": 1.3911, "step": 9037 }, { "epoch": 0.06, "grad_norm": 5.248752040120656, "learning_rate": 1.998852360246054e-06, "loss": 1.4355, "step": 9038 }, { "epoch": 0.06, "grad_norm": 5.165077934887538, "learning_rate": 1.99885210604054e-06, "loss": 1.4702, "step": 9039 }, { "epoch": 0.06, "grad_norm": 4.980124759726575, "learning_rate": 1.9988518518068916e-06, "loss": 1.3591, "step": 9040 }, { "epoch": 0.06, "grad_norm": 4.67548279943575, "learning_rate": 1.9988515975451094e-06, "loss": 1.3922, "step": 9041 }, { "epoch": 0.06, "grad_norm": 4.63766518016973, "learning_rate": 1.9988513432551922e-06, "loss": 1.4185, "step": 9042 }, { "epoch": 0.06, "grad_norm": 4.782559994653726, "learning_rate": 1.998851088937141e-06, "loss": 1.3569, "step": 9043 }, { "epoch": 0.06, "grad_norm": 4.357415285535007, "learning_rate": 1.998850834590955e-06, "loss": 1.3202, "step": 9044 }, { "epoch": 0.06, "grad_norm": 4.189087610099267, "learning_rate": 1.9988505802166354e-06, "loss": 1.301, "step": 9045 }, { "epoch": 0.06, "grad_norm": 4.570560161610986, "learning_rate": 1.9988503258141814e-06, "loss": 1.3757, "step": 9046 }, { "epoch": 0.06, "grad_norm": 4.307296931492565, "learning_rate": 1.9988500713835927e-06, "loss": 1.4011, "step": 9047 }, { "epoch": 0.06, "grad_norm": 3.9675940034991855, "learning_rate": 1.9988498169248703e-06, "loss": 1.1557, "step": 9048 }, { "epoch": 0.06, "grad_norm": 4.291378655674192, "learning_rate": 1.9988495624380133e-06, "loss": 1.324, "step": 9049 }, { "epoch": 0.06, "grad_norm": 8.532552952167226, "learning_rate": 1.9988493079230216e-06, "loss": 1.4429, "step": 9050 }, { "epoch": 0.06, "grad_norm": 5.493875214686658, "learning_rate": 1.998849053379896e-06, "loss": 1.5635, "step": 9051 }, { "epoch": 0.06, "grad_norm": 4.792197174638157, "learning_rate": 1.9988487988086365e-06, "loss": 1.3605, "step": 9052 }, { "epoch": 0.06, "eval_loss": 1.5568945407867432, "eval_runtime": 4.5932, "eval_samples_per_second": 1.959, "eval_steps_per_second": 1.089, "step": 9052 }, { "epoch": 0.06, "grad_norm": 4.6190225029601875, "learning_rate": 1.9988485442092423e-06, "loss": 1.2421, "step": 9053 }, { "epoch": 0.06, "grad_norm": 4.535802137716677, "learning_rate": 1.998848289581714e-06, "loss": 1.3614, "step": 9054 }, { "epoch": 0.06, "grad_norm": 8.31164491897128, "learning_rate": 1.9988480349260516e-06, "loss": 1.2074, "step": 9055 }, { "epoch": 0.06, "grad_norm": 4.1293617867803825, "learning_rate": 1.9988477802422547e-06, "loss": 1.2049, "step": 9056 }, { "epoch": 0.06, "grad_norm": 5.348364919042677, "learning_rate": 1.9988475255303237e-06, "loss": 1.4955, "step": 9057 }, { "epoch": 0.06, "grad_norm": 4.803246772346318, "learning_rate": 1.9988472707902584e-06, "loss": 1.3898, "step": 9058 }, { "epoch": 0.06, "grad_norm": 5.098680525051035, "learning_rate": 1.9988470160220594e-06, "loss": 1.5915, "step": 9059 }, { "epoch": 0.06, "grad_norm": 5.970280760389105, "learning_rate": 1.9988467612257253e-06, "loss": 1.313, "step": 9060 }, { "epoch": 0.06, "grad_norm": 4.667421976546424, "learning_rate": 1.9988465064012575e-06, "loss": 1.3201, "step": 9061 }, { "epoch": 0.06, "grad_norm": 7.401429763170494, "learning_rate": 1.998846251548656e-06, "loss": 1.3229, "step": 9062 }, { "epoch": 0.06, "grad_norm": 4.785533056094324, "learning_rate": 1.9988459966679196e-06, "loss": 1.3631, "step": 9063 }, { "epoch": 0.06, "grad_norm": 4.487519182889941, "learning_rate": 1.998845741759049e-06, "loss": 1.437, "step": 9064 }, { "epoch": 0.06, "grad_norm": 5.13642628749504, "learning_rate": 1.9988454868220445e-06, "loss": 1.37, "step": 9065 }, { "epoch": 0.06, "grad_norm": 4.9419697819098625, "learning_rate": 1.998845231856906e-06, "loss": 1.3161, "step": 9066 }, { "epoch": 0.06, "grad_norm": 4.420915488648716, "learning_rate": 1.9988449768636334e-06, "loss": 1.373, "step": 9067 }, { "epoch": 0.06, "grad_norm": 4.221702918453699, "learning_rate": 1.9988447218422266e-06, "loss": 1.2426, "step": 9068 }, { "epoch": 0.06, "grad_norm": 4.740525474146342, "learning_rate": 1.998844466792685e-06, "loss": 1.2266, "step": 9069 }, { "epoch": 0.06, "grad_norm": 5.090429617574333, "learning_rate": 1.99884421171501e-06, "loss": 1.3431, "step": 9070 }, { "epoch": 0.06, "grad_norm": 4.715382170229637, "learning_rate": 1.998843956609201e-06, "loss": 1.3327, "step": 9071 }, { "epoch": 0.06, "grad_norm": 4.4022969934206895, "learning_rate": 1.9988437014752573e-06, "loss": 1.3137, "step": 9072 }, { "epoch": 0.06, "grad_norm": 5.181835744906127, "learning_rate": 1.99884344631318e-06, "loss": 1.549, "step": 9073 }, { "epoch": 0.06, "grad_norm": 4.819269662989522, "learning_rate": 1.9988431911229683e-06, "loss": 1.282, "step": 9074 }, { "epoch": 0.06, "grad_norm": 4.363681009387404, "learning_rate": 1.9988429359046225e-06, "loss": 1.4101, "step": 9075 }, { "epoch": 0.06, "grad_norm": 4.7055582389899175, "learning_rate": 1.9988426806581425e-06, "loss": 1.4084, "step": 9076 }, { "epoch": 0.06, "grad_norm": 4.124273417076292, "learning_rate": 1.9988424253835288e-06, "loss": 1.2397, "step": 9077 }, { "epoch": 0.06, "grad_norm": 5.231691072861776, "learning_rate": 1.998842170080781e-06, "loss": 1.4207, "step": 9078 }, { "epoch": 0.06, "grad_norm": 5.1471474759237426, "learning_rate": 1.998841914749899e-06, "loss": 1.514, "step": 9079 }, { "epoch": 0.06, "grad_norm": 4.661508206419235, "learning_rate": 1.9988416593908827e-06, "loss": 1.4136, "step": 9080 }, { "epoch": 0.06, "grad_norm": 4.768479744757336, "learning_rate": 1.9988414040037325e-06, "loss": 1.3385, "step": 9081 }, { "epoch": 0.06, "grad_norm": 4.741697760571189, "learning_rate": 1.9988411485884486e-06, "loss": 1.4071, "step": 9082 }, { "epoch": 0.06, "grad_norm": 4.074112572353006, "learning_rate": 1.99884089314503e-06, "loss": 1.2509, "step": 9083 }, { "epoch": 0.06, "grad_norm": 6.175344434197922, "learning_rate": 1.998840637673478e-06, "loss": 1.3827, "step": 9084 }, { "epoch": 0.06, "grad_norm": 6.016929703788691, "learning_rate": 1.998840382173792e-06, "loss": 1.3087, "step": 9085 }, { "epoch": 0.06, "grad_norm": 6.244994069291114, "learning_rate": 1.9988401266459718e-06, "loss": 1.4782, "step": 9086 }, { "epoch": 0.06, "grad_norm": 4.3925647532790215, "learning_rate": 1.9988398710900173e-06, "loss": 1.2169, "step": 9087 }, { "epoch": 0.06, "grad_norm": 4.522437735505425, "learning_rate": 1.998839615505929e-06, "loss": 1.4083, "step": 9088 }, { "epoch": 0.06, "grad_norm": 4.819405069864338, "learning_rate": 1.998839359893707e-06, "loss": 1.2948, "step": 9089 }, { "epoch": 0.06, "grad_norm": 4.763161479989202, "learning_rate": 1.9988391042533507e-06, "loss": 1.2271, "step": 9090 }, { "epoch": 0.06, "grad_norm": 4.565434660115241, "learning_rate": 1.9988388485848607e-06, "loss": 1.3826, "step": 9091 }, { "epoch": 0.06, "grad_norm": 5.219237169197936, "learning_rate": 1.9988385928882365e-06, "loss": 1.2761, "step": 9092 }, { "epoch": 0.06, "grad_norm": 6.063411684504128, "learning_rate": 1.998838337163478e-06, "loss": 1.3223, "step": 9093 }, { "epoch": 0.06, "grad_norm": 4.479134736791678, "learning_rate": 1.9988380814105863e-06, "loss": 1.3446, "step": 9094 }, { "epoch": 0.06, "grad_norm": 5.422095452244349, "learning_rate": 1.9988378256295603e-06, "loss": 1.3384, "step": 9095 }, { "epoch": 0.06, "grad_norm": 4.84737701951572, "learning_rate": 1.9988375698204e-06, "loss": 1.2363, "step": 9096 }, { "epoch": 0.06, "grad_norm": 4.382312669869536, "learning_rate": 1.9988373139831066e-06, "loss": 1.4566, "step": 9097 }, { "epoch": 0.06, "grad_norm": 4.013224196049158, "learning_rate": 1.9988370581176785e-06, "loss": 1.3884, "step": 9098 }, { "epoch": 0.06, "grad_norm": 4.818846037522861, "learning_rate": 1.998836802224117e-06, "loss": 1.2727, "step": 9099 }, { "epoch": 0.06, "grad_norm": 4.407544132466331, "learning_rate": 1.9988365463024213e-06, "loss": 1.3083, "step": 9100 }, { "epoch": 0.06, "grad_norm": 4.944094663921774, "learning_rate": 1.9988362903525914e-06, "loss": 1.4612, "step": 9101 }, { "epoch": 0.06, "grad_norm": 4.6406257423464705, "learning_rate": 1.998836034374628e-06, "loss": 1.2318, "step": 9102 }, { "epoch": 0.06, "grad_norm": 4.484612273315136, "learning_rate": 1.998835778368531e-06, "loss": 1.2144, "step": 9103 }, { "epoch": 0.06, "grad_norm": 6.531856078046369, "learning_rate": 1.9988355223342995e-06, "loss": 1.2987, "step": 9104 }, { "epoch": 0.06, "grad_norm": 4.655267378407986, "learning_rate": 1.9988352662719345e-06, "loss": 1.5162, "step": 9105 }, { "epoch": 0.06, "grad_norm": 5.095374997853681, "learning_rate": 1.9988350101814357e-06, "loss": 1.3566, "step": 9106 }, { "epoch": 0.06, "grad_norm": 5.857680330258707, "learning_rate": 1.9988347540628027e-06, "loss": 1.5086, "step": 9107 }, { "epoch": 0.06, "grad_norm": 4.653106006016631, "learning_rate": 1.9988344979160364e-06, "loss": 1.3639, "step": 9108 }, { "epoch": 0.06, "grad_norm": 4.441432248123494, "learning_rate": 1.998834241741136e-06, "loss": 1.3261, "step": 9109 }, { "epoch": 0.06, "grad_norm": 5.087121192776212, "learning_rate": 1.998833985538101e-06, "loss": 1.4353, "step": 9110 }, { "epoch": 0.06, "grad_norm": 4.11190563382986, "learning_rate": 1.9988337293069335e-06, "loss": 1.2746, "step": 9111 }, { "epoch": 0.06, "grad_norm": 6.716606119826262, "learning_rate": 1.9988334730476312e-06, "loss": 1.3709, "step": 9112 }, { "epoch": 0.06, "grad_norm": 4.630104371625799, "learning_rate": 1.998833216760195e-06, "loss": 1.3448, "step": 9113 }, { "epoch": 0.06, "grad_norm": 4.941266886240506, "learning_rate": 1.9988329604446258e-06, "loss": 1.2529, "step": 9114 }, { "epoch": 0.06, "grad_norm": 4.663865704742076, "learning_rate": 1.998832704100922e-06, "loss": 1.3354, "step": 9115 }, { "epoch": 0.06, "grad_norm": 4.614423454119124, "learning_rate": 1.998832447729085e-06, "loss": 1.5157, "step": 9116 }, { "epoch": 0.06, "grad_norm": 4.493157455094239, "learning_rate": 1.998832191329114e-06, "loss": 1.3067, "step": 9117 }, { "epoch": 0.06, "grad_norm": 4.574117095039273, "learning_rate": 1.998831934901009e-06, "loss": 1.3158, "step": 9118 }, { "epoch": 0.06, "grad_norm": 4.779692251344268, "learning_rate": 1.9988316784447704e-06, "loss": 1.2858, "step": 9119 }, { "epoch": 0.06, "grad_norm": 4.242701008375293, "learning_rate": 1.9988314219603983e-06, "loss": 1.3797, "step": 9120 }, { "epoch": 0.06, "grad_norm": 10.034828223330901, "learning_rate": 1.998831165447892e-06, "loss": 1.2989, "step": 9121 }, { "epoch": 0.06, "grad_norm": 4.673336156658645, "learning_rate": 1.9988309089072525e-06, "loss": 1.37, "step": 9122 }, { "epoch": 0.06, "grad_norm": 4.247683852114555, "learning_rate": 1.9988306523384786e-06, "loss": 1.3382, "step": 9123 }, { "epoch": 0.06, "grad_norm": 4.7928360311565905, "learning_rate": 1.9988303957415715e-06, "loss": 1.4073, "step": 9124 }, { "epoch": 0.06, "grad_norm": 5.7822777633284845, "learning_rate": 1.9988301391165305e-06, "loss": 1.4012, "step": 9125 }, { "epoch": 0.06, "eval_loss": 1.5588743686676025, "eval_runtime": 4.5701, "eval_samples_per_second": 1.969, "eval_steps_per_second": 1.094, "step": 9125 }, { "epoch": 0.06, "grad_norm": 4.226827420184808, "learning_rate": 1.9988298824633554e-06, "loss": 1.2245, "step": 9126 }, { "epoch": 0.06, "grad_norm": 5.011844815348574, "learning_rate": 1.9988296257820473e-06, "loss": 1.34, "step": 9127 }, { "epoch": 0.06, "grad_norm": 4.941820242282499, "learning_rate": 1.998829369072605e-06, "loss": 1.507, "step": 9128 }, { "epoch": 0.06, "grad_norm": 4.600639367810583, "learning_rate": 1.998829112335029e-06, "loss": 1.3989, "step": 9129 }, { "epoch": 0.06, "grad_norm": 4.406622869695, "learning_rate": 1.998828855569319e-06, "loss": 1.4553, "step": 9130 }, { "epoch": 0.06, "grad_norm": 4.252996143827648, "learning_rate": 1.9988285987754765e-06, "loss": 1.3364, "step": 9131 }, { "epoch": 0.06, "grad_norm": 4.821985360292999, "learning_rate": 1.998828341953499e-06, "loss": 1.3659, "step": 9132 }, { "epoch": 0.06, "grad_norm": 6.094220456286474, "learning_rate": 1.9988280851033884e-06, "loss": 1.4298, "step": 9133 }, { "epoch": 0.06, "grad_norm": 4.970492624145565, "learning_rate": 1.9988278282251443e-06, "loss": 1.3314, "step": 9134 }, { "epoch": 0.06, "grad_norm": 4.597286782721286, "learning_rate": 1.9988275713187664e-06, "loss": 1.1891, "step": 9135 }, { "epoch": 0.06, "grad_norm": 4.719326172064779, "learning_rate": 1.998827314384255e-06, "loss": 1.3526, "step": 9136 }, { "epoch": 0.06, "grad_norm": 4.964344684563564, "learning_rate": 1.9988270574216094e-06, "loss": 1.3893, "step": 9137 }, { "epoch": 0.06, "grad_norm": 5.048002248308627, "learning_rate": 1.9988268004308307e-06, "loss": 1.2759, "step": 9138 }, { "epoch": 0.06, "grad_norm": 4.254350270877056, "learning_rate": 1.998826543411918e-06, "loss": 1.3175, "step": 9139 }, { "epoch": 0.06, "grad_norm": 4.5864422760779275, "learning_rate": 1.998826286364872e-06, "loss": 1.44, "step": 9140 }, { "epoch": 0.06, "grad_norm": 5.19006686791242, "learning_rate": 1.998826029289692e-06, "loss": 1.3203, "step": 9141 }, { "epoch": 0.06, "grad_norm": 4.661070933704734, "learning_rate": 1.9988257721863787e-06, "loss": 1.2694, "step": 9142 }, { "epoch": 0.06, "grad_norm": 4.813334077646508, "learning_rate": 1.998825515054932e-06, "loss": 1.4046, "step": 9143 }, { "epoch": 0.06, "grad_norm": 4.449255562761521, "learning_rate": 1.9988252578953514e-06, "loss": 1.3647, "step": 9144 }, { "epoch": 0.06, "grad_norm": 4.075167876070271, "learning_rate": 1.998825000707637e-06, "loss": 1.1863, "step": 9145 }, { "epoch": 0.06, "grad_norm": 4.74127015718803, "learning_rate": 1.9988247434917893e-06, "loss": 1.1165, "step": 9146 }, { "epoch": 0.06, "grad_norm": 6.840163155762476, "learning_rate": 1.998824486247808e-06, "loss": 1.4104, "step": 9147 }, { "epoch": 0.06, "grad_norm": 5.102885647246682, "learning_rate": 1.9988242289756935e-06, "loss": 1.4741, "step": 9148 }, { "epoch": 0.06, "grad_norm": 4.380287801595398, "learning_rate": 1.998823971675445e-06, "loss": 1.3417, "step": 9149 }, { "epoch": 0.06, "grad_norm": 4.387236463016363, "learning_rate": 1.998823714347063e-06, "loss": 1.3466, "step": 9150 }, { "epoch": 0.06, "grad_norm": 7.42239010672559, "learning_rate": 1.9988234569905472e-06, "loss": 1.2259, "step": 9151 }, { "epoch": 0.06, "grad_norm": 4.523996935259947, "learning_rate": 1.9988231996058986e-06, "loss": 1.3438, "step": 9152 }, { "epoch": 0.06, "grad_norm": 4.912178189540081, "learning_rate": 1.998822942193116e-06, "loss": 1.384, "step": 9153 }, { "epoch": 0.06, "grad_norm": 4.646735695375992, "learning_rate": 1.9988226847522e-06, "loss": 1.3713, "step": 9154 }, { "epoch": 0.06, "grad_norm": 4.220331488041842, "learning_rate": 1.9988224272831504e-06, "loss": 1.3364, "step": 9155 }, { "epoch": 0.06, "grad_norm": 4.917934915293237, "learning_rate": 1.998822169785967e-06, "loss": 1.3533, "step": 9156 }, { "epoch": 0.06, "grad_norm": 4.136267104770313, "learning_rate": 1.9988219122606504e-06, "loss": 1.2179, "step": 9157 }, { "epoch": 0.06, "grad_norm": 4.418373349181749, "learning_rate": 1.9988216547072003e-06, "loss": 1.332, "step": 9158 }, { "epoch": 0.06, "grad_norm": 4.359669714393945, "learning_rate": 1.9988213971256165e-06, "loss": 1.4237, "step": 9159 }, { "epoch": 0.06, "grad_norm": 4.908105067327183, "learning_rate": 1.9988211395158998e-06, "loss": 1.4135, "step": 9160 }, { "epoch": 0.06, "grad_norm": 4.318013319108017, "learning_rate": 1.9988208818780492e-06, "loss": 1.2633, "step": 9161 }, { "epoch": 0.06, "grad_norm": 4.402614559006522, "learning_rate": 1.9988206242120654e-06, "loss": 1.4503, "step": 9162 }, { "epoch": 0.06, "grad_norm": 5.2079967940421845, "learning_rate": 1.9988203665179477e-06, "loss": 1.4091, "step": 9163 }, { "epoch": 0.06, "grad_norm": 4.734608864419093, "learning_rate": 1.998820108795697e-06, "loss": 1.2797, "step": 9164 }, { "epoch": 0.06, "grad_norm": 4.54596989147798, "learning_rate": 1.998819851045313e-06, "loss": 1.2652, "step": 9165 }, { "epoch": 0.06, "grad_norm": 4.965599831733548, "learning_rate": 1.9988195932667947e-06, "loss": 1.3534, "step": 9166 }, { "epoch": 0.06, "grad_norm": 4.306252303362465, "learning_rate": 1.9988193354601436e-06, "loss": 1.3887, "step": 9167 }, { "epoch": 0.06, "grad_norm": 4.586662322138531, "learning_rate": 1.9988190776253592e-06, "loss": 1.2433, "step": 9168 }, { "epoch": 0.06, "grad_norm": 4.487606906380667, "learning_rate": 1.998818819762441e-06, "loss": 1.3448, "step": 9169 }, { "epoch": 0.06, "grad_norm": 4.796842913016864, "learning_rate": 1.99881856187139e-06, "loss": 1.2425, "step": 9170 }, { "epoch": 0.06, "grad_norm": 5.331051173433661, "learning_rate": 1.998818303952205e-06, "loss": 1.5691, "step": 9171 }, { "epoch": 0.06, "grad_norm": 4.5032447554233155, "learning_rate": 1.998818046004887e-06, "loss": 1.2678, "step": 9172 }, { "epoch": 0.06, "grad_norm": 4.328273220446713, "learning_rate": 1.9988177880294352e-06, "loss": 1.3201, "step": 9173 }, { "epoch": 0.06, "grad_norm": 8.132203810399535, "learning_rate": 1.9988175300258503e-06, "loss": 1.1892, "step": 9174 }, { "epoch": 0.06, "grad_norm": 4.426246765016404, "learning_rate": 1.998817271994132e-06, "loss": 1.3821, "step": 9175 }, { "epoch": 0.06, "grad_norm": 4.647626125267229, "learning_rate": 1.9988170139342803e-06, "loss": 1.4799, "step": 9176 }, { "epoch": 0.06, "grad_norm": 4.276910513814594, "learning_rate": 1.9988167558462953e-06, "loss": 1.1761, "step": 9177 }, { "epoch": 0.06, "grad_norm": 5.0994784697641915, "learning_rate": 1.9988164977301765e-06, "loss": 1.2105, "step": 9178 }, { "epoch": 0.06, "grad_norm": 5.392130693321368, "learning_rate": 1.998816239585925e-06, "loss": 1.5226, "step": 9179 }, { "epoch": 0.06, "grad_norm": 4.29486562875608, "learning_rate": 1.99881598141354e-06, "loss": 1.5095, "step": 9180 }, { "epoch": 0.06, "grad_norm": 5.661862846028829, "learning_rate": 1.998815723213022e-06, "loss": 1.1643, "step": 9181 }, { "epoch": 0.06, "grad_norm": 4.600914266221847, "learning_rate": 1.99881546498437e-06, "loss": 1.3136, "step": 9182 }, { "epoch": 0.06, "grad_norm": 4.9206532309919115, "learning_rate": 1.998815206727585e-06, "loss": 1.2937, "step": 9183 }, { "epoch": 0.06, "grad_norm": 4.500316891522118, "learning_rate": 1.9988149484426665e-06, "loss": 1.3292, "step": 9184 }, { "epoch": 0.06, "grad_norm": 4.654472328955603, "learning_rate": 1.998814690129615e-06, "loss": 1.2866, "step": 9185 }, { "epoch": 0.06, "grad_norm": 4.260705343729918, "learning_rate": 1.9988144317884303e-06, "loss": 1.3803, "step": 9186 }, { "epoch": 0.06, "grad_norm": 4.728087632517888, "learning_rate": 1.9988141734191118e-06, "loss": 1.4177, "step": 9187 }, { "epoch": 0.06, "grad_norm": 4.412481904348956, "learning_rate": 1.9988139150216603e-06, "loss": 1.3198, "step": 9188 }, { "epoch": 0.06, "grad_norm": 4.440521569612287, "learning_rate": 1.998813656596076e-06, "loss": 1.4435, "step": 9189 }, { "epoch": 0.06, "grad_norm": 5.413933265413556, "learning_rate": 1.998813398142358e-06, "loss": 1.5202, "step": 9190 }, { "epoch": 0.06, "grad_norm": 4.9382782805913745, "learning_rate": 1.9988131396605067e-06, "loss": 1.3668, "step": 9191 }, { "epoch": 0.06, "grad_norm": 4.504718320545054, "learning_rate": 1.9988128811505223e-06, "loss": 1.4625, "step": 9192 }, { "epoch": 0.06, "grad_norm": 4.56992680428897, "learning_rate": 1.9988126226124045e-06, "loss": 1.483, "step": 9193 }, { "epoch": 0.06, "grad_norm": 4.983931505334358, "learning_rate": 1.9988123640461538e-06, "loss": 1.303, "step": 9194 }, { "epoch": 0.06, "grad_norm": 4.061058860247226, "learning_rate": 1.9988121054517697e-06, "loss": 1.2034, "step": 9195 }, { "epoch": 0.06, "grad_norm": 4.210479928823161, "learning_rate": 1.9988118468292523e-06, "loss": 1.3605, "step": 9196 }, { "epoch": 0.06, "grad_norm": 4.580943998647816, "learning_rate": 1.9988115881786015e-06, "loss": 1.3324, "step": 9197 }, { "epoch": 0.06, "grad_norm": 4.654744937300995, "learning_rate": 1.9988113294998178e-06, "loss": 1.3696, "step": 9198 }, { "epoch": 0.06, "eval_loss": 1.5551848411560059, "eval_runtime": 4.5741, "eval_samples_per_second": 1.968, "eval_steps_per_second": 1.093, "step": 9198 }, { "epoch": 0.06, "grad_norm": 4.885414016752089, "learning_rate": 1.9988110707929007e-06, "loss": 1.3691, "step": 9199 }, { "epoch": 0.06, "grad_norm": 4.473691951387402, "learning_rate": 1.9988108120578507e-06, "loss": 1.3502, "step": 9200 }, { "epoch": 0.06, "grad_norm": 4.551769682896951, "learning_rate": 1.9988105532946674e-06, "loss": 1.2203, "step": 9201 }, { "epoch": 0.06, "grad_norm": 4.263827357327292, "learning_rate": 1.9988102945033507e-06, "loss": 1.2445, "step": 9202 }, { "epoch": 0.06, "grad_norm": 5.0858326278244395, "learning_rate": 1.998810035683901e-06, "loss": 1.3465, "step": 9203 }, { "epoch": 0.06, "grad_norm": 4.669388133745964, "learning_rate": 1.9988097768363185e-06, "loss": 1.383, "step": 9204 }, { "epoch": 0.06, "grad_norm": 4.650010881826172, "learning_rate": 1.998809517960602e-06, "loss": 1.3574, "step": 9205 }, { "epoch": 0.06, "grad_norm": 4.089131123337762, "learning_rate": 1.9988092590567534e-06, "loss": 1.3197, "step": 9206 }, { "epoch": 0.06, "grad_norm": 4.5883544020075115, "learning_rate": 1.998809000124771e-06, "loss": 1.3156, "step": 9207 }, { "epoch": 0.06, "grad_norm": 3.9427521496043374, "learning_rate": 1.9988087411646557e-06, "loss": 1.1175, "step": 9208 }, { "epoch": 0.06, "grad_norm": 4.971540794459495, "learning_rate": 1.998808482176407e-06, "loss": 1.1763, "step": 9209 }, { "epoch": 0.06, "grad_norm": 5.250689400116675, "learning_rate": 1.9988082231600254e-06, "loss": 1.3241, "step": 9210 }, { "epoch": 0.06, "grad_norm": 5.723022770698001, "learning_rate": 1.9988079641155106e-06, "loss": 1.453, "step": 9211 }, { "epoch": 0.06, "grad_norm": 4.625770044710968, "learning_rate": 1.998807705042863e-06, "loss": 1.4092, "step": 9212 }, { "epoch": 0.06, "grad_norm": 4.292208043151595, "learning_rate": 1.998807445942082e-06, "loss": 1.3142, "step": 9213 }, { "epoch": 0.06, "grad_norm": 4.688632080400614, "learning_rate": 1.998807186813168e-06, "loss": 1.2623, "step": 9214 }, { "epoch": 0.06, "grad_norm": 4.962966344888703, "learning_rate": 1.998806927656121e-06, "loss": 1.3753, "step": 9215 }, { "epoch": 0.06, "grad_norm": 6.3746591523351, "learning_rate": 1.998806668470941e-06, "loss": 1.2383, "step": 9216 }, { "epoch": 0.06, "grad_norm": 5.239264773042854, "learning_rate": 1.9988064092576277e-06, "loss": 1.3787, "step": 9217 }, { "epoch": 0.06, "grad_norm": 4.67627214590265, "learning_rate": 1.9988061500161816e-06, "loss": 1.3105, "step": 9218 }, { "epoch": 0.06, "grad_norm": 40.299816237909404, "learning_rate": 1.998805890746602e-06, "loss": 1.2077, "step": 9219 }, { "epoch": 0.06, "grad_norm": 4.3000625923041, "learning_rate": 1.99880563144889e-06, "loss": 1.3591, "step": 9220 }, { "epoch": 0.06, "grad_norm": 11.819164773059967, "learning_rate": 1.9988053721230445e-06, "loss": 1.4325, "step": 9221 }, { "epoch": 0.06, "grad_norm": 6.602682277265542, "learning_rate": 1.9988051127690663e-06, "loss": 1.4859, "step": 9222 }, { "epoch": 0.06, "grad_norm": 4.397121461783765, "learning_rate": 1.9988048533869547e-06, "loss": 1.3326, "step": 9223 }, { "epoch": 0.06, "grad_norm": 6.223462950176888, "learning_rate": 1.99880459397671e-06, "loss": 1.4692, "step": 9224 }, { "epoch": 0.06, "grad_norm": 4.685747149224842, "learning_rate": 1.9988043345383327e-06, "loss": 1.5234, "step": 9225 }, { "epoch": 0.06, "grad_norm": 4.567393106628273, "learning_rate": 1.9988040750718223e-06, "loss": 1.2359, "step": 9226 }, { "epoch": 0.06, "grad_norm": 4.805400839191366, "learning_rate": 1.998803815577179e-06, "loss": 1.3772, "step": 9227 }, { "epoch": 0.06, "grad_norm": 4.3623309908031525, "learning_rate": 1.998803556054403e-06, "loss": 1.3121, "step": 9228 }, { "epoch": 0.06, "grad_norm": 4.667877487910693, "learning_rate": 1.9988032965034932e-06, "loss": 1.1697, "step": 9229 }, { "epoch": 0.06, "grad_norm": 5.152682760918998, "learning_rate": 1.998803036924451e-06, "loss": 1.3827, "step": 9230 }, { "epoch": 0.06, "grad_norm": 4.308201467006308, "learning_rate": 1.9988027773172757e-06, "loss": 1.3683, "step": 9231 }, { "epoch": 0.06, "grad_norm": 4.6805524903343265, "learning_rate": 1.9988025176819674e-06, "loss": 1.4236, "step": 9232 }, { "epoch": 0.06, "grad_norm": 4.643916265722326, "learning_rate": 1.9988022580185265e-06, "loss": 1.254, "step": 9233 }, { "epoch": 0.06, "grad_norm": 4.909438057129668, "learning_rate": 1.9988019983269523e-06, "loss": 1.3937, "step": 9234 }, { "epoch": 0.06, "grad_norm": 4.300536284251366, "learning_rate": 1.998801738607245e-06, "loss": 1.2756, "step": 9235 }, { "epoch": 0.06, "grad_norm": 4.2915941448670365, "learning_rate": 1.998801478859405e-06, "loss": 1.3561, "step": 9236 }, { "epoch": 0.06, "grad_norm": 4.919475393069577, "learning_rate": 1.9988012190834324e-06, "loss": 1.5328, "step": 9237 }, { "epoch": 0.06, "grad_norm": 6.211193517788841, "learning_rate": 1.9988009592793265e-06, "loss": 1.1965, "step": 9238 }, { "epoch": 0.06, "grad_norm": 4.694982072556466, "learning_rate": 1.998800699447088e-06, "loss": 1.4447, "step": 9239 }, { "epoch": 0.06, "grad_norm": 4.50952391959699, "learning_rate": 1.9988004395867163e-06, "loss": 1.1954, "step": 9240 }, { "epoch": 0.06, "grad_norm": 4.192278098395553, "learning_rate": 1.9988001796982115e-06, "loss": 1.3435, "step": 9241 }, { "epoch": 0.06, "grad_norm": 7.456890300219677, "learning_rate": 1.9987999197815743e-06, "loss": 1.3874, "step": 9242 }, { "epoch": 0.06, "grad_norm": 4.43883051682153, "learning_rate": 1.998799659836804e-06, "loss": 1.3825, "step": 9243 }, { "epoch": 0.06, "grad_norm": 3.9919724656906244, "learning_rate": 1.998799399863901e-06, "loss": 1.2054, "step": 9244 }, { "epoch": 0.06, "grad_norm": 5.415586743663469, "learning_rate": 1.998799139862865e-06, "loss": 1.3388, "step": 9245 }, { "epoch": 0.06, "grad_norm": 4.737787258758223, "learning_rate": 1.9987988798336965e-06, "loss": 1.2581, "step": 9246 }, { "epoch": 0.06, "grad_norm": 4.460589872603003, "learning_rate": 1.9987986197763946e-06, "loss": 1.3994, "step": 9247 }, { "epoch": 0.06, "grad_norm": 4.730783176442161, "learning_rate": 1.9987983596909603e-06, "loss": 1.2868, "step": 9248 }, { "epoch": 0.06, "grad_norm": 4.52809047493597, "learning_rate": 1.998798099577393e-06, "loss": 1.349, "step": 9249 }, { "epoch": 0.06, "grad_norm": 4.251415073060551, "learning_rate": 1.9987978394356927e-06, "loss": 1.2302, "step": 9250 }, { "epoch": 0.06, "grad_norm": 4.5145178325005375, "learning_rate": 1.99879757926586e-06, "loss": 1.3751, "step": 9251 }, { "epoch": 0.06, "grad_norm": 4.618141550647926, "learning_rate": 1.998797319067894e-06, "loss": 1.2455, "step": 9252 }, { "epoch": 0.06, "grad_norm": 9.249325877496508, "learning_rate": 1.9987970588417957e-06, "loss": 1.3605, "step": 9253 }, { "epoch": 0.06, "grad_norm": 5.02226422494594, "learning_rate": 1.998796798587564e-06, "loss": 1.4115, "step": 9254 }, { "epoch": 0.06, "grad_norm": 4.4558576164359085, "learning_rate": 1.9987965383052e-06, "loss": 1.4009, "step": 9255 }, { "epoch": 0.06, "grad_norm": 4.6185799596053245, "learning_rate": 1.998796277994703e-06, "loss": 1.3943, "step": 9256 }, { "epoch": 0.06, "grad_norm": 4.047559855515776, "learning_rate": 1.9987960176560733e-06, "loss": 1.2431, "step": 9257 }, { "epoch": 0.06, "grad_norm": 4.591163392316811, "learning_rate": 1.998795757289311e-06, "loss": 1.4109, "step": 9258 }, { "epoch": 0.06, "grad_norm": 4.225018282293772, "learning_rate": 1.998795496894416e-06, "loss": 1.2567, "step": 9259 }, { "epoch": 0.06, "grad_norm": 4.317149360129581, "learning_rate": 1.998795236471388e-06, "loss": 1.3953, "step": 9260 }, { "epoch": 0.06, "grad_norm": 6.7634580294834015, "learning_rate": 1.9987949760202274e-06, "loss": 1.5209, "step": 9261 }, { "epoch": 0.06, "grad_norm": 5.482034080478455, "learning_rate": 1.998794715540934e-06, "loss": 1.3214, "step": 9262 }, { "epoch": 0.06, "grad_norm": 4.085856425355162, "learning_rate": 1.998794455033508e-06, "loss": 1.2051, "step": 9263 }, { "epoch": 0.06, "grad_norm": 4.412689106105339, "learning_rate": 1.998794194497949e-06, "loss": 1.3565, "step": 9264 }, { "epoch": 0.06, "grad_norm": 5.177331083369055, "learning_rate": 1.9987939339342576e-06, "loss": 1.3637, "step": 9265 }, { "epoch": 0.06, "grad_norm": 4.836459468080875, "learning_rate": 1.9987936733424335e-06, "loss": 1.2617, "step": 9266 }, { "epoch": 0.06, "grad_norm": 4.503984940561418, "learning_rate": 1.998793412722476e-06, "loss": 1.2991, "step": 9267 }, { "epoch": 0.06, "grad_norm": 4.57715649412643, "learning_rate": 1.9987931520743864e-06, "loss": 1.3679, "step": 9268 }, { "epoch": 0.06, "grad_norm": 4.365446337498259, "learning_rate": 1.9987928913981643e-06, "loss": 1.3526, "step": 9269 }, { "epoch": 0.06, "grad_norm": 4.588375265317352, "learning_rate": 1.9987926306938093e-06, "loss": 1.4596, "step": 9270 }, { "epoch": 0.06, "grad_norm": 6.615517229308341, "learning_rate": 1.998792369961322e-06, "loss": 1.4156, "step": 9271 }, { "epoch": 0.06, "eval_loss": 1.554495096206665, "eval_runtime": 4.6064, "eval_samples_per_second": 1.954, "eval_steps_per_second": 1.085, "step": 9271 }, { "epoch": 0.06, "grad_norm": 4.472883395236333, "learning_rate": 1.9987921092007014e-06, "loss": 1.3725, "step": 9272 }, { "epoch": 0.06, "grad_norm": 5.4377656112995085, "learning_rate": 1.9987918484119484e-06, "loss": 1.3698, "step": 9273 }, { "epoch": 0.06, "grad_norm": 5.435964818923897, "learning_rate": 1.998791587595063e-06, "loss": 1.5279, "step": 9274 }, { "epoch": 0.06, "grad_norm": 4.4465383279021164, "learning_rate": 1.9987913267500446e-06, "loss": 1.525, "step": 9275 }, { "epoch": 0.06, "grad_norm": 4.775829844342686, "learning_rate": 1.9987910658768937e-06, "loss": 1.4982, "step": 9276 }, { "epoch": 0.06, "grad_norm": 4.488655788858193, "learning_rate": 1.9987908049756103e-06, "loss": 1.4157, "step": 9277 }, { "epoch": 0.06, "grad_norm": 4.184203121501775, "learning_rate": 1.998790544046194e-06, "loss": 1.2199, "step": 9278 }, { "epoch": 0.06, "grad_norm": 4.661767676614311, "learning_rate": 1.9987902830886452e-06, "loss": 1.4224, "step": 9279 }, { "epoch": 0.06, "grad_norm": 5.395610188181745, "learning_rate": 1.998790022102964e-06, "loss": 1.2944, "step": 9280 }, { "epoch": 0.06, "grad_norm": 14.70008307969552, "learning_rate": 1.99878976108915e-06, "loss": 1.3301, "step": 9281 }, { "epoch": 0.06, "grad_norm": 4.717504790595443, "learning_rate": 1.9987895000472033e-06, "loss": 1.4021, "step": 9282 }, { "epoch": 0.06, "grad_norm": 5.0920116594945215, "learning_rate": 1.9987892389771245e-06, "loss": 1.464, "step": 9283 }, { "epoch": 0.06, "grad_norm": 4.420083599275562, "learning_rate": 1.9987889778789127e-06, "loss": 1.3963, "step": 9284 }, { "epoch": 0.06, "grad_norm": 4.65791395587469, "learning_rate": 1.9987887167525684e-06, "loss": 1.3724, "step": 9285 }, { "epoch": 0.06, "grad_norm": 4.956681883190954, "learning_rate": 1.9987884555980916e-06, "loss": 1.3088, "step": 9286 }, { "epoch": 0.06, "grad_norm": 4.632885623210763, "learning_rate": 1.9987881944154823e-06, "loss": 1.3204, "step": 9287 }, { "epoch": 0.06, "grad_norm": 4.745986269835675, "learning_rate": 1.9987879332047406e-06, "loss": 1.4684, "step": 9288 }, { "epoch": 0.06, "grad_norm": 5.832174684013402, "learning_rate": 1.9987876719658663e-06, "loss": 1.2743, "step": 9289 }, { "epoch": 0.06, "grad_norm": 4.40543614112395, "learning_rate": 1.9987874106988595e-06, "loss": 1.4756, "step": 9290 }, { "epoch": 0.06, "grad_norm": 4.6797750413247545, "learning_rate": 1.99878714940372e-06, "loss": 1.3292, "step": 9291 }, { "epoch": 0.06, "grad_norm": 5.183811398343307, "learning_rate": 1.9987868880804483e-06, "loss": 1.4219, "step": 9292 }, { "epoch": 0.06, "grad_norm": 4.651033397706635, "learning_rate": 1.9987866267290436e-06, "loss": 1.3912, "step": 9293 }, { "epoch": 0.06, "grad_norm": 4.762988933329257, "learning_rate": 1.9987863653495064e-06, "loss": 1.2451, "step": 9294 }, { "epoch": 0.06, "grad_norm": 4.736059905112012, "learning_rate": 1.998786103941837e-06, "loss": 1.497, "step": 9295 }, { "epoch": 0.06, "grad_norm": 5.765904543506734, "learning_rate": 1.998785842506035e-06, "loss": 1.6876, "step": 9296 }, { "epoch": 0.06, "grad_norm": 4.592128130821391, "learning_rate": 1.998785581042101e-06, "loss": 1.3138, "step": 9297 }, { "epoch": 0.06, "grad_norm": 4.70491513327742, "learning_rate": 1.998785319550034e-06, "loss": 1.399, "step": 9298 }, { "epoch": 0.06, "grad_norm": 4.267025604843614, "learning_rate": 1.9987850580298347e-06, "loss": 1.2901, "step": 9299 }, { "epoch": 0.06, "grad_norm": 7.339875503217822, "learning_rate": 1.998784796481503e-06, "loss": 1.1524, "step": 9300 }, { "epoch": 0.06, "grad_norm": 4.949563078877494, "learning_rate": 1.9987845349050384e-06, "loss": 1.2128, "step": 9301 }, { "epoch": 0.06, "grad_norm": 5.396387899021429, "learning_rate": 1.998784273300442e-06, "loss": 1.3408, "step": 9302 }, { "epoch": 0.06, "grad_norm": 6.347357519236312, "learning_rate": 1.998784011667713e-06, "loss": 1.3769, "step": 9303 }, { "epoch": 0.06, "grad_norm": 4.649649010355445, "learning_rate": 1.9987837500068516e-06, "loss": 1.4515, "step": 9304 }, { "epoch": 0.06, "grad_norm": 4.58513396644983, "learning_rate": 1.9987834883178576e-06, "loss": 1.3744, "step": 9305 }, { "epoch": 0.06, "grad_norm": 5.166731555818931, "learning_rate": 1.9987832266007308e-06, "loss": 1.4787, "step": 9306 }, { "epoch": 0.06, "grad_norm": 4.283407057774179, "learning_rate": 1.9987829648554722e-06, "loss": 1.268, "step": 9307 }, { "epoch": 0.06, "grad_norm": 4.48034430599568, "learning_rate": 1.998782703082081e-06, "loss": 1.4802, "step": 9308 }, { "epoch": 0.06, "grad_norm": 4.392416470892556, "learning_rate": 1.9987824412805576e-06, "loss": 1.2899, "step": 9309 }, { "epoch": 0.06, "grad_norm": 4.38167239713809, "learning_rate": 1.998782179450902e-06, "loss": 1.3048, "step": 9310 }, { "epoch": 0.06, "grad_norm": 4.795089895853581, "learning_rate": 1.9987819175931134e-06, "loss": 1.4448, "step": 9311 }, { "epoch": 0.06, "grad_norm": 4.7341605878517505, "learning_rate": 1.998781655707193e-06, "loss": 1.2488, "step": 9312 }, { "epoch": 0.06, "grad_norm": 5.858909377611416, "learning_rate": 1.99878139379314e-06, "loss": 1.3509, "step": 9313 }, { "epoch": 0.06, "grad_norm": 4.901591635443287, "learning_rate": 1.998781131850955e-06, "loss": 1.3949, "step": 9314 }, { "epoch": 0.06, "grad_norm": 5.40303957806761, "learning_rate": 1.9987808698806366e-06, "loss": 1.5304, "step": 9315 }, { "epoch": 0.06, "grad_norm": 4.404420742355614, "learning_rate": 1.998780607882187e-06, "loss": 1.3839, "step": 9316 }, { "epoch": 0.06, "grad_norm": 6.471627541955831, "learning_rate": 1.9987803458556045e-06, "loss": 1.5146, "step": 9317 }, { "epoch": 0.06, "grad_norm": 4.704646072489282, "learning_rate": 1.9987800838008897e-06, "loss": 1.3695, "step": 9318 }, { "epoch": 0.06, "grad_norm": 4.557454953695969, "learning_rate": 1.9987798217180427e-06, "loss": 1.2422, "step": 9319 }, { "epoch": 0.06, "grad_norm": 4.476143796087004, "learning_rate": 1.9987795596070637e-06, "loss": 1.3398, "step": 9320 }, { "epoch": 0.06, "grad_norm": 4.861996942887446, "learning_rate": 1.998779297467952e-06, "loss": 1.3641, "step": 9321 }, { "epoch": 0.06, "grad_norm": 7.56123616641868, "learning_rate": 1.998779035300708e-06, "loss": 1.2767, "step": 9322 }, { "epoch": 0.06, "grad_norm": 4.808672507983671, "learning_rate": 1.9987787731053317e-06, "loss": 1.285, "step": 9323 }, { "epoch": 0.06, "grad_norm": 4.185642482271401, "learning_rate": 1.9987785108818236e-06, "loss": 1.2857, "step": 9324 }, { "epoch": 0.06, "grad_norm": 6.203206204004206, "learning_rate": 1.998778248630183e-06, "loss": 1.3522, "step": 9325 }, { "epoch": 0.06, "grad_norm": 4.276307882865296, "learning_rate": 1.9987779863504097e-06, "loss": 1.3013, "step": 9326 }, { "epoch": 0.06, "grad_norm": 5.399071763911135, "learning_rate": 1.9987777240425044e-06, "loss": 1.4096, "step": 9327 }, { "epoch": 0.06, "grad_norm": 4.299485897763391, "learning_rate": 1.998777461706467e-06, "loss": 1.2972, "step": 9328 }, { "epoch": 0.06, "grad_norm": 4.976409578703816, "learning_rate": 1.9987771993422972e-06, "loss": 1.4363, "step": 9329 }, { "epoch": 0.06, "grad_norm": 4.568935534913313, "learning_rate": 1.9987769369499953e-06, "loss": 1.2764, "step": 9330 }, { "epoch": 0.06, "grad_norm": 6.503288948265216, "learning_rate": 1.9987766745295613e-06, "loss": 1.2364, "step": 9331 }, { "epoch": 0.06, "grad_norm": 4.256260136957411, "learning_rate": 1.998776412080995e-06, "loss": 1.3106, "step": 9332 }, { "epoch": 0.06, "grad_norm": 4.199000698707674, "learning_rate": 1.998776149604296e-06, "loss": 1.3945, "step": 9333 }, { "epoch": 0.06, "grad_norm": 4.048167747518878, "learning_rate": 1.9987758870994655e-06, "loss": 1.3643, "step": 9334 }, { "epoch": 0.06, "grad_norm": 4.560653833986124, "learning_rate": 1.9987756245665027e-06, "loss": 1.2131, "step": 9335 }, { "epoch": 0.06, "grad_norm": 4.734914641753475, "learning_rate": 1.9987753620054075e-06, "loss": 1.2232, "step": 9336 }, { "epoch": 0.06, "grad_norm": 5.346564245683493, "learning_rate": 1.9987750994161797e-06, "loss": 1.332, "step": 9337 }, { "epoch": 0.06, "grad_norm": 4.766509990122271, "learning_rate": 1.9987748367988203e-06, "loss": 1.4455, "step": 9338 }, { "epoch": 0.06, "grad_norm": 4.537614482329693, "learning_rate": 1.9987745741533288e-06, "loss": 1.4035, "step": 9339 }, { "epoch": 0.06, "grad_norm": 4.535923577308804, "learning_rate": 1.9987743114797047e-06, "loss": 1.3578, "step": 9340 }, { "epoch": 0.06, "grad_norm": 5.66711209337483, "learning_rate": 1.9987740487779486e-06, "loss": 1.3531, "step": 9341 }, { "epoch": 0.06, "grad_norm": 5.157752259967078, "learning_rate": 1.998773786048061e-06, "loss": 1.2807, "step": 9342 }, { "epoch": 0.06, "grad_norm": 4.580449970801107, "learning_rate": 1.9987735232900406e-06, "loss": 1.4835, "step": 9343 }, { "epoch": 0.06, "grad_norm": 5.041308807278067, "learning_rate": 1.998773260503888e-06, "loss": 1.507, "step": 9344 }, { "epoch": 0.06, "eval_loss": 1.5564230680465698, "eval_runtime": 4.5977, "eval_samples_per_second": 1.957, "eval_steps_per_second": 1.087, "step": 9344 }, { "epoch": 0.06, "grad_norm": 4.651619444032944, "learning_rate": 1.9987729976896034e-06, "loss": 1.3943, "step": 9345 }, { "epoch": 0.06, "grad_norm": 4.919107611574725, "learning_rate": 1.998772734847187e-06, "loss": 1.388, "step": 9346 }, { "epoch": 0.06, "grad_norm": 3.9307381434164723, "learning_rate": 1.9987724719766383e-06, "loss": 1.1384, "step": 9347 }, { "epoch": 0.06, "grad_norm": 4.489358723978206, "learning_rate": 1.9987722090779572e-06, "loss": 1.2868, "step": 9348 }, { "epoch": 0.06, "grad_norm": 4.3874365251733805, "learning_rate": 1.9987719461511445e-06, "loss": 1.173, "step": 9349 }, { "epoch": 0.06, "grad_norm": 4.206845871964003, "learning_rate": 1.998771683196199e-06, "loss": 1.015, "step": 9350 }, { "epoch": 0.06, "grad_norm": 4.765176521714314, "learning_rate": 1.9987714202131223e-06, "loss": 1.4216, "step": 9351 }, { "epoch": 0.06, "grad_norm": 6.397277861698614, "learning_rate": 1.9987711572019133e-06, "loss": 1.3599, "step": 9352 }, { "epoch": 0.06, "grad_norm": 4.554091947437095, "learning_rate": 1.9987708941625718e-06, "loss": 1.2454, "step": 9353 }, { "epoch": 0.06, "grad_norm": 4.6806296300669645, "learning_rate": 1.9987706310950986e-06, "loss": 1.3785, "step": 9354 }, { "epoch": 0.06, "grad_norm": 16.765534803659744, "learning_rate": 1.9987703679994934e-06, "loss": 1.3085, "step": 9355 }, { "epoch": 0.06, "grad_norm": 4.245376816942007, "learning_rate": 1.998770104875756e-06, "loss": 1.2533, "step": 9356 }, { "epoch": 0.06, "grad_norm": 5.284808208838013, "learning_rate": 1.9987698417238866e-06, "loss": 1.3031, "step": 9357 }, { "epoch": 0.06, "grad_norm": 5.106723034892174, "learning_rate": 1.998769578543885e-06, "loss": 1.3921, "step": 9358 }, { "epoch": 0.06, "grad_norm": 4.625868763169571, "learning_rate": 1.9987693153357516e-06, "loss": 1.3934, "step": 9359 }, { "epoch": 0.06, "grad_norm": 4.4624830709939385, "learning_rate": 1.9987690520994863e-06, "loss": 1.3563, "step": 9360 }, { "epoch": 0.06, "grad_norm": 4.39807846365891, "learning_rate": 1.9987687888350886e-06, "loss": 1.3282, "step": 9361 }, { "epoch": 0.06, "grad_norm": 4.529497524903928, "learning_rate": 1.9987685255425592e-06, "loss": 1.4251, "step": 9362 }, { "epoch": 0.06, "grad_norm": 4.357869001320897, "learning_rate": 1.998768262221898e-06, "loss": 1.3545, "step": 9363 }, { "epoch": 0.06, "grad_norm": 4.5683256561702725, "learning_rate": 1.9987679988731046e-06, "loss": 1.3858, "step": 9364 }, { "epoch": 0.06, "grad_norm": 5.2895249000173985, "learning_rate": 1.998767735496179e-06, "loss": 1.4175, "step": 9365 }, { "epoch": 0.06, "grad_norm": 4.654215484010466, "learning_rate": 1.9987674720911217e-06, "loss": 1.4406, "step": 9366 }, { "epoch": 0.06, "grad_norm": 5.291279222863882, "learning_rate": 1.9987672086579327e-06, "loss": 1.3936, "step": 9367 }, { "epoch": 0.06, "grad_norm": 4.839411000145247, "learning_rate": 1.9987669451966113e-06, "loss": 1.4873, "step": 9368 }, { "epoch": 0.06, "grad_norm": 7.153951594412018, "learning_rate": 1.998766681707158e-06, "loss": 1.2487, "step": 9369 }, { "epoch": 0.06, "grad_norm": 4.764295742363618, "learning_rate": 1.998766418189573e-06, "loss": 1.4439, "step": 9370 }, { "epoch": 0.06, "grad_norm": 4.750167490834732, "learning_rate": 1.9987661546438557e-06, "loss": 1.4094, "step": 9371 }, { "epoch": 0.06, "grad_norm": 4.505501703856124, "learning_rate": 1.9987658910700067e-06, "loss": 1.1499, "step": 9372 }, { "epoch": 0.06, "grad_norm": 4.179037053362475, "learning_rate": 1.998765627468026e-06, "loss": 1.1673, "step": 9373 }, { "epoch": 0.06, "grad_norm": 4.817171723995393, "learning_rate": 1.998765363837913e-06, "loss": 1.3536, "step": 9374 }, { "epoch": 0.06, "grad_norm": 4.300571417546668, "learning_rate": 1.9987651001796682e-06, "loss": 1.3439, "step": 9375 }, { "epoch": 0.06, "grad_norm": 4.4718558066022425, "learning_rate": 1.998764836493292e-06, "loss": 1.4057, "step": 9376 }, { "epoch": 0.06, "grad_norm": 4.501448963846782, "learning_rate": 1.9987645727787833e-06, "loss": 1.3458, "step": 9377 }, { "epoch": 0.06, "grad_norm": 5.110078065693422, "learning_rate": 1.998764309036143e-06, "loss": 1.402, "step": 9378 }, { "epoch": 0.06, "grad_norm": 4.188935065097337, "learning_rate": 1.998764045265371e-06, "loss": 1.1881, "step": 9379 }, { "epoch": 0.06, "grad_norm": 4.334402988353024, "learning_rate": 1.9987637814664666e-06, "loss": 1.4239, "step": 9380 }, { "epoch": 0.06, "grad_norm": 4.890979438566899, "learning_rate": 1.9987635176394306e-06, "loss": 1.5492, "step": 9381 }, { "epoch": 0.06, "grad_norm": 5.377491515086609, "learning_rate": 1.998763253784263e-06, "loss": 1.1524, "step": 9382 }, { "epoch": 0.06, "grad_norm": 4.476054373815102, "learning_rate": 1.9987629899009637e-06, "loss": 1.4062, "step": 9383 }, { "epoch": 0.06, "grad_norm": 4.260747832926185, "learning_rate": 1.998762725989532e-06, "loss": 1.2834, "step": 9384 }, { "epoch": 0.06, "grad_norm": 4.816625609447784, "learning_rate": 1.9987624620499684e-06, "loss": 1.3921, "step": 9385 }, { "epoch": 0.06, "grad_norm": 6.091096415979983, "learning_rate": 1.9987621980822737e-06, "loss": 1.3007, "step": 9386 }, { "epoch": 0.06, "grad_norm": 4.425480951782516, "learning_rate": 1.998761934086447e-06, "loss": 1.4246, "step": 9387 }, { "epoch": 0.06, "grad_norm": 7.411021803943703, "learning_rate": 1.998761670062488e-06, "loss": 1.2711, "step": 9388 }, { "epoch": 0.06, "grad_norm": 4.564211915096558, "learning_rate": 1.9987614060103975e-06, "loss": 1.3117, "step": 9389 }, { "epoch": 0.06, "grad_norm": 6.271459895443692, "learning_rate": 1.998761141930175e-06, "loss": 1.3914, "step": 9390 }, { "epoch": 0.06, "grad_norm": 4.42397509417245, "learning_rate": 1.998760877821821e-06, "loss": 1.3259, "step": 9391 }, { "epoch": 0.06, "grad_norm": 4.472042016889664, "learning_rate": 1.998760613685335e-06, "loss": 1.2991, "step": 9392 }, { "epoch": 0.06, "grad_norm": 4.631671171162598, "learning_rate": 1.9987603495207176e-06, "loss": 1.4603, "step": 9393 }, { "epoch": 0.06, "grad_norm": 5.126982568006298, "learning_rate": 1.9987600853279684e-06, "loss": 1.3015, "step": 9394 }, { "epoch": 0.06, "grad_norm": 4.35775755410097, "learning_rate": 1.9987598211070875e-06, "loss": 1.2212, "step": 9395 }, { "epoch": 0.06, "grad_norm": 4.376302506254838, "learning_rate": 1.9987595568580745e-06, "loss": 1.2954, "step": 9396 }, { "epoch": 0.06, "grad_norm": 4.22920181372995, "learning_rate": 1.99875929258093e-06, "loss": 1.2735, "step": 9397 }, { "epoch": 0.06, "grad_norm": 4.353132731433254, "learning_rate": 1.9987590282756536e-06, "loss": 1.3804, "step": 9398 }, { "epoch": 0.06, "grad_norm": 6.090285023597953, "learning_rate": 1.9987587639422452e-06, "loss": 1.2131, "step": 9399 }, { "epoch": 0.06, "grad_norm": 4.647817003881624, "learning_rate": 1.9987584995807056e-06, "loss": 1.3534, "step": 9400 }, { "epoch": 0.06, "grad_norm": 4.023506578973363, "learning_rate": 1.9987582351910343e-06, "loss": 1.2525, "step": 9401 }, { "epoch": 0.06, "grad_norm": 4.723006433047982, "learning_rate": 1.998757970773231e-06, "loss": 1.4166, "step": 9402 }, { "epoch": 0.06, "grad_norm": 6.0269702562074094, "learning_rate": 1.9987577063272964e-06, "loss": 1.432, "step": 9403 }, { "epoch": 0.06, "grad_norm": 5.266689423088489, "learning_rate": 1.9987574418532298e-06, "loss": 1.325, "step": 9404 }, { "epoch": 0.06, "grad_norm": 8.181221368989679, "learning_rate": 1.9987571773510314e-06, "loss": 1.3802, "step": 9405 }, { "epoch": 0.06, "grad_norm": 4.388891468179163, "learning_rate": 1.998756912820702e-06, "loss": 1.3755, "step": 9406 }, { "epoch": 0.06, "grad_norm": 5.487684122863875, "learning_rate": 1.99875664826224e-06, "loss": 1.3536, "step": 9407 }, { "epoch": 0.06, "grad_norm": 4.548071851503697, "learning_rate": 1.998756383675647e-06, "loss": 1.3414, "step": 9408 }, { "epoch": 0.06, "grad_norm": 4.224147895397433, "learning_rate": 1.998756119060922e-06, "loss": 1.2701, "step": 9409 }, { "epoch": 0.06, "grad_norm": 5.370676813455007, "learning_rate": 1.998755854418066e-06, "loss": 1.4123, "step": 9410 }, { "epoch": 0.06, "grad_norm": 3.9779755980831815, "learning_rate": 1.9987555897470775e-06, "loss": 1.2393, "step": 9411 }, { "epoch": 0.06, "grad_norm": 4.2740192236856105, "learning_rate": 1.998755325047958e-06, "loss": 1.4512, "step": 9412 }, { "epoch": 0.06, "grad_norm": 5.815177989160682, "learning_rate": 1.9987550603207064e-06, "loss": 1.6383, "step": 9413 }, { "epoch": 0.06, "grad_norm": 4.6404335981797775, "learning_rate": 1.9987547955653236e-06, "loss": 1.3385, "step": 9414 }, { "epoch": 0.06, "grad_norm": 4.3188661188951984, "learning_rate": 1.998754530781809e-06, "loss": 1.4059, "step": 9415 }, { "epoch": 0.06, "grad_norm": 4.450171886637124, "learning_rate": 1.998754265970163e-06, "loss": 1.2894, "step": 9416 }, { "epoch": 0.06, "grad_norm": 4.397497627251262, "learning_rate": 1.998754001130385e-06, "loss": 1.289, "step": 9417 }, { "epoch": 0.06, "eval_loss": 1.5530638694763184, "eval_runtime": 4.5926, "eval_samples_per_second": 1.96, "eval_steps_per_second": 1.089, "step": 9417 } ], "logging_steps": 1, "max_steps": 591956, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 73, "total_flos": 985811763855360.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }