{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 908, "global_step": 9075, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011019283746556474, "grad_norm": 59.863380432128906, "learning_rate": 1.098901098901099e-07, "loss": 1.0565, "step": 1 }, { "epoch": 0.00022038567493112948, "grad_norm": 172.912353515625, "learning_rate": 2.197802197802198e-07, "loss": 1.6306, "step": 2 }, { "epoch": 0.00033057851239669424, "grad_norm": 193.34146118164062, "learning_rate": 3.296703296703297e-07, "loss": 1.5845, "step": 3 }, { "epoch": 0.00044077134986225897, "grad_norm": 71.46421813964844, "learning_rate": 4.395604395604396e-07, "loss": 1.1894, "step": 4 }, { "epoch": 0.0005509641873278236, "grad_norm": 214.38052368164062, "learning_rate": 5.494505494505495e-07, "loss": 1.5854, "step": 5 }, { "epoch": 0.0006611570247933885, "grad_norm": 92.93484497070312, "learning_rate": 6.593406593406594e-07, "loss": 1.1533, "step": 6 }, { "epoch": 0.0007713498622589532, "grad_norm": 88.05059814453125, "learning_rate": 7.692307692307694e-07, "loss": 1.2465, "step": 7 }, { "epoch": 0.0008815426997245179, "grad_norm": 90.69667053222656, "learning_rate": 8.791208791208792e-07, "loss": 1.0339, "step": 8 }, { "epoch": 0.0009917355371900827, "grad_norm": 174.13861083984375, "learning_rate": 9.890109890109891e-07, "loss": 1.653, "step": 9 }, { "epoch": 0.0011019283746556473, "grad_norm": 62.3985710144043, "learning_rate": 1.098901098901099e-06, "loss": 0.986, "step": 10 }, { "epoch": 0.0012121212121212121, "grad_norm": 39.197818756103516, "learning_rate": 1.2087912087912089e-06, "loss": 0.9859, "step": 11 }, { "epoch": 0.001322314049586777, "grad_norm": 97.90790557861328, "learning_rate": 1.3186813186813187e-06, "loss": 1.057, "step": 12 }, { "epoch": 0.0014325068870523416, "grad_norm": 34.72150421142578, "learning_rate": 1.4285714285714286e-06, "loss": 0.9492, "step": 13 }, { "epoch": 0.0015426997245179064, "grad_norm": 48.048912048339844, "learning_rate": 1.5384615384615387e-06, "loss": 1.0598, "step": 14 }, { "epoch": 0.001652892561983471, "grad_norm": 26.897621154785156, "learning_rate": 1.6483516483516484e-06, "loss": 0.9134, "step": 15 }, { "epoch": 0.0017630853994490359, "grad_norm": 40.0106201171875, "learning_rate": 1.7582417582417585e-06, "loss": 0.7494, "step": 16 }, { "epoch": 0.0018732782369146005, "grad_norm": 24.09613037109375, "learning_rate": 1.8681318681318684e-06, "loss": 0.8329, "step": 17 }, { "epoch": 0.0019834710743801653, "grad_norm": 27.178569793701172, "learning_rate": 1.9780219780219782e-06, "loss": 0.8935, "step": 18 }, { "epoch": 0.00209366391184573, "grad_norm": 21.927278518676758, "learning_rate": 2.0879120879120883e-06, "loss": 0.869, "step": 19 }, { "epoch": 0.0022038567493112946, "grad_norm": 32.57326126098633, "learning_rate": 2.197802197802198e-06, "loss": 0.8756, "step": 20 }, { "epoch": 0.0023140495867768596, "grad_norm": 26.621749877929688, "learning_rate": 2.307692307692308e-06, "loss": 0.8861, "step": 21 }, { "epoch": 0.0024242424242424242, "grad_norm": 21.098466873168945, "learning_rate": 2.4175824175824177e-06, "loss": 0.8368, "step": 22 }, { "epoch": 0.002534435261707989, "grad_norm": 18.737295150756836, "learning_rate": 2.5274725274725274e-06, "loss": 0.7743, "step": 23 }, { "epoch": 0.002644628099173554, "grad_norm": 19.97951316833496, "learning_rate": 2.6373626373626375e-06, "loss": 0.8272, "step": 24 }, { "epoch": 0.0027548209366391185, "grad_norm": 26.72895050048828, "learning_rate": 2.7472527472527476e-06, "loss": 0.7512, "step": 25 }, { "epoch": 0.002865013774104683, "grad_norm": 38.96323776245117, "learning_rate": 2.8571428571428573e-06, "loss": 0.6617, "step": 26 }, { "epoch": 0.0029752066115702478, "grad_norm": 29.704008102416992, "learning_rate": 2.9670329670329673e-06, "loss": 0.7189, "step": 27 }, { "epoch": 0.003085399449035813, "grad_norm": 31.563518524169922, "learning_rate": 3.0769230769230774e-06, "loss": 0.782, "step": 28 }, { "epoch": 0.0031955922865013774, "grad_norm": 26.969806671142578, "learning_rate": 3.1868131868131867e-06, "loss": 0.6745, "step": 29 }, { "epoch": 0.003305785123966942, "grad_norm": 26.056306838989258, "learning_rate": 3.2967032967032968e-06, "loss": 0.5712, "step": 30 }, { "epoch": 0.0034159779614325067, "grad_norm": 23.492544174194336, "learning_rate": 3.406593406593407e-06, "loss": 0.6993, "step": 31 }, { "epoch": 0.0035261707988980717, "grad_norm": 26.434640884399414, "learning_rate": 3.516483516483517e-06, "loss": 0.5461, "step": 32 }, { "epoch": 0.0036363636363636364, "grad_norm": 21.654865264892578, "learning_rate": 3.6263736263736266e-06, "loss": 0.6576, "step": 33 }, { "epoch": 0.003746556473829201, "grad_norm": 25.91067123413086, "learning_rate": 3.7362637362637367e-06, "loss": 0.6175, "step": 34 }, { "epoch": 0.003856749311294766, "grad_norm": 18.07984733581543, "learning_rate": 3.846153846153847e-06, "loss": 0.5718, "step": 35 }, { "epoch": 0.003966942148760331, "grad_norm": 18.15633201599121, "learning_rate": 3.9560439560439565e-06, "loss": 0.6699, "step": 36 }, { "epoch": 0.004077134986225896, "grad_norm": 28.740493774414062, "learning_rate": 4.065934065934066e-06, "loss": 0.6541, "step": 37 }, { "epoch": 0.00418732782369146, "grad_norm": 30.440296173095703, "learning_rate": 4.175824175824177e-06, "loss": 0.678, "step": 38 }, { "epoch": 0.004297520661157025, "grad_norm": 16.116561889648438, "learning_rate": 4.2857142857142855e-06, "loss": 0.5679, "step": 39 }, { "epoch": 0.004407713498622589, "grad_norm": 25.8539981842041, "learning_rate": 4.395604395604396e-06, "loss": 0.7524, "step": 40 }, { "epoch": 0.004517906336088154, "grad_norm": 16.08936882019043, "learning_rate": 4.505494505494506e-06, "loss": 0.5679, "step": 41 }, { "epoch": 0.004628099173553719, "grad_norm": 11.73170280456543, "learning_rate": 4.615384615384616e-06, "loss": 0.5828, "step": 42 }, { "epoch": 0.004738292011019283, "grad_norm": 22.269550323486328, "learning_rate": 4.725274725274726e-06, "loss": 0.6695, "step": 43 }, { "epoch": 0.0048484848484848485, "grad_norm": 22.19915008544922, "learning_rate": 4.8351648351648355e-06, "loss": 0.6622, "step": 44 }, { "epoch": 0.0049586776859504135, "grad_norm": 13.642354011535645, "learning_rate": 4.945054945054946e-06, "loss": 0.5272, "step": 45 }, { "epoch": 0.005068870523415978, "grad_norm": 17.455495834350586, "learning_rate": 5.054945054945055e-06, "loss": 0.604, "step": 46 }, { "epoch": 0.005179063360881543, "grad_norm": 19.724523544311523, "learning_rate": 5.164835164835166e-06, "loss": 0.5195, "step": 47 }, { "epoch": 0.005289256198347108, "grad_norm": 35.24604034423828, "learning_rate": 5.274725274725275e-06, "loss": 0.5939, "step": 48 }, { "epoch": 0.005399449035812672, "grad_norm": 23.06145668029785, "learning_rate": 5.384615384615385e-06, "loss": 0.6269, "step": 49 }, { "epoch": 0.005509641873278237, "grad_norm": 19.79766273498535, "learning_rate": 5.494505494505495e-06, "loss": 0.5724, "step": 50 }, { "epoch": 0.005619834710743801, "grad_norm": 11.354966163635254, "learning_rate": 5.604395604395605e-06, "loss": 0.5952, "step": 51 }, { "epoch": 0.005730027548209366, "grad_norm": 11.693944931030273, "learning_rate": 5.7142857142857145e-06, "loss": 0.6348, "step": 52 }, { "epoch": 0.005840220385674931, "grad_norm": 43.29163360595703, "learning_rate": 5.824175824175825e-06, "loss": 0.6324, "step": 53 }, { "epoch": 0.0059504132231404955, "grad_norm": 27.70946502685547, "learning_rate": 5.934065934065935e-06, "loss": 0.7219, "step": 54 }, { "epoch": 0.006060606060606061, "grad_norm": 21.75235939025879, "learning_rate": 6.043956043956044e-06, "loss": 0.6411, "step": 55 }, { "epoch": 0.006170798898071626, "grad_norm": 17.24085235595703, "learning_rate": 6.153846153846155e-06, "loss": 0.582, "step": 56 }, { "epoch": 0.00628099173553719, "grad_norm": 14.344873428344727, "learning_rate": 6.2637362637362645e-06, "loss": 0.5875, "step": 57 }, { "epoch": 0.006391184573002755, "grad_norm": 9.679116249084473, "learning_rate": 6.373626373626373e-06, "loss": 0.6291, "step": 58 }, { "epoch": 0.00650137741046832, "grad_norm": 27.68067169189453, "learning_rate": 6.483516483516485e-06, "loss": 0.6413, "step": 59 }, { "epoch": 0.006611570247933884, "grad_norm": 27.99264144897461, "learning_rate": 6.5934065934065935e-06, "loss": 0.678, "step": 60 }, { "epoch": 0.006721763085399449, "grad_norm": 11.32085132598877, "learning_rate": 6.703296703296703e-06, "loss": 0.5968, "step": 61 }, { "epoch": 0.006831955922865013, "grad_norm": 15.186779975891113, "learning_rate": 6.813186813186814e-06, "loss": 0.4404, "step": 62 }, { "epoch": 0.006942148760330578, "grad_norm": 9.521608352661133, "learning_rate": 6.923076923076923e-06, "loss": 0.5429, "step": 63 }, { "epoch": 0.0070523415977961435, "grad_norm": 12.57529067993164, "learning_rate": 7.032967032967034e-06, "loss": 0.5389, "step": 64 }, { "epoch": 0.007162534435261708, "grad_norm": 22.200571060180664, "learning_rate": 7.1428571428571436e-06, "loss": 0.6364, "step": 65 }, { "epoch": 0.007272727272727273, "grad_norm": 15.877410888671875, "learning_rate": 7.252747252747253e-06, "loss": 0.5471, "step": 66 }, { "epoch": 0.007382920110192838, "grad_norm": 24.132150650024414, "learning_rate": 7.362637362637364e-06, "loss": 0.5143, "step": 67 }, { "epoch": 0.007493112947658402, "grad_norm": 21.11246109008789, "learning_rate": 7.472527472527473e-06, "loss": 0.562, "step": 68 }, { "epoch": 0.007603305785123967, "grad_norm": 22.65631675720215, "learning_rate": 7.582417582417583e-06, "loss": 0.5674, "step": 69 }, { "epoch": 0.007713498622589532, "grad_norm": 22.620630264282227, "learning_rate": 7.692307692307694e-06, "loss": 0.6273, "step": 70 }, { "epoch": 0.007823691460055097, "grad_norm": 19.626869201660156, "learning_rate": 7.802197802197802e-06, "loss": 0.5974, "step": 71 }, { "epoch": 0.007933884297520661, "grad_norm": 12.321710586547852, "learning_rate": 7.912087912087913e-06, "loss": 0.6292, "step": 72 }, { "epoch": 0.008044077134986225, "grad_norm": 18.073720932006836, "learning_rate": 8.021978021978023e-06, "loss": 0.5283, "step": 73 }, { "epoch": 0.008154269972451791, "grad_norm": 19.79140853881836, "learning_rate": 8.131868131868132e-06, "loss": 0.5381, "step": 74 }, { "epoch": 0.008264462809917356, "grad_norm": 26.322410583496094, "learning_rate": 8.241758241758243e-06, "loss": 0.4972, "step": 75 }, { "epoch": 0.00837465564738292, "grad_norm": 13.333000183105469, "learning_rate": 8.351648351648353e-06, "loss": 0.5632, "step": 76 }, { "epoch": 0.008484848484848486, "grad_norm": 13.923554420471191, "learning_rate": 8.461538461538462e-06, "loss": 0.52, "step": 77 }, { "epoch": 0.00859504132231405, "grad_norm": 12.857089042663574, "learning_rate": 8.571428571428571e-06, "loss": 0.5205, "step": 78 }, { "epoch": 0.008705234159779614, "grad_norm": 16.085172653198242, "learning_rate": 8.681318681318681e-06, "loss": 0.5551, "step": 79 }, { "epoch": 0.008815426997245178, "grad_norm": 15.384976387023926, "learning_rate": 8.791208791208792e-06, "loss": 0.5361, "step": 80 }, { "epoch": 0.008925619834710744, "grad_norm": 12.480789184570312, "learning_rate": 8.9010989010989e-06, "loss": 0.4877, "step": 81 }, { "epoch": 0.009035812672176308, "grad_norm": 28.784650802612305, "learning_rate": 9.010989010989011e-06, "loss": 0.6361, "step": 82 }, { "epoch": 0.009146005509641873, "grad_norm": 20.274999618530273, "learning_rate": 9.120879120879122e-06, "loss": 0.5687, "step": 83 }, { "epoch": 0.009256198347107438, "grad_norm": 9.781147956848145, "learning_rate": 9.230769230769232e-06, "loss": 0.5868, "step": 84 }, { "epoch": 0.009366391184573003, "grad_norm": 34.077537536621094, "learning_rate": 9.340659340659341e-06, "loss": 0.4965, "step": 85 }, { "epoch": 0.009476584022038567, "grad_norm": 16.664121627807617, "learning_rate": 9.450549450549452e-06, "loss": 0.5218, "step": 86 }, { "epoch": 0.009586776859504133, "grad_norm": 17.106098175048828, "learning_rate": 9.560439560439562e-06, "loss": 0.5994, "step": 87 }, { "epoch": 0.009696969696969697, "grad_norm": 11.295900344848633, "learning_rate": 9.670329670329671e-06, "loss": 0.5267, "step": 88 }, { "epoch": 0.009807162534435261, "grad_norm": 13.43213176727295, "learning_rate": 9.780219780219781e-06, "loss": 0.6087, "step": 89 }, { "epoch": 0.009917355371900827, "grad_norm": 20.79828643798828, "learning_rate": 9.890109890109892e-06, "loss": 0.5151, "step": 90 }, { "epoch": 0.010027548209366391, "grad_norm": 8.796585083007812, "learning_rate": 1e-05, "loss": 0.4638, "step": 91 }, { "epoch": 0.010137741046831955, "grad_norm": 14.324283599853516, "learning_rate": 9.999999694296605e-06, "loss": 0.642, "step": 92 }, { "epoch": 0.010247933884297521, "grad_norm": 24.24766731262207, "learning_rate": 9.999998777186455e-06, "loss": 0.6757, "step": 93 }, { "epoch": 0.010358126721763086, "grad_norm": 5.7737932205200195, "learning_rate": 9.999997248669662e-06, "loss": 0.593, "step": 94 }, { "epoch": 0.01046831955922865, "grad_norm": 9.869256019592285, "learning_rate": 9.999995108746413e-06, "loss": 0.5359, "step": 95 }, { "epoch": 0.010578512396694216, "grad_norm": 15.801307678222656, "learning_rate": 9.999992357416972e-06, "loss": 0.5176, "step": 96 }, { "epoch": 0.01068870523415978, "grad_norm": 19.859272003173828, "learning_rate": 9.999988994681672e-06, "loss": 0.5891, "step": 97 }, { "epoch": 0.010798898071625344, "grad_norm": 11.07324504852295, "learning_rate": 9.999985020540928e-06, "loss": 0.6453, "step": 98 }, { "epoch": 0.01090909090909091, "grad_norm": 9.245484352111816, "learning_rate": 9.999980434995223e-06, "loss": 0.5534, "step": 99 }, { "epoch": 0.011019283746556474, "grad_norm": 10.798912048339844, "learning_rate": 9.999975238045117e-06, "loss": 0.5368, "step": 100 }, { "epoch": 0.011129476584022038, "grad_norm": 17.250713348388672, "learning_rate": 9.99996942969125e-06, "loss": 0.609, "step": 101 }, { "epoch": 0.011239669421487603, "grad_norm": 14.687119483947754, "learning_rate": 9.999963009934327e-06, "loss": 0.5903, "step": 102 }, { "epoch": 0.011349862258953168, "grad_norm": 13.010708808898926, "learning_rate": 9.999955978775135e-06, "loss": 0.4922, "step": 103 }, { "epoch": 0.011460055096418733, "grad_norm": 12.1796875, "learning_rate": 9.999948336214536e-06, "loss": 0.5215, "step": 104 }, { "epoch": 0.011570247933884297, "grad_norm": 22.230316162109375, "learning_rate": 9.999940082253462e-06, "loss": 0.4579, "step": 105 }, { "epoch": 0.011680440771349863, "grad_norm": 11.79276180267334, "learning_rate": 9.999931216892924e-06, "loss": 0.5166, "step": 106 }, { "epoch": 0.011790633608815427, "grad_norm": 13.385542869567871, "learning_rate": 9.999921740134003e-06, "loss": 0.6159, "step": 107 }, { "epoch": 0.011900826446280991, "grad_norm": 15.622406005859375, "learning_rate": 9.99991165197786e-06, "loss": 0.5958, "step": 108 }, { "epoch": 0.012011019283746557, "grad_norm": 11.1148099899292, "learning_rate": 9.999900952425729e-06, "loss": 0.5128, "step": 109 }, { "epoch": 0.012121212121212121, "grad_norm": 11.416248321533203, "learning_rate": 9.999889641478919e-06, "loss": 0.5847, "step": 110 }, { "epoch": 0.012231404958677685, "grad_norm": 14.961265563964844, "learning_rate": 9.999877719138812e-06, "loss": 0.53, "step": 111 }, { "epoch": 0.012341597796143251, "grad_norm": 11.567359924316406, "learning_rate": 9.999865185406865e-06, "loss": 0.4921, "step": 112 }, { "epoch": 0.012451790633608815, "grad_norm": 13.228514671325684, "learning_rate": 9.999852040284612e-06, "loss": 0.5288, "step": 113 }, { "epoch": 0.01256198347107438, "grad_norm": 19.065542221069336, "learning_rate": 9.999838283773658e-06, "loss": 0.49, "step": 114 }, { "epoch": 0.012672176308539946, "grad_norm": 13.72010612487793, "learning_rate": 9.999823915875689e-06, "loss": 0.4847, "step": 115 }, { "epoch": 0.01278236914600551, "grad_norm": 9.950152397155762, "learning_rate": 9.999808936592459e-06, "loss": 0.6273, "step": 116 }, { "epoch": 0.012892561983471074, "grad_norm": 10.342408180236816, "learning_rate": 9.9997933459258e-06, "loss": 0.5014, "step": 117 }, { "epoch": 0.01300275482093664, "grad_norm": 14.401764869689941, "learning_rate": 9.999777143877622e-06, "loss": 0.5781, "step": 118 }, { "epoch": 0.013112947658402204, "grad_norm": 15.089703559875488, "learning_rate": 9.999760330449902e-06, "loss": 0.611, "step": 119 }, { "epoch": 0.013223140495867768, "grad_norm": 16.48484992980957, "learning_rate": 9.999742905644697e-06, "loss": 0.6119, "step": 120 }, { "epoch": 0.013333333333333334, "grad_norm": 14.737231254577637, "learning_rate": 9.999724869464136e-06, "loss": 0.6894, "step": 121 }, { "epoch": 0.013443526170798898, "grad_norm": 12.05566120147705, "learning_rate": 9.999706221910428e-06, "loss": 0.5635, "step": 122 }, { "epoch": 0.013553719008264463, "grad_norm": 10.84809398651123, "learning_rate": 9.999686962985852e-06, "loss": 0.5556, "step": 123 }, { "epoch": 0.013663911845730027, "grad_norm": 9.9436616897583, "learning_rate": 9.999667092692763e-06, "loss": 0.5408, "step": 124 }, { "epoch": 0.013774104683195593, "grad_norm": 18.544424057006836, "learning_rate": 9.99964661103359e-06, "loss": 0.5044, "step": 125 }, { "epoch": 0.013884297520661157, "grad_norm": 22.588855743408203, "learning_rate": 9.999625518010837e-06, "loss": 0.6151, "step": 126 }, { "epoch": 0.013994490358126721, "grad_norm": 18.303138732910156, "learning_rate": 9.999603813627087e-06, "loss": 0.5957, "step": 127 }, { "epoch": 0.014104683195592287, "grad_norm": 9.987324714660645, "learning_rate": 9.999581497884992e-06, "loss": 0.5337, "step": 128 }, { "epoch": 0.014214876033057851, "grad_norm": 8.876851081848145, "learning_rate": 9.999558570787277e-06, "loss": 0.5816, "step": 129 }, { "epoch": 0.014325068870523415, "grad_norm": 16.323097229003906, "learning_rate": 9.999535032336749e-06, "loss": 0.4789, "step": 130 }, { "epoch": 0.014435261707988981, "grad_norm": 12.875208854675293, "learning_rate": 9.999510882536288e-06, "loss": 0.5265, "step": 131 }, { "epoch": 0.014545454545454545, "grad_norm": 14.685235977172852, "learning_rate": 9.999486121388844e-06, "loss": 0.4931, "step": 132 }, { "epoch": 0.01465564738292011, "grad_norm": 9.357027053833008, "learning_rate": 9.999460748897447e-06, "loss": 0.5076, "step": 133 }, { "epoch": 0.014765840220385676, "grad_norm": 21.911592483520508, "learning_rate": 9.999434765065197e-06, "loss": 0.5248, "step": 134 }, { "epoch": 0.01487603305785124, "grad_norm": 11.090331077575684, "learning_rate": 9.999408169895273e-06, "loss": 0.5696, "step": 135 }, { "epoch": 0.014986225895316804, "grad_norm": 13.339033126831055, "learning_rate": 9.999380963390929e-06, "loss": 0.5207, "step": 136 }, { "epoch": 0.01509641873278237, "grad_norm": 24.932645797729492, "learning_rate": 9.999353145555486e-06, "loss": 0.6721, "step": 137 }, { "epoch": 0.015206611570247934, "grad_norm": 11.57662296295166, "learning_rate": 9.999324716392352e-06, "loss": 0.4752, "step": 138 }, { "epoch": 0.015316804407713498, "grad_norm": 16.003211975097656, "learning_rate": 9.999295675905001e-06, "loss": 0.518, "step": 139 }, { "epoch": 0.015426997245179064, "grad_norm": 13.036341667175293, "learning_rate": 9.999266024096982e-06, "loss": 0.5324, "step": 140 }, { "epoch": 0.015537190082644628, "grad_norm": 11.445280075073242, "learning_rate": 9.999235760971925e-06, "loss": 0.5302, "step": 141 }, { "epoch": 0.015647382920110194, "grad_norm": 12.373127937316895, "learning_rate": 9.999204886533527e-06, "loss": 0.4233, "step": 142 }, { "epoch": 0.01575757575757576, "grad_norm": 13.878166198730469, "learning_rate": 9.999173400785564e-06, "loss": 0.5741, "step": 143 }, { "epoch": 0.015867768595041323, "grad_norm": 44.70547866821289, "learning_rate": 9.999141303731889e-06, "loss": 0.5027, "step": 144 }, { "epoch": 0.015977961432506887, "grad_norm": 19.48044776916504, "learning_rate": 9.999108595376424e-06, "loss": 0.5468, "step": 145 }, { "epoch": 0.01608815426997245, "grad_norm": 21.51380157470703, "learning_rate": 9.999075275723169e-06, "loss": 0.6482, "step": 146 }, { "epoch": 0.016198347107438015, "grad_norm": 13.136129379272461, "learning_rate": 9.999041344776198e-06, "loss": 0.6616, "step": 147 }, { "epoch": 0.016308539944903583, "grad_norm": 8.277416229248047, "learning_rate": 9.999006802539662e-06, "loss": 0.6248, "step": 148 }, { "epoch": 0.016418732782369147, "grad_norm": 12.341249465942383, "learning_rate": 9.998971649017784e-06, "loss": 0.5269, "step": 149 }, { "epoch": 0.01652892561983471, "grad_norm": 12.141154289245605, "learning_rate": 9.99893588421486e-06, "loss": 0.5146, "step": 150 }, { "epoch": 0.016639118457300275, "grad_norm": 14.272335052490234, "learning_rate": 9.998899508135267e-06, "loss": 0.5359, "step": 151 }, { "epoch": 0.01674931129476584, "grad_norm": 11.248027801513672, "learning_rate": 9.998862520783452e-06, "loss": 0.5398, "step": 152 }, { "epoch": 0.016859504132231404, "grad_norm": 7.830145835876465, "learning_rate": 9.998824922163938e-06, "loss": 0.4999, "step": 153 }, { "epoch": 0.01696969696969697, "grad_norm": 14.68532943725586, "learning_rate": 9.998786712281322e-06, "loss": 0.497, "step": 154 }, { "epoch": 0.017079889807162536, "grad_norm": 23.5339298248291, "learning_rate": 9.998747891140277e-06, "loss": 0.6275, "step": 155 }, { "epoch": 0.0171900826446281, "grad_norm": 22.34769058227539, "learning_rate": 9.99870845874555e-06, "loss": 0.6367, "step": 156 }, { "epoch": 0.017300275482093664, "grad_norm": 20.661041259765625, "learning_rate": 9.99866841510196e-06, "loss": 0.5138, "step": 157 }, { "epoch": 0.017410468319559228, "grad_norm": 11.57186508178711, "learning_rate": 9.99862776021441e-06, "loss": 0.5041, "step": 158 }, { "epoch": 0.017520661157024792, "grad_norm": 11.123800277709961, "learning_rate": 9.998586494087865e-06, "loss": 0.4916, "step": 159 }, { "epoch": 0.017630853994490357, "grad_norm": 18.798917770385742, "learning_rate": 9.998544616727374e-06, "loss": 0.4933, "step": 160 }, { "epoch": 0.017741046831955924, "grad_norm": 18.14601707458496, "learning_rate": 9.998502128138056e-06, "loss": 0.4565, "step": 161 }, { "epoch": 0.01785123966942149, "grad_norm": 13.255478858947754, "learning_rate": 9.99845902832511e-06, "loss": 0.516, "step": 162 }, { "epoch": 0.017961432506887053, "grad_norm": 15.851000785827637, "learning_rate": 9.998415317293805e-06, "loss": 0.5405, "step": 163 }, { "epoch": 0.018071625344352617, "grad_norm": 10.505412101745605, "learning_rate": 9.998370995049485e-06, "loss": 0.5449, "step": 164 }, { "epoch": 0.01818181818181818, "grad_norm": 10.862316131591797, "learning_rate": 9.998326061597567e-06, "loss": 0.4108, "step": 165 }, { "epoch": 0.018292011019283745, "grad_norm": 10.101140022277832, "learning_rate": 9.998280516943553e-06, "loss": 0.4293, "step": 166 }, { "epoch": 0.018402203856749313, "grad_norm": 14.068907737731934, "learning_rate": 9.998234361093005e-06, "loss": 0.4723, "step": 167 }, { "epoch": 0.018512396694214877, "grad_norm": 15.875856399536133, "learning_rate": 9.99818759405157e-06, "loss": 0.5235, "step": 168 }, { "epoch": 0.01862258953168044, "grad_norm": 18.25687026977539, "learning_rate": 9.998140215824967e-06, "loss": 0.3877, "step": 169 }, { "epoch": 0.018732782369146005, "grad_norm": 21.297866821289062, "learning_rate": 9.99809222641899e-06, "loss": 0.5717, "step": 170 }, { "epoch": 0.01884297520661157, "grad_norm": 11.994351387023926, "learning_rate": 9.998043625839506e-06, "loss": 0.51, "step": 171 }, { "epoch": 0.018953168044077134, "grad_norm": 7.859072685241699, "learning_rate": 9.997994414092458e-06, "loss": 0.4668, "step": 172 }, { "epoch": 0.0190633608815427, "grad_norm": 18.620986938476562, "learning_rate": 9.997944591183864e-06, "loss": 0.5655, "step": 173 }, { "epoch": 0.019173553719008266, "grad_norm": 8.813560485839844, "learning_rate": 9.997894157119816e-06, "loss": 0.6078, "step": 174 }, { "epoch": 0.01928374655647383, "grad_norm": 7.580974578857422, "learning_rate": 9.997843111906482e-06, "loss": 0.3836, "step": 175 }, { "epoch": 0.019393939393939394, "grad_norm": 14.975537300109863, "learning_rate": 9.997791455550102e-06, "loss": 0.615, "step": 176 }, { "epoch": 0.019504132231404958, "grad_norm": 18.99844741821289, "learning_rate": 9.997739188056995e-06, "loss": 0.5388, "step": 177 }, { "epoch": 0.019614325068870522, "grad_norm": 11.795282363891602, "learning_rate": 9.997686309433552e-06, "loss": 0.454, "step": 178 }, { "epoch": 0.019724517906336086, "grad_norm": 10.889883041381836, "learning_rate": 9.997632819686238e-06, "loss": 0.5309, "step": 179 }, { "epoch": 0.019834710743801654, "grad_norm": 10.928999900817871, "learning_rate": 9.997578718821594e-06, "loss": 0.5196, "step": 180 }, { "epoch": 0.01994490358126722, "grad_norm": 13.715165138244629, "learning_rate": 9.997524006846235e-06, "loss": 0.5389, "step": 181 }, { "epoch": 0.020055096418732783, "grad_norm": 13.190074920654297, "learning_rate": 9.997468683766853e-06, "loss": 0.5283, "step": 182 }, { "epoch": 0.020165289256198347, "grad_norm": 10.726973533630371, "learning_rate": 9.997412749590212e-06, "loss": 0.593, "step": 183 }, { "epoch": 0.02027548209366391, "grad_norm": 10.734817504882812, "learning_rate": 9.997356204323153e-06, "loss": 0.5979, "step": 184 }, { "epoch": 0.020385674931129475, "grad_norm": 9.629424095153809, "learning_rate": 9.997299047972586e-06, "loss": 0.5061, "step": 185 }, { "epoch": 0.020495867768595043, "grad_norm": 19.34947395324707, "learning_rate": 9.997241280545505e-06, "loss": 0.5885, "step": 186 }, { "epoch": 0.020606060606060607, "grad_norm": 15.34253215789795, "learning_rate": 9.997182902048973e-06, "loss": 0.5997, "step": 187 }, { "epoch": 0.02071625344352617, "grad_norm": 11.760794639587402, "learning_rate": 9.997123912490126e-06, "loss": 0.4872, "step": 188 }, { "epoch": 0.020826446280991735, "grad_norm": 8.705877304077148, "learning_rate": 9.997064311876179e-06, "loss": 0.5161, "step": 189 }, { "epoch": 0.0209366391184573, "grad_norm": 14.717255592346191, "learning_rate": 9.99700410021442e-06, "loss": 0.6922, "step": 190 }, { "epoch": 0.021046831955922864, "grad_norm": 11.279083251953125, "learning_rate": 9.996943277512214e-06, "loss": 0.5453, "step": 191 }, { "epoch": 0.02115702479338843, "grad_norm": 15.692731857299805, "learning_rate": 9.996881843776994e-06, "loss": 0.5641, "step": 192 }, { "epoch": 0.021267217630853995, "grad_norm": 13.773262023925781, "learning_rate": 9.996819799016275e-06, "loss": 0.5335, "step": 193 }, { "epoch": 0.02137741046831956, "grad_norm": 11.219247817993164, "learning_rate": 9.996757143237645e-06, "loss": 0.5444, "step": 194 }, { "epoch": 0.021487603305785124, "grad_norm": 15.12265396118164, "learning_rate": 9.996693876448761e-06, "loss": 0.5041, "step": 195 }, { "epoch": 0.021597796143250688, "grad_norm": 14.708301544189453, "learning_rate": 9.996629998657365e-06, "loss": 0.5643, "step": 196 }, { "epoch": 0.021707988980716252, "grad_norm": 11.828309059143066, "learning_rate": 9.996565509871265e-06, "loss": 0.6107, "step": 197 }, { "epoch": 0.02181818181818182, "grad_norm": 22.66375732421875, "learning_rate": 9.996500410098347e-06, "loss": 0.6399, "step": 198 }, { "epoch": 0.021928374655647384, "grad_norm": 13.554588317871094, "learning_rate": 9.996434699346574e-06, "loss": 0.5173, "step": 199 }, { "epoch": 0.02203856749311295, "grad_norm": 12.405920028686523, "learning_rate": 9.996368377623975e-06, "loss": 0.5303, "step": 200 }, { "epoch": 0.022148760330578512, "grad_norm": 9.011127471923828, "learning_rate": 9.996301444938668e-06, "loss": 0.4698, "step": 201 }, { "epoch": 0.022258953168044077, "grad_norm": 11.63690185546875, "learning_rate": 9.99623390129883e-06, "loss": 0.5527, "step": 202 }, { "epoch": 0.02236914600550964, "grad_norm": 8.766032218933105, "learning_rate": 9.996165746712725e-06, "loss": 0.5798, "step": 203 }, { "epoch": 0.022479338842975205, "grad_norm": 8.39252758026123, "learning_rate": 9.996096981188687e-06, "loss": 0.5477, "step": 204 }, { "epoch": 0.022589531680440773, "grad_norm": 6.701710224151611, "learning_rate": 9.996027604735122e-06, "loss": 0.4578, "step": 205 }, { "epoch": 0.022699724517906337, "grad_norm": 8.533823013305664, "learning_rate": 9.995957617360515e-06, "loss": 0.5107, "step": 206 }, { "epoch": 0.0228099173553719, "grad_norm": 11.137310028076172, "learning_rate": 9.995887019073427e-06, "loss": 0.5688, "step": 207 }, { "epoch": 0.022920110192837465, "grad_norm": 8.286704063415527, "learning_rate": 9.995815809882485e-06, "loss": 0.5068, "step": 208 }, { "epoch": 0.02303030303030303, "grad_norm": 11.508268356323242, "learning_rate": 9.9957439897964e-06, "loss": 0.5123, "step": 209 }, { "epoch": 0.023140495867768594, "grad_norm": 9.160122871398926, "learning_rate": 9.995671558823955e-06, "loss": 0.445, "step": 210 }, { "epoch": 0.02325068870523416, "grad_norm": 17.174598693847656, "learning_rate": 9.995598516974005e-06, "loss": 0.5242, "step": 211 }, { "epoch": 0.023360881542699725, "grad_norm": 11.791619300842285, "learning_rate": 9.995524864255484e-06, "loss": 0.4747, "step": 212 }, { "epoch": 0.02347107438016529, "grad_norm": 12.3840970993042, "learning_rate": 9.995450600677395e-06, "loss": 0.5269, "step": 213 }, { "epoch": 0.023581267217630854, "grad_norm": 9.73430061340332, "learning_rate": 9.995375726248821e-06, "loss": 0.5148, "step": 214 }, { "epoch": 0.023691460055096418, "grad_norm": 8.56908130645752, "learning_rate": 9.995300240978918e-06, "loss": 0.4607, "step": 215 }, { "epoch": 0.023801652892561982, "grad_norm": 13.244146347045898, "learning_rate": 9.995224144876916e-06, "loss": 0.5644, "step": 216 }, { "epoch": 0.02391184573002755, "grad_norm": 8.972795486450195, "learning_rate": 9.995147437952121e-06, "loss": 0.5192, "step": 217 }, { "epoch": 0.024022038567493114, "grad_norm": 8.984622955322266, "learning_rate": 9.995070120213913e-06, "loss": 0.5259, "step": 218 }, { "epoch": 0.024132231404958678, "grad_norm": 12.068046569824219, "learning_rate": 9.994992191671743e-06, "loss": 0.5816, "step": 219 }, { "epoch": 0.024242424242424242, "grad_norm": 11.006875991821289, "learning_rate": 9.994913652335144e-06, "loss": 0.4294, "step": 220 }, { "epoch": 0.024352617079889807, "grad_norm": 13.247452735900879, "learning_rate": 9.994834502213718e-06, "loss": 0.5899, "step": 221 }, { "epoch": 0.02446280991735537, "grad_norm": 17.99578094482422, "learning_rate": 9.994754741317146e-06, "loss": 0.6063, "step": 222 }, { "epoch": 0.024573002754820935, "grad_norm": 15.888850212097168, "learning_rate": 9.994674369655178e-06, "loss": 0.5367, "step": 223 }, { "epoch": 0.024683195592286503, "grad_norm": 10.986605644226074, "learning_rate": 9.994593387237643e-06, "loss": 0.4896, "step": 224 }, { "epoch": 0.024793388429752067, "grad_norm": 9.367854118347168, "learning_rate": 9.994511794074446e-06, "loss": 0.4505, "step": 225 }, { "epoch": 0.02490358126721763, "grad_norm": 15.278581619262695, "learning_rate": 9.99442959017556e-06, "loss": 0.5066, "step": 226 }, { "epoch": 0.025013774104683195, "grad_norm": 9.596366882324219, "learning_rate": 9.99434677555104e-06, "loss": 0.5934, "step": 227 }, { "epoch": 0.02512396694214876, "grad_norm": 7.759309768676758, "learning_rate": 9.994263350211014e-06, "loss": 0.5601, "step": 228 }, { "epoch": 0.025234159779614324, "grad_norm": 11.995270729064941, "learning_rate": 9.994179314165681e-06, "loss": 0.5547, "step": 229 }, { "epoch": 0.02534435261707989, "grad_norm": 14.3855619430542, "learning_rate": 9.994094667425316e-06, "loss": 0.5354, "step": 230 }, { "epoch": 0.025454545454545455, "grad_norm": 11.691323280334473, "learning_rate": 9.994009410000273e-06, "loss": 0.4525, "step": 231 }, { "epoch": 0.02556473829201102, "grad_norm": 7.918416976928711, "learning_rate": 9.993923541900974e-06, "loss": 0.5041, "step": 232 }, { "epoch": 0.025674931129476584, "grad_norm": 15.92847728729248, "learning_rate": 9.993837063137923e-06, "loss": 0.5382, "step": 233 }, { "epoch": 0.025785123966942148, "grad_norm": 18.090808868408203, "learning_rate": 9.99374997372169e-06, "loss": 0.5493, "step": 234 }, { "epoch": 0.025895316804407712, "grad_norm": 24.221498489379883, "learning_rate": 9.993662273662928e-06, "loss": 0.6946, "step": 235 }, { "epoch": 0.02600550964187328, "grad_norm": 14.936357498168945, "learning_rate": 9.99357396297236e-06, "loss": 0.5798, "step": 236 }, { "epoch": 0.026115702479338844, "grad_norm": 11.742161750793457, "learning_rate": 9.993485041660784e-06, "loss": 0.5504, "step": 237 }, { "epoch": 0.026225895316804408, "grad_norm": 15.549546241760254, "learning_rate": 9.993395509739076e-06, "loss": 0.5448, "step": 238 }, { "epoch": 0.026336088154269972, "grad_norm": 12.071475982666016, "learning_rate": 9.99330536721818e-06, "loss": 0.5485, "step": 239 }, { "epoch": 0.026446280991735537, "grad_norm": 14.900094985961914, "learning_rate": 9.993214614109122e-06, "loss": 0.4189, "step": 240 }, { "epoch": 0.0265564738292011, "grad_norm": 13.360709190368652, "learning_rate": 9.993123250422998e-06, "loss": 0.4946, "step": 241 }, { "epoch": 0.02666666666666667, "grad_norm": 14.573027610778809, "learning_rate": 9.993031276170981e-06, "loss": 0.5292, "step": 242 }, { "epoch": 0.026776859504132233, "grad_norm": 11.061827659606934, "learning_rate": 9.992938691364317e-06, "loss": 0.5122, "step": 243 }, { "epoch": 0.026887052341597797, "grad_norm": 11.963610649108887, "learning_rate": 9.992845496014327e-06, "loss": 0.4796, "step": 244 }, { "epoch": 0.02699724517906336, "grad_norm": 9.625136375427246, "learning_rate": 9.99275169013241e-06, "loss": 0.4729, "step": 245 }, { "epoch": 0.027107438016528925, "grad_norm": 11.104870796203613, "learning_rate": 9.992657273730031e-06, "loss": 0.5244, "step": 246 }, { "epoch": 0.02721763085399449, "grad_norm": 24.728092193603516, "learning_rate": 9.99256224681874e-06, "loss": 0.5564, "step": 247 }, { "epoch": 0.027327823691460053, "grad_norm": 19.725374221801758, "learning_rate": 9.992466609410156e-06, "loss": 0.5839, "step": 248 }, { "epoch": 0.02743801652892562, "grad_norm": 12.868062973022461, "learning_rate": 9.992370361515973e-06, "loss": 0.4347, "step": 249 }, { "epoch": 0.027548209366391185, "grad_norm": 22.5147705078125, "learning_rate": 9.99227350314796e-06, "loss": 0.532, "step": 250 }, { "epoch": 0.02765840220385675, "grad_norm": 7.776773929595947, "learning_rate": 9.992176034317963e-06, "loss": 0.4972, "step": 251 }, { "epoch": 0.027768595041322314, "grad_norm": 9.029813766479492, "learning_rate": 9.9920779550379e-06, "loss": 0.5039, "step": 252 }, { "epoch": 0.027878787878787878, "grad_norm": 13.0871000289917, "learning_rate": 9.991979265319762e-06, "loss": 0.6035, "step": 253 }, { "epoch": 0.027988980716253442, "grad_norm": 12.87193489074707, "learning_rate": 9.99187996517562e-06, "loss": 0.5451, "step": 254 }, { "epoch": 0.02809917355371901, "grad_norm": 16.561864852905273, "learning_rate": 9.991780054617613e-06, "loss": 0.4178, "step": 255 }, { "epoch": 0.028209366391184574, "grad_norm": 8.838342666625977, "learning_rate": 9.991679533657962e-06, "loss": 0.5038, "step": 256 }, { "epoch": 0.028319559228650138, "grad_norm": 10.147689819335938, "learning_rate": 9.991578402308957e-06, "loss": 0.5399, "step": 257 }, { "epoch": 0.028429752066115702, "grad_norm": 9.908218383789062, "learning_rate": 9.991476660582964e-06, "loss": 0.512, "step": 258 }, { "epoch": 0.028539944903581266, "grad_norm": 9.494897842407227, "learning_rate": 9.991374308492424e-06, "loss": 0.4804, "step": 259 }, { "epoch": 0.02865013774104683, "grad_norm": 12.115522384643555, "learning_rate": 9.991271346049855e-06, "loss": 0.592, "step": 260 }, { "epoch": 0.0287603305785124, "grad_norm": 9.767868995666504, "learning_rate": 9.991167773267845e-06, "loss": 0.4458, "step": 261 }, { "epoch": 0.028870523415977963, "grad_norm": 15.636098861694336, "learning_rate": 9.99106359015906e-06, "loss": 0.6042, "step": 262 }, { "epoch": 0.028980716253443527, "grad_norm": 14.885655403137207, "learning_rate": 9.990958796736239e-06, "loss": 0.4564, "step": 263 }, { "epoch": 0.02909090909090909, "grad_norm": 15.355427742004395, "learning_rate": 9.990853393012196e-06, "loss": 0.4221, "step": 264 }, { "epoch": 0.029201101928374655, "grad_norm": 15.80118179321289, "learning_rate": 9.990747378999823e-06, "loss": 0.5817, "step": 265 }, { "epoch": 0.02931129476584022, "grad_norm": 10.542325019836426, "learning_rate": 9.99064075471208e-06, "loss": 0.572, "step": 266 }, { "epoch": 0.029421487603305783, "grad_norm": 11.173530578613281, "learning_rate": 9.990533520162007e-06, "loss": 0.5448, "step": 267 }, { "epoch": 0.02953168044077135, "grad_norm": 12.251981735229492, "learning_rate": 9.990425675362715e-06, "loss": 0.5747, "step": 268 }, { "epoch": 0.029641873278236915, "grad_norm": 11.749959945678711, "learning_rate": 9.990317220327393e-06, "loss": 0.6011, "step": 269 }, { "epoch": 0.02975206611570248, "grad_norm": 8.856796264648438, "learning_rate": 9.990208155069303e-06, "loss": 0.5938, "step": 270 }, { "epoch": 0.029862258953168044, "grad_norm": 14.874869346618652, "learning_rate": 9.99009847960178e-06, "loss": 0.5323, "step": 271 }, { "epoch": 0.029972451790633608, "grad_norm": 7.231379985809326, "learning_rate": 9.989988193938239e-06, "loss": 0.4536, "step": 272 }, { "epoch": 0.030082644628099172, "grad_norm": 12.203042984008789, "learning_rate": 9.989877298092161e-06, "loss": 0.5048, "step": 273 }, { "epoch": 0.03019283746556474, "grad_norm": 15.385665893554688, "learning_rate": 9.98976579207711e-06, "loss": 0.5121, "step": 274 }, { "epoch": 0.030303030303030304, "grad_norm": 14.260980606079102, "learning_rate": 9.989653675906722e-06, "loss": 0.5526, "step": 275 }, { "epoch": 0.030413223140495868, "grad_norm": 18.089174270629883, "learning_rate": 9.989540949594701e-06, "loss": 0.6646, "step": 276 }, { "epoch": 0.030523415977961432, "grad_norm": 9.740517616271973, "learning_rate": 9.989427613154838e-06, "loss": 0.513, "step": 277 }, { "epoch": 0.030633608815426996, "grad_norm": 13.526970863342285, "learning_rate": 9.989313666600987e-06, "loss": 0.4703, "step": 278 }, { "epoch": 0.03074380165289256, "grad_norm": 13.345050811767578, "learning_rate": 9.989199109947084e-06, "loss": 0.4925, "step": 279 }, { "epoch": 0.03085399449035813, "grad_norm": 18.05704116821289, "learning_rate": 9.989083943207137e-06, "loss": 0.5549, "step": 280 }, { "epoch": 0.030964187327823692, "grad_norm": 10.237497329711914, "learning_rate": 9.98896816639523e-06, "loss": 0.5152, "step": 281 }, { "epoch": 0.031074380165289257, "grad_norm": 7.69012975692749, "learning_rate": 9.988851779525515e-06, "loss": 0.5107, "step": 282 }, { "epoch": 0.03118457300275482, "grad_norm": 10.806635856628418, "learning_rate": 9.98873478261223e-06, "loss": 0.5326, "step": 283 }, { "epoch": 0.03129476584022039, "grad_norm": 9.593619346618652, "learning_rate": 9.98861717566968e-06, "loss": 0.3565, "step": 284 }, { "epoch": 0.03140495867768595, "grad_norm": 9.333944320678711, "learning_rate": 9.988498958712245e-06, "loss": 0.55, "step": 285 }, { "epoch": 0.03151515151515152, "grad_norm": 14.35074520111084, "learning_rate": 9.98838013175438e-06, "loss": 0.5326, "step": 286 }, { "epoch": 0.03162534435261708, "grad_norm": 15.526893615722656, "learning_rate": 9.988260694810616e-06, "loss": 0.4703, "step": 287 }, { "epoch": 0.031735537190082645, "grad_norm": 12.787149429321289, "learning_rate": 9.988140647895562e-06, "loss": 0.4842, "step": 288 }, { "epoch": 0.03184573002754821, "grad_norm": 17.571022033691406, "learning_rate": 9.98801999102389e-06, "loss": 0.5092, "step": 289 }, { "epoch": 0.031955922865013774, "grad_norm": 10.923630714416504, "learning_rate": 9.987898724210359e-06, "loss": 0.506, "step": 290 }, { "epoch": 0.03206611570247934, "grad_norm": 15.550124168395996, "learning_rate": 9.987776847469797e-06, "loss": 0.4913, "step": 291 }, { "epoch": 0.0321763085399449, "grad_norm": 10.694622039794922, "learning_rate": 9.987654360817106e-06, "loss": 0.4885, "step": 292 }, { "epoch": 0.032286501377410466, "grad_norm": 9.674568176269531, "learning_rate": 9.987531264267265e-06, "loss": 0.4923, "step": 293 }, { "epoch": 0.03239669421487603, "grad_norm": 9.947579383850098, "learning_rate": 9.987407557835327e-06, "loss": 0.4283, "step": 294 }, { "epoch": 0.032506887052341595, "grad_norm": 10.365708351135254, "learning_rate": 9.987283241536419e-06, "loss": 0.3912, "step": 295 }, { "epoch": 0.032617079889807166, "grad_norm": 11.711014747619629, "learning_rate": 9.987158315385738e-06, "loss": 0.4957, "step": 296 }, { "epoch": 0.03272727272727273, "grad_norm": 13.972429275512695, "learning_rate": 9.987032779398566e-06, "loss": 0.5765, "step": 297 }, { "epoch": 0.032837465564738294, "grad_norm": 11.309210777282715, "learning_rate": 9.986906633590252e-06, "loss": 0.4206, "step": 298 }, { "epoch": 0.03294765840220386, "grad_norm": 14.466939926147461, "learning_rate": 9.986779877976221e-06, "loss": 0.4687, "step": 299 }, { "epoch": 0.03305785123966942, "grad_norm": 11.72865104675293, "learning_rate": 9.986652512571972e-06, "loss": 0.5604, "step": 300 }, { "epoch": 0.03316804407713499, "grad_norm": 12.25109577178955, "learning_rate": 9.98652453739308e-06, "loss": 0.4229, "step": 301 }, { "epoch": 0.03327823691460055, "grad_norm": 10.369767189025879, "learning_rate": 9.986395952455194e-06, "loss": 0.4978, "step": 302 }, { "epoch": 0.033388429752066115, "grad_norm": 14.900583267211914, "learning_rate": 9.986266757774038e-06, "loss": 0.4373, "step": 303 }, { "epoch": 0.03349862258953168, "grad_norm": 10.342029571533203, "learning_rate": 9.986136953365409e-06, "loss": 0.4624, "step": 304 }, { "epoch": 0.03360881542699724, "grad_norm": 10.64696216583252, "learning_rate": 9.986006539245181e-06, "loss": 0.4459, "step": 305 }, { "epoch": 0.03371900826446281, "grad_norm": 11.714919090270996, "learning_rate": 9.9858755154293e-06, "loss": 0.5102, "step": 306 }, { "epoch": 0.03382920110192837, "grad_norm": 9.903362274169922, "learning_rate": 9.985743881933789e-06, "loss": 0.5264, "step": 307 }, { "epoch": 0.03393939393939394, "grad_norm": 7.611393451690674, "learning_rate": 9.985611638774744e-06, "loss": 0.5104, "step": 308 }, { "epoch": 0.03404958677685951, "grad_norm": 9.278596878051758, "learning_rate": 9.985478785968334e-06, "loss": 0.4472, "step": 309 }, { "epoch": 0.03415977961432507, "grad_norm": 14.639801025390625, "learning_rate": 9.985345323530806e-06, "loss": 0.4893, "step": 310 }, { "epoch": 0.034269972451790635, "grad_norm": 12.649656295776367, "learning_rate": 9.985211251478482e-06, "loss": 0.4276, "step": 311 }, { "epoch": 0.0343801652892562, "grad_norm": 16.50640106201172, "learning_rate": 9.985076569827752e-06, "loss": 0.4735, "step": 312 }, { "epoch": 0.034490358126721764, "grad_norm": 12.83117961883545, "learning_rate": 9.984941278595088e-06, "loss": 0.5901, "step": 313 }, { "epoch": 0.03460055096418733, "grad_norm": 14.302864074707031, "learning_rate": 9.984805377797033e-06, "loss": 0.4452, "step": 314 }, { "epoch": 0.03471074380165289, "grad_norm": 14.504166603088379, "learning_rate": 9.984668867450207e-06, "loss": 0.5536, "step": 315 }, { "epoch": 0.034820936639118456, "grad_norm": 17.524925231933594, "learning_rate": 9.9845317475713e-06, "loss": 0.582, "step": 316 }, { "epoch": 0.03493112947658402, "grad_norm": 15.99123764038086, "learning_rate": 9.984394018177079e-06, "loss": 0.5481, "step": 317 }, { "epoch": 0.035041322314049585, "grad_norm": 12.897750854492188, "learning_rate": 9.984255679284388e-06, "loss": 0.4505, "step": 318 }, { "epoch": 0.03515151515151515, "grad_norm": 11.626313209533691, "learning_rate": 9.984116730910141e-06, "loss": 0.5878, "step": 319 }, { "epoch": 0.03526170798898071, "grad_norm": 11.26834774017334, "learning_rate": 9.98397717307133e-06, "loss": 0.5804, "step": 320 }, { "epoch": 0.035371900826446284, "grad_norm": 11.204084396362305, "learning_rate": 9.983837005785022e-06, "loss": 0.5339, "step": 321 }, { "epoch": 0.03548209366391185, "grad_norm": 11.40246295928955, "learning_rate": 9.983696229068354e-06, "loss": 0.5181, "step": 322 }, { "epoch": 0.03559228650137741, "grad_norm": 12.926445960998535, "learning_rate": 9.98355484293854e-06, "loss": 0.5742, "step": 323 }, { "epoch": 0.03570247933884298, "grad_norm": 10.973544120788574, "learning_rate": 9.983412847412872e-06, "loss": 0.5005, "step": 324 }, { "epoch": 0.03581267217630854, "grad_norm": 7.6059956550598145, "learning_rate": 9.983270242508712e-06, "loss": 0.4821, "step": 325 }, { "epoch": 0.035922865013774105, "grad_norm": 15.33000659942627, "learning_rate": 9.983127028243497e-06, "loss": 0.503, "step": 326 }, { "epoch": 0.03603305785123967, "grad_norm": 21.503162384033203, "learning_rate": 9.98298320463474e-06, "loss": 0.5549, "step": 327 }, { "epoch": 0.036143250688705233, "grad_norm": 17.429031372070312, "learning_rate": 9.982838771700027e-06, "loss": 0.5355, "step": 328 }, { "epoch": 0.0362534435261708, "grad_norm": 9.791876792907715, "learning_rate": 9.982693729457023e-06, "loss": 0.4157, "step": 329 }, { "epoch": 0.03636363636363636, "grad_norm": 13.283187866210938, "learning_rate": 9.98254807792346e-06, "loss": 0.5341, "step": 330 }, { "epoch": 0.036473829201101926, "grad_norm": 28.495004653930664, "learning_rate": 9.982401817117149e-06, "loss": 0.4931, "step": 331 }, { "epoch": 0.03658402203856749, "grad_norm": 10.677569389343262, "learning_rate": 9.982254947055976e-06, "loss": 0.4693, "step": 332 }, { "epoch": 0.036694214876033054, "grad_norm": 14.609735488891602, "learning_rate": 9.982107467757902e-06, "loss": 0.5492, "step": 333 }, { "epoch": 0.036804407713498626, "grad_norm": 7.3842339515686035, "learning_rate": 9.981959379240957e-06, "loss": 0.4593, "step": 334 }, { "epoch": 0.03691460055096419, "grad_norm": 11.509512901306152, "learning_rate": 9.981810681523254e-06, "loss": 0.482, "step": 335 }, { "epoch": 0.037024793388429754, "grad_norm": 12.058653831481934, "learning_rate": 9.981661374622974e-06, "loss": 0.474, "step": 336 }, { "epoch": 0.03713498622589532, "grad_norm": 9.072152137756348, "learning_rate": 9.981511458558373e-06, "loss": 0.5597, "step": 337 }, { "epoch": 0.03724517906336088, "grad_norm": 12.551641464233398, "learning_rate": 9.981360933347783e-06, "loss": 0.4377, "step": 338 }, { "epoch": 0.037355371900826446, "grad_norm": 17.34666633605957, "learning_rate": 9.981209799009613e-06, "loss": 0.496, "step": 339 }, { "epoch": 0.03746556473829201, "grad_norm": 21.156679153442383, "learning_rate": 9.981058055562343e-06, "loss": 0.6343, "step": 340 }, { "epoch": 0.037575757575757575, "grad_norm": 10.933553695678711, "learning_rate": 9.980905703024525e-06, "loss": 0.5036, "step": 341 }, { "epoch": 0.03768595041322314, "grad_norm": 7.121741771697998, "learning_rate": 9.980752741414796e-06, "loss": 0.5217, "step": 342 }, { "epoch": 0.0377961432506887, "grad_norm": 7.140209674835205, "learning_rate": 9.980599170751852e-06, "loss": 0.4371, "step": 343 }, { "epoch": 0.03790633608815427, "grad_norm": 8.137228012084961, "learning_rate": 9.980444991054478e-06, "loss": 0.4995, "step": 344 }, { "epoch": 0.03801652892561983, "grad_norm": 7.849926948547363, "learning_rate": 9.980290202341525e-06, "loss": 0.5248, "step": 345 }, { "epoch": 0.0381267217630854, "grad_norm": 15.131035804748535, "learning_rate": 9.980134804631922e-06, "loss": 0.5769, "step": 346 }, { "epoch": 0.03823691460055097, "grad_norm": 9.500950813293457, "learning_rate": 9.97997879794467e-06, "loss": 0.5525, "step": 347 }, { "epoch": 0.03834710743801653, "grad_norm": 15.258380889892578, "learning_rate": 9.979822182298843e-06, "loss": 0.5377, "step": 348 }, { "epoch": 0.038457300275482095, "grad_norm": 20.885284423828125, "learning_rate": 9.9796649577136e-06, "loss": 0.6536, "step": 349 }, { "epoch": 0.03856749311294766, "grad_norm": 12.07435417175293, "learning_rate": 9.979507124208158e-06, "loss": 0.583, "step": 350 }, { "epoch": 0.038677685950413224, "grad_norm": 12.532731056213379, "learning_rate": 9.979348681801821e-06, "loss": 0.4871, "step": 351 }, { "epoch": 0.03878787878787879, "grad_norm": 9.900496482849121, "learning_rate": 9.979189630513966e-06, "loss": 0.5749, "step": 352 }, { "epoch": 0.03889807162534435, "grad_norm": 12.795638084411621, "learning_rate": 9.979029970364038e-06, "loss": 0.5578, "step": 353 }, { "epoch": 0.039008264462809916, "grad_norm": 20.84184455871582, "learning_rate": 9.978869701371562e-06, "loss": 0.6099, "step": 354 }, { "epoch": 0.03911845730027548, "grad_norm": 8.181387901306152, "learning_rate": 9.978708823556135e-06, "loss": 0.4692, "step": 355 }, { "epoch": 0.039228650137741045, "grad_norm": 9.632551193237305, "learning_rate": 9.97854733693743e-06, "loss": 0.3728, "step": 356 }, { "epoch": 0.03933884297520661, "grad_norm": 10.69709587097168, "learning_rate": 9.978385241535194e-06, "loss": 0.5292, "step": 357 }, { "epoch": 0.03944903581267217, "grad_norm": 8.557096481323242, "learning_rate": 9.978222537369249e-06, "loss": 0.4841, "step": 358 }, { "epoch": 0.039559228650137744, "grad_norm": 9.893890380859375, "learning_rate": 9.97805922445949e-06, "loss": 0.4939, "step": 359 }, { "epoch": 0.03966942148760331, "grad_norm": 10.700851440429688, "learning_rate": 9.977895302825886e-06, "loss": 0.3914, "step": 360 }, { "epoch": 0.03977961432506887, "grad_norm": 11.766791343688965, "learning_rate": 9.977730772488483e-06, "loss": 0.5544, "step": 361 }, { "epoch": 0.03988980716253444, "grad_norm": 13.675890922546387, "learning_rate": 9.977565633467401e-06, "loss": 0.6028, "step": 362 }, { "epoch": 0.04, "grad_norm": 7.105637073516846, "learning_rate": 9.97739988578283e-06, "loss": 0.4251, "step": 363 }, { "epoch": 0.040110192837465565, "grad_norm": 15.840269088745117, "learning_rate": 9.977233529455042e-06, "loss": 0.5866, "step": 364 }, { "epoch": 0.04022038567493113, "grad_norm": 10.23617172241211, "learning_rate": 9.977066564504374e-06, "loss": 0.5122, "step": 365 }, { "epoch": 0.04033057851239669, "grad_norm": 8.172809600830078, "learning_rate": 9.976898990951249e-06, "loss": 0.4953, "step": 366 }, { "epoch": 0.04044077134986226, "grad_norm": 11.545899391174316, "learning_rate": 9.976730808816153e-06, "loss": 0.5851, "step": 367 }, { "epoch": 0.04055096418732782, "grad_norm": 10.631319999694824, "learning_rate": 9.976562018119654e-06, "loss": 0.4278, "step": 368 }, { "epoch": 0.040661157024793386, "grad_norm": 14.287742614746094, "learning_rate": 9.976392618882391e-06, "loss": 0.5445, "step": 369 }, { "epoch": 0.04077134986225895, "grad_norm": 11.023226737976074, "learning_rate": 9.976222611125079e-06, "loss": 0.4763, "step": 370 }, { "epoch": 0.04088154269972452, "grad_norm": 10.713460922241211, "learning_rate": 9.976051994868506e-06, "loss": 0.5459, "step": 371 }, { "epoch": 0.040991735537190085, "grad_norm": 9.844223976135254, "learning_rate": 9.975880770133537e-06, "loss": 0.4924, "step": 372 }, { "epoch": 0.04110192837465565, "grad_norm": 16.06312370300293, "learning_rate": 9.975708936941107e-06, "loss": 0.5548, "step": 373 }, { "epoch": 0.041212121212121214, "grad_norm": 12.113286972045898, "learning_rate": 9.97553649531223e-06, "loss": 0.4829, "step": 374 }, { "epoch": 0.04132231404958678, "grad_norm": 11.757156372070312, "learning_rate": 9.975363445267993e-06, "loss": 0.4318, "step": 375 }, { "epoch": 0.04143250688705234, "grad_norm": 14.062914848327637, "learning_rate": 9.975189786829554e-06, "loss": 0.5562, "step": 376 }, { "epoch": 0.041542699724517906, "grad_norm": 11.511475563049316, "learning_rate": 9.975015520018149e-06, "loss": 0.5945, "step": 377 }, { "epoch": 0.04165289256198347, "grad_norm": 20.82297706604004, "learning_rate": 9.974840644855091e-06, "loss": 0.5634, "step": 378 }, { "epoch": 0.041763085399449035, "grad_norm": 13.976396560668945, "learning_rate": 9.974665161361759e-06, "loss": 0.5392, "step": 379 }, { "epoch": 0.0418732782369146, "grad_norm": 12.412278175354004, "learning_rate": 9.974489069559615e-06, "loss": 0.6214, "step": 380 }, { "epoch": 0.04198347107438016, "grad_norm": 15.933039665222168, "learning_rate": 9.97431236947019e-06, "loss": 0.5612, "step": 381 }, { "epoch": 0.04209366391184573, "grad_norm": 11.01435375213623, "learning_rate": 9.974135061115091e-06, "loss": 0.4561, "step": 382 }, { "epoch": 0.04220385674931129, "grad_norm": 6.11425256729126, "learning_rate": 9.973957144516002e-06, "loss": 0.4799, "step": 383 }, { "epoch": 0.04231404958677686, "grad_norm": 9.104811668395996, "learning_rate": 9.973778619694673e-06, "loss": 0.4598, "step": 384 }, { "epoch": 0.04242424242424243, "grad_norm": 11.529860496520996, "learning_rate": 9.973599486672942e-06, "loss": 0.4746, "step": 385 }, { "epoch": 0.04253443526170799, "grad_norm": 10.589902877807617, "learning_rate": 9.973419745472708e-06, "loss": 0.448, "step": 386 }, { "epoch": 0.042644628099173555, "grad_norm": 6.479124546051025, "learning_rate": 9.973239396115952e-06, "loss": 0.5251, "step": 387 }, { "epoch": 0.04275482093663912, "grad_norm": 38.29296112060547, "learning_rate": 9.973058438624727e-06, "loss": 0.5769, "step": 388 }, { "epoch": 0.042865013774104684, "grad_norm": 11.937189102172852, "learning_rate": 9.972876873021162e-06, "loss": 0.5102, "step": 389 }, { "epoch": 0.04297520661157025, "grad_norm": 12.845355987548828, "learning_rate": 9.972694699327456e-06, "loss": 0.549, "step": 390 }, { "epoch": 0.04308539944903581, "grad_norm": 7.100241661071777, "learning_rate": 9.972511917565889e-06, "loss": 0.4335, "step": 391 }, { "epoch": 0.043195592286501376, "grad_norm": 9.807534217834473, "learning_rate": 9.97232852775881e-06, "loss": 0.5071, "step": 392 }, { "epoch": 0.04330578512396694, "grad_norm": 9.859899520874023, "learning_rate": 9.972144529928644e-06, "loss": 0.4604, "step": 393 }, { "epoch": 0.043415977961432504, "grad_norm": 16.952932357788086, "learning_rate": 9.971959924097892e-06, "loss": 0.5819, "step": 394 }, { "epoch": 0.04352617079889807, "grad_norm": 14.593550682067871, "learning_rate": 9.971774710289124e-06, "loss": 0.5367, "step": 395 }, { "epoch": 0.04363636363636364, "grad_norm": 13.790374755859375, "learning_rate": 9.971588888524993e-06, "loss": 0.4683, "step": 396 }, { "epoch": 0.043746556473829204, "grad_norm": 15.670136451721191, "learning_rate": 9.971402458828218e-06, "loss": 0.5072, "step": 397 }, { "epoch": 0.04385674931129477, "grad_norm": 16.61530113220215, "learning_rate": 9.9712154212216e-06, "loss": 0.5691, "step": 398 }, { "epoch": 0.04396694214876033, "grad_norm": 9.777688980102539, "learning_rate": 9.971027775728007e-06, "loss": 0.4844, "step": 399 }, { "epoch": 0.0440771349862259, "grad_norm": 10.27904224395752, "learning_rate": 9.970839522370383e-06, "loss": 0.5481, "step": 400 }, { "epoch": 0.04418732782369146, "grad_norm": 7.96864128112793, "learning_rate": 9.970650661171751e-06, "loss": 0.4588, "step": 401 }, { "epoch": 0.044297520661157025, "grad_norm": 8.261791229248047, "learning_rate": 9.970461192155205e-06, "loss": 0.5101, "step": 402 }, { "epoch": 0.04440771349862259, "grad_norm": 11.426614761352539, "learning_rate": 9.970271115343911e-06, "loss": 0.5948, "step": 403 }, { "epoch": 0.04451790633608815, "grad_norm": 13.521474838256836, "learning_rate": 9.970080430761116e-06, "loss": 0.5626, "step": 404 }, { "epoch": 0.04462809917355372, "grad_norm": 8.598532676696777, "learning_rate": 9.969889138430133e-06, "loss": 0.4888, "step": 405 }, { "epoch": 0.04473829201101928, "grad_norm": 13.616643905639648, "learning_rate": 9.969697238374355e-06, "loss": 0.5023, "step": 406 }, { "epoch": 0.044848484848484846, "grad_norm": 7.1386003494262695, "learning_rate": 9.969504730617248e-06, "loss": 0.4734, "step": 407 }, { "epoch": 0.04495867768595041, "grad_norm": 10.026013374328613, "learning_rate": 9.969311615182353e-06, "loss": 0.4859, "step": 408 }, { "epoch": 0.04506887052341598, "grad_norm": 15.475004196166992, "learning_rate": 9.969117892093283e-06, "loss": 0.588, "step": 409 }, { "epoch": 0.045179063360881545, "grad_norm": 15.79782772064209, "learning_rate": 9.968923561373728e-06, "loss": 0.633, "step": 410 }, { "epoch": 0.04528925619834711, "grad_norm": 22.108070373535156, "learning_rate": 9.96872862304745e-06, "loss": 0.5475, "step": 411 }, { "epoch": 0.045399449035812674, "grad_norm": 8.265995979309082, "learning_rate": 9.968533077138287e-06, "loss": 0.4666, "step": 412 }, { "epoch": 0.04550964187327824, "grad_norm": 7.072658061981201, "learning_rate": 9.96833692367015e-06, "loss": 0.4725, "step": 413 }, { "epoch": 0.0456198347107438, "grad_norm": 10.56517219543457, "learning_rate": 9.968140162667024e-06, "loss": 0.4867, "step": 414 }, { "epoch": 0.045730027548209366, "grad_norm": 16.286352157592773, "learning_rate": 9.967942794152972e-06, "loss": 0.4781, "step": 415 }, { "epoch": 0.04584022038567493, "grad_norm": 8.922318458557129, "learning_rate": 9.967744818152125e-06, "loss": 0.4878, "step": 416 }, { "epoch": 0.045950413223140495, "grad_norm": 6.937612056732178, "learning_rate": 9.967546234688694e-06, "loss": 0.5643, "step": 417 }, { "epoch": 0.04606060606060606, "grad_norm": 12.612809181213379, "learning_rate": 9.967347043786964e-06, "loss": 0.5331, "step": 418 }, { "epoch": 0.04617079889807162, "grad_norm": 7.663732051849365, "learning_rate": 9.967147245471287e-06, "loss": 0.4905, "step": 419 }, { "epoch": 0.04628099173553719, "grad_norm": 10.40149974822998, "learning_rate": 9.9669468397661e-06, "loss": 0.4748, "step": 420 }, { "epoch": 0.04639118457300275, "grad_norm": 8.454051971435547, "learning_rate": 9.966745826695905e-06, "loss": 0.3773, "step": 421 }, { "epoch": 0.04650137741046832, "grad_norm": 7.636160373687744, "learning_rate": 9.966544206285285e-06, "loss": 0.4961, "step": 422 }, { "epoch": 0.04661157024793389, "grad_norm": 6.887092113494873, "learning_rate": 9.96634197855889e-06, "loss": 0.509, "step": 423 }, { "epoch": 0.04672176308539945, "grad_norm": 9.089897155761719, "learning_rate": 9.966139143541455e-06, "loss": 0.4543, "step": 424 }, { "epoch": 0.046831955922865015, "grad_norm": 11.715934753417969, "learning_rate": 9.965935701257779e-06, "loss": 0.5041, "step": 425 }, { "epoch": 0.04694214876033058, "grad_norm": 12.82231330871582, "learning_rate": 9.96573165173274e-06, "loss": 0.5387, "step": 426 }, { "epoch": 0.04705234159779614, "grad_norm": 6.379404067993164, "learning_rate": 9.965526994991288e-06, "loss": 0.5053, "step": 427 }, { "epoch": 0.04716253443526171, "grad_norm": 11.586793899536133, "learning_rate": 9.96532173105845e-06, "loss": 0.4907, "step": 428 }, { "epoch": 0.04727272727272727, "grad_norm": 5.423346996307373, "learning_rate": 9.965115859959327e-06, "loss": 0.4862, "step": 429 }, { "epoch": 0.047382920110192836, "grad_norm": 5.6065754890441895, "learning_rate": 9.964909381719092e-06, "loss": 0.5168, "step": 430 }, { "epoch": 0.0474931129476584, "grad_norm": 11.98960018157959, "learning_rate": 9.964702296362995e-06, "loss": 0.4249, "step": 431 }, { "epoch": 0.047603305785123964, "grad_norm": 19.07823944091797, "learning_rate": 9.964494603916356e-06, "loss": 0.5719, "step": 432 }, { "epoch": 0.04771349862258953, "grad_norm": 8.011237144470215, "learning_rate": 9.964286304404573e-06, "loss": 0.4781, "step": 433 }, { "epoch": 0.0478236914600551, "grad_norm": 11.941851615905762, "learning_rate": 9.964077397853117e-06, "loss": 0.5121, "step": 434 }, { "epoch": 0.047933884297520664, "grad_norm": 12.690646171569824, "learning_rate": 9.963867884287534e-06, "loss": 0.341, "step": 435 }, { "epoch": 0.04804407713498623, "grad_norm": 9.887231826782227, "learning_rate": 9.963657763733445e-06, "loss": 0.4575, "step": 436 }, { "epoch": 0.04815426997245179, "grad_norm": 25.173494338989258, "learning_rate": 9.96344703621654e-06, "loss": 0.5522, "step": 437 }, { "epoch": 0.048264462809917356, "grad_norm": 9.900611877441406, "learning_rate": 9.963235701762591e-06, "loss": 0.4944, "step": 438 }, { "epoch": 0.04837465564738292, "grad_norm": 9.656048774719238, "learning_rate": 9.963023760397437e-06, "loss": 0.5256, "step": 439 }, { "epoch": 0.048484848484848485, "grad_norm": 18.26109504699707, "learning_rate": 9.962811212146997e-06, "loss": 0.5189, "step": 440 }, { "epoch": 0.04859504132231405, "grad_norm": 9.127241134643555, "learning_rate": 9.96259805703726e-06, "loss": 0.529, "step": 441 }, { "epoch": 0.04870523415977961, "grad_norm": 9.09994125366211, "learning_rate": 9.962384295094293e-06, "loss": 0.4587, "step": 442 }, { "epoch": 0.04881542699724518, "grad_norm": 6.427917003631592, "learning_rate": 9.962169926344231e-06, "loss": 0.4969, "step": 443 }, { "epoch": 0.04892561983471074, "grad_norm": 12.126689910888672, "learning_rate": 9.961954950813292e-06, "loss": 0.6163, "step": 444 }, { "epoch": 0.049035812672176306, "grad_norm": 13.614275932312012, "learning_rate": 9.96173936852776e-06, "loss": 0.4255, "step": 445 }, { "epoch": 0.04914600550964187, "grad_norm": 10.977042198181152, "learning_rate": 9.961523179514e-06, "loss": 0.4867, "step": 446 }, { "epoch": 0.04925619834710744, "grad_norm": 7.973937034606934, "learning_rate": 9.961306383798445e-06, "loss": 0.4506, "step": 447 }, { "epoch": 0.049366391184573005, "grad_norm": 12.840161323547363, "learning_rate": 9.961088981407607e-06, "loss": 0.6008, "step": 448 }, { "epoch": 0.04947658402203857, "grad_norm": 7.462771415710449, "learning_rate": 9.960870972368068e-06, "loss": 0.5071, "step": 449 }, { "epoch": 0.049586776859504134, "grad_norm": 8.614627838134766, "learning_rate": 9.960652356706489e-06, "loss": 0.5687, "step": 450 }, { "epoch": 0.0496969696969697, "grad_norm": 8.726750373840332, "learning_rate": 9.960433134449601e-06, "loss": 0.4525, "step": 451 }, { "epoch": 0.04980716253443526, "grad_norm": 11.282340049743652, "learning_rate": 9.960213305624211e-06, "loss": 0.5434, "step": 452 }, { "epoch": 0.049917355371900826, "grad_norm": 18.417638778686523, "learning_rate": 9.9599928702572e-06, "loss": 0.5903, "step": 453 }, { "epoch": 0.05002754820936639, "grad_norm": 11.008544921875, "learning_rate": 9.959771828375523e-06, "loss": 0.4515, "step": 454 }, { "epoch": 0.050137741046831955, "grad_norm": 7.547202110290527, "learning_rate": 9.95955018000621e-06, "loss": 0.3885, "step": 455 }, { "epoch": 0.05024793388429752, "grad_norm": 7.816680431365967, "learning_rate": 9.959327925176365e-06, "loss": 0.4768, "step": 456 }, { "epoch": 0.05035812672176308, "grad_norm": 10.285447120666504, "learning_rate": 9.959105063913164e-06, "loss": 0.501, "step": 457 }, { "epoch": 0.05046831955922865, "grad_norm": 9.738030433654785, "learning_rate": 9.95888159624386e-06, "loss": 0.5915, "step": 458 }, { "epoch": 0.05057851239669422, "grad_norm": 8.554464340209961, "learning_rate": 9.958657522195779e-06, "loss": 0.4127, "step": 459 }, { "epoch": 0.05068870523415978, "grad_norm": 13.582873344421387, "learning_rate": 9.958432841796319e-06, "loss": 0.4419, "step": 460 }, { "epoch": 0.05079889807162535, "grad_norm": 10.61713695526123, "learning_rate": 9.958207555072957e-06, "loss": 0.5171, "step": 461 }, { "epoch": 0.05090909090909091, "grad_norm": 14.762274742126465, "learning_rate": 9.957981662053239e-06, "loss": 0.4755, "step": 462 }, { "epoch": 0.051019283746556475, "grad_norm": 10.927807807922363, "learning_rate": 9.957755162764789e-06, "loss": 0.3426, "step": 463 }, { "epoch": 0.05112947658402204, "grad_norm": 9.77183723449707, "learning_rate": 9.957528057235301e-06, "loss": 0.476, "step": 464 }, { "epoch": 0.0512396694214876, "grad_norm": 9.146940231323242, "learning_rate": 9.95730034549255e-06, "loss": 0.5547, "step": 465 }, { "epoch": 0.05134986225895317, "grad_norm": 9.617725372314453, "learning_rate": 9.95707202756438e-06, "loss": 0.4616, "step": 466 }, { "epoch": 0.05146005509641873, "grad_norm": 7.482451915740967, "learning_rate": 9.956843103478709e-06, "loss": 0.399, "step": 467 }, { "epoch": 0.051570247933884296, "grad_norm": 12.296363830566406, "learning_rate": 9.95661357326353e-06, "loss": 0.4369, "step": 468 }, { "epoch": 0.05168044077134986, "grad_norm": 9.191376686096191, "learning_rate": 9.956383436946908e-06, "loss": 0.4178, "step": 469 }, { "epoch": 0.051790633608815424, "grad_norm": 24.099687576293945, "learning_rate": 9.956152694556988e-06, "loss": 0.5529, "step": 470 }, { "epoch": 0.05190082644628099, "grad_norm": 10.05209732055664, "learning_rate": 9.955921346121985e-06, "loss": 0.4258, "step": 471 }, { "epoch": 0.05201101928374656, "grad_norm": 10.024127006530762, "learning_rate": 9.955689391670188e-06, "loss": 0.4693, "step": 472 }, { "epoch": 0.052121212121212124, "grad_norm": 12.266153335571289, "learning_rate": 9.95545683122996e-06, "loss": 0.5465, "step": 473 }, { "epoch": 0.05223140495867769, "grad_norm": 6.339637279510498, "learning_rate": 9.955223664829739e-06, "loss": 0.4511, "step": 474 }, { "epoch": 0.05234159779614325, "grad_norm": 7.533390522003174, "learning_rate": 9.954989892498037e-06, "loss": 0.3883, "step": 475 }, { "epoch": 0.052451790633608816, "grad_norm": 9.525851249694824, "learning_rate": 9.954755514263442e-06, "loss": 0.4514, "step": 476 }, { "epoch": 0.05256198347107438, "grad_norm": 9.640487670898438, "learning_rate": 9.95452053015461e-06, "loss": 0.5306, "step": 477 }, { "epoch": 0.052672176308539945, "grad_norm": 11.502488136291504, "learning_rate": 9.95428494020028e-06, "loss": 0.5595, "step": 478 }, { "epoch": 0.05278236914600551, "grad_norm": 13.235862731933594, "learning_rate": 9.954048744429256e-06, "loss": 0.5212, "step": 479 }, { "epoch": 0.05289256198347107, "grad_norm": 11.967812538146973, "learning_rate": 9.953811942870422e-06, "loss": 0.6275, "step": 480 }, { "epoch": 0.05300275482093664, "grad_norm": 11.516127586364746, "learning_rate": 9.953574535552735e-06, "loss": 0.4844, "step": 481 }, { "epoch": 0.0531129476584022, "grad_norm": 7.050947666168213, "learning_rate": 9.953336522505227e-06, "loss": 0.4414, "step": 482 }, { "epoch": 0.053223140495867766, "grad_norm": 10.11103630065918, "learning_rate": 9.953097903756997e-06, "loss": 0.4532, "step": 483 }, { "epoch": 0.05333333333333334, "grad_norm": 9.065771102905273, "learning_rate": 9.95285867933723e-06, "loss": 0.4776, "step": 484 }, { "epoch": 0.0534435261707989, "grad_norm": 12.761256217956543, "learning_rate": 9.952618849275173e-06, "loss": 0.4895, "step": 485 }, { "epoch": 0.053553719008264465, "grad_norm": 9.490694999694824, "learning_rate": 9.952378413600159e-06, "loss": 0.4583, "step": 486 }, { "epoch": 0.05366391184573003, "grad_norm": 6.050409317016602, "learning_rate": 9.952137372341584e-06, "loss": 0.5119, "step": 487 }, { "epoch": 0.053774104683195594, "grad_norm": 9.085648536682129, "learning_rate": 9.951895725528924e-06, "loss": 0.4401, "step": 488 }, { "epoch": 0.05388429752066116, "grad_norm": 10.04773235321045, "learning_rate": 9.951653473191727e-06, "loss": 0.4676, "step": 489 }, { "epoch": 0.05399449035812672, "grad_norm": 14.638328552246094, "learning_rate": 9.951410615359619e-06, "loss": 0.5522, "step": 490 }, { "epoch": 0.054104683195592286, "grad_norm": 12.053609848022461, "learning_rate": 9.951167152062296e-06, "loss": 0.5225, "step": 491 }, { "epoch": 0.05421487603305785, "grad_norm": 12.283226013183594, "learning_rate": 9.950923083329525e-06, "loss": 0.4905, "step": 492 }, { "epoch": 0.054325068870523414, "grad_norm": 9.185400009155273, "learning_rate": 9.950678409191157e-06, "loss": 0.4756, "step": 493 }, { "epoch": 0.05443526170798898, "grad_norm": 7.565023899078369, "learning_rate": 9.950433129677106e-06, "loss": 0.5202, "step": 494 }, { "epoch": 0.05454545454545454, "grad_norm": 11.195167541503906, "learning_rate": 9.950187244817368e-06, "loss": 0.4532, "step": 495 }, { "epoch": 0.05465564738292011, "grad_norm": 10.009805679321289, "learning_rate": 9.94994075464201e-06, "loss": 0.509, "step": 496 }, { "epoch": 0.05476584022038568, "grad_norm": 11.064774513244629, "learning_rate": 9.949693659181175e-06, "loss": 0.534, "step": 497 }, { "epoch": 0.05487603305785124, "grad_norm": 9.216227531433105, "learning_rate": 9.949445958465074e-06, "loss": 0.4937, "step": 498 }, { "epoch": 0.054986225895316806, "grad_norm": 9.245732307434082, "learning_rate": 9.949197652523996e-06, "loss": 0.5174, "step": 499 }, { "epoch": 0.05509641873278237, "grad_norm": 15.904149055480957, "learning_rate": 9.94894874138831e-06, "loss": 0.5527, "step": 500 }, { "epoch": 0.055206611570247935, "grad_norm": 8.78382682800293, "learning_rate": 9.948699225088446e-06, "loss": 0.5693, "step": 501 }, { "epoch": 0.0553168044077135, "grad_norm": 9.095039367675781, "learning_rate": 9.94844910365492e-06, "loss": 0.5161, "step": 502 }, { "epoch": 0.05542699724517906, "grad_norm": 29.34706687927246, "learning_rate": 9.948198377118316e-06, "loss": 0.7604, "step": 503 }, { "epoch": 0.05553719008264463, "grad_norm": 11.551289558410645, "learning_rate": 9.947947045509292e-06, "loss": 0.518, "step": 504 }, { "epoch": 0.05564738292011019, "grad_norm": 12.79025936126709, "learning_rate": 9.947695108858583e-06, "loss": 0.5482, "step": 505 }, { "epoch": 0.055757575757575756, "grad_norm": 5.822357654571533, "learning_rate": 9.947442567196996e-06, "loss": 0.4895, "step": 506 }, { "epoch": 0.05586776859504132, "grad_norm": 7.41774320602417, "learning_rate": 9.94718942055541e-06, "loss": 0.4525, "step": 507 }, { "epoch": 0.055977961432506884, "grad_norm": 6.24639368057251, "learning_rate": 9.946935668964784e-06, "loss": 0.579, "step": 508 }, { "epoch": 0.05608815426997245, "grad_norm": 8.718338012695312, "learning_rate": 9.946681312456142e-06, "loss": 0.5288, "step": 509 }, { "epoch": 0.05619834710743802, "grad_norm": 8.52311897277832, "learning_rate": 9.946426351060589e-06, "loss": 0.4902, "step": 510 }, { "epoch": 0.056308539944903584, "grad_norm": 5.036555767059326, "learning_rate": 9.946170784809307e-06, "loss": 0.4719, "step": 511 }, { "epoch": 0.05641873278236915, "grad_norm": 3.8602375984191895, "learning_rate": 9.945914613733538e-06, "loss": 0.481, "step": 512 }, { "epoch": 0.05652892561983471, "grad_norm": 12.633068084716797, "learning_rate": 9.945657837864615e-06, "loss": 0.5185, "step": 513 }, { "epoch": 0.056639118457300276, "grad_norm": 9.873920440673828, "learning_rate": 9.945400457233931e-06, "loss": 0.5364, "step": 514 }, { "epoch": 0.05674931129476584, "grad_norm": 9.537657737731934, "learning_rate": 9.945142471872963e-06, "loss": 0.519, "step": 515 }, { "epoch": 0.056859504132231405, "grad_norm": 10.282795906066895, "learning_rate": 9.944883881813257e-06, "loss": 0.4402, "step": 516 }, { "epoch": 0.05696969696969697, "grad_norm": 12.182283401489258, "learning_rate": 9.94462468708643e-06, "loss": 0.437, "step": 517 }, { "epoch": 0.05707988980716253, "grad_norm": 8.637264251708984, "learning_rate": 9.944364887724182e-06, "loss": 0.4978, "step": 518 }, { "epoch": 0.0571900826446281, "grad_norm": 11.6417236328125, "learning_rate": 9.94410448375828e-06, "loss": 0.5038, "step": 519 }, { "epoch": 0.05730027548209366, "grad_norm": 9.156976699829102, "learning_rate": 9.943843475220565e-06, "loss": 0.4275, "step": 520 }, { "epoch": 0.057410468319559226, "grad_norm": 11.194124221801758, "learning_rate": 9.943581862142953e-06, "loss": 0.504, "step": 521 }, { "epoch": 0.0575206611570248, "grad_norm": 13.460176467895508, "learning_rate": 9.943319644557436e-06, "loss": 0.5085, "step": 522 }, { "epoch": 0.05763085399449036, "grad_norm": 13.589484214782715, "learning_rate": 9.94305682249608e-06, "loss": 0.4618, "step": 523 }, { "epoch": 0.057741046831955925, "grad_norm": 11.005972862243652, "learning_rate": 9.94279339599102e-06, "loss": 0.5412, "step": 524 }, { "epoch": 0.05785123966942149, "grad_norm": 12.504033088684082, "learning_rate": 9.94252936507447e-06, "loss": 0.4531, "step": 525 }, { "epoch": 0.05796143250688705, "grad_norm": 8.159317016601562, "learning_rate": 9.942264729778713e-06, "loss": 0.5107, "step": 526 }, { "epoch": 0.05807162534435262, "grad_norm": 11.655434608459473, "learning_rate": 9.941999490136114e-06, "loss": 0.5816, "step": 527 }, { "epoch": 0.05818181818181818, "grad_norm": 12.202296257019043, "learning_rate": 9.941733646179103e-06, "loss": 0.537, "step": 528 }, { "epoch": 0.058292011019283746, "grad_norm": 15.81233024597168, "learning_rate": 9.94146719794019e-06, "loss": 0.6009, "step": 529 }, { "epoch": 0.05840220385674931, "grad_norm": 17.76038932800293, "learning_rate": 9.941200145451955e-06, "loss": 0.5005, "step": 530 }, { "epoch": 0.058512396694214874, "grad_norm": 14.642497062683105, "learning_rate": 9.940932488747054e-06, "loss": 0.4706, "step": 531 }, { "epoch": 0.05862258953168044, "grad_norm": 24.03742027282715, "learning_rate": 9.940664227858218e-06, "loss": 0.6553, "step": 532 }, { "epoch": 0.058732782369146, "grad_norm": 13.785540580749512, "learning_rate": 9.940395362818249e-06, "loss": 0.5425, "step": 533 }, { "epoch": 0.05884297520661157, "grad_norm": 13.390655517578125, "learning_rate": 9.940125893660022e-06, "loss": 0.5043, "step": 534 }, { "epoch": 0.05895316804407714, "grad_norm": 8.240135192871094, "learning_rate": 9.939855820416492e-06, "loss": 0.5053, "step": 535 }, { "epoch": 0.0590633608815427, "grad_norm": 6.785595417022705, "learning_rate": 9.939585143120683e-06, "loss": 0.5584, "step": 536 }, { "epoch": 0.059173553719008266, "grad_norm": 9.69683837890625, "learning_rate": 9.93931386180569e-06, "loss": 0.4352, "step": 537 }, { "epoch": 0.05928374655647383, "grad_norm": 7.655026912689209, "learning_rate": 9.939041976504691e-06, "loss": 0.4534, "step": 538 }, { "epoch": 0.059393939393939395, "grad_norm": 5.39308500289917, "learning_rate": 9.938769487250928e-06, "loss": 0.4584, "step": 539 }, { "epoch": 0.05950413223140496, "grad_norm": 9.690875053405762, "learning_rate": 9.938496394077725e-06, "loss": 0.4614, "step": 540 }, { "epoch": 0.05961432506887052, "grad_norm": 17.464853286743164, "learning_rate": 9.938222697018475e-06, "loss": 0.4885, "step": 541 }, { "epoch": 0.05972451790633609, "grad_norm": 11.740934371948242, "learning_rate": 9.937948396106645e-06, "loss": 0.5212, "step": 542 }, { "epoch": 0.05983471074380165, "grad_norm": 12.96544361114502, "learning_rate": 9.937673491375777e-06, "loss": 0.4148, "step": 543 }, { "epoch": 0.059944903581267216, "grad_norm": 11.744980812072754, "learning_rate": 9.937397982859489e-06, "loss": 0.5195, "step": 544 }, { "epoch": 0.06005509641873278, "grad_norm": 11.595945358276367, "learning_rate": 9.937121870591469e-06, "loss": 0.4163, "step": 545 }, { "epoch": 0.060165289256198344, "grad_norm": 10.950522422790527, "learning_rate": 9.936845154605477e-06, "loss": 0.4574, "step": 546 }, { "epoch": 0.060275482093663915, "grad_norm": 8.607457160949707, "learning_rate": 9.936567834935355e-06, "loss": 0.3833, "step": 547 }, { "epoch": 0.06038567493112948, "grad_norm": 10.207286834716797, "learning_rate": 9.936289911615015e-06, "loss": 0.4857, "step": 548 }, { "epoch": 0.060495867768595044, "grad_norm": 11.0805025100708, "learning_rate": 9.936011384678437e-06, "loss": 0.4441, "step": 549 }, { "epoch": 0.06060606060606061, "grad_norm": 6.993634223937988, "learning_rate": 9.935732254159683e-06, "loss": 0.4395, "step": 550 }, { "epoch": 0.06071625344352617, "grad_norm": 9.192682266235352, "learning_rate": 9.935452520092884e-06, "loss": 0.5288, "step": 551 }, { "epoch": 0.060826446280991736, "grad_norm": 5.134696006774902, "learning_rate": 9.935172182512245e-06, "loss": 0.5139, "step": 552 }, { "epoch": 0.0609366391184573, "grad_norm": 8.106890678405762, "learning_rate": 9.93489124145205e-06, "loss": 0.4753, "step": 553 }, { "epoch": 0.061046831955922864, "grad_norm": 9.831403732299805, "learning_rate": 9.934609696946648e-06, "loss": 0.4797, "step": 554 }, { "epoch": 0.06115702479338843, "grad_norm": 16.136892318725586, "learning_rate": 9.934327549030471e-06, "loss": 0.5622, "step": 555 }, { "epoch": 0.06126721763085399, "grad_norm": 8.392415046691895, "learning_rate": 9.93404479773802e-06, "loss": 0.4825, "step": 556 }, { "epoch": 0.06137741046831956, "grad_norm": 10.084752082824707, "learning_rate": 9.933761443103868e-06, "loss": 0.4658, "step": 557 }, { "epoch": 0.06148760330578512, "grad_norm": 11.000602722167969, "learning_rate": 9.933477485162664e-06, "loss": 0.5345, "step": 558 }, { "epoch": 0.061597796143250685, "grad_norm": 9.608716011047363, "learning_rate": 9.933192923949132e-06, "loss": 0.5332, "step": 559 }, { "epoch": 0.06170798898071626, "grad_norm": 12.702254295349121, "learning_rate": 9.932907759498069e-06, "loss": 0.4959, "step": 560 }, { "epoch": 0.06181818181818182, "grad_norm": 13.748756408691406, "learning_rate": 9.932621991844342e-06, "loss": 0.5641, "step": 561 }, { "epoch": 0.061928374655647385, "grad_norm": 16.632835388183594, "learning_rate": 9.9323356210229e-06, "loss": 0.578, "step": 562 }, { "epoch": 0.06203856749311295, "grad_norm": 11.966531753540039, "learning_rate": 9.932048647068759e-06, "loss": 0.4808, "step": 563 }, { "epoch": 0.06214876033057851, "grad_norm": 7.218803405761719, "learning_rate": 9.931761070017008e-06, "loss": 0.4511, "step": 564 }, { "epoch": 0.06225895316804408, "grad_norm": 7.731607913970947, "learning_rate": 9.931472889902814e-06, "loss": 0.5155, "step": 565 }, { "epoch": 0.06236914600550964, "grad_norm": 10.343323707580566, "learning_rate": 9.931184106761419e-06, "loss": 0.5582, "step": 566 }, { "epoch": 0.062479338842975206, "grad_norm": 7.1000895500183105, "learning_rate": 9.930894720628129e-06, "loss": 0.506, "step": 567 }, { "epoch": 0.06258953168044078, "grad_norm": 9.204298973083496, "learning_rate": 9.930604731538337e-06, "loss": 0.4962, "step": 568 }, { "epoch": 0.06269972451790634, "grad_norm": 7.463597297668457, "learning_rate": 9.930314139527501e-06, "loss": 0.5024, "step": 569 }, { "epoch": 0.0628099173553719, "grad_norm": 11.228713989257812, "learning_rate": 9.930022944631155e-06, "loss": 0.4776, "step": 570 }, { "epoch": 0.06292011019283747, "grad_norm": 8.012906074523926, "learning_rate": 9.929731146884904e-06, "loss": 0.4753, "step": 571 }, { "epoch": 0.06303030303030303, "grad_norm": 8.836130142211914, "learning_rate": 9.929438746324436e-06, "loss": 0.4902, "step": 572 }, { "epoch": 0.0631404958677686, "grad_norm": 12.457344055175781, "learning_rate": 9.929145742985498e-06, "loss": 0.5347, "step": 573 }, { "epoch": 0.06325068870523416, "grad_norm": 11.039780616760254, "learning_rate": 9.928852136903926e-06, "loss": 0.4673, "step": 574 }, { "epoch": 0.06336088154269973, "grad_norm": 14.96688175201416, "learning_rate": 9.928557928115619e-06, "loss": 0.5673, "step": 575 }, { "epoch": 0.06347107438016529, "grad_norm": 35.09209060668945, "learning_rate": 9.928263116656554e-06, "loss": 0.5101, "step": 576 }, { "epoch": 0.06358126721763085, "grad_norm": 13.416524887084961, "learning_rate": 9.92796770256278e-06, "loss": 0.5174, "step": 577 }, { "epoch": 0.06369146005509642, "grad_norm": 7.597424030303955, "learning_rate": 9.92767168587042e-06, "loss": 0.507, "step": 578 }, { "epoch": 0.06380165289256198, "grad_norm": 9.764474868774414, "learning_rate": 9.927375066615674e-06, "loss": 0.5125, "step": 579 }, { "epoch": 0.06391184573002755, "grad_norm": 11.765339851379395, "learning_rate": 9.927077844834811e-06, "loss": 0.4751, "step": 580 }, { "epoch": 0.06402203856749311, "grad_norm": 8.978880882263184, "learning_rate": 9.926780020564178e-06, "loss": 0.4576, "step": 581 }, { "epoch": 0.06413223140495868, "grad_norm": 8.072242736816406, "learning_rate": 9.92648159384019e-06, "loss": 0.556, "step": 582 }, { "epoch": 0.06424242424242424, "grad_norm": 12.509285926818848, "learning_rate": 9.926182564699343e-06, "loss": 0.4711, "step": 583 }, { "epoch": 0.0643526170798898, "grad_norm": 12.04463005065918, "learning_rate": 9.925882933178199e-06, "loss": 0.4865, "step": 584 }, { "epoch": 0.06446280991735537, "grad_norm": 10.25395393371582, "learning_rate": 9.925582699313397e-06, "loss": 0.5415, "step": 585 }, { "epoch": 0.06457300275482093, "grad_norm": 6.9483442306518555, "learning_rate": 9.925281863141653e-06, "loss": 0.4399, "step": 586 }, { "epoch": 0.0646831955922865, "grad_norm": 9.354958534240723, "learning_rate": 9.924980424699754e-06, "loss": 0.4824, "step": 587 }, { "epoch": 0.06479338842975206, "grad_norm": 8.679173469543457, "learning_rate": 9.924678384024557e-06, "loss": 0.5696, "step": 588 }, { "epoch": 0.06490358126721762, "grad_norm": 7.74788761138916, "learning_rate": 9.924375741152998e-06, "loss": 0.3965, "step": 589 }, { "epoch": 0.06501377410468319, "grad_norm": 7.50650691986084, "learning_rate": 9.924072496122085e-06, "loss": 0.4672, "step": 590 }, { "epoch": 0.06512396694214877, "grad_norm": 15.767810821533203, "learning_rate": 9.9237686489689e-06, "loss": 0.5774, "step": 591 }, { "epoch": 0.06523415977961433, "grad_norm": 10.386406898498535, "learning_rate": 9.923464199730593e-06, "loss": 0.4665, "step": 592 }, { "epoch": 0.0653443526170799, "grad_norm": 12.454659461975098, "learning_rate": 9.923159148444397e-06, "loss": 0.4118, "step": 593 }, { "epoch": 0.06545454545454546, "grad_norm": 12.94180679321289, "learning_rate": 9.922853495147613e-06, "loss": 0.5064, "step": 594 }, { "epoch": 0.06556473829201102, "grad_norm": 10.078128814697266, "learning_rate": 9.922547239877617e-06, "loss": 0.4827, "step": 595 }, { "epoch": 0.06567493112947659, "grad_norm": 7.623451232910156, "learning_rate": 9.922240382671858e-06, "loss": 0.4234, "step": 596 }, { "epoch": 0.06578512396694215, "grad_norm": 12.806289672851562, "learning_rate": 9.921932923567858e-06, "loss": 0.5223, "step": 597 }, { "epoch": 0.06589531680440772, "grad_norm": 16.55751609802246, "learning_rate": 9.921624862603214e-06, "loss": 0.5422, "step": 598 }, { "epoch": 0.06600550964187328, "grad_norm": 10.228744506835938, "learning_rate": 9.921316199815597e-06, "loss": 0.4944, "step": 599 }, { "epoch": 0.06611570247933884, "grad_norm": 7.3604960441589355, "learning_rate": 9.92100693524275e-06, "loss": 0.4693, "step": 600 }, { "epoch": 0.06622589531680441, "grad_norm": 10.923355102539062, "learning_rate": 9.920697068922491e-06, "loss": 0.5234, "step": 601 }, { "epoch": 0.06633608815426997, "grad_norm": 9.79210376739502, "learning_rate": 9.92038660089271e-06, "loss": 0.4189, "step": 602 }, { "epoch": 0.06644628099173554, "grad_norm": 6.217829704284668, "learning_rate": 9.920075531191371e-06, "loss": 0.4696, "step": 603 }, { "epoch": 0.0665564738292011, "grad_norm": 9.0442533493042, "learning_rate": 9.919763859856514e-06, "loss": 0.5369, "step": 604 }, { "epoch": 0.06666666666666667, "grad_norm": 12.135570526123047, "learning_rate": 9.919451586926249e-06, "loss": 0.6227, "step": 605 }, { "epoch": 0.06677685950413223, "grad_norm": 14.060270309448242, "learning_rate": 9.91913871243876e-06, "loss": 0.5006, "step": 606 }, { "epoch": 0.0668870523415978, "grad_norm": 11.584596633911133, "learning_rate": 9.91882523643231e-06, "loss": 0.5498, "step": 607 }, { "epoch": 0.06699724517906336, "grad_norm": 8.220637321472168, "learning_rate": 9.918511158945226e-06, "loss": 0.4144, "step": 608 }, { "epoch": 0.06710743801652892, "grad_norm": 7.712873458862305, "learning_rate": 9.918196480015918e-06, "loss": 0.4731, "step": 609 }, { "epoch": 0.06721763085399449, "grad_norm": 10.395918846130371, "learning_rate": 9.917881199682864e-06, "loss": 0.5316, "step": 610 }, { "epoch": 0.06732782369146005, "grad_norm": 10.288063049316406, "learning_rate": 9.917565317984614e-06, "loss": 0.5693, "step": 611 }, { "epoch": 0.06743801652892562, "grad_norm": 6.308658599853516, "learning_rate": 9.917248834959799e-06, "loss": 0.488, "step": 612 }, { "epoch": 0.06754820936639118, "grad_norm": 7.876396179199219, "learning_rate": 9.916931750647118e-06, "loss": 0.4428, "step": 613 }, { "epoch": 0.06765840220385674, "grad_norm": 9.849668502807617, "learning_rate": 9.916614065085342e-06, "loss": 0.4908, "step": 614 }, { "epoch": 0.06776859504132231, "grad_norm": 9.0003662109375, "learning_rate": 9.91629577831332e-06, "loss": 0.4933, "step": 615 }, { "epoch": 0.06787878787878789, "grad_norm": 11.975523948669434, "learning_rate": 9.915976890369972e-06, "loss": 0.5003, "step": 616 }, { "epoch": 0.06798898071625345, "grad_norm": 7.651215553283691, "learning_rate": 9.915657401294291e-06, "loss": 0.5143, "step": 617 }, { "epoch": 0.06809917355371901, "grad_norm": 6.625883102416992, "learning_rate": 9.915337311125348e-06, "loss": 0.471, "step": 618 }, { "epoch": 0.06820936639118458, "grad_norm": 8.365671157836914, "learning_rate": 9.91501661990228e-06, "loss": 0.4172, "step": 619 }, { "epoch": 0.06831955922865014, "grad_norm": 10.616379737854004, "learning_rate": 9.914695327664306e-06, "loss": 0.5127, "step": 620 }, { "epoch": 0.0684297520661157, "grad_norm": 9.061238288879395, "learning_rate": 9.914373434450707e-06, "loss": 0.5785, "step": 621 }, { "epoch": 0.06853994490358127, "grad_norm": 7.105732440948486, "learning_rate": 9.914050940300852e-06, "loss": 0.3874, "step": 622 }, { "epoch": 0.06865013774104683, "grad_norm": 13.91950798034668, "learning_rate": 9.913727845254173e-06, "loss": 0.5533, "step": 623 }, { "epoch": 0.0687603305785124, "grad_norm": 8.040175437927246, "learning_rate": 9.913404149350177e-06, "loss": 0.4253, "step": 624 }, { "epoch": 0.06887052341597796, "grad_norm": 4.891575336456299, "learning_rate": 9.91307985262845e-06, "loss": 0.4509, "step": 625 }, { "epoch": 0.06898071625344353, "grad_norm": 5.809885025024414, "learning_rate": 9.912754955128641e-06, "loss": 0.4876, "step": 626 }, { "epoch": 0.06909090909090909, "grad_norm": 6.048772811889648, "learning_rate": 9.912429456890484e-06, "loss": 0.4972, "step": 627 }, { "epoch": 0.06920110192837466, "grad_norm": 10.386425018310547, "learning_rate": 9.912103357953782e-06, "loss": 0.5544, "step": 628 }, { "epoch": 0.06931129476584022, "grad_norm": 9.411628723144531, "learning_rate": 9.911776658358408e-06, "loss": 0.5733, "step": 629 }, { "epoch": 0.06942148760330578, "grad_norm": 10.796219825744629, "learning_rate": 9.911449358144311e-06, "loss": 0.4771, "step": 630 }, { "epoch": 0.06953168044077135, "grad_norm": 8.182342529296875, "learning_rate": 9.911121457351516e-06, "loss": 0.5043, "step": 631 }, { "epoch": 0.06964187327823691, "grad_norm": 17.62633514404297, "learning_rate": 9.910792956020119e-06, "loss": 0.6273, "step": 632 }, { "epoch": 0.06975206611570248, "grad_norm": 7.0770344734191895, "learning_rate": 9.910463854190287e-06, "loss": 0.4609, "step": 633 }, { "epoch": 0.06986225895316804, "grad_norm": 15.625011444091797, "learning_rate": 9.910134151902267e-06, "loss": 0.523, "step": 634 }, { "epoch": 0.0699724517906336, "grad_norm": 7.356435298919678, "learning_rate": 9.90980384919637e-06, "loss": 0.442, "step": 635 }, { "epoch": 0.07008264462809917, "grad_norm": 10.88286018371582, "learning_rate": 9.90947294611299e-06, "loss": 0.4141, "step": 636 }, { "epoch": 0.07019283746556473, "grad_norm": 9.952497482299805, "learning_rate": 9.909141442692592e-06, "loss": 0.4882, "step": 637 }, { "epoch": 0.0703030303030303, "grad_norm": 9.48213005065918, "learning_rate": 9.908809338975706e-06, "loss": 0.4936, "step": 638 }, { "epoch": 0.07041322314049586, "grad_norm": 18.07328987121582, "learning_rate": 9.908476635002948e-06, "loss": 0.5932, "step": 639 }, { "epoch": 0.07052341597796143, "grad_norm": 8.986546516418457, "learning_rate": 9.908143330815e-06, "loss": 0.4723, "step": 640 }, { "epoch": 0.07063360881542699, "grad_norm": 9.191007614135742, "learning_rate": 9.907809426452617e-06, "loss": 0.424, "step": 641 }, { "epoch": 0.07074380165289257, "grad_norm": 7.707602500915527, "learning_rate": 9.907474921956632e-06, "loss": 0.5195, "step": 642 }, { "epoch": 0.07085399449035813, "grad_norm": 8.087730407714844, "learning_rate": 9.907139817367948e-06, "loss": 0.5119, "step": 643 }, { "epoch": 0.0709641873278237, "grad_norm": 10.142461776733398, "learning_rate": 9.90680411272754e-06, "loss": 0.5339, "step": 644 }, { "epoch": 0.07107438016528926, "grad_norm": 8.02434253692627, "learning_rate": 9.906467808076461e-06, "loss": 0.4649, "step": 645 }, { "epoch": 0.07118457300275483, "grad_norm": 10.722000122070312, "learning_rate": 9.906130903455833e-06, "loss": 0.4916, "step": 646 }, { "epoch": 0.07129476584022039, "grad_norm": 9.784269332885742, "learning_rate": 9.905793398906853e-06, "loss": 0.4741, "step": 647 }, { "epoch": 0.07140495867768595, "grad_norm": 7.949434757232666, "learning_rate": 9.905455294470793e-06, "loss": 0.4754, "step": 648 }, { "epoch": 0.07151515151515152, "grad_norm": 14.611332893371582, "learning_rate": 9.905116590188996e-06, "loss": 0.565, "step": 649 }, { "epoch": 0.07162534435261708, "grad_norm": 13.625049591064453, "learning_rate": 9.90477728610288e-06, "loss": 0.5274, "step": 650 }, { "epoch": 0.07173553719008265, "grad_norm": 9.991351127624512, "learning_rate": 9.904437382253935e-06, "loss": 0.5799, "step": 651 }, { "epoch": 0.07184573002754821, "grad_norm": 9.200335502624512, "learning_rate": 9.904096878683724e-06, "loss": 0.5184, "step": 652 }, { "epoch": 0.07195592286501377, "grad_norm": 8.283576011657715, "learning_rate": 9.903755775433886e-06, "loss": 0.5627, "step": 653 }, { "epoch": 0.07206611570247934, "grad_norm": 8.832159996032715, "learning_rate": 9.90341407254613e-06, "loss": 0.4769, "step": 654 }, { "epoch": 0.0721763085399449, "grad_norm": 6.476308345794678, "learning_rate": 9.90307177006224e-06, "loss": 0.5206, "step": 655 }, { "epoch": 0.07228650137741047, "grad_norm": 8.467257499694824, "learning_rate": 9.902728868024075e-06, "loss": 0.391, "step": 656 }, { "epoch": 0.07239669421487603, "grad_norm": 6.937216758728027, "learning_rate": 9.902385366473564e-06, "loss": 0.479, "step": 657 }, { "epoch": 0.0725068870523416, "grad_norm": 7.917789936065674, "learning_rate": 9.90204126545271e-06, "loss": 0.5145, "step": 658 }, { "epoch": 0.07261707988980716, "grad_norm": 5.8603434562683105, "learning_rate": 9.901696565003593e-06, "loss": 0.5029, "step": 659 }, { "epoch": 0.07272727272727272, "grad_norm": 7.931520462036133, "learning_rate": 9.901351265168363e-06, "loss": 0.4938, "step": 660 }, { "epoch": 0.07283746556473829, "grad_norm": 10.546648979187012, "learning_rate": 9.901005365989241e-06, "loss": 0.4907, "step": 661 }, { "epoch": 0.07294765840220385, "grad_norm": 8.893937110900879, "learning_rate": 9.900658867508524e-06, "loss": 0.4804, "step": 662 }, { "epoch": 0.07305785123966942, "grad_norm": 7.418328762054443, "learning_rate": 9.900311769768585e-06, "loss": 0.4477, "step": 663 }, { "epoch": 0.07316804407713498, "grad_norm": 9.132783889770508, "learning_rate": 9.899964072811865e-06, "loss": 0.4554, "step": 664 }, { "epoch": 0.07327823691460054, "grad_norm": 15.406439781188965, "learning_rate": 9.899615776680885e-06, "loss": 0.4194, "step": 665 }, { "epoch": 0.07338842975206611, "grad_norm": 10.79430866241455, "learning_rate": 9.89926688141823e-06, "loss": 0.4427, "step": 666 }, { "epoch": 0.07349862258953169, "grad_norm": 7.434822082519531, "learning_rate": 9.898917387066566e-06, "loss": 0.4463, "step": 667 }, { "epoch": 0.07360881542699725, "grad_norm": 6.936038970947266, "learning_rate": 9.89856729366863e-06, "loss": 0.3917, "step": 668 }, { "epoch": 0.07371900826446282, "grad_norm": 7.734316349029541, "learning_rate": 9.898216601267232e-06, "loss": 0.468, "step": 669 }, { "epoch": 0.07382920110192838, "grad_norm": 12.051238059997559, "learning_rate": 9.897865309905254e-06, "loss": 0.4934, "step": 670 }, { "epoch": 0.07393939393939394, "grad_norm": 6.236438751220703, "learning_rate": 9.897513419625653e-06, "loss": 0.4516, "step": 671 }, { "epoch": 0.07404958677685951, "grad_norm": 8.817441940307617, "learning_rate": 9.897160930471457e-06, "loss": 0.4753, "step": 672 }, { "epoch": 0.07415977961432507, "grad_norm": 15.634927749633789, "learning_rate": 9.896807842485772e-06, "loss": 0.6153, "step": 673 }, { "epoch": 0.07426997245179064, "grad_norm": 12.055826187133789, "learning_rate": 9.896454155711771e-06, "loss": 0.6136, "step": 674 }, { "epoch": 0.0743801652892562, "grad_norm": 5.280544757843018, "learning_rate": 9.896099870192706e-06, "loss": 0.49, "step": 675 }, { "epoch": 0.07449035812672176, "grad_norm": 8.10863208770752, "learning_rate": 9.895744985971895e-06, "loss": 0.5093, "step": 676 }, { "epoch": 0.07460055096418733, "grad_norm": 5.737893104553223, "learning_rate": 9.89538950309274e-06, "loss": 0.5611, "step": 677 }, { "epoch": 0.07471074380165289, "grad_norm": 9.80198860168457, "learning_rate": 9.895033421598708e-06, "loss": 0.5501, "step": 678 }, { "epoch": 0.07482093663911846, "grad_norm": 12.883217811584473, "learning_rate": 9.894676741533337e-06, "loss": 0.5309, "step": 679 }, { "epoch": 0.07493112947658402, "grad_norm": 9.768318176269531, "learning_rate": 9.894319462940246e-06, "loss": 0.5148, "step": 680 }, { "epoch": 0.07504132231404959, "grad_norm": 6.506410598754883, "learning_rate": 9.893961585863124e-06, "loss": 0.4572, "step": 681 }, { "epoch": 0.07515151515151515, "grad_norm": 9.509140968322754, "learning_rate": 9.89360311034573e-06, "loss": 0.5298, "step": 682 }, { "epoch": 0.07526170798898071, "grad_norm": 4.768259048461914, "learning_rate": 9.893244036431901e-06, "loss": 0.4822, "step": 683 }, { "epoch": 0.07537190082644628, "grad_norm": 6.9128289222717285, "learning_rate": 9.892884364165545e-06, "loss": 0.4858, "step": 684 }, { "epoch": 0.07548209366391184, "grad_norm": 16.79252052307129, "learning_rate": 9.89252409359064e-06, "loss": 0.5622, "step": 685 }, { "epoch": 0.0755922865013774, "grad_norm": 10.07437801361084, "learning_rate": 9.892163224751245e-06, "loss": 0.4842, "step": 686 }, { "epoch": 0.07570247933884297, "grad_norm": 5.11728572845459, "learning_rate": 9.891801757691487e-06, "loss": 0.492, "step": 687 }, { "epoch": 0.07581267217630853, "grad_norm": 7.831786155700684, "learning_rate": 9.891439692455563e-06, "loss": 0.5006, "step": 688 }, { "epoch": 0.0759228650137741, "grad_norm": 12.940075874328613, "learning_rate": 9.89107702908775e-06, "loss": 0.4705, "step": 689 }, { "epoch": 0.07603305785123966, "grad_norm": 15.072134017944336, "learning_rate": 9.890713767632394e-06, "loss": 0.6851, "step": 690 }, { "epoch": 0.07614325068870523, "grad_norm": 8.534189224243164, "learning_rate": 9.890349908133914e-06, "loss": 0.4492, "step": 691 }, { "epoch": 0.0762534435261708, "grad_norm": 9.891030311584473, "learning_rate": 9.889985450636806e-06, "loss": 0.4826, "step": 692 }, { "epoch": 0.07636363636363637, "grad_norm": 18.696834564208984, "learning_rate": 9.889620395185635e-06, "loss": 0.4704, "step": 693 }, { "epoch": 0.07647382920110193, "grad_norm": 11.502964973449707, "learning_rate": 9.889254741825038e-06, "loss": 0.4521, "step": 694 }, { "epoch": 0.0765840220385675, "grad_norm": 6.212857246398926, "learning_rate": 9.888888490599731e-06, "loss": 0.5045, "step": 695 }, { "epoch": 0.07669421487603306, "grad_norm": 13.820978164672852, "learning_rate": 9.888521641554499e-06, "loss": 0.5168, "step": 696 }, { "epoch": 0.07680440771349863, "grad_norm": 10.593164443969727, "learning_rate": 9.888154194734198e-06, "loss": 0.4951, "step": 697 }, { "epoch": 0.07691460055096419, "grad_norm": 10.364981651306152, "learning_rate": 9.887786150183765e-06, "loss": 0.5434, "step": 698 }, { "epoch": 0.07702479338842975, "grad_norm": 15.26873779296875, "learning_rate": 9.8874175079482e-06, "loss": 0.5559, "step": 699 }, { "epoch": 0.07713498622589532, "grad_norm": 7.997010707855225, "learning_rate": 9.887048268072585e-06, "loss": 0.459, "step": 700 }, { "epoch": 0.07724517906336088, "grad_norm": 9.9561767578125, "learning_rate": 9.886678430602068e-06, "loss": 0.442, "step": 701 }, { "epoch": 0.07735537190082645, "grad_norm": 12.93139934539795, "learning_rate": 9.886307995581877e-06, "loss": 0.4559, "step": 702 }, { "epoch": 0.07746556473829201, "grad_norm": 8.539533615112305, "learning_rate": 9.885936963057303e-06, "loss": 0.487, "step": 703 }, { "epoch": 0.07757575757575758, "grad_norm": 5.245579242706299, "learning_rate": 9.885565333073723e-06, "loss": 0.4743, "step": 704 }, { "epoch": 0.07768595041322314, "grad_norm": 13.183752059936523, "learning_rate": 9.885193105676577e-06, "loss": 0.3537, "step": 705 }, { "epoch": 0.0777961432506887, "grad_norm": 8.59929370880127, "learning_rate": 9.884820280911383e-06, "loss": 0.4825, "step": 706 }, { "epoch": 0.07790633608815427, "grad_norm": 11.603909492492676, "learning_rate": 9.884446858823728e-06, "loss": 0.4862, "step": 707 }, { "epoch": 0.07801652892561983, "grad_norm": 11.469616889953613, "learning_rate": 9.884072839459278e-06, "loss": 0.4586, "step": 708 }, { "epoch": 0.0781267217630854, "grad_norm": 9.469440460205078, "learning_rate": 9.883698222863765e-06, "loss": 0.5705, "step": 709 }, { "epoch": 0.07823691460055096, "grad_norm": 7.1158647537231445, "learning_rate": 9.883323009083e-06, "loss": 0.3444, "step": 710 }, { "epoch": 0.07834710743801652, "grad_norm": 8.297913551330566, "learning_rate": 9.882947198162865e-06, "loss": 0.3462, "step": 711 }, { "epoch": 0.07845730027548209, "grad_norm": 13.624194145202637, "learning_rate": 9.882570790149313e-06, "loss": 0.5259, "step": 712 }, { "epoch": 0.07856749311294765, "grad_norm": 10.769474983215332, "learning_rate": 9.882193785088372e-06, "loss": 0.5351, "step": 713 }, { "epoch": 0.07867768595041322, "grad_norm": 11.514634132385254, "learning_rate": 9.881816183026145e-06, "loss": 0.4189, "step": 714 }, { "epoch": 0.07878787878787878, "grad_norm": 9.002857208251953, "learning_rate": 9.881437984008801e-06, "loss": 0.5667, "step": 715 }, { "epoch": 0.07889807162534435, "grad_norm": 10.22419548034668, "learning_rate": 9.881059188082592e-06, "loss": 0.4815, "step": 716 }, { "epoch": 0.07900826446280992, "grad_norm": 10.344335556030273, "learning_rate": 9.880679795293835e-06, "loss": 0.5151, "step": 717 }, { "epoch": 0.07911845730027549, "grad_norm": 8.122965812683105, "learning_rate": 9.880299805688922e-06, "loss": 0.514, "step": 718 }, { "epoch": 0.07922865013774105, "grad_norm": 8.082174301147461, "learning_rate": 9.87991921931432e-06, "loss": 0.4556, "step": 719 }, { "epoch": 0.07933884297520662, "grad_norm": 8.465988159179688, "learning_rate": 9.879538036216567e-06, "loss": 0.4653, "step": 720 }, { "epoch": 0.07944903581267218, "grad_norm": 12.201423645019531, "learning_rate": 9.879156256442276e-06, "loss": 0.5005, "step": 721 }, { "epoch": 0.07955922865013774, "grad_norm": 7.324477672576904, "learning_rate": 9.878773880038127e-06, "loss": 0.4988, "step": 722 }, { "epoch": 0.07966942148760331, "grad_norm": 10.313101768493652, "learning_rate": 9.878390907050882e-06, "loss": 0.5054, "step": 723 }, { "epoch": 0.07977961432506887, "grad_norm": 7.98211669921875, "learning_rate": 9.878007337527373e-06, "loss": 0.5633, "step": 724 }, { "epoch": 0.07988980716253444, "grad_norm": 7.980637073516846, "learning_rate": 9.877623171514498e-06, "loss": 0.4389, "step": 725 }, { "epoch": 0.08, "grad_norm": 6.355591773986816, "learning_rate": 9.877238409059237e-06, "loss": 0.4547, "step": 726 }, { "epoch": 0.08011019283746557, "grad_norm": 9.528438568115234, "learning_rate": 9.876853050208637e-06, "loss": 0.5235, "step": 727 }, { "epoch": 0.08022038567493113, "grad_norm": 9.217560768127441, "learning_rate": 9.876467095009823e-06, "loss": 0.4124, "step": 728 }, { "epoch": 0.0803305785123967, "grad_norm": 6.823374271392822, "learning_rate": 9.876080543509987e-06, "loss": 0.503, "step": 729 }, { "epoch": 0.08044077134986226, "grad_norm": 14.821892738342285, "learning_rate": 9.8756933957564e-06, "loss": 0.462, "step": 730 }, { "epoch": 0.08055096418732782, "grad_norm": 12.123024940490723, "learning_rate": 9.8753056517964e-06, "loss": 0.5251, "step": 731 }, { "epoch": 0.08066115702479339, "grad_norm": 8.066936492919922, "learning_rate": 9.874917311677405e-06, "loss": 0.507, "step": 732 }, { "epoch": 0.08077134986225895, "grad_norm": 9.832659721374512, "learning_rate": 9.874528375446898e-06, "loss": 0.4783, "step": 733 }, { "epoch": 0.08088154269972452, "grad_norm": 7.567358016967773, "learning_rate": 9.874138843152438e-06, "loss": 0.466, "step": 734 }, { "epoch": 0.08099173553719008, "grad_norm": 11.371232032775879, "learning_rate": 9.873748714841661e-06, "loss": 0.4871, "step": 735 }, { "epoch": 0.08110192837465564, "grad_norm": 10.023006439208984, "learning_rate": 9.873357990562272e-06, "loss": 0.4108, "step": 736 }, { "epoch": 0.08121212121212121, "grad_norm": 16.237262725830078, "learning_rate": 9.872966670362048e-06, "loss": 0.4999, "step": 737 }, { "epoch": 0.08132231404958677, "grad_norm": 10.57867431640625, "learning_rate": 9.872574754288838e-06, "loss": 0.4475, "step": 738 }, { "epoch": 0.08143250688705234, "grad_norm": 8.24145793914795, "learning_rate": 9.87218224239057e-06, "loss": 0.4632, "step": 739 }, { "epoch": 0.0815426997245179, "grad_norm": 9.584731101989746, "learning_rate": 9.87178913471524e-06, "loss": 0.5202, "step": 740 }, { "epoch": 0.08165289256198346, "grad_norm": 9.388596534729004, "learning_rate": 9.871395431310915e-06, "loss": 0.4376, "step": 741 }, { "epoch": 0.08176308539944904, "grad_norm": 10.384413719177246, "learning_rate": 9.87100113222574e-06, "loss": 0.512, "step": 742 }, { "epoch": 0.0818732782369146, "grad_norm": 13.565887451171875, "learning_rate": 9.87060623750793e-06, "loss": 0.5288, "step": 743 }, { "epoch": 0.08198347107438017, "grad_norm": 8.631324768066406, "learning_rate": 9.870210747205772e-06, "loss": 0.452, "step": 744 }, { "epoch": 0.08209366391184574, "grad_norm": 12.596695899963379, "learning_rate": 9.869814661367631e-06, "loss": 0.4553, "step": 745 }, { "epoch": 0.0822038567493113, "grad_norm": 10.302438735961914, "learning_rate": 9.869417980041937e-06, "loss": 0.5333, "step": 746 }, { "epoch": 0.08231404958677686, "grad_norm": 12.104216575622559, "learning_rate": 9.869020703277197e-06, "loss": 0.4825, "step": 747 }, { "epoch": 0.08242424242424243, "grad_norm": 8.994564056396484, "learning_rate": 9.868622831121992e-06, "loss": 0.5453, "step": 748 }, { "epoch": 0.08253443526170799, "grad_norm": 8.422258377075195, "learning_rate": 9.868224363624975e-06, "loss": 0.506, "step": 749 }, { "epoch": 0.08264462809917356, "grad_norm": 7.729913711547852, "learning_rate": 9.867825300834868e-06, "loss": 0.4059, "step": 750 }, { "epoch": 0.08275482093663912, "grad_norm": 6.395445823669434, "learning_rate": 9.867425642800473e-06, "loss": 0.4778, "step": 751 }, { "epoch": 0.08286501377410468, "grad_norm": 7.772154808044434, "learning_rate": 9.867025389570658e-06, "loss": 0.4095, "step": 752 }, { "epoch": 0.08297520661157025, "grad_norm": 7.181577205657959, "learning_rate": 9.866624541194367e-06, "loss": 0.5259, "step": 753 }, { "epoch": 0.08308539944903581, "grad_norm": 7.02896785736084, "learning_rate": 9.866223097720616e-06, "loss": 0.4161, "step": 754 }, { "epoch": 0.08319559228650138, "grad_norm": 5.862891674041748, "learning_rate": 9.865821059198494e-06, "loss": 0.4926, "step": 755 }, { "epoch": 0.08330578512396694, "grad_norm": 9.70177936553955, "learning_rate": 9.865418425677165e-06, "loss": 0.4415, "step": 756 }, { "epoch": 0.0834159779614325, "grad_norm": 9.973360061645508, "learning_rate": 9.86501519720586e-06, "loss": 0.4388, "step": 757 }, { "epoch": 0.08352617079889807, "grad_norm": 12.362749099731445, "learning_rate": 9.86461137383389e-06, "loss": 0.5289, "step": 758 }, { "epoch": 0.08363636363636363, "grad_norm": 12.198518753051758, "learning_rate": 9.864206955610632e-06, "loss": 0.4405, "step": 759 }, { "epoch": 0.0837465564738292, "grad_norm": 8.491933822631836, "learning_rate": 9.86380194258554e-06, "loss": 0.4767, "step": 760 }, { "epoch": 0.08385674931129476, "grad_norm": 13.432280540466309, "learning_rate": 9.863396334808141e-06, "loss": 0.5168, "step": 761 }, { "epoch": 0.08396694214876033, "grad_norm": 7.089749813079834, "learning_rate": 9.862990132328032e-06, "loss": 0.4916, "step": 762 }, { "epoch": 0.08407713498622589, "grad_norm": 15.24958610534668, "learning_rate": 9.862583335194882e-06, "loss": 0.5959, "step": 763 }, { "epoch": 0.08418732782369145, "grad_norm": 9.171639442443848, "learning_rate": 9.862175943458438e-06, "loss": 0.4483, "step": 764 }, { "epoch": 0.08429752066115702, "grad_norm": 13.582493782043457, "learning_rate": 9.861767957168514e-06, "loss": 0.5253, "step": 765 }, { "epoch": 0.08440771349862258, "grad_norm": 9.245279312133789, "learning_rate": 9.861359376375002e-06, "loss": 0.5328, "step": 766 }, { "epoch": 0.08451790633608816, "grad_norm": 12.219204902648926, "learning_rate": 9.86095020112786e-06, "loss": 0.5749, "step": 767 }, { "epoch": 0.08462809917355373, "grad_norm": 11.779691696166992, "learning_rate": 9.860540431477126e-06, "loss": 0.5183, "step": 768 }, { "epoch": 0.08473829201101929, "grad_norm": 5.6311187744140625, "learning_rate": 9.860130067472904e-06, "loss": 0.4032, "step": 769 }, { "epoch": 0.08484848484848485, "grad_norm": 6.74190616607666, "learning_rate": 9.859719109165376e-06, "loss": 0.5194, "step": 770 }, { "epoch": 0.08495867768595042, "grad_norm": 6.302004814147949, "learning_rate": 9.859307556604794e-06, "loss": 0.5004, "step": 771 }, { "epoch": 0.08506887052341598, "grad_norm": 5.843480110168457, "learning_rate": 9.858895409841485e-06, "loss": 0.4876, "step": 772 }, { "epoch": 0.08517906336088155, "grad_norm": 7.050701141357422, "learning_rate": 9.858482668925843e-06, "loss": 0.4691, "step": 773 }, { "epoch": 0.08528925619834711, "grad_norm": 11.953496932983398, "learning_rate": 9.858069333908341e-06, "loss": 0.4997, "step": 774 }, { "epoch": 0.08539944903581267, "grad_norm": 8.992799758911133, "learning_rate": 9.857655404839522e-06, "loss": 0.4915, "step": 775 }, { "epoch": 0.08550964187327824, "grad_norm": 7.738897323608398, "learning_rate": 9.857240881770003e-06, "loss": 0.4843, "step": 776 }, { "epoch": 0.0856198347107438, "grad_norm": 10.122507095336914, "learning_rate": 9.856825764750468e-06, "loss": 0.5273, "step": 777 }, { "epoch": 0.08573002754820937, "grad_norm": 9.262826919555664, "learning_rate": 9.856410053831685e-06, "loss": 0.5056, "step": 778 }, { "epoch": 0.08584022038567493, "grad_norm": 6.276966571807861, "learning_rate": 9.85599374906448e-06, "loss": 0.4164, "step": 779 }, { "epoch": 0.0859504132231405, "grad_norm": 5.466647148132324, "learning_rate": 9.855576850499767e-06, "loss": 0.376, "step": 780 }, { "epoch": 0.08606060606060606, "grad_norm": 7.285747051239014, "learning_rate": 9.855159358188517e-06, "loss": 0.4791, "step": 781 }, { "epoch": 0.08617079889807162, "grad_norm": 6.792855739593506, "learning_rate": 9.854741272181789e-06, "loss": 0.5095, "step": 782 }, { "epoch": 0.08628099173553719, "grad_norm": 6.635336875915527, "learning_rate": 9.854322592530702e-06, "loss": 0.5065, "step": 783 }, { "epoch": 0.08639118457300275, "grad_norm": 7.093437194824219, "learning_rate": 9.853903319286456e-06, "loss": 0.3857, "step": 784 }, { "epoch": 0.08650137741046832, "grad_norm": 7.978708267211914, "learning_rate": 9.853483452500316e-06, "loss": 0.4088, "step": 785 }, { "epoch": 0.08661157024793388, "grad_norm": 9.05033016204834, "learning_rate": 9.853062992223629e-06, "loss": 0.4697, "step": 786 }, { "epoch": 0.08672176308539944, "grad_norm": 9.65661334991455, "learning_rate": 9.852641938507806e-06, "loss": 0.4212, "step": 787 }, { "epoch": 0.08683195592286501, "grad_norm": 8.787781715393066, "learning_rate": 9.852220291404335e-06, "loss": 0.4903, "step": 788 }, { "epoch": 0.08694214876033057, "grad_norm": 10.779501914978027, "learning_rate": 9.851798050964775e-06, "loss": 0.5753, "step": 789 }, { "epoch": 0.08705234159779614, "grad_norm": 8.177962303161621, "learning_rate": 9.851375217240761e-06, "loss": 0.4619, "step": 790 }, { "epoch": 0.0871625344352617, "grad_norm": 6.387087345123291, "learning_rate": 9.850951790283993e-06, "loss": 0.4211, "step": 791 }, { "epoch": 0.08727272727272728, "grad_norm": 8.48974323272705, "learning_rate": 9.850527770146253e-06, "loss": 0.511, "step": 792 }, { "epoch": 0.08738292011019284, "grad_norm": 7.000217437744141, "learning_rate": 9.850103156879386e-06, "loss": 0.4904, "step": 793 }, { "epoch": 0.08749311294765841, "grad_norm": 9.408488273620605, "learning_rate": 9.849677950535319e-06, "loss": 0.5271, "step": 794 }, { "epoch": 0.08760330578512397, "grad_norm": 5.422463417053223, "learning_rate": 9.849252151166044e-06, "loss": 0.3922, "step": 795 }, { "epoch": 0.08771349862258954, "grad_norm": 5.552674770355225, "learning_rate": 9.848825758823629e-06, "loss": 0.4377, "step": 796 }, { "epoch": 0.0878236914600551, "grad_norm": 6.646964073181152, "learning_rate": 9.848398773560213e-06, "loss": 0.4523, "step": 797 }, { "epoch": 0.08793388429752066, "grad_norm": 9.012452125549316, "learning_rate": 9.84797119542801e-06, "loss": 0.4846, "step": 798 }, { "epoch": 0.08804407713498623, "grad_norm": 11.543864250183105, "learning_rate": 9.847543024479304e-06, "loss": 0.5324, "step": 799 }, { "epoch": 0.0881542699724518, "grad_norm": 9.367711067199707, "learning_rate": 9.847114260766451e-06, "loss": 0.3494, "step": 800 }, { "epoch": 0.08826446280991736, "grad_norm": 9.736997604370117, "learning_rate": 9.846684904341883e-06, "loss": 0.5678, "step": 801 }, { "epoch": 0.08837465564738292, "grad_norm": 12.38846206665039, "learning_rate": 9.846254955258101e-06, "loss": 0.4822, "step": 802 }, { "epoch": 0.08848484848484849, "grad_norm": 13.669387817382812, "learning_rate": 9.845824413567679e-06, "loss": 0.4647, "step": 803 }, { "epoch": 0.08859504132231405, "grad_norm": 13.912272453308105, "learning_rate": 9.845393279323268e-06, "loss": 0.4578, "step": 804 }, { "epoch": 0.08870523415977961, "grad_norm": 6.685051918029785, "learning_rate": 9.844961552577583e-06, "loss": 0.503, "step": 805 }, { "epoch": 0.08881542699724518, "grad_norm": 9.136698722839355, "learning_rate": 9.844529233383418e-06, "loss": 0.5278, "step": 806 }, { "epoch": 0.08892561983471074, "grad_norm": 7.765626430511475, "learning_rate": 9.844096321793638e-06, "loss": 0.4546, "step": 807 }, { "epoch": 0.0890358126721763, "grad_norm": 7.0093092918396, "learning_rate": 9.84366281786118e-06, "loss": 0.4084, "step": 808 }, { "epoch": 0.08914600550964187, "grad_norm": 11.89829158782959, "learning_rate": 9.843228721639053e-06, "loss": 0.4895, "step": 809 }, { "epoch": 0.08925619834710743, "grad_norm": 10.255476951599121, "learning_rate": 9.842794033180339e-06, "loss": 0.4462, "step": 810 }, { "epoch": 0.089366391184573, "grad_norm": 9.761069297790527, "learning_rate": 9.842358752538193e-06, "loss": 0.4419, "step": 811 }, { "epoch": 0.08947658402203856, "grad_norm": 9.967449188232422, "learning_rate": 9.84192287976584e-06, "loss": 0.5406, "step": 812 }, { "epoch": 0.08958677685950413, "grad_norm": 8.725493431091309, "learning_rate": 9.841486414916581e-06, "loss": 0.4606, "step": 813 }, { "epoch": 0.08969696969696969, "grad_norm": 8.53592300415039, "learning_rate": 9.841049358043787e-06, "loss": 0.451, "step": 814 }, { "epoch": 0.08980716253443526, "grad_norm": 11.516461372375488, "learning_rate": 9.8406117092009e-06, "loss": 0.5207, "step": 815 }, { "epoch": 0.08991735537190082, "grad_norm": 6.130514621734619, "learning_rate": 9.840173468441438e-06, "loss": 0.401, "step": 816 }, { "epoch": 0.09002754820936638, "grad_norm": 28.716140747070312, "learning_rate": 9.83973463581899e-06, "loss": 0.628, "step": 817 }, { "epoch": 0.09013774104683196, "grad_norm": 4.71995210647583, "learning_rate": 9.839295211387218e-06, "loss": 0.4665, "step": 818 }, { "epoch": 0.09024793388429753, "grad_norm": 12.816455841064453, "learning_rate": 9.838855195199852e-06, "loss": 0.3792, "step": 819 }, { "epoch": 0.09035812672176309, "grad_norm": 6.196542739868164, "learning_rate": 9.838414587310701e-06, "loss": 0.4371, "step": 820 }, { "epoch": 0.09046831955922865, "grad_norm": 9.20396900177002, "learning_rate": 9.837973387773642e-06, "loss": 0.4938, "step": 821 }, { "epoch": 0.09057851239669422, "grad_norm": 11.53105354309082, "learning_rate": 9.837531596642624e-06, "loss": 0.5538, "step": 822 }, { "epoch": 0.09068870523415978, "grad_norm": 6.2707085609436035, "learning_rate": 9.837089213971674e-06, "loss": 0.3766, "step": 823 }, { "epoch": 0.09079889807162535, "grad_norm": 9.616811752319336, "learning_rate": 9.836646239814883e-06, "loss": 0.446, "step": 824 }, { "epoch": 0.09090909090909091, "grad_norm": 18.91120147705078, "learning_rate": 9.836202674226418e-06, "loss": 0.4863, "step": 825 }, { "epoch": 0.09101928374655648, "grad_norm": 12.505661964416504, "learning_rate": 9.835758517260522e-06, "loss": 0.5459, "step": 826 }, { "epoch": 0.09112947658402204, "grad_norm": 14.198311805725098, "learning_rate": 9.835313768971507e-06, "loss": 0.5098, "step": 827 }, { "epoch": 0.0912396694214876, "grad_norm": 6.708476543426514, "learning_rate": 9.834868429413753e-06, "loss": 0.467, "step": 828 }, { "epoch": 0.09134986225895317, "grad_norm": 9.290276527404785, "learning_rate": 9.834422498641722e-06, "loss": 0.4402, "step": 829 }, { "epoch": 0.09146005509641873, "grad_norm": 5.819812297821045, "learning_rate": 9.833975976709942e-06, "loss": 0.4294, "step": 830 }, { "epoch": 0.0915702479338843, "grad_norm": 5.357295513153076, "learning_rate": 9.833528863673013e-06, "loss": 0.4823, "step": 831 }, { "epoch": 0.09168044077134986, "grad_norm": 7.369269371032715, "learning_rate": 9.833081159585607e-06, "loss": 0.4368, "step": 832 }, { "epoch": 0.09179063360881543, "grad_norm": 12.383874893188477, "learning_rate": 9.832632864502472e-06, "loss": 0.5421, "step": 833 }, { "epoch": 0.09190082644628099, "grad_norm": 8.38817024230957, "learning_rate": 9.832183978478426e-06, "loss": 0.5052, "step": 834 }, { "epoch": 0.09201101928374655, "grad_norm": 5.2538065910339355, "learning_rate": 9.831734501568362e-06, "loss": 0.4917, "step": 835 }, { "epoch": 0.09212121212121212, "grad_norm": 9.768592834472656, "learning_rate": 9.831284433827238e-06, "loss": 0.5308, "step": 836 }, { "epoch": 0.09223140495867768, "grad_norm": 7.479040145874023, "learning_rate": 9.83083377531009e-06, "loss": 0.5141, "step": 837 }, { "epoch": 0.09234159779614325, "grad_norm": 11.684398651123047, "learning_rate": 9.830382526072027e-06, "loss": 0.496, "step": 838 }, { "epoch": 0.09245179063360881, "grad_norm": 5.878693103790283, "learning_rate": 9.829930686168225e-06, "loss": 0.4375, "step": 839 }, { "epoch": 0.09256198347107437, "grad_norm": 6.7616963386535645, "learning_rate": 9.82947825565394e-06, "loss": 0.3976, "step": 840 }, { "epoch": 0.09267217630853994, "grad_norm": 6.9319167137146, "learning_rate": 9.829025234584493e-06, "loss": 0.4774, "step": 841 }, { "epoch": 0.0927823691460055, "grad_norm": 7.360224723815918, "learning_rate": 9.828571623015282e-06, "loss": 0.4056, "step": 842 }, { "epoch": 0.09289256198347108, "grad_norm": 10.568782806396484, "learning_rate": 9.828117421001773e-06, "loss": 0.4599, "step": 843 }, { "epoch": 0.09300275482093665, "grad_norm": 13.017199516296387, "learning_rate": 9.827662628599507e-06, "loss": 0.5339, "step": 844 }, { "epoch": 0.09311294765840221, "grad_norm": 8.242898941040039, "learning_rate": 9.827207245864097e-06, "loss": 0.5639, "step": 845 }, { "epoch": 0.09322314049586777, "grad_norm": 10.402332305908203, "learning_rate": 9.826751272851228e-06, "loss": 0.5339, "step": 846 }, { "epoch": 0.09333333333333334, "grad_norm": 7.68806266784668, "learning_rate": 9.826294709616657e-06, "loss": 0.3979, "step": 847 }, { "epoch": 0.0934435261707989, "grad_norm": 5.082879066467285, "learning_rate": 9.825837556216214e-06, "loss": 0.4721, "step": 848 }, { "epoch": 0.09355371900826447, "grad_norm": 15.04353141784668, "learning_rate": 9.825379812705797e-06, "loss": 0.6114, "step": 849 }, { "epoch": 0.09366391184573003, "grad_norm": 7.5867462158203125, "learning_rate": 9.824921479141385e-06, "loss": 0.4747, "step": 850 }, { "epoch": 0.0937741046831956, "grad_norm": 5.760891437530518, "learning_rate": 9.824462555579019e-06, "loss": 0.4118, "step": 851 }, { "epoch": 0.09388429752066116, "grad_norm": 11.626730918884277, "learning_rate": 9.824003042074818e-06, "loss": 0.5749, "step": 852 }, { "epoch": 0.09399449035812672, "grad_norm": 11.019942283630371, "learning_rate": 9.823542938684972e-06, "loss": 0.4681, "step": 853 }, { "epoch": 0.09410468319559229, "grad_norm": 6.9120402336120605, "learning_rate": 9.823082245465743e-06, "loss": 0.4679, "step": 854 }, { "epoch": 0.09421487603305785, "grad_norm": 6.156190395355225, "learning_rate": 9.822620962473466e-06, "loss": 0.4294, "step": 855 }, { "epoch": 0.09432506887052342, "grad_norm": 11.259407997131348, "learning_rate": 9.822159089764549e-06, "loss": 0.5178, "step": 856 }, { "epoch": 0.09443526170798898, "grad_norm": 9.78303050994873, "learning_rate": 9.821696627395465e-06, "loss": 0.4396, "step": 857 }, { "epoch": 0.09454545454545454, "grad_norm": 11.318187713623047, "learning_rate": 9.82123357542277e-06, "loss": 0.442, "step": 858 }, { "epoch": 0.09465564738292011, "grad_norm": 8.33297061920166, "learning_rate": 9.820769933903082e-06, "loss": 0.4594, "step": 859 }, { "epoch": 0.09476584022038567, "grad_norm": 7.704953193664551, "learning_rate": 9.8203057028931e-06, "loss": 0.4941, "step": 860 }, { "epoch": 0.09487603305785124, "grad_norm": 7.631678581237793, "learning_rate": 9.81984088244959e-06, "loss": 0.4377, "step": 861 }, { "epoch": 0.0949862258953168, "grad_norm": 8.531377792358398, "learning_rate": 9.819375472629388e-06, "loss": 0.418, "step": 862 }, { "epoch": 0.09509641873278236, "grad_norm": 7.013405799865723, "learning_rate": 9.818909473489406e-06, "loss": 0.4727, "step": 863 }, { "epoch": 0.09520661157024793, "grad_norm": 9.04765796661377, "learning_rate": 9.81844288508663e-06, "loss": 0.4904, "step": 864 }, { "epoch": 0.09531680440771349, "grad_norm": 6.089433193206787, "learning_rate": 9.817975707478111e-06, "loss": 0.5192, "step": 865 }, { "epoch": 0.09542699724517906, "grad_norm": 10.373228073120117, "learning_rate": 9.817507940720978e-06, "loss": 0.5992, "step": 866 }, { "epoch": 0.09553719008264462, "grad_norm": 12.56816577911377, "learning_rate": 9.817039584872433e-06, "loss": 0.5209, "step": 867 }, { "epoch": 0.0956473829201102, "grad_norm": 7.873315811157227, "learning_rate": 9.81657063998974e-06, "loss": 0.472, "step": 868 }, { "epoch": 0.09575757575757576, "grad_norm": 8.665975570678711, "learning_rate": 9.816101106130249e-06, "loss": 0.4502, "step": 869 }, { "epoch": 0.09586776859504133, "grad_norm": 8.683157920837402, "learning_rate": 9.815630983351372e-06, "loss": 0.5341, "step": 870 }, { "epoch": 0.09597796143250689, "grad_norm": 5.375618934631348, "learning_rate": 9.815160271710596e-06, "loss": 0.4331, "step": 871 }, { "epoch": 0.09608815426997246, "grad_norm": 8.436710357666016, "learning_rate": 9.814688971265482e-06, "loss": 0.3948, "step": 872 }, { "epoch": 0.09619834710743802, "grad_norm": 11.841187477111816, "learning_rate": 9.814217082073662e-06, "loss": 0.6072, "step": 873 }, { "epoch": 0.09630853994490358, "grad_norm": 9.360101699829102, "learning_rate": 9.813744604192836e-06, "loss": 0.4533, "step": 874 }, { "epoch": 0.09641873278236915, "grad_norm": 8.671298027038574, "learning_rate": 9.81327153768078e-06, "loss": 0.4271, "step": 875 }, { "epoch": 0.09652892561983471, "grad_norm": 7.9733052253723145, "learning_rate": 9.812797882595345e-06, "loss": 0.5111, "step": 876 }, { "epoch": 0.09663911845730028, "grad_norm": 5.683726787567139, "learning_rate": 9.812323638994446e-06, "loss": 0.4318, "step": 877 }, { "epoch": 0.09674931129476584, "grad_norm": 13.997456550598145, "learning_rate": 9.811848806936076e-06, "loss": 0.513, "step": 878 }, { "epoch": 0.0968595041322314, "grad_norm": 10.461240768432617, "learning_rate": 9.811373386478296e-06, "loss": 0.4362, "step": 879 }, { "epoch": 0.09696969696969697, "grad_norm": 10.092257499694824, "learning_rate": 9.810897377679243e-06, "loss": 0.5318, "step": 880 }, { "epoch": 0.09707988980716253, "grad_norm": 10.155533790588379, "learning_rate": 9.810420780597126e-06, "loss": 0.5287, "step": 881 }, { "epoch": 0.0971900826446281, "grad_norm": 13.351264953613281, "learning_rate": 9.809943595290219e-06, "loss": 0.6615, "step": 882 }, { "epoch": 0.09730027548209366, "grad_norm": 10.560681343078613, "learning_rate": 9.809465821816877e-06, "loss": 0.4489, "step": 883 }, { "epoch": 0.09741046831955923, "grad_norm": 6.292139530181885, "learning_rate": 9.808987460235521e-06, "loss": 0.4243, "step": 884 }, { "epoch": 0.09752066115702479, "grad_norm": 23.3513126373291, "learning_rate": 9.808508510604647e-06, "loss": 0.569, "step": 885 }, { "epoch": 0.09763085399449035, "grad_norm": 8.537981033325195, "learning_rate": 9.808028972982818e-06, "loss": 0.4615, "step": 886 }, { "epoch": 0.09774104683195592, "grad_norm": 11.562088012695312, "learning_rate": 9.807548847428678e-06, "loss": 0.4932, "step": 887 }, { "epoch": 0.09785123966942148, "grad_norm": 9.631924629211426, "learning_rate": 9.807068134000933e-06, "loss": 0.4441, "step": 888 }, { "epoch": 0.09796143250688705, "grad_norm": 6.422639846801758, "learning_rate": 9.806586832758367e-06, "loss": 0.524, "step": 889 }, { "epoch": 0.09807162534435261, "grad_norm": 8.23627758026123, "learning_rate": 9.806104943759832e-06, "loss": 0.507, "step": 890 }, { "epoch": 0.09818181818181818, "grad_norm": 7.2150654792785645, "learning_rate": 9.80562246706426e-06, "loss": 0.4919, "step": 891 }, { "epoch": 0.09829201101928374, "grad_norm": 8.93393325805664, "learning_rate": 9.805139402730641e-06, "loss": 0.5218, "step": 892 }, { "epoch": 0.09840220385674932, "grad_norm": 14.821327209472656, "learning_rate": 9.804655750818051e-06, "loss": 0.5575, "step": 893 }, { "epoch": 0.09851239669421488, "grad_norm": 8.87337875366211, "learning_rate": 9.80417151138563e-06, "loss": 0.509, "step": 894 }, { "epoch": 0.09862258953168045, "grad_norm": 7.318422317504883, "learning_rate": 9.803686684492589e-06, "loss": 0.4545, "step": 895 }, { "epoch": 0.09873278236914601, "grad_norm": 10.265007019042969, "learning_rate": 9.803201270198215e-06, "loss": 0.5321, "step": 896 }, { "epoch": 0.09884297520661157, "grad_norm": 8.000826835632324, "learning_rate": 9.802715268561867e-06, "loss": 0.4072, "step": 897 }, { "epoch": 0.09895316804407714, "grad_norm": 9.18450927734375, "learning_rate": 9.802228679642971e-06, "loss": 0.4839, "step": 898 }, { "epoch": 0.0990633608815427, "grad_norm": 8.497360229492188, "learning_rate": 9.801741503501028e-06, "loss": 0.5484, "step": 899 }, { "epoch": 0.09917355371900827, "grad_norm": 14.225048065185547, "learning_rate": 9.801253740195613e-06, "loss": 0.551, "step": 900 }, { "epoch": 0.09928374655647383, "grad_norm": 6.575657844543457, "learning_rate": 9.800765389786368e-06, "loss": 0.4461, "step": 901 }, { "epoch": 0.0993939393939394, "grad_norm": 6.189450740814209, "learning_rate": 9.80027645233301e-06, "loss": 0.4668, "step": 902 }, { "epoch": 0.09950413223140496, "grad_norm": 6.756643295288086, "learning_rate": 9.799786927895328e-06, "loss": 0.5309, "step": 903 }, { "epoch": 0.09961432506887052, "grad_norm": 7.314714431762695, "learning_rate": 9.799296816533178e-06, "loss": 0.473, "step": 904 }, { "epoch": 0.09972451790633609, "grad_norm": 5.048830032348633, "learning_rate": 9.798806118306496e-06, "loss": 0.5075, "step": 905 }, { "epoch": 0.09983471074380165, "grad_norm": 10.5199613571167, "learning_rate": 9.798314833275281e-06, "loss": 0.6008, "step": 906 }, { "epoch": 0.09994490358126722, "grad_norm": 14.887784004211426, "learning_rate": 9.797822961499614e-06, "loss": 0.4552, "step": 907 }, { "epoch": 0.10005509641873278, "grad_norm": 7.37053918838501, "learning_rate": 9.797330503039636e-06, "loss": 0.4901, "step": 908 }, { "epoch": 0.10005509641873278, "eval_loss": 0.47280246019363403, "eval_runtime": 41.9516, "eval_samples_per_second": 17.496, "eval_steps_per_second": 2.193, "step": 908 }, { "epoch": 0.10016528925619834, "grad_norm": 7.933466911315918, "learning_rate": 9.796837457955568e-06, "loss": 0.4259, "step": 909 }, { "epoch": 0.10027548209366391, "grad_norm": 5.126168727874756, "learning_rate": 9.7963438263077e-06, "loss": 0.466, "step": 910 }, { "epoch": 0.10038567493112947, "grad_norm": 9.237092018127441, "learning_rate": 9.795849608156393e-06, "loss": 0.5476, "step": 911 }, { "epoch": 0.10049586776859504, "grad_norm": 9.323186874389648, "learning_rate": 9.79535480356208e-06, "loss": 0.4387, "step": 912 }, { "epoch": 0.1006060606060606, "grad_norm": 9.716706275939941, "learning_rate": 9.79485941258527e-06, "loss": 0.4289, "step": 913 }, { "epoch": 0.10071625344352617, "grad_norm": 12.264853477478027, "learning_rate": 9.794363435286538e-06, "loss": 0.4621, "step": 914 }, { "epoch": 0.10082644628099173, "grad_norm": 10.040712356567383, "learning_rate": 9.793866871726533e-06, "loss": 0.5254, "step": 915 }, { "epoch": 0.1009366391184573, "grad_norm": 8.736623764038086, "learning_rate": 9.793369721965973e-06, "loss": 0.4578, "step": 916 }, { "epoch": 0.10104683195592286, "grad_norm": 10.712364196777344, "learning_rate": 9.792871986065653e-06, "loss": 0.4672, "step": 917 }, { "epoch": 0.10115702479338844, "grad_norm": 6.687889099121094, "learning_rate": 9.792373664086437e-06, "loss": 0.4849, "step": 918 }, { "epoch": 0.101267217630854, "grad_norm": 9.594983100891113, "learning_rate": 9.791874756089258e-06, "loss": 0.5466, "step": 919 }, { "epoch": 0.10137741046831956, "grad_norm": 14.560150146484375, "learning_rate": 9.791375262135126e-06, "loss": 0.5258, "step": 920 }, { "epoch": 0.10148760330578513, "grad_norm": 9.519214630126953, "learning_rate": 9.790875182285119e-06, "loss": 0.4361, "step": 921 }, { "epoch": 0.1015977961432507, "grad_norm": 10.702827453613281, "learning_rate": 9.790374516600384e-06, "loss": 0.4343, "step": 922 }, { "epoch": 0.10170798898071626, "grad_norm": 13.46514892578125, "learning_rate": 9.78987326514215e-06, "loss": 0.465, "step": 923 }, { "epoch": 0.10181818181818182, "grad_norm": 9.053117752075195, "learning_rate": 9.789371427971703e-06, "loss": 0.4946, "step": 924 }, { "epoch": 0.10192837465564739, "grad_norm": 8.024469375610352, "learning_rate": 9.788869005150415e-06, "loss": 0.5081, "step": 925 }, { "epoch": 0.10203856749311295, "grad_norm": 8.866372108459473, "learning_rate": 9.788365996739719e-06, "loss": 0.4919, "step": 926 }, { "epoch": 0.10214876033057851, "grad_norm": 9.30793285369873, "learning_rate": 9.787862402801125e-06, "loss": 0.5046, "step": 927 }, { "epoch": 0.10225895316804408, "grad_norm": 7.904647350311279, "learning_rate": 9.787358223396211e-06, "loss": 0.5348, "step": 928 }, { "epoch": 0.10236914600550964, "grad_norm": 6.154333591461182, "learning_rate": 9.786853458586632e-06, "loss": 0.4739, "step": 929 }, { "epoch": 0.1024793388429752, "grad_norm": 6.14103889465332, "learning_rate": 9.78634810843411e-06, "loss": 0.496, "step": 930 }, { "epoch": 0.10258953168044077, "grad_norm": 6.138935089111328, "learning_rate": 9.785842173000439e-06, "loss": 0.4887, "step": 931 }, { "epoch": 0.10269972451790634, "grad_norm": 8.407147407531738, "learning_rate": 9.785335652347485e-06, "loss": 0.4524, "step": 932 }, { "epoch": 0.1028099173553719, "grad_norm": 7.617428302764893, "learning_rate": 9.784828546537189e-06, "loss": 0.4698, "step": 933 }, { "epoch": 0.10292011019283746, "grad_norm": 6.29920768737793, "learning_rate": 9.784320855631558e-06, "loss": 0.4895, "step": 934 }, { "epoch": 0.10303030303030303, "grad_norm": 9.820000648498535, "learning_rate": 9.783812579692675e-06, "loss": 0.3639, "step": 935 }, { "epoch": 0.10314049586776859, "grad_norm": 8.362419128417969, "learning_rate": 9.78330371878269e-06, "loss": 0.432, "step": 936 }, { "epoch": 0.10325068870523416, "grad_norm": 7.344079971313477, "learning_rate": 9.782794272963829e-06, "loss": 0.4791, "step": 937 }, { "epoch": 0.10336088154269972, "grad_norm": 12.409881591796875, "learning_rate": 9.782284242298388e-06, "loss": 0.5734, "step": 938 }, { "epoch": 0.10347107438016528, "grad_norm": 8.979248046875, "learning_rate": 9.781773626848735e-06, "loss": 0.4816, "step": 939 }, { "epoch": 0.10358126721763085, "grad_norm": 13.881377220153809, "learning_rate": 9.781262426677304e-06, "loss": 0.569, "step": 940 }, { "epoch": 0.10369146005509641, "grad_norm": 8.64345645904541, "learning_rate": 9.780750641846613e-06, "loss": 0.4471, "step": 941 }, { "epoch": 0.10380165289256198, "grad_norm": 9.803803443908691, "learning_rate": 9.780238272419237e-06, "loss": 0.5504, "step": 942 }, { "epoch": 0.10391184573002755, "grad_norm": 8.662821769714355, "learning_rate": 9.779725318457833e-06, "loss": 0.4825, "step": 943 }, { "epoch": 0.10402203856749312, "grad_norm": 6.113463878631592, "learning_rate": 9.779211780025122e-06, "loss": 0.5457, "step": 944 }, { "epoch": 0.10413223140495868, "grad_norm": 7.981666564941406, "learning_rate": 9.778697657183906e-06, "loss": 0.5118, "step": 945 }, { "epoch": 0.10424242424242425, "grad_norm": 5.331852436065674, "learning_rate": 9.778182949997047e-06, "loss": 0.4881, "step": 946 }, { "epoch": 0.10435261707988981, "grad_norm": 9.32343864440918, "learning_rate": 9.777667658527487e-06, "loss": 0.4887, "step": 947 }, { "epoch": 0.10446280991735538, "grad_norm": 9.610448837280273, "learning_rate": 9.777151782838236e-06, "loss": 0.4562, "step": 948 }, { "epoch": 0.10457300275482094, "grad_norm": 8.85430908203125, "learning_rate": 9.776635322992377e-06, "loss": 0.3829, "step": 949 }, { "epoch": 0.1046831955922865, "grad_norm": 9.623668670654297, "learning_rate": 9.77611827905306e-06, "loss": 0.4647, "step": 950 }, { "epoch": 0.10479338842975207, "grad_norm": 8.303234100341797, "learning_rate": 9.775600651083511e-06, "loss": 0.455, "step": 951 }, { "epoch": 0.10490358126721763, "grad_norm": 6.702611446380615, "learning_rate": 9.77508243914703e-06, "loss": 0.4579, "step": 952 }, { "epoch": 0.1050137741046832, "grad_norm": 8.47449016571045, "learning_rate": 9.774563643306982e-06, "loss": 0.469, "step": 953 }, { "epoch": 0.10512396694214876, "grad_norm": 9.198047637939453, "learning_rate": 9.774044263626804e-06, "loss": 0.4984, "step": 954 }, { "epoch": 0.10523415977961433, "grad_norm": 9.3027925491333, "learning_rate": 9.773524300170012e-06, "loss": 0.4517, "step": 955 }, { "epoch": 0.10534435261707989, "grad_norm": 10.216512680053711, "learning_rate": 9.773003753000184e-06, "loss": 0.4854, "step": 956 }, { "epoch": 0.10545454545454545, "grad_norm": 9.105542182922363, "learning_rate": 9.77248262218097e-06, "loss": 0.5126, "step": 957 }, { "epoch": 0.10556473829201102, "grad_norm": 6.106430530548096, "learning_rate": 9.771960907776102e-06, "loss": 0.437, "step": 958 }, { "epoch": 0.10567493112947658, "grad_norm": 17.164031982421875, "learning_rate": 9.771438609849368e-06, "loss": 0.4824, "step": 959 }, { "epoch": 0.10578512396694215, "grad_norm": 8.929496765136719, "learning_rate": 9.770915728464643e-06, "loss": 0.5196, "step": 960 }, { "epoch": 0.10589531680440771, "grad_norm": 6.128411769866943, "learning_rate": 9.770392263685861e-06, "loss": 0.424, "step": 961 }, { "epoch": 0.10600550964187327, "grad_norm": 8.186613082885742, "learning_rate": 9.769868215577033e-06, "loss": 0.5163, "step": 962 }, { "epoch": 0.10611570247933884, "grad_norm": 9.785344123840332, "learning_rate": 9.76934358420224e-06, "loss": 0.388, "step": 963 }, { "epoch": 0.1062258953168044, "grad_norm": 4.475244522094727, "learning_rate": 9.768818369625635e-06, "loss": 0.3925, "step": 964 }, { "epoch": 0.10633608815426997, "grad_norm": 7.028677463531494, "learning_rate": 9.768292571911443e-06, "loss": 0.4047, "step": 965 }, { "epoch": 0.10644628099173553, "grad_norm": 7.164196968078613, "learning_rate": 9.767766191123957e-06, "loss": 0.385, "step": 966 }, { "epoch": 0.1065564738292011, "grad_norm": 9.74470329284668, "learning_rate": 9.767239227327545e-06, "loss": 0.4755, "step": 967 }, { "epoch": 0.10666666666666667, "grad_norm": 8.157772064208984, "learning_rate": 9.766711680586644e-06, "loss": 0.3744, "step": 968 }, { "epoch": 0.10677685950413224, "grad_norm": 9.312668800354004, "learning_rate": 9.766183550965767e-06, "loss": 0.5155, "step": 969 }, { "epoch": 0.1068870523415978, "grad_norm": 14.372354507446289, "learning_rate": 9.765654838529488e-06, "loss": 0.5237, "step": 970 }, { "epoch": 0.10699724517906337, "grad_norm": 8.154521942138672, "learning_rate": 9.765125543342461e-06, "loss": 0.4389, "step": 971 }, { "epoch": 0.10710743801652893, "grad_norm": 8.324005126953125, "learning_rate": 9.764595665469413e-06, "loss": 0.4308, "step": 972 }, { "epoch": 0.1072176308539945, "grad_norm": 8.264619827270508, "learning_rate": 9.764065204975132e-06, "loss": 0.4676, "step": 973 }, { "epoch": 0.10732782369146006, "grad_norm": 14.091426849365234, "learning_rate": 9.763534161924489e-06, "loss": 0.4343, "step": 974 }, { "epoch": 0.10743801652892562, "grad_norm": 14.29140567779541, "learning_rate": 9.763002536382416e-06, "loss": 0.469, "step": 975 }, { "epoch": 0.10754820936639119, "grad_norm": 7.155638217926025, "learning_rate": 9.762470328413925e-06, "loss": 0.4953, "step": 976 }, { "epoch": 0.10765840220385675, "grad_norm": 11.622149467468262, "learning_rate": 9.761937538084092e-06, "loss": 0.4917, "step": 977 }, { "epoch": 0.10776859504132232, "grad_norm": 8.021838188171387, "learning_rate": 9.761404165458068e-06, "loss": 0.4889, "step": 978 }, { "epoch": 0.10787878787878788, "grad_norm": 13.566268920898438, "learning_rate": 9.760870210601074e-06, "loss": 0.6036, "step": 979 }, { "epoch": 0.10798898071625344, "grad_norm": 15.852530479431152, "learning_rate": 9.760335673578405e-06, "loss": 0.4561, "step": 980 }, { "epoch": 0.10809917355371901, "grad_norm": 7.8272294998168945, "learning_rate": 9.759800554455424e-06, "loss": 0.4706, "step": 981 }, { "epoch": 0.10820936639118457, "grad_norm": 7.1814398765563965, "learning_rate": 9.759264853297565e-06, "loss": 0.424, "step": 982 }, { "epoch": 0.10831955922865014, "grad_norm": 7.137000560760498, "learning_rate": 9.758728570170335e-06, "loss": 0.3955, "step": 983 }, { "epoch": 0.1084297520661157, "grad_norm": 11.43804931640625, "learning_rate": 9.75819170513931e-06, "loss": 0.5115, "step": 984 }, { "epoch": 0.10853994490358126, "grad_norm": 7.88161039352417, "learning_rate": 9.757654258270141e-06, "loss": 0.4613, "step": 985 }, { "epoch": 0.10865013774104683, "grad_norm": 6.659247875213623, "learning_rate": 9.757116229628547e-06, "loss": 0.486, "step": 986 }, { "epoch": 0.10876033057851239, "grad_norm": 12.036238670349121, "learning_rate": 9.756577619280319e-06, "loss": 0.5612, "step": 987 }, { "epoch": 0.10887052341597796, "grad_norm": 12.239153861999512, "learning_rate": 9.756038427291318e-06, "loss": 0.5577, "step": 988 }, { "epoch": 0.10898071625344352, "grad_norm": 8.325892448425293, "learning_rate": 9.755498653727477e-06, "loss": 0.551, "step": 989 }, { "epoch": 0.10909090909090909, "grad_norm": 8.936971664428711, "learning_rate": 9.754958298654802e-06, "loss": 0.4942, "step": 990 }, { "epoch": 0.10920110192837465, "grad_norm": 7.798034191131592, "learning_rate": 9.754417362139366e-06, "loss": 0.5114, "step": 991 }, { "epoch": 0.10931129476584021, "grad_norm": 8.612783432006836, "learning_rate": 9.753875844247318e-06, "loss": 0.4531, "step": 992 }, { "epoch": 0.10942148760330579, "grad_norm": 8.013750076293945, "learning_rate": 9.753333745044873e-06, "loss": 0.499, "step": 993 }, { "epoch": 0.10953168044077136, "grad_norm": 8.845736503601074, "learning_rate": 9.752791064598322e-06, "loss": 0.4827, "step": 994 }, { "epoch": 0.10964187327823692, "grad_norm": 9.831127166748047, "learning_rate": 9.752247802974023e-06, "loss": 0.5425, "step": 995 }, { "epoch": 0.10975206611570248, "grad_norm": 7.089080810546875, "learning_rate": 9.751703960238408e-06, "loss": 0.3942, "step": 996 }, { "epoch": 0.10986225895316805, "grad_norm": 8.943671226501465, "learning_rate": 9.751159536457977e-06, "loss": 0.4379, "step": 997 }, { "epoch": 0.10997245179063361, "grad_norm": 7.014477729797363, "learning_rate": 9.750614531699304e-06, "loss": 0.4104, "step": 998 }, { "epoch": 0.11008264462809918, "grad_norm": 6.420469284057617, "learning_rate": 9.750068946029034e-06, "loss": 0.4419, "step": 999 }, { "epoch": 0.11019283746556474, "grad_norm": 14.385880470275879, "learning_rate": 9.749522779513883e-06, "loss": 0.5737, "step": 1000 }, { "epoch": 0.1103030303030303, "grad_norm": 19.77135467529297, "learning_rate": 9.748976032220632e-06, "loss": 0.534, "step": 1001 }, { "epoch": 0.11041322314049587, "grad_norm": 9.207255363464355, "learning_rate": 9.748428704216141e-06, "loss": 0.5468, "step": 1002 }, { "epoch": 0.11052341597796143, "grad_norm": 10.451583862304688, "learning_rate": 9.747880795567338e-06, "loss": 0.4539, "step": 1003 }, { "epoch": 0.110633608815427, "grad_norm": 10.347034454345703, "learning_rate": 9.747332306341222e-06, "loss": 0.472, "step": 1004 }, { "epoch": 0.11074380165289256, "grad_norm": 8.932332992553711, "learning_rate": 9.746783236604864e-06, "loss": 0.4288, "step": 1005 }, { "epoch": 0.11085399449035813, "grad_norm": 8.662117958068848, "learning_rate": 9.746233586425404e-06, "loss": 0.507, "step": 1006 }, { "epoch": 0.11096418732782369, "grad_norm": 7.668501377105713, "learning_rate": 9.745683355870053e-06, "loss": 0.5003, "step": 1007 }, { "epoch": 0.11107438016528925, "grad_norm": 9.935391426086426, "learning_rate": 9.745132545006096e-06, "loss": 0.4623, "step": 1008 }, { "epoch": 0.11118457300275482, "grad_norm": 9.756633758544922, "learning_rate": 9.744581153900883e-06, "loss": 0.5415, "step": 1009 }, { "epoch": 0.11129476584022038, "grad_norm": 7.257644176483154, "learning_rate": 9.744029182621845e-06, "loss": 0.4688, "step": 1010 }, { "epoch": 0.11140495867768595, "grad_norm": 11.103131294250488, "learning_rate": 9.743476631236473e-06, "loss": 0.5904, "step": 1011 }, { "epoch": 0.11151515151515151, "grad_norm": 8.020723342895508, "learning_rate": 9.742923499812335e-06, "loss": 0.4852, "step": 1012 }, { "epoch": 0.11162534435261708, "grad_norm": 8.784106254577637, "learning_rate": 9.742369788417068e-06, "loss": 0.4, "step": 1013 }, { "epoch": 0.11173553719008264, "grad_norm": 8.087194442749023, "learning_rate": 9.741815497118383e-06, "loss": 0.4862, "step": 1014 }, { "epoch": 0.1118457300275482, "grad_norm": 9.28781509399414, "learning_rate": 9.741260625984057e-06, "loss": 0.5297, "step": 1015 }, { "epoch": 0.11195592286501377, "grad_norm": 7.464603424072266, "learning_rate": 9.74070517508194e-06, "loss": 0.5161, "step": 1016 }, { "epoch": 0.11206611570247933, "grad_norm": 6.450089454650879, "learning_rate": 9.740149144479957e-06, "loss": 0.4945, "step": 1017 }, { "epoch": 0.1121763085399449, "grad_norm": 7.314095973968506, "learning_rate": 9.739592534246098e-06, "loss": 0.4962, "step": 1018 }, { "epoch": 0.11228650137741047, "grad_norm": 9.615967750549316, "learning_rate": 9.739035344448425e-06, "loss": 0.4424, "step": 1019 }, { "epoch": 0.11239669421487604, "grad_norm": 5.225827693939209, "learning_rate": 9.738477575155072e-06, "loss": 0.4328, "step": 1020 }, { "epoch": 0.1125068870523416, "grad_norm": 8.105597496032715, "learning_rate": 9.737919226434245e-06, "loss": 0.4604, "step": 1021 }, { "epoch": 0.11261707988980717, "grad_norm": 6.439534664154053, "learning_rate": 9.73736029835422e-06, "loss": 0.4346, "step": 1022 }, { "epoch": 0.11272727272727273, "grad_norm": 8.215489387512207, "learning_rate": 9.73680079098334e-06, "loss": 0.4113, "step": 1023 }, { "epoch": 0.1128374655647383, "grad_norm": 9.294937133789062, "learning_rate": 9.736240704390027e-06, "loss": 0.5111, "step": 1024 }, { "epoch": 0.11294765840220386, "grad_norm": 6.806695461273193, "learning_rate": 9.735680038642767e-06, "loss": 0.4648, "step": 1025 }, { "epoch": 0.11305785123966942, "grad_norm": 7.230663776397705, "learning_rate": 9.735118793810118e-06, "loss": 0.4098, "step": 1026 }, { "epoch": 0.11316804407713499, "grad_norm": 8.778502464294434, "learning_rate": 9.734556969960712e-06, "loss": 0.4613, "step": 1027 }, { "epoch": 0.11327823691460055, "grad_norm": 8.630878448486328, "learning_rate": 9.733994567163248e-06, "loss": 0.4937, "step": 1028 }, { "epoch": 0.11338842975206612, "grad_norm": 7.268901348114014, "learning_rate": 9.733431585486499e-06, "loss": 0.4959, "step": 1029 }, { "epoch": 0.11349862258953168, "grad_norm": 7.849322319030762, "learning_rate": 9.732868024999305e-06, "loss": 0.4461, "step": 1030 }, { "epoch": 0.11360881542699725, "grad_norm": 7.055856227874756, "learning_rate": 9.73230388577058e-06, "loss": 0.4543, "step": 1031 }, { "epoch": 0.11371900826446281, "grad_norm": 6.915589332580566, "learning_rate": 9.731739167869308e-06, "loss": 0.555, "step": 1032 }, { "epoch": 0.11382920110192837, "grad_norm": 19.499521255493164, "learning_rate": 9.731173871364542e-06, "loss": 0.5882, "step": 1033 }, { "epoch": 0.11393939393939394, "grad_norm": 13.201210975646973, "learning_rate": 9.730607996325408e-06, "loss": 0.4885, "step": 1034 }, { "epoch": 0.1140495867768595, "grad_norm": 12.455816268920898, "learning_rate": 9.730041542821105e-06, "loss": 0.4669, "step": 1035 }, { "epoch": 0.11415977961432507, "grad_norm": 5.530920028686523, "learning_rate": 9.729474510920895e-06, "loss": 0.4181, "step": 1036 }, { "epoch": 0.11426997245179063, "grad_norm": 5.375738620758057, "learning_rate": 9.728906900694117e-06, "loss": 0.2722, "step": 1037 }, { "epoch": 0.1143801652892562, "grad_norm": 8.542257308959961, "learning_rate": 9.728338712210181e-06, "loss": 0.4797, "step": 1038 }, { "epoch": 0.11449035812672176, "grad_norm": 9.09620475769043, "learning_rate": 9.727769945538563e-06, "loss": 0.466, "step": 1039 }, { "epoch": 0.11460055096418732, "grad_norm": 6.134823322296143, "learning_rate": 9.727200600748815e-06, "loss": 0.4626, "step": 1040 }, { "epoch": 0.11471074380165289, "grad_norm": 6.514139175415039, "learning_rate": 9.726630677910556e-06, "loss": 0.4281, "step": 1041 }, { "epoch": 0.11482093663911845, "grad_norm": 7.309274673461914, "learning_rate": 9.726060177093477e-06, "loss": 0.4084, "step": 1042 }, { "epoch": 0.11493112947658402, "grad_norm": 10.32424259185791, "learning_rate": 9.72548909836734e-06, "loss": 0.3898, "step": 1043 }, { "epoch": 0.1150413223140496, "grad_norm": 11.615346908569336, "learning_rate": 9.724917441801977e-06, "loss": 0.393, "step": 1044 }, { "epoch": 0.11515151515151516, "grad_norm": 6.7931389808654785, "learning_rate": 9.724345207467292e-06, "loss": 0.4691, "step": 1045 }, { "epoch": 0.11526170798898072, "grad_norm": 11.140974044799805, "learning_rate": 9.723772395433257e-06, "loss": 0.4047, "step": 1046 }, { "epoch": 0.11537190082644629, "grad_norm": 6.690062522888184, "learning_rate": 9.723199005769917e-06, "loss": 0.4866, "step": 1047 }, { "epoch": 0.11548209366391185, "grad_norm": 9.408793449401855, "learning_rate": 9.722625038547386e-06, "loss": 0.4788, "step": 1048 }, { "epoch": 0.11559228650137741, "grad_norm": 8.867337226867676, "learning_rate": 9.722050493835852e-06, "loss": 0.4866, "step": 1049 }, { "epoch": 0.11570247933884298, "grad_norm": 6.5331597328186035, "learning_rate": 9.721475371705567e-06, "loss": 0.5087, "step": 1050 }, { "epoch": 0.11581267217630854, "grad_norm": 11.793696403503418, "learning_rate": 9.720899672226863e-06, "loss": 0.4775, "step": 1051 }, { "epoch": 0.1159228650137741, "grad_norm": 16.762985229492188, "learning_rate": 9.720323395470132e-06, "loss": 0.5354, "step": 1052 }, { "epoch": 0.11603305785123967, "grad_norm": 13.210665702819824, "learning_rate": 9.719746541505844e-06, "loss": 0.5214, "step": 1053 }, { "epoch": 0.11614325068870524, "grad_norm": 5.774608135223389, "learning_rate": 9.719169110404538e-06, "loss": 0.4265, "step": 1054 }, { "epoch": 0.1162534435261708, "grad_norm": 8.105388641357422, "learning_rate": 9.718591102236823e-06, "loss": 0.5145, "step": 1055 }, { "epoch": 0.11636363636363636, "grad_norm": 5.87872314453125, "learning_rate": 9.71801251707338e-06, "loss": 0.45, "step": 1056 }, { "epoch": 0.11647382920110193, "grad_norm": 5.883322715759277, "learning_rate": 9.717433354984957e-06, "loss": 0.4363, "step": 1057 }, { "epoch": 0.11658402203856749, "grad_norm": 9.888565063476562, "learning_rate": 9.716853616042375e-06, "loss": 0.4482, "step": 1058 }, { "epoch": 0.11669421487603306, "grad_norm": 10.839025497436523, "learning_rate": 9.716273300316526e-06, "loss": 0.5161, "step": 1059 }, { "epoch": 0.11680440771349862, "grad_norm": 5.918877124786377, "learning_rate": 9.71569240787837e-06, "loss": 0.428, "step": 1060 }, { "epoch": 0.11691460055096418, "grad_norm": 7.210424900054932, "learning_rate": 9.715110938798942e-06, "loss": 0.4684, "step": 1061 }, { "epoch": 0.11702479338842975, "grad_norm": 9.123066902160645, "learning_rate": 9.714528893149343e-06, "loss": 0.4826, "step": 1062 }, { "epoch": 0.11713498622589531, "grad_norm": 7.897624969482422, "learning_rate": 9.713946271000747e-06, "loss": 0.5176, "step": 1063 }, { "epoch": 0.11724517906336088, "grad_norm": 7.387955188751221, "learning_rate": 9.713363072424398e-06, "loss": 0.3383, "step": 1064 }, { "epoch": 0.11735537190082644, "grad_norm": 5.582972526550293, "learning_rate": 9.712779297491609e-06, "loss": 0.5117, "step": 1065 }, { "epoch": 0.117465564738292, "grad_norm": 9.218341827392578, "learning_rate": 9.712194946273767e-06, "loss": 0.4588, "step": 1066 }, { "epoch": 0.11757575757575757, "grad_norm": 8.894552230834961, "learning_rate": 9.711610018842325e-06, "loss": 0.4658, "step": 1067 }, { "epoch": 0.11768595041322313, "grad_norm": 6.712638854980469, "learning_rate": 9.71102451526881e-06, "loss": 0.4981, "step": 1068 }, { "epoch": 0.11779614325068871, "grad_norm": 5.404588222503662, "learning_rate": 9.710438435624818e-06, "loss": 0.443, "step": 1069 }, { "epoch": 0.11790633608815428, "grad_norm": 7.288143634796143, "learning_rate": 9.709851779982017e-06, "loss": 0.5119, "step": 1070 }, { "epoch": 0.11801652892561984, "grad_norm": 6.820754528045654, "learning_rate": 9.709264548412141e-06, "loss": 0.463, "step": 1071 }, { "epoch": 0.1181267217630854, "grad_norm": 10.298928260803223, "learning_rate": 9.708676740986999e-06, "loss": 0.5043, "step": 1072 }, { "epoch": 0.11823691460055097, "grad_norm": 9.05601978302002, "learning_rate": 9.708088357778472e-06, "loss": 0.5091, "step": 1073 }, { "epoch": 0.11834710743801653, "grad_norm": 10.932220458984375, "learning_rate": 9.707499398858501e-06, "loss": 0.4609, "step": 1074 }, { "epoch": 0.1184573002754821, "grad_norm": 7.302265167236328, "learning_rate": 9.706909864299112e-06, "loss": 0.4627, "step": 1075 }, { "epoch": 0.11856749311294766, "grad_norm": 6.128036022186279, "learning_rate": 9.70631975417239e-06, "loss": 0.4745, "step": 1076 }, { "epoch": 0.11867768595041323, "grad_norm": 8.51804256439209, "learning_rate": 9.705729068550495e-06, "loss": 0.504, "step": 1077 }, { "epoch": 0.11878787878787879, "grad_norm": 9.401533126831055, "learning_rate": 9.70513780750566e-06, "loss": 0.3415, "step": 1078 }, { "epoch": 0.11889807162534435, "grad_norm": 9.22064208984375, "learning_rate": 9.70454597111018e-06, "loss": 0.4953, "step": 1079 }, { "epoch": 0.11900826446280992, "grad_norm": 10.248266220092773, "learning_rate": 9.703953559436429e-06, "loss": 0.5212, "step": 1080 }, { "epoch": 0.11911845730027548, "grad_norm": 7.634253978729248, "learning_rate": 9.703360572556845e-06, "loss": 0.4165, "step": 1081 }, { "epoch": 0.11922865013774105, "grad_norm": 6.5556182861328125, "learning_rate": 9.702767010543945e-06, "loss": 0.4619, "step": 1082 }, { "epoch": 0.11933884297520661, "grad_norm": 8.908390998840332, "learning_rate": 9.702172873470304e-06, "loss": 0.5006, "step": 1083 }, { "epoch": 0.11944903581267217, "grad_norm": 8.97170639038086, "learning_rate": 9.701578161408578e-06, "loss": 0.511, "step": 1084 }, { "epoch": 0.11955922865013774, "grad_norm": 7.142294883728027, "learning_rate": 9.700982874431488e-06, "loss": 0.4188, "step": 1085 }, { "epoch": 0.1196694214876033, "grad_norm": 6.893550872802734, "learning_rate": 9.700387012611827e-06, "loss": 0.4255, "step": 1086 }, { "epoch": 0.11977961432506887, "grad_norm": 7.846024513244629, "learning_rate": 9.699790576022456e-06, "loss": 0.4637, "step": 1087 }, { "epoch": 0.11988980716253443, "grad_norm": 7.985848426818848, "learning_rate": 9.699193564736308e-06, "loss": 0.4641, "step": 1088 }, { "epoch": 0.12, "grad_norm": 12.337224006652832, "learning_rate": 9.69859597882639e-06, "loss": 0.5153, "step": 1089 }, { "epoch": 0.12011019283746556, "grad_norm": 7.637415885925293, "learning_rate": 9.697997818365774e-06, "loss": 0.4042, "step": 1090 }, { "epoch": 0.12022038567493112, "grad_norm": 7.815561771392822, "learning_rate": 9.697399083427602e-06, "loss": 0.5332, "step": 1091 }, { "epoch": 0.12033057851239669, "grad_norm": 10.246246337890625, "learning_rate": 9.69679977408509e-06, "loss": 0.4325, "step": 1092 }, { "epoch": 0.12044077134986225, "grad_norm": 6.883440017700195, "learning_rate": 9.69619989041152e-06, "loss": 0.4729, "step": 1093 }, { "epoch": 0.12055096418732783, "grad_norm": 8.58263874053955, "learning_rate": 9.695599432480249e-06, "loss": 0.4118, "step": 1094 }, { "epoch": 0.1206611570247934, "grad_norm": 9.461421966552734, "learning_rate": 9.6949984003647e-06, "loss": 0.4378, "step": 1095 }, { "epoch": 0.12077134986225896, "grad_norm": 10.19222354888916, "learning_rate": 9.694396794138373e-06, "loss": 0.4299, "step": 1096 }, { "epoch": 0.12088154269972452, "grad_norm": 9.905485153198242, "learning_rate": 9.693794613874825e-06, "loss": 0.4034, "step": 1097 }, { "epoch": 0.12099173553719009, "grad_norm": 7.480471134185791, "learning_rate": 9.693191859647696e-06, "loss": 0.4503, "step": 1098 }, { "epoch": 0.12110192837465565, "grad_norm": 27.981821060180664, "learning_rate": 9.692588531530693e-06, "loss": 0.6124, "step": 1099 }, { "epoch": 0.12121212121212122, "grad_norm": 12.169344902038574, "learning_rate": 9.69198462959759e-06, "loss": 0.5834, "step": 1100 }, { "epoch": 0.12132231404958678, "grad_norm": 7.219008445739746, "learning_rate": 9.691380153922235e-06, "loss": 0.4682, "step": 1101 }, { "epoch": 0.12143250688705234, "grad_norm": 12.253581047058105, "learning_rate": 9.690775104578539e-06, "loss": 0.5751, "step": 1102 }, { "epoch": 0.12154269972451791, "grad_norm": 8.646245002746582, "learning_rate": 9.690169481640492e-06, "loss": 0.4849, "step": 1103 }, { "epoch": 0.12165289256198347, "grad_norm": 8.885684967041016, "learning_rate": 9.68956328518215e-06, "loss": 0.4056, "step": 1104 }, { "epoch": 0.12176308539944904, "grad_norm": 17.205263137817383, "learning_rate": 9.68895651527764e-06, "loss": 0.522, "step": 1105 }, { "epoch": 0.1218732782369146, "grad_norm": 9.57472038269043, "learning_rate": 9.688349172001157e-06, "loss": 0.5177, "step": 1106 }, { "epoch": 0.12198347107438016, "grad_norm": 7.7095794677734375, "learning_rate": 9.687741255426969e-06, "loss": 0.3892, "step": 1107 }, { "epoch": 0.12209366391184573, "grad_norm": 7.4513092041015625, "learning_rate": 9.687132765629412e-06, "loss": 0.5235, "step": 1108 }, { "epoch": 0.1222038567493113, "grad_norm": 9.538368225097656, "learning_rate": 9.686523702682896e-06, "loss": 0.4363, "step": 1109 }, { "epoch": 0.12231404958677686, "grad_norm": 5.501028537750244, "learning_rate": 9.685914066661893e-06, "loss": 0.3747, "step": 1110 }, { "epoch": 0.12242424242424242, "grad_norm": 8.349221229553223, "learning_rate": 9.685303857640954e-06, "loss": 0.4618, "step": 1111 }, { "epoch": 0.12253443526170799, "grad_norm": 7.227693557739258, "learning_rate": 9.684693075694696e-06, "loss": 0.4728, "step": 1112 }, { "epoch": 0.12264462809917355, "grad_norm": 7.649456024169922, "learning_rate": 9.684081720897802e-06, "loss": 0.3752, "step": 1113 }, { "epoch": 0.12275482093663911, "grad_norm": 9.91862678527832, "learning_rate": 9.683469793325036e-06, "loss": 0.4001, "step": 1114 }, { "epoch": 0.12286501377410468, "grad_norm": 9.030213356018066, "learning_rate": 9.68285729305122e-06, "loss": 0.5393, "step": 1115 }, { "epoch": 0.12297520661157024, "grad_norm": 10.885235786437988, "learning_rate": 9.682244220151253e-06, "loss": 0.4393, "step": 1116 }, { "epoch": 0.1230853994490358, "grad_norm": 5.922689914703369, "learning_rate": 9.681630574700102e-06, "loss": 0.4039, "step": 1117 }, { "epoch": 0.12319559228650137, "grad_norm": 5.201834678649902, "learning_rate": 9.681016356772805e-06, "loss": 0.3368, "step": 1118 }, { "epoch": 0.12330578512396695, "grad_norm": 11.609149932861328, "learning_rate": 9.680401566444472e-06, "loss": 0.3857, "step": 1119 }, { "epoch": 0.12341597796143251, "grad_norm": 11.552797317504883, "learning_rate": 9.679786203790276e-06, "loss": 0.4921, "step": 1120 }, { "epoch": 0.12352617079889808, "grad_norm": 6.618052959442139, "learning_rate": 9.679170268885464e-06, "loss": 0.4078, "step": 1121 }, { "epoch": 0.12363636363636364, "grad_norm": 6.38631010055542, "learning_rate": 9.67855376180536e-06, "loss": 0.4434, "step": 1122 }, { "epoch": 0.1237465564738292, "grad_norm": 7.464217662811279, "learning_rate": 9.677936682625344e-06, "loss": 0.4414, "step": 1123 }, { "epoch": 0.12385674931129477, "grad_norm": 7.782440662384033, "learning_rate": 9.677319031420875e-06, "loss": 0.4064, "step": 1124 }, { "epoch": 0.12396694214876033, "grad_norm": 7.2428154945373535, "learning_rate": 9.676700808267483e-06, "loss": 0.3844, "step": 1125 }, { "epoch": 0.1240771349862259, "grad_norm": 22.8863525390625, "learning_rate": 9.676082013240764e-06, "loss": 0.5105, "step": 1126 }, { "epoch": 0.12418732782369146, "grad_norm": 10.620203018188477, "learning_rate": 9.675462646416385e-06, "loss": 0.4406, "step": 1127 }, { "epoch": 0.12429752066115703, "grad_norm": 7.522141933441162, "learning_rate": 9.67484270787008e-06, "loss": 0.3763, "step": 1128 }, { "epoch": 0.12440771349862259, "grad_norm": 7.644880294799805, "learning_rate": 9.67422219767766e-06, "loss": 0.4832, "step": 1129 }, { "epoch": 0.12451790633608815, "grad_norm": 8.263649940490723, "learning_rate": 9.673601115915001e-06, "loss": 0.4586, "step": 1130 }, { "epoch": 0.12462809917355372, "grad_norm": 13.16008186340332, "learning_rate": 9.672979462658047e-06, "loss": 0.5466, "step": 1131 }, { "epoch": 0.12473829201101928, "grad_norm": 10.177546501159668, "learning_rate": 9.672357237982819e-06, "loss": 0.4415, "step": 1132 }, { "epoch": 0.12484848484848485, "grad_norm": 9.804990768432617, "learning_rate": 9.6717344419654e-06, "loss": 0.4931, "step": 1133 }, { "epoch": 0.12495867768595041, "grad_norm": 10.962959289550781, "learning_rate": 9.67111107468195e-06, "loss": 0.4532, "step": 1134 }, { "epoch": 0.12506887052341598, "grad_norm": 6.473902702331543, "learning_rate": 9.670487136208688e-06, "loss": 0.4059, "step": 1135 }, { "epoch": 0.12517906336088155, "grad_norm": 12.173033714294434, "learning_rate": 9.669862626621918e-06, "loss": 0.3997, "step": 1136 }, { "epoch": 0.1252892561983471, "grad_norm": 10.380783081054688, "learning_rate": 9.669237545998002e-06, "loss": 0.4907, "step": 1137 }, { "epoch": 0.12539944903581268, "grad_norm": 14.192329406738281, "learning_rate": 9.668611894413376e-06, "loss": 0.4613, "step": 1138 }, { "epoch": 0.12550964187327823, "grad_norm": 10.058757781982422, "learning_rate": 9.667985671944546e-06, "loss": 0.3814, "step": 1139 }, { "epoch": 0.1256198347107438, "grad_norm": 8.420469284057617, "learning_rate": 9.667358878668088e-06, "loss": 0.4661, "step": 1140 }, { "epoch": 0.12573002754820936, "grad_norm": 7.803491592407227, "learning_rate": 9.666731514660646e-06, "loss": 0.4629, "step": 1141 }, { "epoch": 0.12584022038567494, "grad_norm": 11.714037895202637, "learning_rate": 9.666103579998935e-06, "loss": 0.5287, "step": 1142 }, { "epoch": 0.1259504132231405, "grad_norm": 9.14343547821045, "learning_rate": 9.665475074759739e-06, "loss": 0.4649, "step": 1143 }, { "epoch": 0.12606060606060607, "grad_norm": 8.096257209777832, "learning_rate": 9.664845999019914e-06, "loss": 0.4587, "step": 1144 }, { "epoch": 0.12617079889807162, "grad_norm": 8.276843070983887, "learning_rate": 9.664216352856386e-06, "loss": 0.51, "step": 1145 }, { "epoch": 0.1262809917355372, "grad_norm": 7.214012145996094, "learning_rate": 9.663586136346143e-06, "loss": 0.4942, "step": 1146 }, { "epoch": 0.12639118457300275, "grad_norm": 9.526751518249512, "learning_rate": 9.662955349566254e-06, "loss": 0.4873, "step": 1147 }, { "epoch": 0.12650137741046832, "grad_norm": 6.470208644866943, "learning_rate": 9.662323992593852e-06, "loss": 0.4211, "step": 1148 }, { "epoch": 0.12661157024793387, "grad_norm": 6.078673362731934, "learning_rate": 9.661692065506136e-06, "loss": 0.4929, "step": 1149 }, { "epoch": 0.12672176308539945, "grad_norm": 7.904278755187988, "learning_rate": 9.661059568380384e-06, "loss": 0.4637, "step": 1150 }, { "epoch": 0.126831955922865, "grad_norm": 5.249787330627441, "learning_rate": 9.660426501293937e-06, "loss": 0.3808, "step": 1151 }, { "epoch": 0.12694214876033058, "grad_norm": 6.705897331237793, "learning_rate": 9.659792864324207e-06, "loss": 0.4353, "step": 1152 }, { "epoch": 0.12705234159779613, "grad_norm": 6.71006965637207, "learning_rate": 9.659158657548676e-06, "loss": 0.3755, "step": 1153 }, { "epoch": 0.1271625344352617, "grad_norm": 15.06179141998291, "learning_rate": 9.658523881044892e-06, "loss": 0.5519, "step": 1154 }, { "epoch": 0.12727272727272726, "grad_norm": 12.418374061584473, "learning_rate": 9.657888534890484e-06, "loss": 0.5228, "step": 1155 }, { "epoch": 0.12738292011019284, "grad_norm": 10.345029830932617, "learning_rate": 9.657252619163136e-06, "loss": 0.4888, "step": 1156 }, { "epoch": 0.12749311294765842, "grad_norm": 9.189409255981445, "learning_rate": 9.656616133940612e-06, "loss": 0.4751, "step": 1157 }, { "epoch": 0.12760330578512397, "grad_norm": 6.832703590393066, "learning_rate": 9.655979079300744e-06, "loss": 0.4348, "step": 1158 }, { "epoch": 0.12771349862258954, "grad_norm": 9.597253799438477, "learning_rate": 9.655341455321427e-06, "loss": 0.5213, "step": 1159 }, { "epoch": 0.1278236914600551, "grad_norm": 7.796607971191406, "learning_rate": 9.654703262080636e-06, "loss": 0.4712, "step": 1160 }, { "epoch": 0.12793388429752067, "grad_norm": 5.447074890136719, "learning_rate": 9.654064499656405e-06, "loss": 0.4379, "step": 1161 }, { "epoch": 0.12804407713498622, "grad_norm": 7.612880706787109, "learning_rate": 9.653425168126846e-06, "loss": 0.4716, "step": 1162 }, { "epoch": 0.1281542699724518, "grad_norm": 13.605120658874512, "learning_rate": 9.652785267570136e-06, "loss": 0.5116, "step": 1163 }, { "epoch": 0.12826446280991735, "grad_norm": 9.965607643127441, "learning_rate": 9.652144798064523e-06, "loss": 0.4869, "step": 1164 }, { "epoch": 0.12837465564738293, "grad_norm": 8.977864265441895, "learning_rate": 9.651503759688325e-06, "loss": 0.4552, "step": 1165 }, { "epoch": 0.12848484848484848, "grad_norm": 12.288677215576172, "learning_rate": 9.65086215251993e-06, "loss": 0.4344, "step": 1166 }, { "epoch": 0.12859504132231406, "grad_norm": 7.44070291519165, "learning_rate": 9.650219976637792e-06, "loss": 0.5138, "step": 1167 }, { "epoch": 0.1287052341597796, "grad_norm": 5.929866790771484, "learning_rate": 9.64957723212044e-06, "loss": 0.4361, "step": 1168 }, { "epoch": 0.12881542699724519, "grad_norm": 8.705809593200684, "learning_rate": 9.648933919046466e-06, "loss": 0.4308, "step": 1169 }, { "epoch": 0.12892561983471074, "grad_norm": 11.498370170593262, "learning_rate": 9.648290037494538e-06, "loss": 0.5356, "step": 1170 }, { "epoch": 0.12903581267217631, "grad_norm": 9.883990287780762, "learning_rate": 9.647645587543391e-06, "loss": 0.547, "step": 1171 }, { "epoch": 0.12914600550964186, "grad_norm": 11.769598007202148, "learning_rate": 9.647000569271829e-06, "loss": 0.4811, "step": 1172 }, { "epoch": 0.12925619834710744, "grad_norm": 13.798337936401367, "learning_rate": 9.646354982758724e-06, "loss": 0.5142, "step": 1173 }, { "epoch": 0.129366391184573, "grad_norm": 15.07916259765625, "learning_rate": 9.64570882808302e-06, "loss": 0.5676, "step": 1174 }, { "epoch": 0.12947658402203857, "grad_norm": 3.993927240371704, "learning_rate": 9.64506210532373e-06, "loss": 0.4158, "step": 1175 }, { "epoch": 0.12958677685950412, "grad_norm": 9.357967376708984, "learning_rate": 9.644414814559937e-06, "loss": 0.407, "step": 1176 }, { "epoch": 0.1296969696969697, "grad_norm": 11.830809593200684, "learning_rate": 9.64376695587079e-06, "loss": 0.5259, "step": 1177 }, { "epoch": 0.12980716253443525, "grad_norm": 7.312831878662109, "learning_rate": 9.643118529335514e-06, "loss": 0.4122, "step": 1178 }, { "epoch": 0.12991735537190083, "grad_norm": 7.975347518920898, "learning_rate": 9.642469535033396e-06, "loss": 0.4104, "step": 1179 }, { "epoch": 0.13002754820936638, "grad_norm": 6.080129623413086, "learning_rate": 9.641819973043796e-06, "loss": 0.4823, "step": 1180 }, { "epoch": 0.13013774104683196, "grad_norm": 7.574034690856934, "learning_rate": 9.641169843446146e-06, "loss": 0.4133, "step": 1181 }, { "epoch": 0.13024793388429753, "grad_norm": 7.223039150238037, "learning_rate": 9.640519146319941e-06, "loss": 0.4473, "step": 1182 }, { "epoch": 0.13035812672176308, "grad_norm": 20.465892791748047, "learning_rate": 9.639867881744753e-06, "loss": 0.4561, "step": 1183 }, { "epoch": 0.13046831955922866, "grad_norm": 5.304706573486328, "learning_rate": 9.63921604980022e-06, "loss": 0.4085, "step": 1184 }, { "epoch": 0.1305785123966942, "grad_norm": 13.216215133666992, "learning_rate": 9.638563650566044e-06, "loss": 0.5271, "step": 1185 }, { "epoch": 0.1306887052341598, "grad_norm": 5.724482536315918, "learning_rate": 9.637910684122003e-06, "loss": 0.4576, "step": 1186 }, { "epoch": 0.13079889807162534, "grad_norm": 9.094064712524414, "learning_rate": 9.637257150547945e-06, "loss": 0.4702, "step": 1187 }, { "epoch": 0.13090909090909092, "grad_norm": 6.847584247589111, "learning_rate": 9.636603049923783e-06, "loss": 0.5027, "step": 1188 }, { "epoch": 0.13101928374655647, "grad_norm": 5.101464748382568, "learning_rate": 9.635948382329502e-06, "loss": 0.3008, "step": 1189 }, { "epoch": 0.13112947658402205, "grad_norm": 10.623517990112305, "learning_rate": 9.635293147845156e-06, "loss": 0.4668, "step": 1190 }, { "epoch": 0.1312396694214876, "grad_norm": 9.423023223876953, "learning_rate": 9.634637346550866e-06, "loss": 0.5525, "step": 1191 }, { "epoch": 0.13134986225895318, "grad_norm": 6.9419050216674805, "learning_rate": 9.633980978526826e-06, "loss": 0.5164, "step": 1192 }, { "epoch": 0.13146005509641873, "grad_norm": 5.715521335601807, "learning_rate": 9.6333240438533e-06, "loss": 0.4139, "step": 1193 }, { "epoch": 0.1315702479338843, "grad_norm": 5.699647903442383, "learning_rate": 9.632666542610614e-06, "loss": 0.4201, "step": 1194 }, { "epoch": 0.13168044077134985, "grad_norm": 7.819087505340576, "learning_rate": 9.632008474879171e-06, "loss": 0.5354, "step": 1195 }, { "epoch": 0.13179063360881543, "grad_norm": 7.091444492340088, "learning_rate": 9.63134984073944e-06, "loss": 0.4269, "step": 1196 }, { "epoch": 0.13190082644628098, "grad_norm": 5.056596755981445, "learning_rate": 9.630690640271958e-06, "loss": 0.4025, "step": 1197 }, { "epoch": 0.13201101928374656, "grad_norm": 13.227201461791992, "learning_rate": 9.630030873557335e-06, "loss": 0.5259, "step": 1198 }, { "epoch": 0.1321212121212121, "grad_norm": 12.597100257873535, "learning_rate": 9.629370540676246e-06, "loss": 0.5291, "step": 1199 }, { "epoch": 0.1322314049586777, "grad_norm": 11.199701309204102, "learning_rate": 9.628709641709441e-06, "loss": 0.5655, "step": 1200 }, { "epoch": 0.13234159779614324, "grad_norm": 10.168828010559082, "learning_rate": 9.628048176737734e-06, "loss": 0.4606, "step": 1201 }, { "epoch": 0.13245179063360882, "grad_norm": 8.4885835647583, "learning_rate": 9.627386145842008e-06, "loss": 0.538, "step": 1202 }, { "epoch": 0.13256198347107437, "grad_norm": 5.746736526489258, "learning_rate": 9.626723549103218e-06, "loss": 0.4401, "step": 1203 }, { "epoch": 0.13267217630853995, "grad_norm": 7.922615051269531, "learning_rate": 9.62606038660239e-06, "loss": 0.4987, "step": 1204 }, { "epoch": 0.1327823691460055, "grad_norm": 6.999356269836426, "learning_rate": 9.625396658420611e-06, "loss": 0.4446, "step": 1205 }, { "epoch": 0.13289256198347107, "grad_norm": 9.101819038391113, "learning_rate": 9.624732364639046e-06, "loss": 0.5127, "step": 1206 }, { "epoch": 0.13300275482093665, "grad_norm": 7.7360358238220215, "learning_rate": 9.624067505338928e-06, "loss": 0.4511, "step": 1207 }, { "epoch": 0.1331129476584022, "grad_norm": 7.754938125610352, "learning_rate": 9.623402080601552e-06, "loss": 0.4846, "step": 1208 }, { "epoch": 0.13322314049586778, "grad_norm": 9.81610107421875, "learning_rate": 9.62273609050829e-06, "loss": 0.5006, "step": 1209 }, { "epoch": 0.13333333333333333, "grad_norm": 7.27468204498291, "learning_rate": 9.622069535140579e-06, "loss": 0.4858, "step": 1210 }, { "epoch": 0.1334435261707989, "grad_norm": 11.147741317749023, "learning_rate": 9.621402414579928e-06, "loss": 0.485, "step": 1211 }, { "epoch": 0.13355371900826446, "grad_norm": 4.764684200286865, "learning_rate": 9.620734728907912e-06, "loss": 0.4267, "step": 1212 }, { "epoch": 0.13366391184573004, "grad_norm": 8.194490432739258, "learning_rate": 9.620066478206176e-06, "loss": 0.5357, "step": 1213 }, { "epoch": 0.1337741046831956, "grad_norm": 7.495524883270264, "learning_rate": 9.619397662556434e-06, "loss": 0.4516, "step": 1214 }, { "epoch": 0.13388429752066117, "grad_norm": 7.69467306137085, "learning_rate": 9.618728282040472e-06, "loss": 0.4443, "step": 1215 }, { "epoch": 0.13399449035812672, "grad_norm": 5.7928466796875, "learning_rate": 9.618058336740144e-06, "loss": 0.4539, "step": 1216 }, { "epoch": 0.1341046831955923, "grad_norm": 6.241455554962158, "learning_rate": 9.617387826737367e-06, "loss": 0.4858, "step": 1217 }, { "epoch": 0.13421487603305784, "grad_norm": 7.450160980224609, "learning_rate": 9.616716752114135e-06, "loss": 0.453, "step": 1218 }, { "epoch": 0.13432506887052342, "grad_norm": 10.77891731262207, "learning_rate": 9.616045112952508e-06, "loss": 0.446, "step": 1219 }, { "epoch": 0.13443526170798897, "grad_norm": 6.600894451141357, "learning_rate": 9.615372909334612e-06, "loss": 0.446, "step": 1220 }, { "epoch": 0.13454545454545455, "grad_norm": 9.839051246643066, "learning_rate": 9.61470014134265e-06, "loss": 0.4126, "step": 1221 }, { "epoch": 0.1346556473829201, "grad_norm": 5.878121852874756, "learning_rate": 9.614026809058886e-06, "loss": 0.4811, "step": 1222 }, { "epoch": 0.13476584022038568, "grad_norm": 6.809379577636719, "learning_rate": 9.613352912565656e-06, "loss": 0.4785, "step": 1223 }, { "epoch": 0.13487603305785123, "grad_norm": 11.274439811706543, "learning_rate": 9.612678451945364e-06, "loss": 0.5477, "step": 1224 }, { "epoch": 0.1349862258953168, "grad_norm": 7.883140563964844, "learning_rate": 9.612003427280487e-06, "loss": 0.4922, "step": 1225 }, { "epoch": 0.13509641873278236, "grad_norm": 11.556133270263672, "learning_rate": 9.611327838653563e-06, "loss": 0.5489, "step": 1226 }, { "epoch": 0.13520661157024794, "grad_norm": 5.750260829925537, "learning_rate": 9.61065168614721e-06, "loss": 0.3844, "step": 1227 }, { "epoch": 0.1353168044077135, "grad_norm": 7.41603422164917, "learning_rate": 9.609974969844105e-06, "loss": 0.3361, "step": 1228 }, { "epoch": 0.13542699724517906, "grad_norm": 6.057075500488281, "learning_rate": 9.609297689827e-06, "loss": 0.39, "step": 1229 }, { "epoch": 0.13553719008264462, "grad_norm": 7.7945146560668945, "learning_rate": 9.608619846178711e-06, "loss": 0.4986, "step": 1230 }, { "epoch": 0.1356473829201102, "grad_norm": 7.78212833404541, "learning_rate": 9.607941438982127e-06, "loss": 0.4787, "step": 1231 }, { "epoch": 0.13575757575757577, "grad_norm": 11.500925064086914, "learning_rate": 9.607262468320205e-06, "loss": 0.4491, "step": 1232 }, { "epoch": 0.13586776859504132, "grad_norm": 8.095635414123535, "learning_rate": 9.606582934275968e-06, "loss": 0.4334, "step": 1233 }, { "epoch": 0.1359779614325069, "grad_norm": 5.727466583251953, "learning_rate": 9.605902836932514e-06, "loss": 0.3908, "step": 1234 }, { "epoch": 0.13608815426997245, "grad_norm": 8.273818016052246, "learning_rate": 9.605222176373006e-06, "loss": 0.4466, "step": 1235 }, { "epoch": 0.13619834710743803, "grad_norm": 4.12173318862915, "learning_rate": 9.604540952680672e-06, "loss": 0.3868, "step": 1236 }, { "epoch": 0.13630853994490358, "grad_norm": 7.504457950592041, "learning_rate": 9.603859165938817e-06, "loss": 0.507, "step": 1237 }, { "epoch": 0.13641873278236916, "grad_norm": 8.912880897521973, "learning_rate": 9.60317681623081e-06, "loss": 0.4625, "step": 1238 }, { "epoch": 0.1365289256198347, "grad_norm": 7.629209995269775, "learning_rate": 9.602493903640089e-06, "loss": 0.4457, "step": 1239 }, { "epoch": 0.13663911845730028, "grad_norm": 11.610555648803711, "learning_rate": 9.60181042825016e-06, "loss": 0.4087, "step": 1240 }, { "epoch": 0.13674931129476584, "grad_norm": 13.225961685180664, "learning_rate": 9.601126390144602e-06, "loss": 0.4733, "step": 1241 }, { "epoch": 0.1368595041322314, "grad_norm": 9.693376541137695, "learning_rate": 9.60044178940706e-06, "loss": 0.4659, "step": 1242 }, { "epoch": 0.13696969696969696, "grad_norm": 10.25334358215332, "learning_rate": 9.599756626121244e-06, "loss": 0.4908, "step": 1243 }, { "epoch": 0.13707988980716254, "grad_norm": 15.683958053588867, "learning_rate": 9.599070900370943e-06, "loss": 0.5867, "step": 1244 }, { "epoch": 0.1371900826446281, "grad_norm": 5.411210060119629, "learning_rate": 9.598384612240004e-06, "loss": 0.4215, "step": 1245 }, { "epoch": 0.13730027548209367, "grad_norm": 6.647460460662842, "learning_rate": 9.597697761812347e-06, "loss": 0.4412, "step": 1246 }, { "epoch": 0.13741046831955922, "grad_norm": 6.99817419052124, "learning_rate": 9.597010349171964e-06, "loss": 0.4447, "step": 1247 }, { "epoch": 0.1375206611570248, "grad_norm": 7.737760543823242, "learning_rate": 9.596322374402908e-06, "loss": 0.4342, "step": 1248 }, { "epoch": 0.13763085399449035, "grad_norm": 6.537383556365967, "learning_rate": 9.595633837589313e-06, "loss": 0.5261, "step": 1249 }, { "epoch": 0.13774104683195593, "grad_norm": 9.799365043640137, "learning_rate": 9.594944738815366e-06, "loss": 0.5021, "step": 1250 }, { "epoch": 0.13785123966942148, "grad_norm": 7.652403831481934, "learning_rate": 9.594255078165338e-06, "loss": 0.4058, "step": 1251 }, { "epoch": 0.13796143250688706, "grad_norm": 6.344549655914307, "learning_rate": 9.593564855723557e-06, "loss": 0.3838, "step": 1252 }, { "epoch": 0.1380716253443526, "grad_norm": 5.8052215576171875, "learning_rate": 9.592874071574424e-06, "loss": 0.4224, "step": 1253 }, { "epoch": 0.13818181818181818, "grad_norm": 5.907143592834473, "learning_rate": 9.592182725802412e-06, "loss": 0.4224, "step": 1254 }, { "epoch": 0.13829201101928373, "grad_norm": 8.23563289642334, "learning_rate": 9.591490818492059e-06, "loss": 0.4484, "step": 1255 }, { "epoch": 0.1384022038567493, "grad_norm": 7.861181259155273, "learning_rate": 9.590798349727972e-06, "loss": 0.531, "step": 1256 }, { "epoch": 0.1385123966942149, "grad_norm": 11.337368965148926, "learning_rate": 9.590105319594825e-06, "loss": 0.4131, "step": 1257 }, { "epoch": 0.13862258953168044, "grad_norm": 6.5620269775390625, "learning_rate": 9.589411728177367e-06, "loss": 0.4461, "step": 1258 }, { "epoch": 0.13873278236914602, "grad_norm": 14.474329948425293, "learning_rate": 9.588717575560407e-06, "loss": 0.544, "step": 1259 }, { "epoch": 0.13884297520661157, "grad_norm": 13.015029907226562, "learning_rate": 9.58802286182883e-06, "loss": 0.475, "step": 1260 }, { "epoch": 0.13895316804407715, "grad_norm": 8.727346420288086, "learning_rate": 9.587327587067583e-06, "loss": 0.5245, "step": 1261 }, { "epoch": 0.1390633608815427, "grad_norm": 17.652963638305664, "learning_rate": 9.58663175136169e-06, "loss": 0.4731, "step": 1262 }, { "epoch": 0.13917355371900827, "grad_norm": 6.671087265014648, "learning_rate": 9.585935354796235e-06, "loss": 0.47, "step": 1263 }, { "epoch": 0.13928374655647383, "grad_norm": 4.637528896331787, "learning_rate": 9.585238397456373e-06, "loss": 0.4266, "step": 1264 }, { "epoch": 0.1393939393939394, "grad_norm": 12.280485153198242, "learning_rate": 9.584540879427335e-06, "loss": 0.5641, "step": 1265 }, { "epoch": 0.13950413223140495, "grad_norm": 6.294337272644043, "learning_rate": 9.58384280079441e-06, "loss": 0.4443, "step": 1266 }, { "epoch": 0.13961432506887053, "grad_norm": 8.127622604370117, "learning_rate": 9.583144161642958e-06, "loss": 0.4848, "step": 1267 }, { "epoch": 0.13972451790633608, "grad_norm": 6.222387313842773, "learning_rate": 9.582444962058416e-06, "loss": 0.4791, "step": 1268 }, { "epoch": 0.13983471074380166, "grad_norm": 5.600038528442383, "learning_rate": 9.581745202126278e-06, "loss": 0.3918, "step": 1269 }, { "epoch": 0.1399449035812672, "grad_norm": 5.010919094085693, "learning_rate": 9.581044881932113e-06, "loss": 0.4981, "step": 1270 }, { "epoch": 0.1400550964187328, "grad_norm": 8.86003589630127, "learning_rate": 9.580344001561557e-06, "loss": 0.5298, "step": 1271 }, { "epoch": 0.14016528925619834, "grad_norm": 7.720181941986084, "learning_rate": 9.579642561100314e-06, "loss": 0.429, "step": 1272 }, { "epoch": 0.14027548209366392, "grad_norm": 9.330036163330078, "learning_rate": 9.578940560634157e-06, "loss": 0.4489, "step": 1273 }, { "epoch": 0.14038567493112947, "grad_norm": 9.135281562805176, "learning_rate": 9.578238000248931e-06, "loss": 0.3906, "step": 1274 }, { "epoch": 0.14049586776859505, "grad_norm": 8.328913688659668, "learning_rate": 9.577534880030543e-06, "loss": 0.5004, "step": 1275 }, { "epoch": 0.1406060606060606, "grad_norm": 8.757101058959961, "learning_rate": 9.576831200064972e-06, "loss": 0.5734, "step": 1276 }, { "epoch": 0.14071625344352617, "grad_norm": 7.532750606536865, "learning_rate": 9.576126960438264e-06, "loss": 0.4672, "step": 1277 }, { "epoch": 0.14082644628099172, "grad_norm": 7.0799970626831055, "learning_rate": 9.575422161236533e-06, "loss": 0.4517, "step": 1278 }, { "epoch": 0.1409366391184573, "grad_norm": 9.143688201904297, "learning_rate": 9.574716802545968e-06, "loss": 0.5049, "step": 1279 }, { "epoch": 0.14104683195592285, "grad_norm": 4.941235542297363, "learning_rate": 9.574010884452817e-06, "loss": 0.4532, "step": 1280 }, { "epoch": 0.14115702479338843, "grad_norm": 9.055264472961426, "learning_rate": 9.573304407043402e-06, "loss": 0.4694, "step": 1281 }, { "epoch": 0.14126721763085398, "grad_norm": 5.88823127746582, "learning_rate": 9.572597370404114e-06, "loss": 0.4428, "step": 1282 }, { "epoch": 0.14137741046831956, "grad_norm": 11.091938972473145, "learning_rate": 9.571889774621406e-06, "loss": 0.528, "step": 1283 }, { "epoch": 0.14148760330578514, "grad_norm": 18.7547607421875, "learning_rate": 9.571181619781806e-06, "loss": 0.4894, "step": 1284 }, { "epoch": 0.1415977961432507, "grad_norm": 18.28177833557129, "learning_rate": 9.57047290597191e-06, "loss": 0.5075, "step": 1285 }, { "epoch": 0.14170798898071627, "grad_norm": 5.0757975578308105, "learning_rate": 9.569763633278377e-06, "loss": 0.4046, "step": 1286 }, { "epoch": 0.14181818181818182, "grad_norm": 12.676048278808594, "learning_rate": 9.56905380178794e-06, "loss": 0.4505, "step": 1287 }, { "epoch": 0.1419283746556474, "grad_norm": 6.966454982757568, "learning_rate": 9.568343411587397e-06, "loss": 0.4769, "step": 1288 }, { "epoch": 0.14203856749311294, "grad_norm": 6.308920860290527, "learning_rate": 9.567632462763617e-06, "loss": 0.5722, "step": 1289 }, { "epoch": 0.14214876033057852, "grad_norm": 8.201804161071777, "learning_rate": 9.566920955403533e-06, "loss": 0.4732, "step": 1290 }, { "epoch": 0.14225895316804407, "grad_norm": 5.8679890632629395, "learning_rate": 9.566208889594154e-06, "loss": 0.5137, "step": 1291 }, { "epoch": 0.14236914600550965, "grad_norm": 8.620732307434082, "learning_rate": 9.565496265422549e-06, "loss": 0.5064, "step": 1292 }, { "epoch": 0.1424793388429752, "grad_norm": 7.340023040771484, "learning_rate": 9.564783082975856e-06, "loss": 0.5066, "step": 1293 }, { "epoch": 0.14258953168044078, "grad_norm": 8.920328140258789, "learning_rate": 9.56406934234129e-06, "loss": 0.3834, "step": 1294 }, { "epoch": 0.14269972451790633, "grad_norm": 10.822632789611816, "learning_rate": 9.563355043606124e-06, "loss": 0.5246, "step": 1295 }, { "epoch": 0.1428099173553719, "grad_norm": 8.7300443649292, "learning_rate": 9.562640186857706e-06, "loss": 0.5316, "step": 1296 }, { "epoch": 0.14292011019283746, "grad_norm": 5.638113498687744, "learning_rate": 9.561924772183446e-06, "loss": 0.3808, "step": 1297 }, { "epoch": 0.14303030303030304, "grad_norm": 4.380406856536865, "learning_rate": 9.561208799670828e-06, "loss": 0.4787, "step": 1298 }, { "epoch": 0.14314049586776859, "grad_norm": 10.42844295501709, "learning_rate": 9.560492269407405e-06, "loss": 0.4683, "step": 1299 }, { "epoch": 0.14325068870523416, "grad_norm": 6.256180286407471, "learning_rate": 9.559775181480791e-06, "loss": 0.3951, "step": 1300 }, { "epoch": 0.14336088154269971, "grad_norm": 5.697518825531006, "learning_rate": 9.559057535978673e-06, "loss": 0.4077, "step": 1301 }, { "epoch": 0.1434710743801653, "grad_norm": 7.972506523132324, "learning_rate": 9.558339332988807e-06, "loss": 0.4687, "step": 1302 }, { "epoch": 0.14358126721763084, "grad_norm": 6.847010612487793, "learning_rate": 9.557620572599015e-06, "loss": 0.4682, "step": 1303 }, { "epoch": 0.14369146005509642, "grad_norm": 8.559355735778809, "learning_rate": 9.55690125489719e-06, "loss": 0.4327, "step": 1304 }, { "epoch": 0.14380165289256197, "grad_norm": 8.214713096618652, "learning_rate": 9.55618137997129e-06, "loss": 0.5354, "step": 1305 }, { "epoch": 0.14391184573002755, "grad_norm": 9.412150382995605, "learning_rate": 9.55546094790934e-06, "loss": 0.491, "step": 1306 }, { "epoch": 0.1440220385674931, "grad_norm": 10.300345420837402, "learning_rate": 9.554739958799438e-06, "loss": 0.5273, "step": 1307 }, { "epoch": 0.14413223140495868, "grad_norm": 10.135249137878418, "learning_rate": 9.554018412729747e-06, "loss": 0.3953, "step": 1308 }, { "epoch": 0.14424242424242426, "grad_norm": 7.0443620681762695, "learning_rate": 9.553296309788498e-06, "loss": 0.4439, "step": 1309 }, { "epoch": 0.1443526170798898, "grad_norm": 7.326988220214844, "learning_rate": 9.552573650063992e-06, "loss": 0.5392, "step": 1310 }, { "epoch": 0.14446280991735538, "grad_norm": 9.309418678283691, "learning_rate": 9.551850433644596e-06, "loss": 0.4253, "step": 1311 }, { "epoch": 0.14457300275482093, "grad_norm": 11.001035690307617, "learning_rate": 9.551126660618746e-06, "loss": 0.5079, "step": 1312 }, { "epoch": 0.1446831955922865, "grad_norm": 13.418405532836914, "learning_rate": 9.550402331074945e-06, "loss": 0.4895, "step": 1313 }, { "epoch": 0.14479338842975206, "grad_norm": 8.14391040802002, "learning_rate": 9.549677445101766e-06, "loss": 0.4726, "step": 1314 }, { "epoch": 0.14490358126721764, "grad_norm": 8.856776237487793, "learning_rate": 9.54895200278785e-06, "loss": 0.4434, "step": 1315 }, { "epoch": 0.1450137741046832, "grad_norm": 7.328104019165039, "learning_rate": 9.548226004221903e-06, "loss": 0.4649, "step": 1316 }, { "epoch": 0.14512396694214877, "grad_norm": 8.873720169067383, "learning_rate": 9.547499449492701e-06, "loss": 0.4917, "step": 1317 }, { "epoch": 0.14523415977961432, "grad_norm": 8.50603199005127, "learning_rate": 9.54677233868909e-06, "loss": 0.4602, "step": 1318 }, { "epoch": 0.1453443526170799, "grad_norm": 6.062439441680908, "learning_rate": 9.546044671899982e-06, "loss": 0.4393, "step": 1319 }, { "epoch": 0.14545454545454545, "grad_norm": 10.409941673278809, "learning_rate": 9.545316449214354e-06, "loss": 0.4722, "step": 1320 }, { "epoch": 0.14556473829201103, "grad_norm": 14.732938766479492, "learning_rate": 9.54458767072126e-06, "loss": 0.4956, "step": 1321 }, { "epoch": 0.14567493112947658, "grad_norm": 7.161686897277832, "learning_rate": 9.54385833650981e-06, "loss": 0.455, "step": 1322 }, { "epoch": 0.14578512396694215, "grad_norm": 7.07647180557251, "learning_rate": 9.543128446669191e-06, "loss": 0.4994, "step": 1323 }, { "epoch": 0.1458953168044077, "grad_norm": 8.985993385314941, "learning_rate": 9.542398001288654e-06, "loss": 0.4947, "step": 1324 }, { "epoch": 0.14600550964187328, "grad_norm": 7.013548374176025, "learning_rate": 9.54166700045752e-06, "loss": 0.4678, "step": 1325 }, { "epoch": 0.14611570247933883, "grad_norm": 11.606616020202637, "learning_rate": 9.540935444265175e-06, "loss": 0.5178, "step": 1326 }, { "epoch": 0.1462258953168044, "grad_norm": 9.271910667419434, "learning_rate": 9.540203332801075e-06, "loss": 0.5439, "step": 1327 }, { "epoch": 0.14633608815426996, "grad_norm": 6.152432918548584, "learning_rate": 9.539470666154747e-06, "loss": 0.455, "step": 1328 }, { "epoch": 0.14644628099173554, "grad_norm": 6.23866081237793, "learning_rate": 9.538737444415777e-06, "loss": 0.4138, "step": 1329 }, { "epoch": 0.1465564738292011, "grad_norm": 7.096884727478027, "learning_rate": 9.538003667673828e-06, "loss": 0.5072, "step": 1330 }, { "epoch": 0.14666666666666667, "grad_norm": 6.731375694274902, "learning_rate": 9.537269336018627e-06, "loss": 0.4115, "step": 1331 }, { "epoch": 0.14677685950413222, "grad_norm": 6.049683094024658, "learning_rate": 9.536534449539966e-06, "loss": 0.472, "step": 1332 }, { "epoch": 0.1468870523415978, "grad_norm": 8.583109855651855, "learning_rate": 9.535799008327711e-06, "loss": 0.4451, "step": 1333 }, { "epoch": 0.14699724517906337, "grad_norm": 5.552403450012207, "learning_rate": 9.535063012471793e-06, "loss": 0.3823, "step": 1334 }, { "epoch": 0.14710743801652892, "grad_norm": 7.0078606605529785, "learning_rate": 9.53432646206221e-06, "loss": 0.4477, "step": 1335 }, { "epoch": 0.1472176308539945, "grad_norm": 8.977633476257324, "learning_rate": 9.533589357189026e-06, "loss": 0.4246, "step": 1336 }, { "epoch": 0.14732782369146005, "grad_norm": 11.107526779174805, "learning_rate": 9.53285169794238e-06, "loss": 0.5381, "step": 1337 }, { "epoch": 0.14743801652892563, "grad_norm": 8.975915908813477, "learning_rate": 9.532113484412468e-06, "loss": 0.4812, "step": 1338 }, { "epoch": 0.14754820936639118, "grad_norm": 7.837002277374268, "learning_rate": 9.531374716689565e-06, "loss": 0.5246, "step": 1339 }, { "epoch": 0.14765840220385676, "grad_norm": 8.374943733215332, "learning_rate": 9.530635394864006e-06, "loss": 0.4484, "step": 1340 }, { "epoch": 0.1477685950413223, "grad_norm": 7.772285461425781, "learning_rate": 9.529895519026196e-06, "loss": 0.4227, "step": 1341 }, { "epoch": 0.1478787878787879, "grad_norm": 4.9404072761535645, "learning_rate": 9.529155089266609e-06, "loss": 0.4677, "step": 1342 }, { "epoch": 0.14798898071625344, "grad_norm": 9.528486251831055, "learning_rate": 9.528414105675785e-06, "loss": 0.5177, "step": 1343 }, { "epoch": 0.14809917355371902, "grad_norm": 6.80415153503418, "learning_rate": 9.527672568344332e-06, "loss": 0.5118, "step": 1344 }, { "epoch": 0.14820936639118457, "grad_norm": 7.836904525756836, "learning_rate": 9.52693047736293e-06, "loss": 0.4326, "step": 1345 }, { "epoch": 0.14831955922865014, "grad_norm": 6.0371012687683105, "learning_rate": 9.526187832822318e-06, "loss": 0.4778, "step": 1346 }, { "epoch": 0.1484297520661157, "grad_norm": 5.394172668457031, "learning_rate": 9.52544463481331e-06, "loss": 0.436, "step": 1347 }, { "epoch": 0.14853994490358127, "grad_norm": 7.3542351722717285, "learning_rate": 9.524700883426786e-06, "loss": 0.4585, "step": 1348 }, { "epoch": 0.14865013774104682, "grad_norm": 7.725783824920654, "learning_rate": 9.523956578753688e-06, "loss": 0.4353, "step": 1349 }, { "epoch": 0.1487603305785124, "grad_norm": 9.414610862731934, "learning_rate": 9.523211720885038e-06, "loss": 0.4457, "step": 1350 }, { "epoch": 0.14887052341597795, "grad_norm": 5.255350112915039, "learning_rate": 9.522466309911913e-06, "loss": 0.4497, "step": 1351 }, { "epoch": 0.14898071625344353, "grad_norm": 5.908344745635986, "learning_rate": 9.521720345925464e-06, "loss": 0.4214, "step": 1352 }, { "epoch": 0.14909090909090908, "grad_norm": 10.101250648498535, "learning_rate": 9.52097382901691e-06, "loss": 0.4965, "step": 1353 }, { "epoch": 0.14920110192837466, "grad_norm": 5.879767417907715, "learning_rate": 9.520226759277536e-06, "loss": 0.4725, "step": 1354 }, { "epoch": 0.1493112947658402, "grad_norm": 7.752197265625, "learning_rate": 9.519479136798693e-06, "loss": 0.4929, "step": 1355 }, { "epoch": 0.14942148760330579, "grad_norm": 7.521350860595703, "learning_rate": 9.518730961671802e-06, "loss": 0.379, "step": 1356 }, { "epoch": 0.14953168044077134, "grad_norm": 6.87856912612915, "learning_rate": 9.51798223398835e-06, "loss": 0.4206, "step": 1357 }, { "epoch": 0.14964187327823691, "grad_norm": 7.299850940704346, "learning_rate": 9.517232953839894e-06, "loss": 0.4177, "step": 1358 }, { "epoch": 0.1497520661157025, "grad_norm": 14.304620742797852, "learning_rate": 9.516483121318057e-06, "loss": 0.5049, "step": 1359 }, { "epoch": 0.14986225895316804, "grad_norm": 5.096722602844238, "learning_rate": 9.515732736514526e-06, "loss": 0.4325, "step": 1360 }, { "epoch": 0.14997245179063362, "grad_norm": 5.899183750152588, "learning_rate": 9.514981799521066e-06, "loss": 0.3767, "step": 1361 }, { "epoch": 0.15008264462809917, "grad_norm": 4.823487758636475, "learning_rate": 9.514230310429498e-06, "loss": 0.3536, "step": 1362 }, { "epoch": 0.15019283746556475, "grad_norm": 11.252744674682617, "learning_rate": 9.513478269331713e-06, "loss": 0.4375, "step": 1363 }, { "epoch": 0.1503030303030303, "grad_norm": 13.921759605407715, "learning_rate": 9.512725676319677e-06, "loss": 0.496, "step": 1364 }, { "epoch": 0.15041322314049588, "grad_norm": 10.286101341247559, "learning_rate": 9.511972531485414e-06, "loss": 0.4727, "step": 1365 }, { "epoch": 0.15052341597796143, "grad_norm": 4.919619083404541, "learning_rate": 9.51121883492102e-06, "loss": 0.4386, "step": 1366 }, { "epoch": 0.150633608815427, "grad_norm": 8.442239761352539, "learning_rate": 9.51046458671866e-06, "loss": 0.4446, "step": 1367 }, { "epoch": 0.15074380165289256, "grad_norm": 9.221701622009277, "learning_rate": 9.509709786970564e-06, "loss": 0.4601, "step": 1368 }, { "epoch": 0.15085399449035813, "grad_norm": 11.813467979431152, "learning_rate": 9.50895443576903e-06, "loss": 0.5873, "step": 1369 }, { "epoch": 0.15096418732782368, "grad_norm": 5.503012657165527, "learning_rate": 9.50819853320642e-06, "loss": 0.4285, "step": 1370 }, { "epoch": 0.15107438016528926, "grad_norm": 8.24340534210205, "learning_rate": 9.507442079375171e-06, "loss": 0.4181, "step": 1371 }, { "epoch": 0.1511845730027548, "grad_norm": 6.293192386627197, "learning_rate": 9.506685074367782e-06, "loss": 0.4747, "step": 1372 }, { "epoch": 0.1512947658402204, "grad_norm": 9.944341659545898, "learning_rate": 9.505927518276821e-06, "loss": 0.4881, "step": 1373 }, { "epoch": 0.15140495867768594, "grad_norm": 7.372109889984131, "learning_rate": 9.505169411194921e-06, "loss": 0.4513, "step": 1374 }, { "epoch": 0.15151515151515152, "grad_norm": 6.490156650543213, "learning_rate": 9.504410753214786e-06, "loss": 0.356, "step": 1375 }, { "epoch": 0.15162534435261707, "grad_norm": 3.989199638366699, "learning_rate": 9.503651544429186e-06, "loss": 0.4293, "step": 1376 }, { "epoch": 0.15173553719008265, "grad_norm": 5.597970962524414, "learning_rate": 9.502891784930957e-06, "loss": 0.4755, "step": 1377 }, { "epoch": 0.1518457300275482, "grad_norm": 7.86866569519043, "learning_rate": 9.502131474813006e-06, "loss": 0.4134, "step": 1378 }, { "epoch": 0.15195592286501378, "grad_norm": 7.2889556884765625, "learning_rate": 9.5013706141683e-06, "loss": 0.4354, "step": 1379 }, { "epoch": 0.15206611570247933, "grad_norm": 15.950618743896484, "learning_rate": 9.500609203089883e-06, "loss": 0.3591, "step": 1380 }, { "epoch": 0.1521763085399449, "grad_norm": 6.034360885620117, "learning_rate": 9.499847241670857e-06, "loss": 0.3366, "step": 1381 }, { "epoch": 0.15228650137741045, "grad_norm": 13.032929420471191, "learning_rate": 9.499084730004398e-06, "loss": 0.4793, "step": 1382 }, { "epoch": 0.15239669421487603, "grad_norm": 9.218709945678711, "learning_rate": 9.498321668183749e-06, "loss": 0.4686, "step": 1383 }, { "epoch": 0.1525068870523416, "grad_norm": 12.696232795715332, "learning_rate": 9.497558056302214e-06, "loss": 0.5946, "step": 1384 }, { "epoch": 0.15261707988980716, "grad_norm": 8.161370277404785, "learning_rate": 9.496793894453171e-06, "loss": 0.4742, "step": 1385 }, { "epoch": 0.15272727272727274, "grad_norm": 11.497420310974121, "learning_rate": 9.496029182730063e-06, "loss": 0.6843, "step": 1386 }, { "epoch": 0.1528374655647383, "grad_norm": 12.493870735168457, "learning_rate": 9.495263921226399e-06, "loss": 0.5355, "step": 1387 }, { "epoch": 0.15294765840220387, "grad_norm": 4.5893473625183105, "learning_rate": 9.494498110035756e-06, "loss": 0.4677, "step": 1388 }, { "epoch": 0.15305785123966942, "grad_norm": 7.156816005706787, "learning_rate": 9.49373174925178e-06, "loss": 0.4558, "step": 1389 }, { "epoch": 0.153168044077135, "grad_norm": 9.895354270935059, "learning_rate": 9.49296483896818e-06, "loss": 0.5318, "step": 1390 }, { "epoch": 0.15327823691460055, "grad_norm": 7.410556793212891, "learning_rate": 9.492197379278738e-06, "loss": 0.4879, "step": 1391 }, { "epoch": 0.15338842975206612, "grad_norm": 4.698634624481201, "learning_rate": 9.491429370277298e-06, "loss": 0.3865, "step": 1392 }, { "epoch": 0.15349862258953167, "grad_norm": 9.832266807556152, "learning_rate": 9.490660812057772e-06, "loss": 0.4181, "step": 1393 }, { "epoch": 0.15360881542699725, "grad_norm": 6.800455093383789, "learning_rate": 9.489891704714145e-06, "loss": 0.517, "step": 1394 }, { "epoch": 0.1537190082644628, "grad_norm": 6.298616886138916, "learning_rate": 9.489122048340457e-06, "loss": 0.3812, "step": 1395 }, { "epoch": 0.15382920110192838, "grad_norm": 7.242441654205322, "learning_rate": 9.488351843030832e-06, "loss": 0.5227, "step": 1396 }, { "epoch": 0.15393939393939393, "grad_norm": 6.825589656829834, "learning_rate": 9.487581088879443e-06, "loss": 0.4433, "step": 1397 }, { "epoch": 0.1540495867768595, "grad_norm": 6.0639519691467285, "learning_rate": 9.486809785980544e-06, "loss": 0.4206, "step": 1398 }, { "epoch": 0.15415977961432506, "grad_norm": 6.887888431549072, "learning_rate": 9.486037934428451e-06, "loss": 0.5139, "step": 1399 }, { "epoch": 0.15426997245179064, "grad_norm": 9.926239967346191, "learning_rate": 9.485265534317544e-06, "loss": 0.3926, "step": 1400 }, { "epoch": 0.1543801652892562, "grad_norm": 6.610692977905273, "learning_rate": 9.484492585742275e-06, "loss": 0.3983, "step": 1401 }, { "epoch": 0.15449035812672177, "grad_norm": 6.125588893890381, "learning_rate": 9.483719088797161e-06, "loss": 0.4405, "step": 1402 }, { "epoch": 0.15460055096418732, "grad_norm": 11.86706256866455, "learning_rate": 9.482945043576787e-06, "loss": 0.4647, "step": 1403 }, { "epoch": 0.1547107438016529, "grad_norm": 10.825469017028809, "learning_rate": 9.482170450175804e-06, "loss": 0.4889, "step": 1404 }, { "epoch": 0.15482093663911844, "grad_norm": 8.743223190307617, "learning_rate": 9.481395308688928e-06, "loss": 0.524, "step": 1405 }, { "epoch": 0.15493112947658402, "grad_norm": 10.586660385131836, "learning_rate": 9.48061961921095e-06, "loss": 0.4492, "step": 1406 }, { "epoch": 0.15504132231404957, "grad_norm": 11.887472152709961, "learning_rate": 9.479843381836714e-06, "loss": 0.5481, "step": 1407 }, { "epoch": 0.15515151515151515, "grad_norm": 10.426525115966797, "learning_rate": 9.479066596661146e-06, "loss": 0.4112, "step": 1408 }, { "epoch": 0.15526170798898073, "grad_norm": 7.2527689933776855, "learning_rate": 9.47828926377923e-06, "loss": 0.4969, "step": 1409 }, { "epoch": 0.15537190082644628, "grad_norm": 5.922959804534912, "learning_rate": 9.47751138328602e-06, "loss": 0.4402, "step": 1410 }, { "epoch": 0.15548209366391186, "grad_norm": 12.400938987731934, "learning_rate": 9.476732955276637e-06, "loss": 0.4873, "step": 1411 }, { "epoch": 0.1555922865013774, "grad_norm": 9.209698677062988, "learning_rate": 9.475953979846267e-06, "loss": 0.4264, "step": 1412 }, { "epoch": 0.155702479338843, "grad_norm": 5.971480846405029, "learning_rate": 9.475174457090162e-06, "loss": 0.4584, "step": 1413 }, { "epoch": 0.15581267217630854, "grad_norm": 6.5802788734436035, "learning_rate": 9.474394387103648e-06, "loss": 0.3941, "step": 1414 }, { "epoch": 0.15592286501377411, "grad_norm": 7.03208589553833, "learning_rate": 9.473613769982108e-06, "loss": 0.3746, "step": 1415 }, { "epoch": 0.15603305785123966, "grad_norm": 9.911009788513184, "learning_rate": 9.472832605821003e-06, "loss": 0.4821, "step": 1416 }, { "epoch": 0.15614325068870524, "grad_norm": 4.675900936126709, "learning_rate": 9.472050894715849e-06, "loss": 0.3468, "step": 1417 }, { "epoch": 0.1562534435261708, "grad_norm": 8.639227867126465, "learning_rate": 9.471268636762237e-06, "loss": 0.5741, "step": 1418 }, { "epoch": 0.15636363636363637, "grad_norm": 5.883412837982178, "learning_rate": 9.470485832055822e-06, "loss": 0.4819, "step": 1419 }, { "epoch": 0.15647382920110192, "grad_norm": 7.617645740509033, "learning_rate": 9.469702480692326e-06, "loss": 0.4767, "step": 1420 }, { "epoch": 0.1565840220385675, "grad_norm": 6.042135715484619, "learning_rate": 9.46891858276754e-06, "loss": 0.481, "step": 1421 }, { "epoch": 0.15669421487603305, "grad_norm": 8.383898735046387, "learning_rate": 9.468134138377321e-06, "loss": 0.4136, "step": 1422 }, { "epoch": 0.15680440771349863, "grad_norm": 6.613709926605225, "learning_rate": 9.467349147617589e-06, "loss": 0.4291, "step": 1423 }, { "epoch": 0.15691460055096418, "grad_norm": 6.821608543395996, "learning_rate": 9.466563610584336e-06, "loss": 0.447, "step": 1424 }, { "epoch": 0.15702479338842976, "grad_norm": 8.442924499511719, "learning_rate": 9.465777527373616e-06, "loss": 0.423, "step": 1425 }, { "epoch": 0.1571349862258953, "grad_norm": 6.351988315582275, "learning_rate": 9.464990898081554e-06, "loss": 0.4263, "step": 1426 }, { "epoch": 0.15724517906336088, "grad_norm": 12.619184494018555, "learning_rate": 9.46420372280434e-06, "loss": 0.5921, "step": 1427 }, { "epoch": 0.15735537190082644, "grad_norm": 14.88255500793457, "learning_rate": 9.46341600163823e-06, "loss": 0.5316, "step": 1428 }, { "epoch": 0.157465564738292, "grad_norm": 9.838242530822754, "learning_rate": 9.46262773467955e-06, "loss": 0.507, "step": 1429 }, { "epoch": 0.15757575757575756, "grad_norm": 11.328462600708008, "learning_rate": 9.46183892202469e-06, "loss": 0.4047, "step": 1430 }, { "epoch": 0.15768595041322314, "grad_norm": 7.212308883666992, "learning_rate": 9.461049563770102e-06, "loss": 0.4398, "step": 1431 }, { "epoch": 0.1577961432506887, "grad_norm": 6.799179553985596, "learning_rate": 9.460259660012316e-06, "loss": 0.4984, "step": 1432 }, { "epoch": 0.15790633608815427, "grad_norm": 11.3588228225708, "learning_rate": 9.459469210847919e-06, "loss": 0.5055, "step": 1433 }, { "epoch": 0.15801652892561985, "grad_norm": 5.606801509857178, "learning_rate": 9.45867821637357e-06, "loss": 0.4606, "step": 1434 }, { "epoch": 0.1581267217630854, "grad_norm": 6.175350666046143, "learning_rate": 9.457886676685992e-06, "loss": 0.4075, "step": 1435 }, { "epoch": 0.15823691460055098, "grad_norm": 7.256425857543945, "learning_rate": 9.457094591881975e-06, "loss": 0.4129, "step": 1436 }, { "epoch": 0.15834710743801653, "grad_norm": 8.640177726745605, "learning_rate": 9.456301962058377e-06, "loss": 0.3757, "step": 1437 }, { "epoch": 0.1584573002754821, "grad_norm": 8.584975242614746, "learning_rate": 9.455508787312123e-06, "loss": 0.5124, "step": 1438 }, { "epoch": 0.15856749311294766, "grad_norm": 8.673279762268066, "learning_rate": 9.454715067740202e-06, "loss": 0.4775, "step": 1439 }, { "epoch": 0.15867768595041323, "grad_norm": 3.941206693649292, "learning_rate": 9.45392080343967e-06, "loss": 0.4469, "step": 1440 }, { "epoch": 0.15878787878787878, "grad_norm": 5.717761516571045, "learning_rate": 9.453125994507654e-06, "loss": 0.4708, "step": 1441 }, { "epoch": 0.15889807162534436, "grad_norm": 11.770964622497559, "learning_rate": 9.452330641041341e-06, "loss": 0.4513, "step": 1442 }, { "epoch": 0.1590082644628099, "grad_norm": 9.43468952178955, "learning_rate": 9.45153474313799e-06, "loss": 0.5151, "step": 1443 }, { "epoch": 0.1591184573002755, "grad_norm": 8.744233131408691, "learning_rate": 9.450738300894924e-06, "loss": 0.4464, "step": 1444 }, { "epoch": 0.15922865013774104, "grad_norm": 8.976760864257812, "learning_rate": 9.449941314409532e-06, "loss": 0.3869, "step": 1445 }, { "epoch": 0.15933884297520662, "grad_norm": 22.744741439819336, "learning_rate": 9.449143783779273e-06, "loss": 0.4933, "step": 1446 }, { "epoch": 0.15944903581267217, "grad_norm": 8.330665588378906, "learning_rate": 9.448345709101667e-06, "loss": 0.4662, "step": 1447 }, { "epoch": 0.15955922865013775, "grad_norm": 6.56828498840332, "learning_rate": 9.447547090474306e-06, "loss": 0.4644, "step": 1448 }, { "epoch": 0.1596694214876033, "grad_norm": 7.922265529632568, "learning_rate": 9.446747927994844e-06, "loss": 0.5257, "step": 1449 }, { "epoch": 0.15977961432506887, "grad_norm": 10.60759449005127, "learning_rate": 9.445948221761007e-06, "loss": 0.4497, "step": 1450 }, { "epoch": 0.15988980716253443, "grad_norm": 18.729232788085938, "learning_rate": 9.445147971870581e-06, "loss": 0.5753, "step": 1451 }, { "epoch": 0.16, "grad_norm": 13.113868713378906, "learning_rate": 9.444347178421423e-06, "loss": 0.4972, "step": 1452 }, { "epoch": 0.16011019283746555, "grad_norm": 5.94268798828125, "learning_rate": 9.443545841511456e-06, "loss": 0.4448, "step": 1453 }, { "epoch": 0.16022038567493113, "grad_norm": 6.834156036376953, "learning_rate": 9.442743961238665e-06, "loss": 0.4677, "step": 1454 }, { "epoch": 0.16033057851239668, "grad_norm": 5.015316486358643, "learning_rate": 9.44194153770111e-06, "loss": 0.3224, "step": 1455 }, { "epoch": 0.16044077134986226, "grad_norm": 17.945768356323242, "learning_rate": 9.44113857099691e-06, "loss": 0.4532, "step": 1456 }, { "epoch": 0.1605509641873278, "grad_norm": 12.237314224243164, "learning_rate": 9.44033506122425e-06, "loss": 0.4405, "step": 1457 }, { "epoch": 0.1606611570247934, "grad_norm": 14.865547180175781, "learning_rate": 9.439531008481392e-06, "loss": 0.534, "step": 1458 }, { "epoch": 0.16077134986225897, "grad_norm": 8.56263256072998, "learning_rate": 9.438726412866648e-06, "loss": 0.436, "step": 1459 }, { "epoch": 0.16088154269972452, "grad_norm": 6.990463733673096, "learning_rate": 9.43792127447841e-06, "loss": 0.4856, "step": 1460 }, { "epoch": 0.1609917355371901, "grad_norm": 9.773675918579102, "learning_rate": 9.437115593415129e-06, "loss": 0.5639, "step": 1461 }, { "epoch": 0.16110192837465565, "grad_norm": 7.388878345489502, "learning_rate": 9.436309369775328e-06, "loss": 0.4377, "step": 1462 }, { "epoch": 0.16121212121212122, "grad_norm": 6.132890701293945, "learning_rate": 9.43550260365759e-06, "loss": 0.4315, "step": 1463 }, { "epoch": 0.16132231404958677, "grad_norm": 9.686576843261719, "learning_rate": 9.434695295160568e-06, "loss": 0.4449, "step": 1464 }, { "epoch": 0.16143250688705235, "grad_norm": 8.422321319580078, "learning_rate": 9.433887444382982e-06, "loss": 0.498, "step": 1465 }, { "epoch": 0.1615426997245179, "grad_norm": 11.859896659851074, "learning_rate": 9.433079051423616e-06, "loss": 0.5934, "step": 1466 }, { "epoch": 0.16165289256198348, "grad_norm": 5.726517677307129, "learning_rate": 9.432270116381323e-06, "loss": 0.4092, "step": 1467 }, { "epoch": 0.16176308539944903, "grad_norm": 10.08298397064209, "learning_rate": 9.431460639355019e-06, "loss": 0.421, "step": 1468 }, { "epoch": 0.1618732782369146, "grad_norm": 8.94924259185791, "learning_rate": 9.430650620443688e-06, "loss": 0.5167, "step": 1469 }, { "epoch": 0.16198347107438016, "grad_norm": 8.591166496276855, "learning_rate": 9.42984005974638e-06, "loss": 0.5073, "step": 1470 }, { "epoch": 0.16209366391184574, "grad_norm": 7.914938926696777, "learning_rate": 9.429028957362215e-06, "loss": 0.4906, "step": 1471 }, { "epoch": 0.1622038567493113, "grad_norm": 5.708611011505127, "learning_rate": 9.428217313390371e-06, "loss": 0.5026, "step": 1472 }, { "epoch": 0.16231404958677687, "grad_norm": 7.096518039703369, "learning_rate": 9.427405127930097e-06, "loss": 0.4185, "step": 1473 }, { "epoch": 0.16242424242424242, "grad_norm": 7.8787336349487305, "learning_rate": 9.426592401080712e-06, "loss": 0.4419, "step": 1474 }, { "epoch": 0.162534435261708, "grad_norm": 8.509092330932617, "learning_rate": 9.425779132941595e-06, "loss": 0.534, "step": 1475 }, { "epoch": 0.16264462809917354, "grad_norm": 4.901118755340576, "learning_rate": 9.424965323612195e-06, "loss": 0.3961, "step": 1476 }, { "epoch": 0.16275482093663912, "grad_norm": 5.733903884887695, "learning_rate": 9.424150973192023e-06, "loss": 0.4299, "step": 1477 }, { "epoch": 0.16286501377410467, "grad_norm": 14.781890869140625, "learning_rate": 9.42333608178066e-06, "loss": 0.5192, "step": 1478 }, { "epoch": 0.16297520661157025, "grad_norm": 6.482476711273193, "learning_rate": 9.422520649477754e-06, "loss": 0.4981, "step": 1479 }, { "epoch": 0.1630853994490358, "grad_norm": 10.953235626220703, "learning_rate": 9.421704676383014e-06, "loss": 0.51, "step": 1480 }, { "epoch": 0.16319559228650138, "grad_norm": 9.130579948425293, "learning_rate": 9.420888162596221e-06, "loss": 0.4976, "step": 1481 }, { "epoch": 0.16330578512396693, "grad_norm": 6.881532192230225, "learning_rate": 9.420071108217216e-06, "loss": 0.4551, "step": 1482 }, { "epoch": 0.1634159779614325, "grad_norm": 5.993616104125977, "learning_rate": 9.419253513345916e-06, "loss": 0.4523, "step": 1483 }, { "epoch": 0.16352617079889809, "grad_norm": 8.481162071228027, "learning_rate": 9.41843537808229e-06, "loss": 0.5667, "step": 1484 }, { "epoch": 0.16363636363636364, "grad_norm": 7.955751419067383, "learning_rate": 9.417616702526387e-06, "loss": 0.4281, "step": 1485 }, { "epoch": 0.1637465564738292, "grad_norm": 5.160492420196533, "learning_rate": 9.41679748677831e-06, "loss": 0.4632, "step": 1486 }, { "epoch": 0.16385674931129476, "grad_norm": 6.2219767570495605, "learning_rate": 9.415977730938237e-06, "loss": 0.4349, "step": 1487 }, { "epoch": 0.16396694214876034, "grad_norm": 11.044354438781738, "learning_rate": 9.41515743510641e-06, "loss": 0.5752, "step": 1488 }, { "epoch": 0.1640771349862259, "grad_norm": 5.493617057800293, "learning_rate": 9.414336599383133e-06, "loss": 0.4504, "step": 1489 }, { "epoch": 0.16418732782369147, "grad_norm": 5.468198776245117, "learning_rate": 9.413515223868782e-06, "loss": 0.4254, "step": 1490 }, { "epoch": 0.16429752066115702, "grad_norm": 5.371224403381348, "learning_rate": 9.412693308663793e-06, "loss": 0.3265, "step": 1491 }, { "epoch": 0.1644077134986226, "grad_norm": 5.7264404296875, "learning_rate": 9.411870853868673e-06, "loss": 0.4068, "step": 1492 }, { "epoch": 0.16451790633608815, "grad_norm": 11.388520240783691, "learning_rate": 9.41104785958399e-06, "loss": 0.5433, "step": 1493 }, { "epoch": 0.16462809917355373, "grad_norm": 12.780411720275879, "learning_rate": 9.410224325910384e-06, "loss": 0.5371, "step": 1494 }, { "epoch": 0.16473829201101928, "grad_norm": 8.704211235046387, "learning_rate": 9.409400252948558e-06, "loss": 0.431, "step": 1495 }, { "epoch": 0.16484848484848486, "grad_norm": 7.5440826416015625, "learning_rate": 9.40857564079928e-06, "loss": 0.3368, "step": 1496 }, { "epoch": 0.1649586776859504, "grad_norm": 6.04235315322876, "learning_rate": 9.407750489563381e-06, "loss": 0.414, "step": 1497 }, { "epoch": 0.16506887052341598, "grad_norm": 7.066317081451416, "learning_rate": 9.406924799341767e-06, "loss": 0.4522, "step": 1498 }, { "epoch": 0.16517906336088153, "grad_norm": 10.957195281982422, "learning_rate": 9.406098570235402e-06, "loss": 0.402, "step": 1499 }, { "epoch": 0.1652892561983471, "grad_norm": 8.854328155517578, "learning_rate": 9.405271802345319e-06, "loss": 0.4694, "step": 1500 }, { "epoch": 0.16539944903581266, "grad_norm": 11.334278106689453, "learning_rate": 9.404444495772615e-06, "loss": 0.4365, "step": 1501 }, { "epoch": 0.16550964187327824, "grad_norm": 12.177138328552246, "learning_rate": 9.403616650618456e-06, "loss": 0.6228, "step": 1502 }, { "epoch": 0.1656198347107438, "grad_norm": 6.889142990112305, "learning_rate": 9.402788266984071e-06, "loss": 0.412, "step": 1503 }, { "epoch": 0.16573002754820937, "grad_norm": 8.409960746765137, "learning_rate": 9.401959344970756e-06, "loss": 0.393, "step": 1504 }, { "epoch": 0.16584022038567492, "grad_norm": 10.087620735168457, "learning_rate": 9.401129884679874e-06, "loss": 0.4533, "step": 1505 }, { "epoch": 0.1659504132231405, "grad_norm": 8.47231674194336, "learning_rate": 9.40029988621285e-06, "loss": 0.4001, "step": 1506 }, { "epoch": 0.16606060606060605, "grad_norm": 4.585776329040527, "learning_rate": 9.39946934967118e-06, "loss": 0.456, "step": 1507 }, { "epoch": 0.16617079889807163, "grad_norm": 5.115619659423828, "learning_rate": 9.39863827515642e-06, "loss": 0.4247, "step": 1508 }, { "epoch": 0.1662809917355372, "grad_norm": 7.151246547698975, "learning_rate": 9.397806662770198e-06, "loss": 0.2597, "step": 1509 }, { "epoch": 0.16639118457300275, "grad_norm": 6.416628360748291, "learning_rate": 9.396974512614203e-06, "loss": 0.3686, "step": 1510 }, { "epoch": 0.16650137741046833, "grad_norm": 8.886362075805664, "learning_rate": 9.396141824790193e-06, "loss": 0.499, "step": 1511 }, { "epoch": 0.16661157024793388, "grad_norm": 10.70858383178711, "learning_rate": 9.395308599399987e-06, "loss": 0.47, "step": 1512 }, { "epoch": 0.16672176308539946, "grad_norm": 9.563263893127441, "learning_rate": 9.394474836545477e-06, "loss": 0.4403, "step": 1513 }, { "epoch": 0.166831955922865, "grad_norm": 5.888492584228516, "learning_rate": 9.393640536328613e-06, "loss": 0.3563, "step": 1514 }, { "epoch": 0.1669421487603306, "grad_norm": 7.1910810470581055, "learning_rate": 9.392805698851417e-06, "loss": 0.4313, "step": 1515 }, { "epoch": 0.16705234159779614, "grad_norm": 6.051369667053223, "learning_rate": 9.391970324215973e-06, "loss": 0.4821, "step": 1516 }, { "epoch": 0.16716253443526172, "grad_norm": 6.591337203979492, "learning_rate": 9.391134412524432e-06, "loss": 0.394, "step": 1517 }, { "epoch": 0.16727272727272727, "grad_norm": 7.165696620941162, "learning_rate": 9.390297963879008e-06, "loss": 0.431, "step": 1518 }, { "epoch": 0.16738292011019285, "grad_norm": 8.706212997436523, "learning_rate": 9.38946097838199e-06, "loss": 0.4573, "step": 1519 }, { "epoch": 0.1674931129476584, "grad_norm": 6.006828784942627, "learning_rate": 9.388623456135717e-06, "loss": 0.3732, "step": 1520 }, { "epoch": 0.16760330578512397, "grad_norm": 7.291321277618408, "learning_rate": 9.387785397242608e-06, "loss": 0.4585, "step": 1521 }, { "epoch": 0.16771349862258952, "grad_norm": 8.707494735717773, "learning_rate": 9.386946801805141e-06, "loss": 0.4481, "step": 1522 }, { "epoch": 0.1678236914600551, "grad_norm": 7.811737537384033, "learning_rate": 9.386107669925858e-06, "loss": 0.4742, "step": 1523 }, { "epoch": 0.16793388429752065, "grad_norm": 6.985987663269043, "learning_rate": 9.385268001707373e-06, "loss": 0.3789, "step": 1524 }, { "epoch": 0.16804407713498623, "grad_norm": 7.202207088470459, "learning_rate": 9.38442779725236e-06, "loss": 0.4366, "step": 1525 }, { "epoch": 0.16815426997245178, "grad_norm": 7.785396575927734, "learning_rate": 9.38358705666356e-06, "loss": 0.5049, "step": 1526 }, { "epoch": 0.16826446280991736, "grad_norm": 5.80039119720459, "learning_rate": 9.38274578004378e-06, "loss": 0.4372, "step": 1527 }, { "epoch": 0.1683746556473829, "grad_norm": 8.934199333190918, "learning_rate": 9.381903967495893e-06, "loss": 0.48, "step": 1528 }, { "epoch": 0.1684848484848485, "grad_norm": 9.693472862243652, "learning_rate": 9.381061619122835e-06, "loss": 0.417, "step": 1529 }, { "epoch": 0.16859504132231404, "grad_norm": 6.487861156463623, "learning_rate": 9.380218735027614e-06, "loss": 0.4508, "step": 1530 }, { "epoch": 0.16870523415977962, "grad_norm": 11.514814376831055, "learning_rate": 9.379375315313292e-06, "loss": 0.5513, "step": 1531 }, { "epoch": 0.16881542699724517, "grad_norm": 6.065979480743408, "learning_rate": 9.378531360083011e-06, "loss": 0.4021, "step": 1532 }, { "epoch": 0.16892561983471074, "grad_norm": 12.198055267333984, "learning_rate": 9.377686869439967e-06, "loss": 0.5575, "step": 1533 }, { "epoch": 0.16903581267217632, "grad_norm": 6.628361701965332, "learning_rate": 9.376841843487427e-06, "loss": 0.4029, "step": 1534 }, { "epoch": 0.16914600550964187, "grad_norm": 7.487825393676758, "learning_rate": 9.37599628232872e-06, "loss": 0.4811, "step": 1535 }, { "epoch": 0.16925619834710745, "grad_norm": 5.830073833465576, "learning_rate": 9.375150186067243e-06, "loss": 0.3705, "step": 1536 }, { "epoch": 0.169366391184573, "grad_norm": 8.890256881713867, "learning_rate": 9.374303554806458e-06, "loss": 0.4355, "step": 1537 }, { "epoch": 0.16947658402203858, "grad_norm": 12.017580032348633, "learning_rate": 9.373456388649893e-06, "loss": 0.4822, "step": 1538 }, { "epoch": 0.16958677685950413, "grad_norm": 5.915216445922852, "learning_rate": 9.37260868770114e-06, "loss": 0.4844, "step": 1539 }, { "epoch": 0.1696969696969697, "grad_norm": 12.784900665283203, "learning_rate": 9.371760452063857e-06, "loss": 0.5914, "step": 1540 }, { "epoch": 0.16980716253443526, "grad_norm": 7.340192794799805, "learning_rate": 9.370911681841768e-06, "loss": 0.4692, "step": 1541 }, { "epoch": 0.16991735537190084, "grad_norm": 5.451550006866455, "learning_rate": 9.37006237713866e-06, "loss": 0.4002, "step": 1542 }, { "epoch": 0.17002754820936639, "grad_norm": 6.147626876831055, "learning_rate": 9.369212538058389e-06, "loss": 0.4145, "step": 1543 }, { "epoch": 0.17013774104683196, "grad_norm": 8.239680290222168, "learning_rate": 9.368362164704873e-06, "loss": 0.4248, "step": 1544 }, { "epoch": 0.17024793388429751, "grad_norm": 4.821244239807129, "learning_rate": 9.3675112571821e-06, "loss": 0.4278, "step": 1545 }, { "epoch": 0.1703581267217631, "grad_norm": 8.407042503356934, "learning_rate": 9.366659815594116e-06, "loss": 0.5224, "step": 1546 }, { "epoch": 0.17046831955922864, "grad_norm": 6.209753513336182, "learning_rate": 9.365807840045037e-06, "loss": 0.4261, "step": 1547 }, { "epoch": 0.17057851239669422, "grad_norm": 7.687352657318115, "learning_rate": 9.364955330639048e-06, "loss": 0.4171, "step": 1548 }, { "epoch": 0.17068870523415977, "grad_norm": 9.114327430725098, "learning_rate": 9.36410228748039e-06, "loss": 0.5157, "step": 1549 }, { "epoch": 0.17079889807162535, "grad_norm": 7.95366907119751, "learning_rate": 9.363248710673375e-06, "loss": 0.553, "step": 1550 }, { "epoch": 0.1709090909090909, "grad_norm": 10.192009925842285, "learning_rate": 9.362394600322384e-06, "loss": 0.424, "step": 1551 }, { "epoch": 0.17101928374655648, "grad_norm": 12.724431991577148, "learning_rate": 9.361539956531853e-06, "loss": 0.5071, "step": 1552 }, { "epoch": 0.17112947658402203, "grad_norm": 7.3048295974731445, "learning_rate": 9.360684779406294e-06, "loss": 0.4098, "step": 1553 }, { "epoch": 0.1712396694214876, "grad_norm": 5.551223278045654, "learning_rate": 9.359829069050274e-06, "loss": 0.4497, "step": 1554 }, { "epoch": 0.17134986225895316, "grad_norm": 12.205377578735352, "learning_rate": 9.358972825568436e-06, "loss": 0.4724, "step": 1555 }, { "epoch": 0.17146005509641873, "grad_norm": 9.10533332824707, "learning_rate": 9.358116049065478e-06, "loss": 0.4131, "step": 1556 }, { "epoch": 0.17157024793388428, "grad_norm": 5.6637492179870605, "learning_rate": 9.35725873964617e-06, "loss": 0.498, "step": 1557 }, { "epoch": 0.17168044077134986, "grad_norm": 6.103901386260986, "learning_rate": 9.356400897415345e-06, "loss": 0.3669, "step": 1558 }, { "epoch": 0.17179063360881544, "grad_norm": 6.015481472015381, "learning_rate": 9.3555425224779e-06, "loss": 0.3158, "step": 1559 }, { "epoch": 0.171900826446281, "grad_norm": 5.316213607788086, "learning_rate": 9.354683614938798e-06, "loss": 0.4558, "step": 1560 }, { "epoch": 0.17201101928374657, "grad_norm": 10.183836936950684, "learning_rate": 9.35382417490307e-06, "loss": 0.4287, "step": 1561 }, { "epoch": 0.17212121212121212, "grad_norm": 6.446548938751221, "learning_rate": 9.352964202475808e-06, "loss": 0.4508, "step": 1562 }, { "epoch": 0.1722314049586777, "grad_norm": 10.281747817993164, "learning_rate": 9.352103697762169e-06, "loss": 0.5018, "step": 1563 }, { "epoch": 0.17234159779614325, "grad_norm": 7.714334011077881, "learning_rate": 9.351242660867378e-06, "loss": 0.4423, "step": 1564 }, { "epoch": 0.17245179063360883, "grad_norm": 6.705694675445557, "learning_rate": 9.350381091896725e-06, "loss": 0.495, "step": 1565 }, { "epoch": 0.17256198347107438, "grad_norm": 7.069931507110596, "learning_rate": 9.349518990955561e-06, "loss": 0.4489, "step": 1566 }, { "epoch": 0.17267217630853995, "grad_norm": 4.913645267486572, "learning_rate": 9.348656358149308e-06, "loss": 0.4078, "step": 1567 }, { "epoch": 0.1727823691460055, "grad_norm": 7.224776744842529, "learning_rate": 9.34779319358345e-06, "loss": 0.4023, "step": 1568 }, { "epoch": 0.17289256198347108, "grad_norm": 7.386590003967285, "learning_rate": 9.346929497363533e-06, "loss": 0.428, "step": 1569 }, { "epoch": 0.17300275482093663, "grad_norm": 8.202399253845215, "learning_rate": 9.34606526959517e-06, "loss": 0.4701, "step": 1570 }, { "epoch": 0.1731129476584022, "grad_norm": 8.172073364257812, "learning_rate": 9.345200510384044e-06, "loss": 0.4355, "step": 1571 }, { "epoch": 0.17322314049586776, "grad_norm": 5.922333717346191, "learning_rate": 9.344335219835899e-06, "loss": 0.4772, "step": 1572 }, { "epoch": 0.17333333333333334, "grad_norm": 7.50675630569458, "learning_rate": 9.34346939805654e-06, "loss": 0.4776, "step": 1573 }, { "epoch": 0.1734435261707989, "grad_norm": 4.919284343719482, "learning_rate": 9.342603045151844e-06, "loss": 0.412, "step": 1574 }, { "epoch": 0.17355371900826447, "grad_norm": 6.294493198394775, "learning_rate": 9.341736161227749e-06, "loss": 0.4684, "step": 1575 }, { "epoch": 0.17366391184573002, "grad_norm": 8.999917984008789, "learning_rate": 9.340868746390257e-06, "loss": 0.4699, "step": 1576 }, { "epoch": 0.1737741046831956, "grad_norm": 7.87579870223999, "learning_rate": 9.34000080074544e-06, "loss": 0.4248, "step": 1577 }, { "epoch": 0.17388429752066115, "grad_norm": 15.802863121032715, "learning_rate": 9.339132324399427e-06, "loss": 0.5674, "step": 1578 }, { "epoch": 0.17399449035812672, "grad_norm": 4.844115257263184, "learning_rate": 9.338263317458422e-06, "loss": 0.4537, "step": 1579 }, { "epoch": 0.17410468319559227, "grad_norm": 11.231072425842285, "learning_rate": 9.337393780028684e-06, "loss": 0.4172, "step": 1580 }, { "epoch": 0.17421487603305785, "grad_norm": 9.87650203704834, "learning_rate": 9.336523712216545e-06, "loss": 0.54, "step": 1581 }, { "epoch": 0.1743250688705234, "grad_norm": 6.8644022941589355, "learning_rate": 9.335653114128393e-06, "loss": 0.4106, "step": 1582 }, { "epoch": 0.17443526170798898, "grad_norm": 11.86081600189209, "learning_rate": 9.33478198587069e-06, "loss": 0.6136, "step": 1583 }, { "epoch": 0.17454545454545456, "grad_norm": 7.47705602645874, "learning_rate": 9.333910327549958e-06, "loss": 0.4817, "step": 1584 }, { "epoch": 0.1746556473829201, "grad_norm": 8.04872989654541, "learning_rate": 9.333038139272783e-06, "loss": 0.4366, "step": 1585 }, { "epoch": 0.1747658402203857, "grad_norm": 9.860109329223633, "learning_rate": 9.332165421145821e-06, "loss": 0.3969, "step": 1586 }, { "epoch": 0.17487603305785124, "grad_norm": 9.992838859558105, "learning_rate": 9.331292173275783e-06, "loss": 0.4956, "step": 1587 }, { "epoch": 0.17498622589531682, "grad_norm": 5.045669078826904, "learning_rate": 9.330418395769457e-06, "loss": 0.4504, "step": 1588 }, { "epoch": 0.17509641873278237, "grad_norm": 7.532293319702148, "learning_rate": 9.329544088733686e-06, "loss": 0.3748, "step": 1589 }, { "epoch": 0.17520661157024794, "grad_norm": 6.9422926902771, "learning_rate": 9.328669252275385e-06, "loss": 0.4797, "step": 1590 }, { "epoch": 0.1753168044077135, "grad_norm": 13.932552337646484, "learning_rate": 9.327793886501526e-06, "loss": 0.4177, "step": 1591 }, { "epoch": 0.17542699724517907, "grad_norm": 4.638037204742432, "learning_rate": 9.326917991519153e-06, "loss": 0.4402, "step": 1592 }, { "epoch": 0.17553719008264462, "grad_norm": 6.164183616638184, "learning_rate": 9.326041567435368e-06, "loss": 0.4583, "step": 1593 }, { "epoch": 0.1756473829201102, "grad_norm": 5.5203118324279785, "learning_rate": 9.325164614357347e-06, "loss": 0.4683, "step": 1594 }, { "epoch": 0.17575757575757575, "grad_norm": 10.047639846801758, "learning_rate": 9.32428713239232e-06, "loss": 0.5439, "step": 1595 }, { "epoch": 0.17586776859504133, "grad_norm": 5.9899163246154785, "learning_rate": 9.323409121647588e-06, "loss": 0.3647, "step": 1596 }, { "epoch": 0.17597796143250688, "grad_norm": 7.982687950134277, "learning_rate": 9.322530582230517e-06, "loss": 0.457, "step": 1597 }, { "epoch": 0.17608815426997246, "grad_norm": 7.037700176239014, "learning_rate": 9.321651514248534e-06, "loss": 0.4356, "step": 1598 }, { "epoch": 0.176198347107438, "grad_norm": 6.525335311889648, "learning_rate": 9.320771917809134e-06, "loss": 0.4025, "step": 1599 }, { "epoch": 0.1763085399449036, "grad_norm": 5.2210493087768555, "learning_rate": 9.319891793019874e-06, "loss": 0.3519, "step": 1600 }, { "epoch": 0.17641873278236914, "grad_norm": 11.669242858886719, "learning_rate": 9.319011139988378e-06, "loss": 0.5395, "step": 1601 }, { "epoch": 0.17652892561983471, "grad_norm": 13.469799995422363, "learning_rate": 9.318129958822334e-06, "loss": 0.4634, "step": 1602 }, { "epoch": 0.17663911845730026, "grad_norm": 5.899416923522949, "learning_rate": 9.31724824962949e-06, "loss": 0.4997, "step": 1603 }, { "epoch": 0.17674931129476584, "grad_norm": 4.695652484893799, "learning_rate": 9.31636601251767e-06, "loss": 0.4003, "step": 1604 }, { "epoch": 0.1768595041322314, "grad_norm": 6.482518196105957, "learning_rate": 9.315483247594748e-06, "loss": 0.4392, "step": 1605 }, { "epoch": 0.17696969696969697, "grad_norm": 6.833725452423096, "learning_rate": 9.314599954968673e-06, "loss": 0.5056, "step": 1606 }, { "epoch": 0.17707988980716252, "grad_norm": 9.069969177246094, "learning_rate": 9.313716134747455e-06, "loss": 0.4752, "step": 1607 }, { "epoch": 0.1771900826446281, "grad_norm": 12.654067993164062, "learning_rate": 9.312831787039169e-06, "loss": 0.5105, "step": 1608 }, { "epoch": 0.17730027548209368, "grad_norm": 9.427743911743164, "learning_rate": 9.311946911951952e-06, "loss": 0.5026, "step": 1609 }, { "epoch": 0.17741046831955923, "grad_norm": 6.150701999664307, "learning_rate": 9.311061509594011e-06, "loss": 0.489, "step": 1610 }, { "epoch": 0.1775206611570248, "grad_norm": 4.896096229553223, "learning_rate": 9.31017558007361e-06, "loss": 0.4465, "step": 1611 }, { "epoch": 0.17763085399449036, "grad_norm": 7.014410495758057, "learning_rate": 9.309289123499088e-06, "loss": 0.4055, "step": 1612 }, { "epoch": 0.17774104683195593, "grad_norm": 6.980282306671143, "learning_rate": 9.308402139978836e-06, "loss": 0.4678, "step": 1613 }, { "epoch": 0.17785123966942148, "grad_norm": 7.26424503326416, "learning_rate": 9.307514629621318e-06, "loss": 0.4338, "step": 1614 }, { "epoch": 0.17796143250688706, "grad_norm": 11.836024284362793, "learning_rate": 9.30662659253506e-06, "loss": 0.5679, "step": 1615 }, { "epoch": 0.1780716253443526, "grad_norm": 7.45238733291626, "learning_rate": 9.305738028828653e-06, "loss": 0.4848, "step": 1616 }, { "epoch": 0.1781818181818182, "grad_norm": 9.33069896697998, "learning_rate": 9.30484893861075e-06, "loss": 0.4438, "step": 1617 }, { "epoch": 0.17829201101928374, "grad_norm": 10.457403182983398, "learning_rate": 9.303959321990072e-06, "loss": 0.4788, "step": 1618 }, { "epoch": 0.17840220385674932, "grad_norm": 5.067116737365723, "learning_rate": 9.303069179075402e-06, "loss": 0.3975, "step": 1619 }, { "epoch": 0.17851239669421487, "grad_norm": 6.72924280166626, "learning_rate": 9.302178509975588e-06, "loss": 0.3877, "step": 1620 }, { "epoch": 0.17862258953168045, "grad_norm": 19.54122543334961, "learning_rate": 9.30128731479954e-06, "loss": 0.4767, "step": 1621 }, { "epoch": 0.178732782369146, "grad_norm": 13.766802787780762, "learning_rate": 9.300395593656237e-06, "loss": 0.4729, "step": 1622 }, { "epoch": 0.17884297520661158, "grad_norm": 14.010546684265137, "learning_rate": 9.299503346654721e-06, "loss": 0.5181, "step": 1623 }, { "epoch": 0.17895316804407713, "grad_norm": 7.248737812042236, "learning_rate": 9.298610573904094e-06, "loss": 0.4805, "step": 1624 }, { "epoch": 0.1790633608815427, "grad_norm": 7.094707489013672, "learning_rate": 9.297717275513526e-06, "loss": 0.4545, "step": 1625 }, { "epoch": 0.17917355371900826, "grad_norm": 7.688851833343506, "learning_rate": 9.296823451592253e-06, "loss": 0.3955, "step": 1626 }, { "epoch": 0.17928374655647383, "grad_norm": 9.634920120239258, "learning_rate": 9.295929102249572e-06, "loss": 0.4597, "step": 1627 }, { "epoch": 0.17939393939393938, "grad_norm": 9.308319091796875, "learning_rate": 9.295034227594846e-06, "loss": 0.5443, "step": 1628 }, { "epoch": 0.17950413223140496, "grad_norm": 6.050390243530273, "learning_rate": 9.294138827737498e-06, "loss": 0.406, "step": 1629 }, { "epoch": 0.1796143250688705, "grad_norm": 7.437664031982422, "learning_rate": 9.293242902787023e-06, "loss": 0.4229, "step": 1630 }, { "epoch": 0.1797245179063361, "grad_norm": 13.359318733215332, "learning_rate": 9.292346452852974e-06, "loss": 0.4789, "step": 1631 }, { "epoch": 0.17983471074380164, "grad_norm": 6.764444351196289, "learning_rate": 9.291449478044968e-06, "loss": 0.4367, "step": 1632 }, { "epoch": 0.17994490358126722, "grad_norm": 7.310115337371826, "learning_rate": 9.290551978472692e-06, "loss": 0.4633, "step": 1633 }, { "epoch": 0.18005509641873277, "grad_norm": 5.299081802368164, "learning_rate": 9.289653954245892e-06, "loss": 0.4097, "step": 1634 }, { "epoch": 0.18016528925619835, "grad_norm": 8.130375862121582, "learning_rate": 9.288755405474379e-06, "loss": 0.4932, "step": 1635 }, { "epoch": 0.18027548209366392, "grad_norm": 19.780685424804688, "learning_rate": 9.28785633226803e-06, "loss": 0.4528, "step": 1636 }, { "epoch": 0.18038567493112947, "grad_norm": 6.51214599609375, "learning_rate": 9.286956734736782e-06, "loss": 0.4531, "step": 1637 }, { "epoch": 0.18049586776859505, "grad_norm": 9.116598129272461, "learning_rate": 9.286056612990644e-06, "loss": 0.4667, "step": 1638 }, { "epoch": 0.1806060606060606, "grad_norm": 7.414033889770508, "learning_rate": 9.28515596713968e-06, "loss": 0.3955, "step": 1639 }, { "epoch": 0.18071625344352618, "grad_norm": 7.945151329040527, "learning_rate": 9.284254797294025e-06, "loss": 0.3675, "step": 1640 }, { "epoch": 0.18082644628099173, "grad_norm": 7.031407833099365, "learning_rate": 9.283353103563872e-06, "loss": 0.472, "step": 1641 }, { "epoch": 0.1809366391184573, "grad_norm": 6.033279895782471, "learning_rate": 9.282450886059485e-06, "loss": 0.4593, "step": 1642 }, { "epoch": 0.18104683195592286, "grad_norm": 8.067496299743652, "learning_rate": 9.281548144891183e-06, "loss": 0.4434, "step": 1643 }, { "epoch": 0.18115702479338844, "grad_norm": 7.44157075881958, "learning_rate": 9.28064488016936e-06, "loss": 0.5043, "step": 1644 }, { "epoch": 0.181267217630854, "grad_norm": 8.112360000610352, "learning_rate": 9.279741092004469e-06, "loss": 0.489, "step": 1645 }, { "epoch": 0.18137741046831957, "grad_norm": 6.331960678100586, "learning_rate": 9.27883678050702e-06, "loss": 0.5035, "step": 1646 }, { "epoch": 0.18148760330578512, "grad_norm": 6.156603813171387, "learning_rate": 9.2779319457876e-06, "loss": 0.4342, "step": 1647 }, { "epoch": 0.1815977961432507, "grad_norm": 8.530582427978516, "learning_rate": 9.277026587956849e-06, "loss": 0.4378, "step": 1648 }, { "epoch": 0.18170798898071625, "grad_norm": 4.40578031539917, "learning_rate": 9.276120707125477e-06, "loss": 0.4722, "step": 1649 }, { "epoch": 0.18181818181818182, "grad_norm": 8.593558311462402, "learning_rate": 9.275214303404256e-06, "loss": 0.4426, "step": 1650 }, { "epoch": 0.18192837465564737, "grad_norm": 5.8893961906433105, "learning_rate": 9.274307376904023e-06, "loss": 0.4641, "step": 1651 }, { "epoch": 0.18203856749311295, "grad_norm": 5.242407321929932, "learning_rate": 9.273399927735679e-06, "loss": 0.4018, "step": 1652 }, { "epoch": 0.1821487603305785, "grad_norm": 6.03156042098999, "learning_rate": 9.272491956010185e-06, "loss": 0.4375, "step": 1653 }, { "epoch": 0.18225895316804408, "grad_norm": 4.4334845542907715, "learning_rate": 9.271583461838573e-06, "loss": 0.4105, "step": 1654 }, { "epoch": 0.18236914600550963, "grad_norm": 5.8249897956848145, "learning_rate": 9.270674445331932e-06, "loss": 0.46, "step": 1655 }, { "epoch": 0.1824793388429752, "grad_norm": 6.178356647491455, "learning_rate": 9.269764906601419e-06, "loss": 0.456, "step": 1656 }, { "epoch": 0.18258953168044076, "grad_norm": 5.149049282073975, "learning_rate": 9.268854845758254e-06, "loss": 0.331, "step": 1657 }, { "epoch": 0.18269972451790634, "grad_norm": 5.921223163604736, "learning_rate": 9.26794426291372e-06, "loss": 0.3743, "step": 1658 }, { "epoch": 0.1828099173553719, "grad_norm": 6.545019149780273, "learning_rate": 9.267033158179162e-06, "loss": 0.5212, "step": 1659 }, { "epoch": 0.18292011019283747, "grad_norm": 9.221651077270508, "learning_rate": 9.266121531665994e-06, "loss": 0.49, "step": 1660 }, { "epoch": 0.18303030303030304, "grad_norm": 8.458298683166504, "learning_rate": 9.265209383485692e-06, "loss": 0.4456, "step": 1661 }, { "epoch": 0.1831404958677686, "grad_norm": 6.811192035675049, "learning_rate": 9.26429671374979e-06, "loss": 0.4542, "step": 1662 }, { "epoch": 0.18325068870523417, "grad_norm": 10.922497749328613, "learning_rate": 9.263383522569896e-06, "loss": 0.5119, "step": 1663 }, { "epoch": 0.18336088154269972, "grad_norm": 10.04183292388916, "learning_rate": 9.26246981005767e-06, "loss": 0.4339, "step": 1664 }, { "epoch": 0.1834710743801653, "grad_norm": 5.958286285400391, "learning_rate": 9.26155557632485e-06, "loss": 0.4527, "step": 1665 }, { "epoch": 0.18358126721763085, "grad_norm": 7.782229900360107, "learning_rate": 9.260640821483222e-06, "loss": 0.4145, "step": 1666 }, { "epoch": 0.18369146005509643, "grad_norm": 7.860006332397461, "learning_rate": 9.259725545644649e-06, "loss": 0.4499, "step": 1667 }, { "epoch": 0.18380165289256198, "grad_norm": 7.201568603515625, "learning_rate": 9.25880974892105e-06, "loss": 0.4353, "step": 1668 }, { "epoch": 0.18391184573002756, "grad_norm": 5.932833194732666, "learning_rate": 9.257893431424408e-06, "loss": 0.436, "step": 1669 }, { "epoch": 0.1840220385674931, "grad_norm": 8.118557929992676, "learning_rate": 9.256976593266774e-06, "loss": 0.441, "step": 1670 }, { "epoch": 0.18413223140495869, "grad_norm": 6.2253313064575195, "learning_rate": 9.25605923456026e-06, "loss": 0.4287, "step": 1671 }, { "epoch": 0.18424242424242424, "grad_norm": 7.285488605499268, "learning_rate": 9.255141355417042e-06, "loss": 0.4498, "step": 1672 }, { "epoch": 0.1843526170798898, "grad_norm": 6.142855644226074, "learning_rate": 9.254222955949359e-06, "loss": 0.4375, "step": 1673 }, { "epoch": 0.18446280991735536, "grad_norm": 9.95976448059082, "learning_rate": 9.253304036269513e-06, "loss": 0.4994, "step": 1674 }, { "epoch": 0.18457300275482094, "grad_norm": 8.551350593566895, "learning_rate": 9.252384596489874e-06, "loss": 0.5382, "step": 1675 }, { "epoch": 0.1846831955922865, "grad_norm": 6.680281639099121, "learning_rate": 9.251464636722868e-06, "loss": 0.3891, "step": 1676 }, { "epoch": 0.18479338842975207, "grad_norm": 7.300052165985107, "learning_rate": 9.250544157080992e-06, "loss": 0.4391, "step": 1677 }, { "epoch": 0.18490358126721762, "grad_norm": 6.606064796447754, "learning_rate": 9.249623157676804e-06, "loss": 0.4586, "step": 1678 }, { "epoch": 0.1850137741046832, "grad_norm": 6.0628461837768555, "learning_rate": 9.248701638622921e-06, "loss": 0.4395, "step": 1679 }, { "epoch": 0.18512396694214875, "grad_norm": 8.079413414001465, "learning_rate": 9.247779600032032e-06, "loss": 0.435, "step": 1680 }, { "epoch": 0.18523415977961433, "grad_norm": 10.073164939880371, "learning_rate": 9.246857042016883e-06, "loss": 0.4077, "step": 1681 }, { "epoch": 0.18534435261707988, "grad_norm": 7.541385173797607, "learning_rate": 9.245933964690288e-06, "loss": 0.3633, "step": 1682 }, { "epoch": 0.18545454545454546, "grad_norm": 6.038715839385986, "learning_rate": 9.245010368165118e-06, "loss": 0.4782, "step": 1683 }, { "epoch": 0.185564738292011, "grad_norm": 14.045479774475098, "learning_rate": 9.244086252554313e-06, "loss": 0.478, "step": 1684 }, { "epoch": 0.18567493112947658, "grad_norm": 5.501763343811035, "learning_rate": 9.24316161797088e-06, "loss": 0.4166, "step": 1685 }, { "epoch": 0.18578512396694216, "grad_norm": 6.4724555015563965, "learning_rate": 9.242236464527877e-06, "loss": 0.4311, "step": 1686 }, { "epoch": 0.1858953168044077, "grad_norm": 5.08769416809082, "learning_rate": 9.241310792338439e-06, "loss": 0.3645, "step": 1687 }, { "epoch": 0.1860055096418733, "grad_norm": 8.996004104614258, "learning_rate": 9.240384601515753e-06, "loss": 0.4901, "step": 1688 }, { "epoch": 0.18611570247933884, "grad_norm": 6.509644985198975, "learning_rate": 9.23945789217308e-06, "loss": 0.3987, "step": 1689 }, { "epoch": 0.18622589531680442, "grad_norm": 9.581886291503906, "learning_rate": 9.238530664423737e-06, "loss": 0.4781, "step": 1690 }, { "epoch": 0.18633608815426997, "grad_norm": 11.073467254638672, "learning_rate": 9.237602918381107e-06, "loss": 0.5167, "step": 1691 }, { "epoch": 0.18644628099173555, "grad_norm": 10.145783424377441, "learning_rate": 9.236674654158637e-06, "loss": 0.4679, "step": 1692 }, { "epoch": 0.1865564738292011, "grad_norm": 9.544154167175293, "learning_rate": 9.235745871869834e-06, "loss": 0.4067, "step": 1693 }, { "epoch": 0.18666666666666668, "grad_norm": 6.963332653045654, "learning_rate": 9.23481657162827e-06, "loss": 0.4524, "step": 1694 }, { "epoch": 0.18677685950413223, "grad_norm": 6.609317302703857, "learning_rate": 9.233886753547588e-06, "loss": 0.4254, "step": 1695 }, { "epoch": 0.1868870523415978, "grad_norm": 6.9944376945495605, "learning_rate": 9.232956417741478e-06, "loss": 0.4096, "step": 1696 }, { "epoch": 0.18699724517906335, "grad_norm": 10.562012672424316, "learning_rate": 9.23202556432371e-06, "loss": 0.4147, "step": 1697 }, { "epoch": 0.18710743801652893, "grad_norm": 6.0145134925842285, "learning_rate": 9.231094193408107e-06, "loss": 0.4501, "step": 1698 }, { "epoch": 0.18721763085399448, "grad_norm": 7.005607604980469, "learning_rate": 9.230162305108558e-06, "loss": 0.4592, "step": 1699 }, { "epoch": 0.18732782369146006, "grad_norm": 10.507290840148926, "learning_rate": 9.229229899539018e-06, "loss": 0.5106, "step": 1700 }, { "epoch": 0.1874380165289256, "grad_norm": 8.026799201965332, "learning_rate": 9.2282969768135e-06, "loss": 0.4859, "step": 1701 }, { "epoch": 0.1875482093663912, "grad_norm": 11.039427757263184, "learning_rate": 9.227363537046083e-06, "loss": 0.4278, "step": 1702 }, { "epoch": 0.18765840220385674, "grad_norm": 10.61426830291748, "learning_rate": 9.22642958035091e-06, "loss": 0.4875, "step": 1703 }, { "epoch": 0.18776859504132232, "grad_norm": 11.166681289672852, "learning_rate": 9.225495106842188e-06, "loss": 0.4511, "step": 1704 }, { "epoch": 0.18787878787878787, "grad_norm": 7.04632568359375, "learning_rate": 9.224560116634184e-06, "loss": 0.3696, "step": 1705 }, { "epoch": 0.18798898071625345, "grad_norm": 13.060858726501465, "learning_rate": 9.223624609841232e-06, "loss": 0.4687, "step": 1706 }, { "epoch": 0.188099173553719, "grad_norm": 12.081947326660156, "learning_rate": 9.222688586577724e-06, "loss": 0.5184, "step": 1707 }, { "epoch": 0.18820936639118457, "grad_norm": 13.18545913696289, "learning_rate": 9.221752046958122e-06, "loss": 0.4923, "step": 1708 }, { "epoch": 0.18831955922865012, "grad_norm": 7.636499404907227, "learning_rate": 9.220814991096943e-06, "loss": 0.487, "step": 1709 }, { "epoch": 0.1884297520661157, "grad_norm": 4.343592166900635, "learning_rate": 9.219877419108773e-06, "loss": 0.434, "step": 1710 }, { "epoch": 0.18853994490358128, "grad_norm": 10.054169654846191, "learning_rate": 9.218939331108261e-06, "loss": 0.5029, "step": 1711 }, { "epoch": 0.18865013774104683, "grad_norm": 7.799861431121826, "learning_rate": 9.218000727210115e-06, "loss": 0.4746, "step": 1712 }, { "epoch": 0.1887603305785124, "grad_norm": 8.256338119506836, "learning_rate": 9.217061607529111e-06, "loss": 0.5506, "step": 1713 }, { "epoch": 0.18887052341597796, "grad_norm": 9.735845565795898, "learning_rate": 9.216121972180087e-06, "loss": 0.4109, "step": 1714 }, { "epoch": 0.18898071625344354, "grad_norm": 7.928435802459717, "learning_rate": 9.21518182127794e-06, "loss": 0.5023, "step": 1715 }, { "epoch": 0.1890909090909091, "grad_norm": 15.789814949035645, "learning_rate": 9.214241154937635e-06, "loss": 0.6017, "step": 1716 }, { "epoch": 0.18920110192837467, "grad_norm": 5.908916473388672, "learning_rate": 9.213299973274197e-06, "loss": 0.496, "step": 1717 }, { "epoch": 0.18931129476584022, "grad_norm": 8.4912691116333, "learning_rate": 9.212358276402716e-06, "loss": 0.5218, "step": 1718 }, { "epoch": 0.1894214876033058, "grad_norm": 9.090007781982422, "learning_rate": 9.211416064438342e-06, "loss": 0.4659, "step": 1719 }, { "epoch": 0.18953168044077134, "grad_norm": 9.479738235473633, "learning_rate": 9.210473337496289e-06, "loss": 0.4745, "step": 1720 }, { "epoch": 0.18964187327823692, "grad_norm": 7.842554569244385, "learning_rate": 9.209530095691839e-06, "loss": 0.4553, "step": 1721 }, { "epoch": 0.18975206611570247, "grad_norm": 5.904482364654541, "learning_rate": 9.20858633914033e-06, "loss": 0.3563, "step": 1722 }, { "epoch": 0.18986225895316805, "grad_norm": 8.66420841217041, "learning_rate": 9.207642067957168e-06, "loss": 0.5095, "step": 1723 }, { "epoch": 0.1899724517906336, "grad_norm": 5.422713279724121, "learning_rate": 9.206697282257817e-06, "loss": 0.4117, "step": 1724 }, { "epoch": 0.19008264462809918, "grad_norm": 8.360159873962402, "learning_rate": 9.20575198215781e-06, "loss": 0.4775, "step": 1725 }, { "epoch": 0.19019283746556473, "grad_norm": 9.186153411865234, "learning_rate": 9.204806167772734e-06, "loss": 0.5071, "step": 1726 }, { "epoch": 0.1903030303030303, "grad_norm": 14.28857135772705, "learning_rate": 9.20385983921825e-06, "loss": 0.6133, "step": 1727 }, { "epoch": 0.19041322314049586, "grad_norm": 5.504502773284912, "learning_rate": 9.202912996610076e-06, "loss": 0.4208, "step": 1728 }, { "epoch": 0.19052341597796144, "grad_norm": 9.405652046203613, "learning_rate": 9.20196564006399e-06, "loss": 0.4877, "step": 1729 }, { "epoch": 0.19063360881542699, "grad_norm": 7.375481605529785, "learning_rate": 9.201017769695838e-06, "loss": 0.4022, "step": 1730 }, { "epoch": 0.19074380165289256, "grad_norm": 15.241425514221191, "learning_rate": 9.200069385621528e-06, "loss": 0.4715, "step": 1731 }, { "epoch": 0.19085399449035811, "grad_norm": 11.097306251525879, "learning_rate": 9.199120487957027e-06, "loss": 0.3701, "step": 1732 }, { "epoch": 0.1909641873278237, "grad_norm": 7.114978313446045, "learning_rate": 9.198171076818368e-06, "loss": 0.3833, "step": 1733 }, { "epoch": 0.19107438016528924, "grad_norm": 9.978882789611816, "learning_rate": 9.197221152321648e-06, "loss": 0.4948, "step": 1734 }, { "epoch": 0.19118457300275482, "grad_norm": 7.850578308105469, "learning_rate": 9.196270714583024e-06, "loss": 0.4935, "step": 1735 }, { "epoch": 0.1912947658402204, "grad_norm": 7.1560869216918945, "learning_rate": 9.195319763718717e-06, "loss": 0.5109, "step": 1736 }, { "epoch": 0.19140495867768595, "grad_norm": 5.753309726715088, "learning_rate": 9.194368299845012e-06, "loss": 0.4673, "step": 1737 }, { "epoch": 0.19151515151515153, "grad_norm": 9.460782051086426, "learning_rate": 9.193416323078252e-06, "loss": 0.4019, "step": 1738 }, { "epoch": 0.19162534435261708, "grad_norm": 10.26589584350586, "learning_rate": 9.192463833534848e-06, "loss": 0.4986, "step": 1739 }, { "epoch": 0.19173553719008266, "grad_norm": 10.60505199432373, "learning_rate": 9.191510831331271e-06, "loss": 0.4664, "step": 1740 }, { "epoch": 0.1918457300275482, "grad_norm": 4.823023796081543, "learning_rate": 9.190557316584057e-06, "loss": 0.4589, "step": 1741 }, { "epoch": 0.19195592286501378, "grad_norm": 17.65654754638672, "learning_rate": 9.189603289409802e-06, "loss": 0.5577, "step": 1742 }, { "epoch": 0.19206611570247933, "grad_norm": 8.283257484436035, "learning_rate": 9.188648749925165e-06, "loss": 0.4656, "step": 1743 }, { "epoch": 0.1921763085399449, "grad_norm": 8.1823148727417, "learning_rate": 9.18769369824687e-06, "loss": 0.4467, "step": 1744 }, { "epoch": 0.19228650137741046, "grad_norm": 6.611006259918213, "learning_rate": 9.1867381344917e-06, "loss": 0.3631, "step": 1745 }, { "epoch": 0.19239669421487604, "grad_norm": 6.430954933166504, "learning_rate": 9.185782058776504e-06, "loss": 0.3733, "step": 1746 }, { "epoch": 0.1925068870523416, "grad_norm": 8.7865629196167, "learning_rate": 9.184825471218193e-06, "loss": 0.4406, "step": 1747 }, { "epoch": 0.19261707988980717, "grad_norm": 6.636740684509277, "learning_rate": 9.18386837193374e-06, "loss": 0.4312, "step": 1748 }, { "epoch": 0.19272727272727272, "grad_norm": 6.261380195617676, "learning_rate": 9.182910761040177e-06, "loss": 0.4475, "step": 1749 }, { "epoch": 0.1928374655647383, "grad_norm": 6.949450492858887, "learning_rate": 9.181952638654604e-06, "loss": 0.4866, "step": 1750 }, { "epoch": 0.19294765840220385, "grad_norm": 8.320935249328613, "learning_rate": 9.180994004894184e-06, "loss": 0.5215, "step": 1751 }, { "epoch": 0.19305785123966943, "grad_norm": 6.757778167724609, "learning_rate": 9.180034859876135e-06, "loss": 0.4685, "step": 1752 }, { "epoch": 0.19316804407713498, "grad_norm": 7.9120097160339355, "learning_rate": 9.179075203717746e-06, "loss": 0.4522, "step": 1753 }, { "epoch": 0.19327823691460055, "grad_norm": 8.696321487426758, "learning_rate": 9.178115036536365e-06, "loss": 0.4557, "step": 1754 }, { "epoch": 0.1933884297520661, "grad_norm": 29.67196273803711, "learning_rate": 9.177154358449403e-06, "loss": 0.4847, "step": 1755 }, { "epoch": 0.19349862258953168, "grad_norm": 8.443095207214355, "learning_rate": 9.176193169574332e-06, "loss": 0.5122, "step": 1756 }, { "epoch": 0.19360881542699723, "grad_norm": 8.4793062210083, "learning_rate": 9.175231470028685e-06, "loss": 0.4016, "step": 1757 }, { "epoch": 0.1937190082644628, "grad_norm": 6.380115509033203, "learning_rate": 9.174269259930064e-06, "loss": 0.4166, "step": 1758 }, { "epoch": 0.19382920110192836, "grad_norm": 10.314026832580566, "learning_rate": 9.173306539396128e-06, "loss": 0.4556, "step": 1759 }, { "epoch": 0.19393939393939394, "grad_norm": 7.1314239501953125, "learning_rate": 9.172343308544598e-06, "loss": 0.4413, "step": 1760 }, { "epoch": 0.19404958677685952, "grad_norm": 6.917520999908447, "learning_rate": 9.171379567493261e-06, "loss": 0.4119, "step": 1761 }, { "epoch": 0.19415977961432507, "grad_norm": 6.420129776000977, "learning_rate": 9.170415316359966e-06, "loss": 0.4111, "step": 1762 }, { "epoch": 0.19426997245179065, "grad_norm": 6.119045734405518, "learning_rate": 9.16945055526262e-06, "loss": 0.3883, "step": 1763 }, { "epoch": 0.1943801652892562, "grad_norm": 9.331291198730469, "learning_rate": 9.168485284319195e-06, "loss": 0.4095, "step": 1764 }, { "epoch": 0.19449035812672177, "grad_norm": 7.21309232711792, "learning_rate": 9.167519503647729e-06, "loss": 0.4418, "step": 1765 }, { "epoch": 0.19460055096418732, "grad_norm": 11.516107559204102, "learning_rate": 9.166553213366316e-06, "loss": 0.5187, "step": 1766 }, { "epoch": 0.1947107438016529, "grad_norm": 17.424747467041016, "learning_rate": 9.165586413593118e-06, "loss": 0.5168, "step": 1767 }, { "epoch": 0.19482093663911845, "grad_norm": 6.920599937438965, "learning_rate": 9.164619104446354e-06, "loss": 0.4324, "step": 1768 }, { "epoch": 0.19493112947658403, "grad_norm": 10.168035507202148, "learning_rate": 9.163651286044308e-06, "loss": 0.5094, "step": 1769 }, { "epoch": 0.19504132231404958, "grad_norm": 8.317641258239746, "learning_rate": 9.16268295850533e-06, "loss": 0.3652, "step": 1770 }, { "epoch": 0.19515151515151516, "grad_norm": 6.406209468841553, "learning_rate": 9.161714121947822e-06, "loss": 0.431, "step": 1771 }, { "epoch": 0.1952617079889807, "grad_norm": 7.211702346801758, "learning_rate": 9.160744776490258e-06, "loss": 0.4585, "step": 1772 }, { "epoch": 0.1953719008264463, "grad_norm": 8.599669456481934, "learning_rate": 9.159774922251173e-06, "loss": 0.4845, "step": 1773 }, { "epoch": 0.19548209366391184, "grad_norm": 7.3090057373046875, "learning_rate": 9.158804559349158e-06, "loss": 0.3688, "step": 1774 }, { "epoch": 0.19559228650137742, "grad_norm": 6.770844459533691, "learning_rate": 9.157833687902872e-06, "loss": 0.4231, "step": 1775 }, { "epoch": 0.19570247933884297, "grad_norm": 8.398971557617188, "learning_rate": 9.156862308031037e-06, "loss": 0.3886, "step": 1776 }, { "epoch": 0.19581267217630854, "grad_norm": 7.647953033447266, "learning_rate": 9.155890419852432e-06, "loss": 0.3672, "step": 1777 }, { "epoch": 0.1959228650137741, "grad_norm": 11.409097671508789, "learning_rate": 9.154918023485901e-06, "loss": 0.5574, "step": 1778 }, { "epoch": 0.19603305785123967, "grad_norm": 14.514908790588379, "learning_rate": 9.153945119050349e-06, "loss": 0.4787, "step": 1779 }, { "epoch": 0.19614325068870522, "grad_norm": 6.008779525756836, "learning_rate": 9.152971706664745e-06, "loss": 0.4832, "step": 1780 }, { "epoch": 0.1962534435261708, "grad_norm": 5.2900710105896, "learning_rate": 9.151997786448123e-06, "loss": 0.395, "step": 1781 }, { "epoch": 0.19636363636363635, "grad_norm": 13.773778915405273, "learning_rate": 9.151023358519569e-06, "loss": 0.4648, "step": 1782 }, { "epoch": 0.19647382920110193, "grad_norm": 6.885417938232422, "learning_rate": 9.15004842299824e-06, "loss": 0.3971, "step": 1783 }, { "epoch": 0.19658402203856748, "grad_norm": 14.44425106048584, "learning_rate": 9.149072980003354e-06, "loss": 0.5117, "step": 1784 }, { "epoch": 0.19669421487603306, "grad_norm": 6.065454483032227, "learning_rate": 9.148097029654186e-06, "loss": 0.4183, "step": 1785 }, { "epoch": 0.19680440771349864, "grad_norm": 5.698667049407959, "learning_rate": 9.14712057207008e-06, "loss": 0.4628, "step": 1786 }, { "epoch": 0.1969146005509642, "grad_norm": 6.582290172576904, "learning_rate": 9.146143607370436e-06, "loss": 0.4249, "step": 1787 }, { "epoch": 0.19702479338842976, "grad_norm": 8.020262718200684, "learning_rate": 9.14516613567472e-06, "loss": 0.5491, "step": 1788 }, { "epoch": 0.19713498622589531, "grad_norm": 6.097904682159424, "learning_rate": 9.14418815710246e-06, "loss": 0.4536, "step": 1789 }, { "epoch": 0.1972451790633609, "grad_norm": 6.816990375518799, "learning_rate": 9.14320967177324e-06, "loss": 0.4149, "step": 1790 }, { "epoch": 0.19735537190082644, "grad_norm": 12.400146484375, "learning_rate": 9.142230679806716e-06, "loss": 0.5054, "step": 1791 }, { "epoch": 0.19746556473829202, "grad_norm": 6.677716255187988, "learning_rate": 9.141251181322597e-06, "loss": 0.3187, "step": 1792 }, { "epoch": 0.19757575757575757, "grad_norm": 13.185030937194824, "learning_rate": 9.140271176440658e-06, "loss": 0.4739, "step": 1793 }, { "epoch": 0.19768595041322315, "grad_norm": 11.510183334350586, "learning_rate": 9.139290665280736e-06, "loss": 0.4501, "step": 1794 }, { "epoch": 0.1977961432506887, "grad_norm": 9.512884140014648, "learning_rate": 9.138309647962729e-06, "loss": 0.5343, "step": 1795 }, { "epoch": 0.19790633608815428, "grad_norm": 5.829929828643799, "learning_rate": 9.137328124606596e-06, "loss": 0.4693, "step": 1796 }, { "epoch": 0.19801652892561983, "grad_norm": 6.9572529792785645, "learning_rate": 9.13634609533236e-06, "loss": 0.4533, "step": 1797 }, { "epoch": 0.1981267217630854, "grad_norm": 11.407742500305176, "learning_rate": 9.135363560260105e-06, "loss": 0.508, "step": 1798 }, { "epoch": 0.19823691460055096, "grad_norm": 9.053552627563477, "learning_rate": 9.134380519509976e-06, "loss": 0.4438, "step": 1799 }, { "epoch": 0.19834710743801653, "grad_norm": 9.490631103515625, "learning_rate": 9.133396973202181e-06, "loss": 0.4095, "step": 1800 }, { "epoch": 0.19845730027548208, "grad_norm": 12.728962898254395, "learning_rate": 9.13241292145699e-06, "loss": 0.4589, "step": 1801 }, { "epoch": 0.19856749311294766, "grad_norm": 7.135961055755615, "learning_rate": 9.131428364394735e-06, "loss": 0.361, "step": 1802 }, { "epoch": 0.1986776859504132, "grad_norm": 13.275293350219727, "learning_rate": 9.130443302135804e-06, "loss": 0.4761, "step": 1803 }, { "epoch": 0.1987878787878788, "grad_norm": 10.185133934020996, "learning_rate": 9.129457734800659e-06, "loss": 0.5152, "step": 1804 }, { "epoch": 0.19889807162534434, "grad_norm": 5.512964725494385, "learning_rate": 9.128471662509811e-06, "loss": 0.3932, "step": 1805 }, { "epoch": 0.19900826446280992, "grad_norm": 5.863252639770508, "learning_rate": 9.127485085383841e-06, "loss": 0.4458, "step": 1806 }, { "epoch": 0.19911845730027547, "grad_norm": 7.236845016479492, "learning_rate": 9.126498003543387e-06, "loss": 0.42, "step": 1807 }, { "epoch": 0.19922865013774105, "grad_norm": 4.469096660614014, "learning_rate": 9.125510417109152e-06, "loss": 0.445, "step": 1808 }, { "epoch": 0.1993388429752066, "grad_norm": 7.3282341957092285, "learning_rate": 9.1245223262019e-06, "loss": 0.4747, "step": 1809 }, { "epoch": 0.19944903581267218, "grad_norm": 4.438247203826904, "learning_rate": 9.123533730942456e-06, "loss": 0.3745, "step": 1810 }, { "epoch": 0.19955922865013775, "grad_norm": 8.355430603027344, "learning_rate": 9.122544631451703e-06, "loss": 0.4216, "step": 1811 }, { "epoch": 0.1996694214876033, "grad_norm": 6.029199600219727, "learning_rate": 9.121555027850597e-06, "loss": 0.4154, "step": 1812 }, { "epoch": 0.19977961432506888, "grad_norm": 4.891571998596191, "learning_rate": 9.12056492026014e-06, "loss": 0.4122, "step": 1813 }, { "epoch": 0.19988980716253443, "grad_norm": 8.599620819091797, "learning_rate": 9.11957430880141e-06, "loss": 0.4457, "step": 1814 }, { "epoch": 0.2, "grad_norm": 6.806952953338623, "learning_rate": 9.118583193595536e-06, "loss": 0.4634, "step": 1815 }, { "epoch": 0.20011019283746556, "grad_norm": 7.122880935668945, "learning_rate": 9.117591574763714e-06, "loss": 0.4144, "step": 1816 }, { "epoch": 0.20011019283746556, "eval_loss": 0.45613893866539, "eval_runtime": 41.9691, "eval_samples_per_second": 17.489, "eval_steps_per_second": 2.192, "step": 1816 }, { "epoch": 0.20022038567493114, "grad_norm": 8.090353965759277, "learning_rate": 9.116599452427201e-06, "loss": 0.4032, "step": 1817 }, { "epoch": 0.2003305785123967, "grad_norm": 6.1090593338012695, "learning_rate": 9.115606826707317e-06, "loss": 0.4011, "step": 1818 }, { "epoch": 0.20044077134986227, "grad_norm": 12.8853120803833, "learning_rate": 9.114613697725438e-06, "loss": 0.4132, "step": 1819 }, { "epoch": 0.20055096418732782, "grad_norm": 5.833970546722412, "learning_rate": 9.113620065603008e-06, "loss": 0.4547, "step": 1820 }, { "epoch": 0.2006611570247934, "grad_norm": 6.591116428375244, "learning_rate": 9.112625930461528e-06, "loss": 0.3708, "step": 1821 }, { "epoch": 0.20077134986225895, "grad_norm": 15.06023120880127, "learning_rate": 9.111631292422562e-06, "loss": 0.4409, "step": 1822 }, { "epoch": 0.20088154269972452, "grad_norm": 13.558335304260254, "learning_rate": 9.11063615160774e-06, "loss": 0.4381, "step": 1823 }, { "epoch": 0.20099173553719007, "grad_norm": 6.851561069488525, "learning_rate": 9.109640508138742e-06, "loss": 0.4188, "step": 1824 }, { "epoch": 0.20110192837465565, "grad_norm": 7.203578948974609, "learning_rate": 9.10864436213732e-06, "loss": 0.4574, "step": 1825 }, { "epoch": 0.2012121212121212, "grad_norm": 6.365950107574463, "learning_rate": 9.107647713725287e-06, "loss": 0.4895, "step": 1826 }, { "epoch": 0.20132231404958678, "grad_norm": 6.2642388343811035, "learning_rate": 9.10665056302451e-06, "loss": 0.4239, "step": 1827 }, { "epoch": 0.20143250688705233, "grad_norm": 10.329350471496582, "learning_rate": 9.105652910156924e-06, "loss": 0.4563, "step": 1828 }, { "epoch": 0.2015426997245179, "grad_norm": 7.78670597076416, "learning_rate": 9.104654755244524e-06, "loss": 0.396, "step": 1829 }, { "epoch": 0.20165289256198346, "grad_norm": 9.769115447998047, "learning_rate": 9.103656098409364e-06, "loss": 0.4818, "step": 1830 }, { "epoch": 0.20176308539944904, "grad_norm": 5.7953009605407715, "learning_rate": 9.102656939773561e-06, "loss": 0.481, "step": 1831 }, { "epoch": 0.2018732782369146, "grad_norm": 6.397300720214844, "learning_rate": 9.101657279459297e-06, "loss": 0.4416, "step": 1832 }, { "epoch": 0.20198347107438017, "grad_norm": 6.988760948181152, "learning_rate": 9.10065711758881e-06, "loss": 0.4666, "step": 1833 }, { "epoch": 0.20209366391184572, "grad_norm": 5.883671760559082, "learning_rate": 9.099656454284396e-06, "loss": 0.4417, "step": 1834 }, { "epoch": 0.2022038567493113, "grad_norm": 7.063892841339111, "learning_rate": 9.098655289668426e-06, "loss": 0.4467, "step": 1835 }, { "epoch": 0.20231404958677687, "grad_norm": 5.584319591522217, "learning_rate": 9.097653623863319e-06, "loss": 0.4041, "step": 1836 }, { "epoch": 0.20242424242424242, "grad_norm": 8.371783256530762, "learning_rate": 9.09665145699156e-06, "loss": 0.5629, "step": 1837 }, { "epoch": 0.202534435261708, "grad_norm": 4.433701992034912, "learning_rate": 9.095648789175695e-06, "loss": 0.4506, "step": 1838 }, { "epoch": 0.20264462809917355, "grad_norm": 6.148390293121338, "learning_rate": 9.094645620538334e-06, "loss": 0.5054, "step": 1839 }, { "epoch": 0.20275482093663913, "grad_norm": 5.67417573928833, "learning_rate": 9.093641951202143e-06, "loss": 0.393, "step": 1840 }, { "epoch": 0.20286501377410468, "grad_norm": 12.673815727233887, "learning_rate": 9.092637781289856e-06, "loss": 0.4929, "step": 1841 }, { "epoch": 0.20297520661157026, "grad_norm": 5.681674003601074, "learning_rate": 9.09163311092426e-06, "loss": 0.3919, "step": 1842 }, { "epoch": 0.2030853994490358, "grad_norm": 5.172900676727295, "learning_rate": 9.090627940228211e-06, "loss": 0.502, "step": 1843 }, { "epoch": 0.2031955922865014, "grad_norm": 14.66736888885498, "learning_rate": 9.089622269324619e-06, "loss": 0.4077, "step": 1844 }, { "epoch": 0.20330578512396694, "grad_norm": 8.680099487304688, "learning_rate": 9.088616098336461e-06, "loss": 0.4933, "step": 1845 }, { "epoch": 0.20341597796143251, "grad_norm": 6.403596878051758, "learning_rate": 9.087609427386774e-06, "loss": 0.4898, "step": 1846 }, { "epoch": 0.20352617079889807, "grad_norm": 15.408370018005371, "learning_rate": 9.086602256598654e-06, "loss": 0.5043, "step": 1847 }, { "epoch": 0.20363636363636364, "grad_norm": 9.622623443603516, "learning_rate": 9.085594586095256e-06, "loss": 0.4478, "step": 1848 }, { "epoch": 0.2037465564738292, "grad_norm": 6.941022872924805, "learning_rate": 9.084586415999804e-06, "loss": 0.4172, "step": 1849 }, { "epoch": 0.20385674931129477, "grad_norm": 5.249694347381592, "learning_rate": 9.083577746435577e-06, "loss": 0.4757, "step": 1850 }, { "epoch": 0.20396694214876032, "grad_norm": 8.428592681884766, "learning_rate": 9.082568577525916e-06, "loss": 0.4937, "step": 1851 }, { "epoch": 0.2040771349862259, "grad_norm": 9.229290008544922, "learning_rate": 9.081558909394223e-06, "loss": 0.4867, "step": 1852 }, { "epoch": 0.20418732782369145, "grad_norm": 6.427026748657227, "learning_rate": 9.080548742163963e-06, "loss": 0.4162, "step": 1853 }, { "epoch": 0.20429752066115703, "grad_norm": 5.985427379608154, "learning_rate": 9.079538075958661e-06, "loss": 0.456, "step": 1854 }, { "epoch": 0.20440771349862258, "grad_norm": 6.108369827270508, "learning_rate": 9.0785269109019e-06, "loss": 0.3918, "step": 1855 }, { "epoch": 0.20451790633608816, "grad_norm": 10.243864059448242, "learning_rate": 9.077515247117329e-06, "loss": 0.3983, "step": 1856 }, { "epoch": 0.2046280991735537, "grad_norm": 7.947391033172607, "learning_rate": 9.076503084728655e-06, "loss": 0.4175, "step": 1857 }, { "epoch": 0.20473829201101928, "grad_norm": 6.859309196472168, "learning_rate": 9.075490423859645e-06, "loss": 0.3996, "step": 1858 }, { "epoch": 0.20484848484848484, "grad_norm": 9.099756240844727, "learning_rate": 9.074477264634131e-06, "loss": 0.4904, "step": 1859 }, { "epoch": 0.2049586776859504, "grad_norm": 8.364721298217773, "learning_rate": 9.073463607176003e-06, "loss": 0.3858, "step": 1860 }, { "epoch": 0.205068870523416, "grad_norm": 6.850193977355957, "learning_rate": 9.072449451609211e-06, "loss": 0.4272, "step": 1861 }, { "epoch": 0.20517906336088154, "grad_norm": 9.688082695007324, "learning_rate": 9.071434798057767e-06, "loss": 0.4758, "step": 1862 }, { "epoch": 0.20528925619834712, "grad_norm": 7.483328819274902, "learning_rate": 9.070419646645747e-06, "loss": 0.4202, "step": 1863 }, { "epoch": 0.20539944903581267, "grad_norm": 7.1225199699401855, "learning_rate": 9.069403997497283e-06, "loss": 0.4982, "step": 1864 }, { "epoch": 0.20550964187327825, "grad_norm": 11.751843452453613, "learning_rate": 9.068387850736572e-06, "loss": 0.4909, "step": 1865 }, { "epoch": 0.2056198347107438, "grad_norm": 15.210118293762207, "learning_rate": 9.067371206487867e-06, "loss": 0.477, "step": 1866 }, { "epoch": 0.20573002754820938, "grad_norm": 9.114174842834473, "learning_rate": 9.066354064875486e-06, "loss": 0.4353, "step": 1867 }, { "epoch": 0.20584022038567493, "grad_norm": 9.445465087890625, "learning_rate": 9.065336426023806e-06, "loss": 0.435, "step": 1868 }, { "epoch": 0.2059504132231405, "grad_norm": 10.78199291229248, "learning_rate": 9.064318290057266e-06, "loss": 0.458, "step": 1869 }, { "epoch": 0.20606060606060606, "grad_norm": 6.4792656898498535, "learning_rate": 9.063299657100363e-06, "loss": 0.4008, "step": 1870 }, { "epoch": 0.20617079889807163, "grad_norm": 9.852862358093262, "learning_rate": 9.06228052727766e-06, "loss": 0.4086, "step": 1871 }, { "epoch": 0.20628099173553718, "grad_norm": 5.965283393859863, "learning_rate": 9.061260900713777e-06, "loss": 0.4791, "step": 1872 }, { "epoch": 0.20639118457300276, "grad_norm": 6.138115882873535, "learning_rate": 9.060240777533394e-06, "loss": 0.4356, "step": 1873 }, { "epoch": 0.2065013774104683, "grad_norm": 5.7559614181518555, "learning_rate": 9.059220157861252e-06, "loss": 0.4752, "step": 1874 }, { "epoch": 0.2066115702479339, "grad_norm": 4.700835704803467, "learning_rate": 9.058199041822155e-06, "loss": 0.4183, "step": 1875 }, { "epoch": 0.20672176308539944, "grad_norm": 6.3946075439453125, "learning_rate": 9.057177429540969e-06, "loss": 0.4635, "step": 1876 }, { "epoch": 0.20683195592286502, "grad_norm": 5.3536458015441895, "learning_rate": 9.056155321142615e-06, "loss": 0.3633, "step": 1877 }, { "epoch": 0.20694214876033057, "grad_norm": 9.934891700744629, "learning_rate": 9.055132716752077e-06, "loss": 0.5228, "step": 1878 }, { "epoch": 0.20705234159779615, "grad_norm": 7.625913619995117, "learning_rate": 9.054109616494403e-06, "loss": 0.4418, "step": 1879 }, { "epoch": 0.2071625344352617, "grad_norm": 6.623529434204102, "learning_rate": 9.053086020494697e-06, "loss": 0.3677, "step": 1880 }, { "epoch": 0.20727272727272728, "grad_norm": 4.113265037536621, "learning_rate": 9.052061928878128e-06, "loss": 0.478, "step": 1881 }, { "epoch": 0.20738292011019283, "grad_norm": 8.305842399597168, "learning_rate": 9.051037341769923e-06, "loss": 0.4773, "step": 1882 }, { "epoch": 0.2074931129476584, "grad_norm": 6.318554878234863, "learning_rate": 9.050012259295368e-06, "loss": 0.3304, "step": 1883 }, { "epoch": 0.20760330578512395, "grad_norm": 11.366905212402344, "learning_rate": 9.048986681579814e-06, "loss": 0.4307, "step": 1884 }, { "epoch": 0.20771349862258953, "grad_norm": 11.362711906433105, "learning_rate": 9.047960608748667e-06, "loss": 0.5466, "step": 1885 }, { "epoch": 0.2078236914600551, "grad_norm": 10.7440824508667, "learning_rate": 9.046934040927398e-06, "loss": 0.493, "step": 1886 }, { "epoch": 0.20793388429752066, "grad_norm": 7.004652500152588, "learning_rate": 9.045906978241538e-06, "loss": 0.3823, "step": 1887 }, { "epoch": 0.20804407713498624, "grad_norm": 7.787408828735352, "learning_rate": 9.044879420816676e-06, "loss": 0.4317, "step": 1888 }, { "epoch": 0.2081542699724518, "grad_norm": 10.921992301940918, "learning_rate": 9.043851368778464e-06, "loss": 0.4119, "step": 1889 }, { "epoch": 0.20826446280991737, "grad_norm": 6.97713565826416, "learning_rate": 9.042822822252615e-06, "loss": 0.4869, "step": 1890 }, { "epoch": 0.20837465564738292, "grad_norm": 5.4596428871154785, "learning_rate": 9.041793781364898e-06, "loss": 0.4907, "step": 1891 }, { "epoch": 0.2084848484848485, "grad_norm": 7.856202602386475, "learning_rate": 9.040764246241148e-06, "loss": 0.48, "step": 1892 }, { "epoch": 0.20859504132231405, "grad_norm": 6.265694618225098, "learning_rate": 9.039734217007258e-06, "loss": 0.4903, "step": 1893 }, { "epoch": 0.20870523415977962, "grad_norm": 8.078190803527832, "learning_rate": 9.03870369378918e-06, "loss": 0.3312, "step": 1894 }, { "epoch": 0.20881542699724517, "grad_norm": 15.330924987792969, "learning_rate": 9.037672676712928e-06, "loss": 0.3968, "step": 1895 }, { "epoch": 0.20892561983471075, "grad_norm": 12.826431274414062, "learning_rate": 9.036641165904575e-06, "loss": 0.5816, "step": 1896 }, { "epoch": 0.2090358126721763, "grad_norm": 8.1292142868042, "learning_rate": 9.035609161490258e-06, "loss": 0.3688, "step": 1897 }, { "epoch": 0.20914600550964188, "grad_norm": 14.929841995239258, "learning_rate": 9.034576663596171e-06, "loss": 0.5262, "step": 1898 }, { "epoch": 0.20925619834710743, "grad_norm": 7.926431655883789, "learning_rate": 9.03354367234857e-06, "loss": 0.4719, "step": 1899 }, { "epoch": 0.209366391184573, "grad_norm": 7.180224418640137, "learning_rate": 9.032510187873769e-06, "loss": 0.4552, "step": 1900 }, { "epoch": 0.20947658402203856, "grad_norm": 8.119571685791016, "learning_rate": 9.031476210298144e-06, "loss": 0.4513, "step": 1901 }, { "epoch": 0.20958677685950414, "grad_norm": 6.797088146209717, "learning_rate": 9.030441739748133e-06, "loss": 0.4122, "step": 1902 }, { "epoch": 0.2096969696969697, "grad_norm": 15.355571746826172, "learning_rate": 9.029406776350232e-06, "loss": 0.4267, "step": 1903 }, { "epoch": 0.20980716253443527, "grad_norm": 9.266566276550293, "learning_rate": 9.028371320230996e-06, "loss": 0.4733, "step": 1904 }, { "epoch": 0.20991735537190082, "grad_norm": 10.276440620422363, "learning_rate": 9.027335371517041e-06, "loss": 0.4745, "step": 1905 }, { "epoch": 0.2100275482093664, "grad_norm": 7.7341485023498535, "learning_rate": 9.02629893033505e-06, "loss": 0.4525, "step": 1906 }, { "epoch": 0.21013774104683194, "grad_norm": 7.565461158752441, "learning_rate": 9.025261996811752e-06, "loss": 0.5141, "step": 1907 }, { "epoch": 0.21024793388429752, "grad_norm": 10.702898025512695, "learning_rate": 9.024224571073953e-06, "loss": 0.4816, "step": 1908 }, { "epoch": 0.21035812672176307, "grad_norm": 4.73010778427124, "learning_rate": 9.023186653248506e-06, "loss": 0.426, "step": 1909 }, { "epoch": 0.21046831955922865, "grad_norm": 8.425341606140137, "learning_rate": 9.02214824346233e-06, "loss": 0.5173, "step": 1910 }, { "epoch": 0.21057851239669423, "grad_norm": 6.806413173675537, "learning_rate": 9.021109341842403e-06, "loss": 0.431, "step": 1911 }, { "epoch": 0.21068870523415978, "grad_norm": 8.63928508758545, "learning_rate": 9.020069948515764e-06, "loss": 0.4368, "step": 1912 }, { "epoch": 0.21079889807162536, "grad_norm": 9.580857276916504, "learning_rate": 9.01903006360951e-06, "loss": 0.3569, "step": 1913 }, { "epoch": 0.2109090909090909, "grad_norm": 6.788583755493164, "learning_rate": 9.0179896872508e-06, "loss": 0.4989, "step": 1914 }, { "epoch": 0.21101928374655649, "grad_norm": 7.399003982543945, "learning_rate": 9.016948819566855e-06, "loss": 0.4294, "step": 1915 }, { "epoch": 0.21112947658402204, "grad_norm": 9.5567045211792, "learning_rate": 9.01590746068495e-06, "loss": 0.4322, "step": 1916 }, { "epoch": 0.2112396694214876, "grad_norm": 4.259730339050293, "learning_rate": 9.014865610732429e-06, "loss": 0.3784, "step": 1917 }, { "epoch": 0.21134986225895316, "grad_norm": 8.83730697631836, "learning_rate": 9.013823269836683e-06, "loss": 0.4126, "step": 1918 }, { "epoch": 0.21146005509641874, "grad_norm": 6.528851509094238, "learning_rate": 9.012780438125178e-06, "loss": 0.4805, "step": 1919 }, { "epoch": 0.2115702479338843, "grad_norm": 15.830177307128906, "learning_rate": 9.01173711572543e-06, "loss": 0.45, "step": 1920 }, { "epoch": 0.21168044077134987, "grad_norm": 7.5151872634887695, "learning_rate": 9.010693302765018e-06, "loss": 0.4472, "step": 1921 }, { "epoch": 0.21179063360881542, "grad_norm": 7.604508399963379, "learning_rate": 9.009648999371581e-06, "loss": 0.489, "step": 1922 }, { "epoch": 0.211900826446281, "grad_norm": 5.260583400726318, "learning_rate": 9.008604205672818e-06, "loss": 0.4191, "step": 1923 }, { "epoch": 0.21201101928374655, "grad_norm": 9.22535228729248, "learning_rate": 9.007558921796487e-06, "loss": 0.3623, "step": 1924 }, { "epoch": 0.21212121212121213, "grad_norm": 11.922446250915527, "learning_rate": 9.006513147870406e-06, "loss": 0.4881, "step": 1925 }, { "epoch": 0.21223140495867768, "grad_norm": 6.315326690673828, "learning_rate": 9.005466884022457e-06, "loss": 0.488, "step": 1926 }, { "epoch": 0.21234159779614326, "grad_norm": 9.142075538635254, "learning_rate": 9.004420130380576e-06, "loss": 0.4033, "step": 1927 }, { "epoch": 0.2124517906336088, "grad_norm": 7.655306339263916, "learning_rate": 9.003372887072761e-06, "loss": 0.3743, "step": 1928 }, { "epoch": 0.21256198347107438, "grad_norm": 4.443755149841309, "learning_rate": 9.002325154227073e-06, "loss": 0.3184, "step": 1929 }, { "epoch": 0.21267217630853993, "grad_norm": 7.230881690979004, "learning_rate": 9.001276931971628e-06, "loss": 0.388, "step": 1930 }, { "epoch": 0.2127823691460055, "grad_norm": 9.036759376525879, "learning_rate": 9.000228220434604e-06, "loss": 0.4088, "step": 1931 }, { "epoch": 0.21289256198347106, "grad_norm": 8.434646606445312, "learning_rate": 8.999179019744239e-06, "loss": 0.4923, "step": 1932 }, { "epoch": 0.21300275482093664, "grad_norm": 9.558442115783691, "learning_rate": 8.998129330028833e-06, "loss": 0.4631, "step": 1933 }, { "epoch": 0.2131129476584022, "grad_norm": 7.401463031768799, "learning_rate": 8.99707915141674e-06, "loss": 0.3914, "step": 1934 }, { "epoch": 0.21322314049586777, "grad_norm": 7.793752670288086, "learning_rate": 8.99602848403638e-06, "loss": 0.3831, "step": 1935 }, { "epoch": 0.21333333333333335, "grad_norm": 9.429425239562988, "learning_rate": 8.994977328016226e-06, "loss": 0.4538, "step": 1936 }, { "epoch": 0.2134435261707989, "grad_norm": 8.313745498657227, "learning_rate": 8.993925683484821e-06, "loss": 0.3812, "step": 1937 }, { "epoch": 0.21355371900826448, "grad_norm": 11.815062522888184, "learning_rate": 8.992873550570758e-06, "loss": 0.5145, "step": 1938 }, { "epoch": 0.21366391184573003, "grad_norm": 10.813628196716309, "learning_rate": 8.991820929402692e-06, "loss": 0.5057, "step": 1939 }, { "epoch": 0.2137741046831956, "grad_norm": 11.73471736907959, "learning_rate": 8.990767820109341e-06, "loss": 0.5166, "step": 1940 }, { "epoch": 0.21388429752066115, "grad_norm": 12.178422927856445, "learning_rate": 8.989714222819479e-06, "loss": 0.5423, "step": 1941 }, { "epoch": 0.21399449035812673, "grad_norm": 6.593087673187256, "learning_rate": 8.988660137661942e-06, "loss": 0.5057, "step": 1942 }, { "epoch": 0.21410468319559228, "grad_norm": 5.574822902679443, "learning_rate": 8.987605564765628e-06, "loss": 0.4922, "step": 1943 }, { "epoch": 0.21421487603305786, "grad_norm": 10.696877479553223, "learning_rate": 8.986550504259487e-06, "loss": 0.4365, "step": 1944 }, { "epoch": 0.2143250688705234, "grad_norm": 10.210994720458984, "learning_rate": 8.985494956272536e-06, "loss": 0.508, "step": 1945 }, { "epoch": 0.214435261707989, "grad_norm": 6.919703483581543, "learning_rate": 8.984438920933847e-06, "loss": 0.443, "step": 1946 }, { "epoch": 0.21454545454545454, "grad_norm": 14.901093482971191, "learning_rate": 8.983382398372555e-06, "loss": 0.4736, "step": 1947 }, { "epoch": 0.21465564738292012, "grad_norm": 9.82280158996582, "learning_rate": 8.982325388717853e-06, "loss": 0.3837, "step": 1948 }, { "epoch": 0.21476584022038567, "grad_norm": 4.7476301193237305, "learning_rate": 8.981267892098993e-06, "loss": 0.3799, "step": 1949 }, { "epoch": 0.21487603305785125, "grad_norm": 5.6726555824279785, "learning_rate": 8.980209908645286e-06, "loss": 0.4045, "step": 1950 }, { "epoch": 0.2149862258953168, "grad_norm": 7.372026443481445, "learning_rate": 8.979151438486105e-06, "loss": 0.419, "step": 1951 }, { "epoch": 0.21509641873278237, "grad_norm": 14.011527061462402, "learning_rate": 8.978092481750883e-06, "loss": 0.4625, "step": 1952 }, { "epoch": 0.21520661157024792, "grad_norm": 8.68128490447998, "learning_rate": 8.977033038569106e-06, "loss": 0.4081, "step": 1953 }, { "epoch": 0.2153168044077135, "grad_norm": 4.661523342132568, "learning_rate": 8.975973109070328e-06, "loss": 0.3807, "step": 1954 }, { "epoch": 0.21542699724517905, "grad_norm": 8.467016220092773, "learning_rate": 8.974912693384156e-06, "loss": 0.4912, "step": 1955 }, { "epoch": 0.21553719008264463, "grad_norm": 10.955033302307129, "learning_rate": 8.973851791640262e-06, "loss": 0.5152, "step": 1956 }, { "epoch": 0.21564738292011018, "grad_norm": 12.558293342590332, "learning_rate": 8.972790403968374e-06, "loss": 0.5096, "step": 1957 }, { "epoch": 0.21575757575757576, "grad_norm": 6.917547225952148, "learning_rate": 8.971728530498276e-06, "loss": 0.4213, "step": 1958 }, { "epoch": 0.2158677685950413, "grad_norm": 13.767768859863281, "learning_rate": 8.970666171359821e-06, "loss": 0.4437, "step": 1959 }, { "epoch": 0.2159779614325069, "grad_norm": 7.672903060913086, "learning_rate": 8.969603326682911e-06, "loss": 0.5086, "step": 1960 }, { "epoch": 0.21608815426997247, "grad_norm": 7.62104606628418, "learning_rate": 8.968539996597514e-06, "loss": 0.4778, "step": 1961 }, { "epoch": 0.21619834710743802, "grad_norm": 9.937533378601074, "learning_rate": 8.967476181233656e-06, "loss": 0.5214, "step": 1962 }, { "epoch": 0.2163085399449036, "grad_norm": 15.488443374633789, "learning_rate": 8.966411880721422e-06, "loss": 0.6193, "step": 1963 }, { "epoch": 0.21641873278236914, "grad_norm": 6.0433735847473145, "learning_rate": 8.965347095190956e-06, "loss": 0.4644, "step": 1964 }, { "epoch": 0.21652892561983472, "grad_norm": 9.517822265625, "learning_rate": 8.964281824772458e-06, "loss": 0.3872, "step": 1965 }, { "epoch": 0.21663911845730027, "grad_norm": 9.647416114807129, "learning_rate": 8.963216069596197e-06, "loss": 0.5504, "step": 1966 }, { "epoch": 0.21674931129476585, "grad_norm": 7.2650532722473145, "learning_rate": 8.962149829792489e-06, "loss": 0.3576, "step": 1967 }, { "epoch": 0.2168595041322314, "grad_norm": 6.231147289276123, "learning_rate": 8.961083105491718e-06, "loss": 0.4159, "step": 1968 }, { "epoch": 0.21696969696969698, "grad_norm": 7.0674567222595215, "learning_rate": 8.960015896824324e-06, "loss": 0.4666, "step": 1969 }, { "epoch": 0.21707988980716253, "grad_norm": 9.527992248535156, "learning_rate": 8.958948203920808e-06, "loss": 0.3766, "step": 1970 }, { "epoch": 0.2171900826446281, "grad_norm": 9.872891426086426, "learning_rate": 8.957880026911727e-06, "loss": 0.4628, "step": 1971 }, { "epoch": 0.21730027548209366, "grad_norm": 8.264708518981934, "learning_rate": 8.956811365927702e-06, "loss": 0.3907, "step": 1972 }, { "epoch": 0.21741046831955924, "grad_norm": 4.303475379943848, "learning_rate": 8.955742221099405e-06, "loss": 0.3753, "step": 1973 }, { "epoch": 0.21752066115702479, "grad_norm": 7.1407318115234375, "learning_rate": 8.954672592557578e-06, "loss": 0.4638, "step": 1974 }, { "epoch": 0.21763085399449036, "grad_norm": 6.071427822113037, "learning_rate": 8.953602480433016e-06, "loss": 0.4365, "step": 1975 }, { "epoch": 0.21774104683195591, "grad_norm": 4.606165409088135, "learning_rate": 8.95253188485657e-06, "loss": 0.4439, "step": 1976 }, { "epoch": 0.2178512396694215, "grad_norm": 8.184374809265137, "learning_rate": 8.951460805959159e-06, "loss": 0.5013, "step": 1977 }, { "epoch": 0.21796143250688704, "grad_norm": 4.882009983062744, "learning_rate": 8.95038924387175e-06, "loss": 0.3109, "step": 1978 }, { "epoch": 0.21807162534435262, "grad_norm": 10.992392539978027, "learning_rate": 8.949317198725379e-06, "loss": 0.5092, "step": 1979 }, { "epoch": 0.21818181818181817, "grad_norm": 11.858559608459473, "learning_rate": 8.948244670651137e-06, "loss": 0.5491, "step": 1980 }, { "epoch": 0.21829201101928375, "grad_norm": 5.792891502380371, "learning_rate": 8.947171659780172e-06, "loss": 0.4797, "step": 1981 }, { "epoch": 0.2184022038567493, "grad_norm": 8.452299118041992, "learning_rate": 8.946098166243696e-06, "loss": 0.4986, "step": 1982 }, { "epoch": 0.21851239669421488, "grad_norm": 4.906167507171631, "learning_rate": 8.945024190172975e-06, "loss": 0.4088, "step": 1983 }, { "epoch": 0.21862258953168043, "grad_norm": 7.811947345733643, "learning_rate": 8.943949731699337e-06, "loss": 0.4175, "step": 1984 }, { "epoch": 0.218732782369146, "grad_norm": 6.3078083992004395, "learning_rate": 8.94287479095417e-06, "loss": 0.4038, "step": 1985 }, { "epoch": 0.21884297520661158, "grad_norm": 8.789570808410645, "learning_rate": 8.941799368068916e-06, "loss": 0.469, "step": 1986 }, { "epoch": 0.21895316804407713, "grad_norm": 7.786570072174072, "learning_rate": 8.940723463175083e-06, "loss": 0.4931, "step": 1987 }, { "epoch": 0.2190633608815427, "grad_norm": 6.052577972412109, "learning_rate": 8.93964707640423e-06, "loss": 0.4266, "step": 1988 }, { "epoch": 0.21917355371900826, "grad_norm": 11.581875801086426, "learning_rate": 8.938570207887981e-06, "loss": 0.4935, "step": 1989 }, { "epoch": 0.21928374655647384, "grad_norm": 7.972194671630859, "learning_rate": 8.93749285775802e-06, "loss": 0.5139, "step": 1990 }, { "epoch": 0.2193939393939394, "grad_norm": 6.227068901062012, "learning_rate": 8.93641502614608e-06, "loss": 0.4265, "step": 1991 }, { "epoch": 0.21950413223140497, "grad_norm": 8.917132377624512, "learning_rate": 8.935336713183965e-06, "loss": 0.4938, "step": 1992 }, { "epoch": 0.21961432506887052, "grad_norm": 11.940751075744629, "learning_rate": 8.934257919003532e-06, "loss": 0.5562, "step": 1993 }, { "epoch": 0.2197245179063361, "grad_norm": 7.603494644165039, "learning_rate": 8.933178643736696e-06, "loss": 0.4801, "step": 1994 }, { "epoch": 0.21983471074380165, "grad_norm": 7.002358913421631, "learning_rate": 8.932098887515432e-06, "loss": 0.4775, "step": 1995 }, { "epoch": 0.21994490358126723, "grad_norm": 7.376613616943359, "learning_rate": 8.931018650471775e-06, "loss": 0.4974, "step": 1996 }, { "epoch": 0.22005509641873278, "grad_norm": 6.262222766876221, "learning_rate": 8.929937932737818e-06, "loss": 0.4584, "step": 1997 }, { "epoch": 0.22016528925619835, "grad_norm": 3.9996824264526367, "learning_rate": 8.928856734445712e-06, "loss": 0.4186, "step": 1998 }, { "epoch": 0.2202754820936639, "grad_norm": 7.933164119720459, "learning_rate": 8.927775055727668e-06, "loss": 0.4332, "step": 1999 }, { "epoch": 0.22038567493112948, "grad_norm": 7.3600382804870605, "learning_rate": 8.926692896715955e-06, "loss": 0.5018, "step": 2000 }, { "epoch": 0.22049586776859503, "grad_norm": 6.7328410148620605, "learning_rate": 8.9256102575429e-06, "loss": 0.421, "step": 2001 }, { "epoch": 0.2206060606060606, "grad_norm": 10.002666473388672, "learning_rate": 8.92452713834089e-06, "loss": 0.4698, "step": 2002 }, { "epoch": 0.22071625344352616, "grad_norm": 6.482827186584473, "learning_rate": 8.923443539242371e-06, "loss": 0.4616, "step": 2003 }, { "epoch": 0.22082644628099174, "grad_norm": 12.589330673217773, "learning_rate": 8.922359460379848e-06, "loss": 0.4643, "step": 2004 }, { "epoch": 0.2209366391184573, "grad_norm": 5.59968376159668, "learning_rate": 8.92127490188588e-06, "loss": 0.4637, "step": 2005 }, { "epoch": 0.22104683195592287, "grad_norm": 4.8191609382629395, "learning_rate": 8.920189863893092e-06, "loss": 0.385, "step": 2006 }, { "epoch": 0.22115702479338842, "grad_norm": 8.757711410522461, "learning_rate": 8.919104346534162e-06, "loss": 0.4544, "step": 2007 }, { "epoch": 0.221267217630854, "grad_norm": 8.402291297912598, "learning_rate": 8.918018349941829e-06, "loss": 0.4519, "step": 2008 }, { "epoch": 0.22137741046831955, "grad_norm": 17.948301315307617, "learning_rate": 8.916931874248889e-06, "loss": 0.4922, "step": 2009 }, { "epoch": 0.22148760330578512, "grad_norm": 8.477241516113281, "learning_rate": 8.9158449195882e-06, "loss": 0.4376, "step": 2010 }, { "epoch": 0.22159779614325067, "grad_norm": 10.58144760131836, "learning_rate": 8.914757486092676e-06, "loss": 0.4337, "step": 2011 }, { "epoch": 0.22170798898071625, "grad_norm": 5.602427005767822, "learning_rate": 8.913669573895285e-06, "loss": 0.467, "step": 2012 }, { "epoch": 0.22181818181818183, "grad_norm": 7.449388027191162, "learning_rate": 8.912581183129067e-06, "loss": 0.4084, "step": 2013 }, { "epoch": 0.22192837465564738, "grad_norm": 8.887518882751465, "learning_rate": 8.911492313927104e-06, "loss": 0.4231, "step": 2014 }, { "epoch": 0.22203856749311296, "grad_norm": 7.231354713439941, "learning_rate": 8.910402966422549e-06, "loss": 0.3867, "step": 2015 }, { "epoch": 0.2221487603305785, "grad_norm": 10.660465240478516, "learning_rate": 8.909313140748607e-06, "loss": 0.4835, "step": 2016 }, { "epoch": 0.2222589531680441, "grad_norm": 11.831543922424316, "learning_rate": 8.908222837038545e-06, "loss": 0.3749, "step": 2017 }, { "epoch": 0.22236914600550964, "grad_norm": 7.562001705169678, "learning_rate": 8.907132055425685e-06, "loss": 0.5091, "step": 2018 }, { "epoch": 0.22247933884297522, "grad_norm": 5.9218597412109375, "learning_rate": 8.906040796043409e-06, "loss": 0.403, "step": 2019 }, { "epoch": 0.22258953168044077, "grad_norm": 7.158297538757324, "learning_rate": 8.904949059025158e-06, "loss": 0.4251, "step": 2020 }, { "epoch": 0.22269972451790634, "grad_norm": 7.283941268920898, "learning_rate": 8.903856844504435e-06, "loss": 0.4382, "step": 2021 }, { "epoch": 0.2228099173553719, "grad_norm": 14.837738037109375, "learning_rate": 8.902764152614792e-06, "loss": 0.5329, "step": 2022 }, { "epoch": 0.22292011019283747, "grad_norm": 7.059011459350586, "learning_rate": 8.901670983489848e-06, "loss": 0.4003, "step": 2023 }, { "epoch": 0.22303030303030302, "grad_norm": 17.28598976135254, "learning_rate": 8.900577337263274e-06, "loss": 0.4001, "step": 2024 }, { "epoch": 0.2231404958677686, "grad_norm": 6.224469184875488, "learning_rate": 8.899483214068807e-06, "loss": 0.4522, "step": 2025 }, { "epoch": 0.22325068870523415, "grad_norm": 7.4331374168396, "learning_rate": 8.898388614040235e-06, "loss": 0.4645, "step": 2026 }, { "epoch": 0.22336088154269973, "grad_norm": 9.30374813079834, "learning_rate": 8.897293537311408e-06, "loss": 0.4421, "step": 2027 }, { "epoch": 0.22347107438016528, "grad_norm": 11.287163734436035, "learning_rate": 8.896197984016233e-06, "loss": 0.4098, "step": 2028 }, { "epoch": 0.22358126721763086, "grad_norm": 9.276102066040039, "learning_rate": 8.895101954288675e-06, "loss": 0.463, "step": 2029 }, { "epoch": 0.2236914600550964, "grad_norm": 9.553190231323242, "learning_rate": 8.894005448262762e-06, "loss": 0.4344, "step": 2030 }, { "epoch": 0.223801652892562, "grad_norm": 4.667308807373047, "learning_rate": 8.89290846607257e-06, "loss": 0.3673, "step": 2031 }, { "epoch": 0.22391184573002754, "grad_norm": 8.357771873474121, "learning_rate": 8.891811007852245e-06, "loss": 0.4999, "step": 2032 }, { "epoch": 0.22402203856749311, "grad_norm": 6.221590995788574, "learning_rate": 8.890713073735983e-06, "loss": 0.3871, "step": 2033 }, { "epoch": 0.22413223140495867, "grad_norm": 5.789371967315674, "learning_rate": 8.889614663858041e-06, "loss": 0.4755, "step": 2034 }, { "epoch": 0.22424242424242424, "grad_norm": 12.528146743774414, "learning_rate": 8.888515778352735e-06, "loss": 0.4535, "step": 2035 }, { "epoch": 0.2243526170798898, "grad_norm": 8.438453674316406, "learning_rate": 8.887416417354437e-06, "loss": 0.5561, "step": 2036 }, { "epoch": 0.22446280991735537, "grad_norm": 10.135050773620605, "learning_rate": 8.886316580997578e-06, "loss": 0.448, "step": 2037 }, { "epoch": 0.22457300275482095, "grad_norm": 5.991178512573242, "learning_rate": 8.88521626941665e-06, "loss": 0.4274, "step": 2038 }, { "epoch": 0.2246831955922865, "grad_norm": 7.5936055183410645, "learning_rate": 8.884115482746199e-06, "loss": 0.4437, "step": 2039 }, { "epoch": 0.22479338842975208, "grad_norm": 6.577114105224609, "learning_rate": 8.883014221120829e-06, "loss": 0.446, "step": 2040 }, { "epoch": 0.22490358126721763, "grad_norm": 12.858460426330566, "learning_rate": 8.881912484675207e-06, "loss": 0.5337, "step": 2041 }, { "epoch": 0.2250137741046832, "grad_norm": 5.964365005493164, "learning_rate": 8.880810273544053e-06, "loss": 0.4245, "step": 2042 }, { "epoch": 0.22512396694214876, "grad_norm": 6.543353080749512, "learning_rate": 8.879707587862148e-06, "loss": 0.4069, "step": 2043 }, { "epoch": 0.22523415977961433, "grad_norm": 7.215315341949463, "learning_rate": 8.878604427764326e-06, "loss": 0.4115, "step": 2044 }, { "epoch": 0.22534435261707988, "grad_norm": 6.376750469207764, "learning_rate": 8.87750079338549e-06, "loss": 0.3905, "step": 2045 }, { "epoch": 0.22545454545454546, "grad_norm": 10.623973846435547, "learning_rate": 8.876396684860588e-06, "loss": 0.4954, "step": 2046 }, { "epoch": 0.225564738292011, "grad_norm": 9.294370651245117, "learning_rate": 8.875292102324634e-06, "loss": 0.5065, "step": 2047 }, { "epoch": 0.2256749311294766, "grad_norm": 7.6773271560668945, "learning_rate": 8.874187045912697e-06, "loss": 0.4181, "step": 2048 }, { "epoch": 0.22578512396694214, "grad_norm": 8.934910774230957, "learning_rate": 8.873081515759908e-06, "loss": 0.4274, "step": 2049 }, { "epoch": 0.22589531680440772, "grad_norm": 5.578096866607666, "learning_rate": 8.871975512001448e-06, "loss": 0.387, "step": 2050 }, { "epoch": 0.22600550964187327, "grad_norm": 7.378517150878906, "learning_rate": 8.870869034772563e-06, "loss": 0.3857, "step": 2051 }, { "epoch": 0.22611570247933885, "grad_norm": 8.191890716552734, "learning_rate": 8.869762084208553e-06, "loss": 0.4835, "step": 2052 }, { "epoch": 0.2262258953168044, "grad_norm": 12.504240989685059, "learning_rate": 8.86865466044478e-06, "loss": 0.5257, "step": 2053 }, { "epoch": 0.22633608815426998, "grad_norm": 8.958276748657227, "learning_rate": 8.867546763616662e-06, "loss": 0.4435, "step": 2054 }, { "epoch": 0.22644628099173553, "grad_norm": 15.308467864990234, "learning_rate": 8.86643839385967e-06, "loss": 0.6179, "step": 2055 }, { "epoch": 0.2265564738292011, "grad_norm": 7.645038604736328, "learning_rate": 8.865329551309338e-06, "loss": 0.4695, "step": 2056 }, { "epoch": 0.22666666666666666, "grad_norm": 7.3347368240356445, "learning_rate": 8.864220236101261e-06, "loss": 0.3826, "step": 2057 }, { "epoch": 0.22677685950413223, "grad_norm": 7.0317769050598145, "learning_rate": 8.863110448371082e-06, "loss": 0.3772, "step": 2058 }, { "epoch": 0.22688705234159778, "grad_norm": 7.45179557800293, "learning_rate": 8.862000188254512e-06, "loss": 0.5059, "step": 2059 }, { "epoch": 0.22699724517906336, "grad_norm": 3.892314910888672, "learning_rate": 8.860889455887312e-06, "loss": 0.4543, "step": 2060 }, { "epoch": 0.2271074380165289, "grad_norm": 6.642572402954102, "learning_rate": 8.859778251405304e-06, "loss": 0.4144, "step": 2061 }, { "epoch": 0.2272176308539945, "grad_norm": 6.904155254364014, "learning_rate": 8.85866657494437e-06, "loss": 0.4312, "step": 2062 }, { "epoch": 0.22732782369146007, "grad_norm": 8.384150505065918, "learning_rate": 8.857554426640445e-06, "loss": 0.4316, "step": 2063 }, { "epoch": 0.22743801652892562, "grad_norm": 9.582276344299316, "learning_rate": 8.856441806629524e-06, "loss": 0.5277, "step": 2064 }, { "epoch": 0.2275482093663912, "grad_norm": 7.464855194091797, "learning_rate": 8.855328715047662e-06, "loss": 0.4663, "step": 2065 }, { "epoch": 0.22765840220385675, "grad_norm": 10.291353225708008, "learning_rate": 8.854215152030966e-06, "loss": 0.4829, "step": 2066 }, { "epoch": 0.22776859504132232, "grad_norm": 7.483318328857422, "learning_rate": 8.853101117715609e-06, "loss": 0.418, "step": 2067 }, { "epoch": 0.22787878787878788, "grad_norm": 12.582602500915527, "learning_rate": 8.851986612237809e-06, "loss": 0.5183, "step": 2068 }, { "epoch": 0.22798898071625345, "grad_norm": 9.944835662841797, "learning_rate": 8.850871635733856e-06, "loss": 0.4539, "step": 2069 }, { "epoch": 0.228099173553719, "grad_norm": 7.1502532958984375, "learning_rate": 8.849756188340089e-06, "loss": 0.4384, "step": 2070 }, { "epoch": 0.22820936639118458, "grad_norm": 7.528603553771973, "learning_rate": 8.848640270192903e-06, "loss": 0.4036, "step": 2071 }, { "epoch": 0.22831955922865013, "grad_norm": 6.340047359466553, "learning_rate": 8.84752388142876e-06, "loss": 0.4259, "step": 2072 }, { "epoch": 0.2284297520661157, "grad_norm": 6.14491605758667, "learning_rate": 8.846407022184169e-06, "loss": 0.4458, "step": 2073 }, { "epoch": 0.22853994490358126, "grad_norm": 8.84805965423584, "learning_rate": 8.845289692595703e-06, "loss": 0.4089, "step": 2074 }, { "epoch": 0.22865013774104684, "grad_norm": 5.433241367340088, "learning_rate": 8.84417189279999e-06, "loss": 0.4659, "step": 2075 }, { "epoch": 0.2287603305785124, "grad_norm": 6.685497283935547, "learning_rate": 8.843053622933716e-06, "loss": 0.4496, "step": 2076 }, { "epoch": 0.22887052341597797, "grad_norm": 6.667261123657227, "learning_rate": 8.841934883133624e-06, "loss": 0.4822, "step": 2077 }, { "epoch": 0.22898071625344352, "grad_norm": 5.1992926597595215, "learning_rate": 8.840815673536518e-06, "loss": 0.3928, "step": 2078 }, { "epoch": 0.2290909090909091, "grad_norm": 8.6360502243042, "learning_rate": 8.839695994279253e-06, "loss": 0.3625, "step": 2079 }, { "epoch": 0.22920110192837465, "grad_norm": 11.109519958496094, "learning_rate": 8.838575845498744e-06, "loss": 0.4753, "step": 2080 }, { "epoch": 0.22931129476584022, "grad_norm": 4.837209701538086, "learning_rate": 8.83745522733197e-06, "loss": 0.434, "step": 2081 }, { "epoch": 0.22942148760330577, "grad_norm": 6.786084175109863, "learning_rate": 8.836334139915957e-06, "loss": 0.4481, "step": 2082 }, { "epoch": 0.22953168044077135, "grad_norm": 8.713823318481445, "learning_rate": 8.835212583387794e-06, "loss": 0.4768, "step": 2083 }, { "epoch": 0.2296418732782369, "grad_norm": 7.901473045349121, "learning_rate": 8.834090557884625e-06, "loss": 0.4223, "step": 2084 }, { "epoch": 0.22975206611570248, "grad_norm": 5.997265815734863, "learning_rate": 8.832968063543657e-06, "loss": 0.2915, "step": 2085 }, { "epoch": 0.22986225895316803, "grad_norm": 10.123592376708984, "learning_rate": 8.831845100502145e-06, "loss": 0.4141, "step": 2086 }, { "epoch": 0.2299724517906336, "grad_norm": 7.859190940856934, "learning_rate": 8.830721668897411e-06, "loss": 0.4084, "step": 2087 }, { "epoch": 0.2300826446280992, "grad_norm": 7.773241996765137, "learning_rate": 8.829597768866827e-06, "loss": 0.405, "step": 2088 }, { "epoch": 0.23019283746556474, "grad_norm": 6.04829216003418, "learning_rate": 8.828473400547825e-06, "loss": 0.3989, "step": 2089 }, { "epoch": 0.23030303030303031, "grad_norm": 10.272017478942871, "learning_rate": 8.827348564077897e-06, "loss": 0.5083, "step": 2090 }, { "epoch": 0.23041322314049587, "grad_norm": 11.257768630981445, "learning_rate": 8.826223259594587e-06, "loss": 0.5208, "step": 2091 }, { "epoch": 0.23052341597796144, "grad_norm": 20.224720001220703, "learning_rate": 8.8250974872355e-06, "loss": 0.481, "step": 2092 }, { "epoch": 0.230633608815427, "grad_norm": 6.637451648712158, "learning_rate": 8.823971247138296e-06, "loss": 0.4079, "step": 2093 }, { "epoch": 0.23074380165289257, "grad_norm": 10.840591430664062, "learning_rate": 8.822844539440693e-06, "loss": 0.4461, "step": 2094 }, { "epoch": 0.23085399449035812, "grad_norm": 7.156713962554932, "learning_rate": 8.821717364280467e-06, "loss": 0.4566, "step": 2095 }, { "epoch": 0.2309641873278237, "grad_norm": 16.366134643554688, "learning_rate": 8.820589721795451e-06, "loss": 0.4134, "step": 2096 }, { "epoch": 0.23107438016528925, "grad_norm": 10.64284896850586, "learning_rate": 8.819461612123532e-06, "loss": 0.4492, "step": 2097 }, { "epoch": 0.23118457300275483, "grad_norm": 5.6441802978515625, "learning_rate": 8.81833303540266e-06, "loss": 0.456, "step": 2098 }, { "epoch": 0.23129476584022038, "grad_norm": 7.566303730010986, "learning_rate": 8.81720399177084e-06, "loss": 0.4573, "step": 2099 }, { "epoch": 0.23140495867768596, "grad_norm": 10.060577392578125, "learning_rate": 8.816074481366128e-06, "loss": 0.4211, "step": 2100 }, { "epoch": 0.2315151515151515, "grad_norm": 7.7261643409729, "learning_rate": 8.814944504326645e-06, "loss": 0.4656, "step": 2101 }, { "epoch": 0.23162534435261709, "grad_norm": 6.533786296844482, "learning_rate": 8.813814060790567e-06, "loss": 0.3302, "step": 2102 }, { "epoch": 0.23173553719008264, "grad_norm": 7.083710193634033, "learning_rate": 8.812683150896126e-06, "loss": 0.3828, "step": 2103 }, { "epoch": 0.2318457300275482, "grad_norm": 10.25291919708252, "learning_rate": 8.811551774781608e-06, "loss": 0.5114, "step": 2104 }, { "epoch": 0.23195592286501376, "grad_norm": 12.033116340637207, "learning_rate": 8.810419932585362e-06, "loss": 0.588, "step": 2105 }, { "epoch": 0.23206611570247934, "grad_norm": 15.983132362365723, "learning_rate": 8.809287624445792e-06, "loss": 0.679, "step": 2106 }, { "epoch": 0.2321763085399449, "grad_norm": 13.827681541442871, "learning_rate": 8.808154850501356e-06, "loss": 0.4535, "step": 2107 }, { "epoch": 0.23228650137741047, "grad_norm": 5.498868465423584, "learning_rate": 8.807021610890571e-06, "loss": 0.4278, "step": 2108 }, { "epoch": 0.23239669421487602, "grad_norm": 7.119879722595215, "learning_rate": 8.805887905752015e-06, "loss": 0.3981, "step": 2109 }, { "epoch": 0.2325068870523416, "grad_norm": 6.168232440948486, "learning_rate": 8.804753735224312e-06, "loss": 0.4268, "step": 2110 }, { "epoch": 0.23261707988980715, "grad_norm": 5.046046257019043, "learning_rate": 8.803619099446157e-06, "loss": 0.4618, "step": 2111 }, { "epoch": 0.23272727272727273, "grad_norm": 4.919585227966309, "learning_rate": 8.80248399855629e-06, "loss": 0.4289, "step": 2112 }, { "epoch": 0.2328374655647383, "grad_norm": 8.216880798339844, "learning_rate": 8.801348432693518e-06, "loss": 0.5318, "step": 2113 }, { "epoch": 0.23294765840220386, "grad_norm": 11.217514991760254, "learning_rate": 8.800212401996692e-06, "loss": 0.456, "step": 2114 }, { "epoch": 0.23305785123966943, "grad_norm": 10.725457191467285, "learning_rate": 8.799075906604732e-06, "loss": 0.5251, "step": 2115 }, { "epoch": 0.23316804407713498, "grad_norm": 5.096673011779785, "learning_rate": 8.797938946656612e-06, "loss": 0.4617, "step": 2116 }, { "epoch": 0.23327823691460056, "grad_norm": 6.513415336608887, "learning_rate": 8.796801522291357e-06, "loss": 0.4937, "step": 2117 }, { "epoch": 0.2333884297520661, "grad_norm": 4.589240074157715, "learning_rate": 8.795663633648055e-06, "loss": 0.4308, "step": 2118 }, { "epoch": 0.2334986225895317, "grad_norm": 4.732120990753174, "learning_rate": 8.794525280865846e-06, "loss": 0.4112, "step": 2119 }, { "epoch": 0.23360881542699724, "grad_norm": 14.606962203979492, "learning_rate": 8.793386464083932e-06, "loss": 0.424, "step": 2120 }, { "epoch": 0.23371900826446282, "grad_norm": 11.576171875, "learning_rate": 8.792247183441572e-06, "loss": 0.5453, "step": 2121 }, { "epoch": 0.23382920110192837, "grad_norm": 10.318958282470703, "learning_rate": 8.79110743907807e-06, "loss": 0.4187, "step": 2122 }, { "epoch": 0.23393939393939395, "grad_norm": 7.910745143890381, "learning_rate": 8.789967231132805e-06, "loss": 0.4416, "step": 2123 }, { "epoch": 0.2340495867768595, "grad_norm": 8.218287467956543, "learning_rate": 8.788826559745197e-06, "loss": 0.4353, "step": 2124 }, { "epoch": 0.23415977961432508, "grad_norm": 6.097707271575928, "learning_rate": 8.787685425054729e-06, "loss": 0.4175, "step": 2125 }, { "epoch": 0.23426997245179063, "grad_norm": 9.37572956085205, "learning_rate": 8.786543827200944e-06, "loss": 0.427, "step": 2126 }, { "epoch": 0.2343801652892562, "grad_norm": 9.844120025634766, "learning_rate": 8.785401766323437e-06, "loss": 0.5319, "step": 2127 }, { "epoch": 0.23449035812672175, "grad_norm": 5.837325572967529, "learning_rate": 8.784259242561858e-06, "loss": 0.4415, "step": 2128 }, { "epoch": 0.23460055096418733, "grad_norm": 9.294026374816895, "learning_rate": 8.78311625605592e-06, "loss": 0.4406, "step": 2129 }, { "epoch": 0.23471074380165288, "grad_norm": 6.600424289703369, "learning_rate": 8.781972806945385e-06, "loss": 0.4437, "step": 2130 }, { "epoch": 0.23482093663911846, "grad_norm": 6.484173774719238, "learning_rate": 8.78082889537008e-06, "loss": 0.4426, "step": 2131 }, { "epoch": 0.234931129476584, "grad_norm": 5.145078659057617, "learning_rate": 8.779684521469882e-06, "loss": 0.4433, "step": 2132 }, { "epoch": 0.2350413223140496, "grad_norm": 7.097766399383545, "learning_rate": 8.778539685384726e-06, "loss": 0.4173, "step": 2133 }, { "epoch": 0.23515151515151514, "grad_norm": 11.189253807067871, "learning_rate": 8.777394387254604e-06, "loss": 0.4288, "step": 2134 }, { "epoch": 0.23526170798898072, "grad_norm": 7.566659450531006, "learning_rate": 8.776248627219566e-06, "loss": 0.4971, "step": 2135 }, { "epoch": 0.23537190082644627, "grad_norm": 6.61622953414917, "learning_rate": 8.775102405419717e-06, "loss": 0.4815, "step": 2136 }, { "epoch": 0.23548209366391185, "grad_norm": 5.723613262176514, "learning_rate": 8.773955721995217e-06, "loss": 0.487, "step": 2137 }, { "epoch": 0.23559228650137742, "grad_norm": 5.3530049324035645, "learning_rate": 8.772808577086285e-06, "loss": 0.4057, "step": 2138 }, { "epoch": 0.23570247933884297, "grad_norm": 12.398603439331055, "learning_rate": 8.771660970833194e-06, "loss": 0.6096, "step": 2139 }, { "epoch": 0.23581267217630855, "grad_norm": 6.584178924560547, "learning_rate": 8.770512903376277e-06, "loss": 0.4814, "step": 2140 }, { "epoch": 0.2359228650137741, "grad_norm": 6.775885581970215, "learning_rate": 8.769364374855923e-06, "loss": 0.4945, "step": 2141 }, { "epoch": 0.23603305785123968, "grad_norm": 9.103728294372559, "learning_rate": 8.76821538541257e-06, "loss": 0.5035, "step": 2142 }, { "epoch": 0.23614325068870523, "grad_norm": 13.791357040405273, "learning_rate": 8.767065935186723e-06, "loss": 0.5347, "step": 2143 }, { "epoch": 0.2362534435261708, "grad_norm": 11.703580856323242, "learning_rate": 8.765916024318935e-06, "loss": 0.4276, "step": 2144 }, { "epoch": 0.23636363636363636, "grad_norm": 6.4634623527526855, "learning_rate": 8.76476565294982e-06, "loss": 0.4393, "step": 2145 }, { "epoch": 0.23647382920110194, "grad_norm": 8.717704772949219, "learning_rate": 8.763614821220047e-06, "loss": 0.3366, "step": 2146 }, { "epoch": 0.2365840220385675, "grad_norm": 6.30748987197876, "learning_rate": 8.762463529270341e-06, "loss": 0.4848, "step": 2147 }, { "epoch": 0.23669421487603307, "grad_norm": 9.31478500366211, "learning_rate": 8.761311777241485e-06, "loss": 0.5241, "step": 2148 }, { "epoch": 0.23680440771349862, "grad_norm": 5.8757452964782715, "learning_rate": 8.760159565274316e-06, "loss": 0.3795, "step": 2149 }, { "epoch": 0.2369146005509642, "grad_norm": 6.232647895812988, "learning_rate": 8.759006893509726e-06, "loss": 0.4356, "step": 2150 }, { "epoch": 0.23702479338842974, "grad_norm": 5.690622806549072, "learning_rate": 8.757853762088671e-06, "loss": 0.4515, "step": 2151 }, { "epoch": 0.23713498622589532, "grad_norm": 4.078435897827148, "learning_rate": 8.756700171152149e-06, "loss": 0.4059, "step": 2152 }, { "epoch": 0.23724517906336087, "grad_norm": 7.967753887176514, "learning_rate": 8.755546120841229e-06, "loss": 0.4858, "step": 2153 }, { "epoch": 0.23735537190082645, "grad_norm": 5.991828918457031, "learning_rate": 8.754391611297026e-06, "loss": 0.4147, "step": 2154 }, { "epoch": 0.237465564738292, "grad_norm": 8.03773021697998, "learning_rate": 8.753236642660719e-06, "loss": 0.3884, "step": 2155 }, { "epoch": 0.23757575757575758, "grad_norm": 7.971235275268555, "learning_rate": 8.752081215073536e-06, "loss": 0.4222, "step": 2156 }, { "epoch": 0.23768595041322313, "grad_norm": 7.698493957519531, "learning_rate": 8.750925328676766e-06, "loss": 0.41, "step": 2157 }, { "epoch": 0.2377961432506887, "grad_norm": 8.206059455871582, "learning_rate": 8.749768983611751e-06, "loss": 0.4505, "step": 2158 }, { "epoch": 0.23790633608815426, "grad_norm": 6.654517650604248, "learning_rate": 8.748612180019893e-06, "loss": 0.4268, "step": 2159 }, { "epoch": 0.23801652892561984, "grad_norm": 7.217126846313477, "learning_rate": 8.747454918042645e-06, "loss": 0.4172, "step": 2160 }, { "epoch": 0.23812672176308539, "grad_norm": 6.2145586013793945, "learning_rate": 8.746297197821516e-06, "loss": 0.4729, "step": 2161 }, { "epoch": 0.23823691460055096, "grad_norm": 8.109328269958496, "learning_rate": 8.745139019498079e-06, "loss": 0.3934, "step": 2162 }, { "epoch": 0.23834710743801654, "grad_norm": 5.94710636138916, "learning_rate": 8.743980383213956e-06, "loss": 0.4261, "step": 2163 }, { "epoch": 0.2384573002754821, "grad_norm": 8.663981437683105, "learning_rate": 8.742821289110825e-06, "loss": 0.3978, "step": 2164 }, { "epoch": 0.23856749311294767, "grad_norm": 8.286674499511719, "learning_rate": 8.741661737330425e-06, "loss": 0.4266, "step": 2165 }, { "epoch": 0.23867768595041322, "grad_norm": 9.72221851348877, "learning_rate": 8.740501728014543e-06, "loss": 0.4871, "step": 2166 }, { "epoch": 0.2387878787878788, "grad_norm": 4.6482415199279785, "learning_rate": 8.73934126130503e-06, "loss": 0.395, "step": 2167 }, { "epoch": 0.23889807162534435, "grad_norm": 5.311193466186523, "learning_rate": 8.738180337343788e-06, "loss": 0.4496, "step": 2168 }, { "epoch": 0.23900826446280993, "grad_norm": 7.596034049987793, "learning_rate": 8.737018956272774e-06, "loss": 0.3588, "step": 2169 }, { "epoch": 0.23911845730027548, "grad_norm": 9.180919647216797, "learning_rate": 8.735857118234008e-06, "loss": 0.4978, "step": 2170 }, { "epoch": 0.23922865013774106, "grad_norm": 11.98186206817627, "learning_rate": 8.734694823369559e-06, "loss": 0.4307, "step": 2171 }, { "epoch": 0.2393388429752066, "grad_norm": 8.02349853515625, "learning_rate": 8.733532071821553e-06, "loss": 0.3434, "step": 2172 }, { "epoch": 0.23944903581267218, "grad_norm": 7.891348838806152, "learning_rate": 8.732368863732175e-06, "loss": 0.5116, "step": 2173 }, { "epoch": 0.23955922865013773, "grad_norm": 8.01990032196045, "learning_rate": 8.731205199243661e-06, "loss": 0.455, "step": 2174 }, { "epoch": 0.2396694214876033, "grad_norm": 5.899777889251709, "learning_rate": 8.730041078498307e-06, "loss": 0.4518, "step": 2175 }, { "epoch": 0.23977961432506886, "grad_norm": 7.121553421020508, "learning_rate": 8.728876501638464e-06, "loss": 0.5194, "step": 2176 }, { "epoch": 0.23988980716253444, "grad_norm": 7.066677093505859, "learning_rate": 8.727711468806537e-06, "loss": 0.4485, "step": 2177 }, { "epoch": 0.24, "grad_norm": 7.9821624755859375, "learning_rate": 8.726545980144988e-06, "loss": 0.4904, "step": 2178 }, { "epoch": 0.24011019283746557, "grad_norm": 5.491579532623291, "learning_rate": 8.725380035796334e-06, "loss": 0.4295, "step": 2179 }, { "epoch": 0.24022038567493112, "grad_norm": 6.961433410644531, "learning_rate": 8.724213635903149e-06, "loss": 0.4909, "step": 2180 }, { "epoch": 0.2403305785123967, "grad_norm": 4.592325210571289, "learning_rate": 8.723046780608061e-06, "loss": 0.424, "step": 2181 }, { "epoch": 0.24044077134986225, "grad_norm": 12.062141418457031, "learning_rate": 8.721879470053758e-06, "loss": 0.5157, "step": 2182 }, { "epoch": 0.24055096418732783, "grad_norm": 8.86048698425293, "learning_rate": 8.720711704382976e-06, "loss": 0.4732, "step": 2183 }, { "epoch": 0.24066115702479338, "grad_norm": 16.834884643554688, "learning_rate": 8.719543483738513e-06, "loss": 0.4236, "step": 2184 }, { "epoch": 0.24077134986225895, "grad_norm": 5.7272443771362305, "learning_rate": 8.718374808263221e-06, "loss": 0.4204, "step": 2185 }, { "epoch": 0.2408815426997245, "grad_norm": 7.467537879943848, "learning_rate": 8.717205678100004e-06, "loss": 0.5018, "step": 2186 }, { "epoch": 0.24099173553719008, "grad_norm": 5.631223201751709, "learning_rate": 8.71603609339183e-06, "loss": 0.424, "step": 2187 }, { "epoch": 0.24110192837465566, "grad_norm": 8.913368225097656, "learning_rate": 8.714866054281714e-06, "loss": 0.4667, "step": 2188 }, { "epoch": 0.2412121212121212, "grad_norm": 5.795899391174316, "learning_rate": 8.71369556091273e-06, "loss": 0.3866, "step": 2189 }, { "epoch": 0.2413223140495868, "grad_norm": 5.947133541107178, "learning_rate": 8.712524613428009e-06, "loss": 0.4024, "step": 2190 }, { "epoch": 0.24143250688705234, "grad_norm": 7.146727085113525, "learning_rate": 8.711353211970734e-06, "loss": 0.4076, "step": 2191 }, { "epoch": 0.24154269972451792, "grad_norm": 11.562288284301758, "learning_rate": 8.710181356684149e-06, "loss": 0.5004, "step": 2192 }, { "epoch": 0.24165289256198347, "grad_norm": 9.729698181152344, "learning_rate": 8.709009047711547e-06, "loss": 0.4714, "step": 2193 }, { "epoch": 0.24176308539944905, "grad_norm": 6.6752400398254395, "learning_rate": 8.707836285196281e-06, "loss": 0.3855, "step": 2194 }, { "epoch": 0.2418732782369146, "grad_norm": 11.59221363067627, "learning_rate": 8.706663069281755e-06, "loss": 0.5057, "step": 2195 }, { "epoch": 0.24198347107438017, "grad_norm": 9.448982238769531, "learning_rate": 8.705489400111437e-06, "loss": 0.5545, "step": 2196 }, { "epoch": 0.24209366391184572, "grad_norm": 7.867563724517822, "learning_rate": 8.70431527782884e-06, "loss": 0.4745, "step": 2197 }, { "epoch": 0.2422038567493113, "grad_norm": 7.003779888153076, "learning_rate": 8.703140702577539e-06, "loss": 0.4257, "step": 2198 }, { "epoch": 0.24231404958677685, "grad_norm": 7.950750350952148, "learning_rate": 8.701965674501162e-06, "loss": 0.4784, "step": 2199 }, { "epoch": 0.24242424242424243, "grad_norm": 6.600383281707764, "learning_rate": 8.700790193743395e-06, "loss": 0.4715, "step": 2200 }, { "epoch": 0.24253443526170798, "grad_norm": 11.465103149414062, "learning_rate": 8.699614260447976e-06, "loss": 0.4216, "step": 2201 }, { "epoch": 0.24264462809917356, "grad_norm": 6.537487030029297, "learning_rate": 8.698437874758701e-06, "loss": 0.4132, "step": 2202 }, { "epoch": 0.2427548209366391, "grad_norm": 6.177744388580322, "learning_rate": 8.697261036819417e-06, "loss": 0.4844, "step": 2203 }, { "epoch": 0.2428650137741047, "grad_norm": 7.86129903793335, "learning_rate": 8.696083746774031e-06, "loss": 0.4171, "step": 2204 }, { "epoch": 0.24297520661157024, "grad_norm": 10.450275421142578, "learning_rate": 8.694906004766505e-06, "loss": 0.515, "step": 2205 }, { "epoch": 0.24308539944903582, "grad_norm": 6.404592990875244, "learning_rate": 8.693727810940853e-06, "loss": 0.4126, "step": 2206 }, { "epoch": 0.24319559228650137, "grad_norm": 6.1069865226745605, "learning_rate": 8.692549165441146e-06, "loss": 0.3866, "step": 2207 }, { "epoch": 0.24330578512396694, "grad_norm": 6.747042179107666, "learning_rate": 8.691370068411513e-06, "loss": 0.4579, "step": 2208 }, { "epoch": 0.2434159779614325, "grad_norm": 6.704173564910889, "learning_rate": 8.690190519996133e-06, "loss": 0.4246, "step": 2209 }, { "epoch": 0.24352617079889807, "grad_norm": 5.756845951080322, "learning_rate": 8.689010520339245e-06, "loss": 0.4082, "step": 2210 }, { "epoch": 0.24363636363636362, "grad_norm": 10.910465240478516, "learning_rate": 8.687830069585138e-06, "loss": 0.4536, "step": 2211 }, { "epoch": 0.2437465564738292, "grad_norm": 7.12267541885376, "learning_rate": 8.68664916787816e-06, "loss": 0.3574, "step": 2212 }, { "epoch": 0.24385674931129478, "grad_norm": 10.122380256652832, "learning_rate": 8.685467815362714e-06, "loss": 0.432, "step": 2213 }, { "epoch": 0.24396694214876033, "grad_norm": 11.504134178161621, "learning_rate": 8.684286012183259e-06, "loss": 0.4656, "step": 2214 }, { "epoch": 0.2440771349862259, "grad_norm": 8.178743362426758, "learning_rate": 8.683103758484304e-06, "loss": 0.4386, "step": 2215 }, { "epoch": 0.24418732782369146, "grad_norm": 11.35354232788086, "learning_rate": 8.68192105441042e-06, "loss": 0.5298, "step": 2216 }, { "epoch": 0.24429752066115704, "grad_norm": 7.735526084899902, "learning_rate": 8.680737900106227e-06, "loss": 0.4393, "step": 2217 }, { "epoch": 0.2444077134986226, "grad_norm": 6.364700794219971, "learning_rate": 8.679554295716403e-06, "loss": 0.4907, "step": 2218 }, { "epoch": 0.24451790633608816, "grad_norm": 6.380803108215332, "learning_rate": 8.678370241385683e-06, "loss": 0.4302, "step": 2219 }, { "epoch": 0.24462809917355371, "grad_norm": 36.914852142333984, "learning_rate": 8.677185737258854e-06, "loss": 0.4743, "step": 2220 }, { "epoch": 0.2447382920110193, "grad_norm": 19.26903533935547, "learning_rate": 8.676000783480758e-06, "loss": 0.5094, "step": 2221 }, { "epoch": 0.24484848484848484, "grad_norm": 6.038789749145508, "learning_rate": 8.674815380196291e-06, "loss": 0.5194, "step": 2222 }, { "epoch": 0.24495867768595042, "grad_norm": 7.294865608215332, "learning_rate": 8.673629527550409e-06, "loss": 0.4517, "step": 2223 }, { "epoch": 0.24506887052341597, "grad_norm": 8.526937484741211, "learning_rate": 8.672443225688117e-06, "loss": 0.4507, "step": 2224 }, { "epoch": 0.24517906336088155, "grad_norm": 11.056537628173828, "learning_rate": 8.67125647475448e-06, "loss": 0.5316, "step": 2225 }, { "epoch": 0.2452892561983471, "grad_norm": 8.909558296203613, "learning_rate": 8.670069274894613e-06, "loss": 0.523, "step": 2226 }, { "epoch": 0.24539944903581268, "grad_norm": 9.7420654296875, "learning_rate": 8.668881626253692e-06, "loss": 0.4595, "step": 2227 }, { "epoch": 0.24550964187327823, "grad_norm": 6.577530384063721, "learning_rate": 8.667693528976938e-06, "loss": 0.5038, "step": 2228 }, { "epoch": 0.2456198347107438, "grad_norm": 7.473630428314209, "learning_rate": 8.666504983209641e-06, "loss": 0.4432, "step": 2229 }, { "epoch": 0.24573002754820936, "grad_norm": 7.230944633483887, "learning_rate": 8.665315989097135e-06, "loss": 0.4073, "step": 2230 }, { "epoch": 0.24584022038567493, "grad_norm": 6.275236129760742, "learning_rate": 8.664126546784808e-06, "loss": 0.4011, "step": 2231 }, { "epoch": 0.24595041322314048, "grad_norm": 7.976849555969238, "learning_rate": 8.662936656418111e-06, "loss": 0.5406, "step": 2232 }, { "epoch": 0.24606060606060606, "grad_norm": 5.796950817108154, "learning_rate": 8.661746318142544e-06, "loss": 0.4271, "step": 2233 }, { "epoch": 0.2461707988980716, "grad_norm": 5.247142314910889, "learning_rate": 8.660555532103663e-06, "loss": 0.4404, "step": 2234 }, { "epoch": 0.2462809917355372, "grad_norm": 8.870091438293457, "learning_rate": 8.659364298447079e-06, "loss": 0.4442, "step": 2235 }, { "epoch": 0.24639118457300274, "grad_norm": 8.041191101074219, "learning_rate": 8.658172617318457e-06, "loss": 0.3981, "step": 2236 }, { "epoch": 0.24650137741046832, "grad_norm": 6.859613418579102, "learning_rate": 8.65698048886352e-06, "loss": 0.456, "step": 2237 }, { "epoch": 0.2466115702479339, "grad_norm": 6.626808166503906, "learning_rate": 8.65578791322804e-06, "loss": 0.4438, "step": 2238 }, { "epoch": 0.24672176308539945, "grad_norm": 5.693751335144043, "learning_rate": 8.654594890557847e-06, "loss": 0.4119, "step": 2239 }, { "epoch": 0.24683195592286503, "grad_norm": 7.935238838195801, "learning_rate": 8.653401420998831e-06, "loss": 0.4847, "step": 2240 }, { "epoch": 0.24694214876033058, "grad_norm": 6.543578624725342, "learning_rate": 8.652207504696922e-06, "loss": 0.519, "step": 2241 }, { "epoch": 0.24705234159779615, "grad_norm": 5.309280872344971, "learning_rate": 8.651013141798121e-06, "loss": 0.3942, "step": 2242 }, { "epoch": 0.2471625344352617, "grad_norm": 7.597204685211182, "learning_rate": 8.649818332448472e-06, "loss": 0.4737, "step": 2243 }, { "epoch": 0.24727272727272728, "grad_norm": 4.780295372009277, "learning_rate": 8.64862307679408e-06, "loss": 0.461, "step": 2244 }, { "epoch": 0.24738292011019283, "grad_norm": 10.477855682373047, "learning_rate": 8.647427374981101e-06, "loss": 0.4543, "step": 2245 }, { "epoch": 0.2474931129476584, "grad_norm": 15.061379432678223, "learning_rate": 8.646231227155751e-06, "loss": 0.4741, "step": 2246 }, { "epoch": 0.24760330578512396, "grad_norm": 8.661005973815918, "learning_rate": 8.645034633464292e-06, "loss": 0.4148, "step": 2247 }, { "epoch": 0.24771349862258954, "grad_norm": 8.025259017944336, "learning_rate": 8.643837594053045e-06, "loss": 0.5376, "step": 2248 }, { "epoch": 0.2478236914600551, "grad_norm": 10.17959213256836, "learning_rate": 8.64264010906839e-06, "loss": 0.4585, "step": 2249 }, { "epoch": 0.24793388429752067, "grad_norm": 9.788511276245117, "learning_rate": 8.641442178656752e-06, "loss": 0.4379, "step": 2250 }, { "epoch": 0.24804407713498622, "grad_norm": 5.2130279541015625, "learning_rate": 8.64024380296462e-06, "loss": 0.3961, "step": 2251 }, { "epoch": 0.2481542699724518, "grad_norm": 14.40140438079834, "learning_rate": 8.63904498213853e-06, "loss": 0.4997, "step": 2252 }, { "epoch": 0.24826446280991735, "grad_norm": 7.163171768188477, "learning_rate": 8.637845716325076e-06, "loss": 0.4464, "step": 2253 }, { "epoch": 0.24837465564738292, "grad_norm": 6.978706359863281, "learning_rate": 8.636646005670908e-06, "loss": 0.3715, "step": 2254 }, { "epoch": 0.24848484848484848, "grad_norm": 6.105304718017578, "learning_rate": 8.635445850322725e-06, "loss": 0.3892, "step": 2255 }, { "epoch": 0.24859504132231405, "grad_norm": 9.44076156616211, "learning_rate": 8.634245250427286e-06, "loss": 0.4035, "step": 2256 }, { "epoch": 0.2487052341597796, "grad_norm": 6.398384094238281, "learning_rate": 8.633044206131401e-06, "loss": 0.4568, "step": 2257 }, { "epoch": 0.24881542699724518, "grad_norm": 6.775662899017334, "learning_rate": 8.631842717581934e-06, "loss": 0.3772, "step": 2258 }, { "epoch": 0.24892561983471073, "grad_norm": 5.58888578414917, "learning_rate": 8.630640784925808e-06, "loss": 0.465, "step": 2259 }, { "epoch": 0.2490358126721763, "grad_norm": 6.437169075012207, "learning_rate": 8.629438408309994e-06, "loss": 0.5361, "step": 2260 }, { "epoch": 0.24914600550964186, "grad_norm": 9.29654312133789, "learning_rate": 8.628235587881522e-06, "loss": 0.5343, "step": 2261 }, { "epoch": 0.24925619834710744, "grad_norm": 9.664660453796387, "learning_rate": 8.627032323787473e-06, "loss": 0.4821, "step": 2262 }, { "epoch": 0.24936639118457302, "grad_norm": 5.591101169586182, "learning_rate": 8.625828616174984e-06, "loss": 0.4363, "step": 2263 }, { "epoch": 0.24947658402203857, "grad_norm": 7.769825458526611, "learning_rate": 8.62462446519125e-06, "loss": 0.4363, "step": 2264 }, { "epoch": 0.24958677685950414, "grad_norm": 7.316049575805664, "learning_rate": 8.62341987098351e-06, "loss": 0.4827, "step": 2265 }, { "epoch": 0.2496969696969697, "grad_norm": 12.398195266723633, "learning_rate": 8.622214833699067e-06, "loss": 0.5515, "step": 2266 }, { "epoch": 0.24980716253443527, "grad_norm": 5.563145637512207, "learning_rate": 8.621009353485272e-06, "loss": 0.4688, "step": 2267 }, { "epoch": 0.24991735537190082, "grad_norm": 5.417816162109375, "learning_rate": 8.619803430489537e-06, "loss": 0.4319, "step": 2268 }, { "epoch": 0.2500275482093664, "grad_norm": 5.636494159698486, "learning_rate": 8.618597064859321e-06, "loss": 0.4469, "step": 2269 }, { "epoch": 0.25013774104683195, "grad_norm": 6.015632152557373, "learning_rate": 8.617390256742142e-06, "loss": 0.4153, "step": 2270 }, { "epoch": 0.2502479338842975, "grad_norm": 6.239927768707275, "learning_rate": 8.616183006285566e-06, "loss": 0.3865, "step": 2271 }, { "epoch": 0.2503581267217631, "grad_norm": 11.210003852844238, "learning_rate": 8.61497531363722e-06, "loss": 0.4984, "step": 2272 }, { "epoch": 0.25046831955922866, "grad_norm": 14.910019874572754, "learning_rate": 8.613767178944784e-06, "loss": 0.4554, "step": 2273 }, { "epoch": 0.2505785123966942, "grad_norm": 10.103205680847168, "learning_rate": 8.612558602355988e-06, "loss": 0.4278, "step": 2274 }, { "epoch": 0.25068870523415976, "grad_norm": 8.149327278137207, "learning_rate": 8.611349584018618e-06, "loss": 0.4078, "step": 2275 }, { "epoch": 0.25079889807162536, "grad_norm": 7.881106376647949, "learning_rate": 8.610140124080515e-06, "loss": 0.4312, "step": 2276 }, { "epoch": 0.2509090909090909, "grad_norm": 8.182812690734863, "learning_rate": 8.608930222689575e-06, "loss": 0.5198, "step": 2277 }, { "epoch": 0.25101928374655647, "grad_norm": 7.4772796630859375, "learning_rate": 8.607719879993745e-06, "loss": 0.4961, "step": 2278 }, { "epoch": 0.251129476584022, "grad_norm": 7.813830852508545, "learning_rate": 8.606509096141027e-06, "loss": 0.4212, "step": 2279 }, { "epoch": 0.2512396694214876, "grad_norm": 7.752506732940674, "learning_rate": 8.605297871279478e-06, "loss": 0.3529, "step": 2280 }, { "epoch": 0.25134986225895317, "grad_norm": 5.12895393371582, "learning_rate": 8.604086205557206e-06, "loss": 0.4413, "step": 2281 }, { "epoch": 0.2514600550964187, "grad_norm": 5.124846935272217, "learning_rate": 8.60287409912238e-06, "loss": 0.4424, "step": 2282 }, { "epoch": 0.2515702479338843, "grad_norm": 5.033202648162842, "learning_rate": 8.601661552123215e-06, "loss": 0.3773, "step": 2283 }, { "epoch": 0.2516804407713499, "grad_norm": 9.819452285766602, "learning_rate": 8.600448564707982e-06, "loss": 0.4828, "step": 2284 }, { "epoch": 0.25179063360881543, "grad_norm": 11.069389343261719, "learning_rate": 8.599235137025007e-06, "loss": 0.4828, "step": 2285 }, { "epoch": 0.251900826446281, "grad_norm": 5.964832305908203, "learning_rate": 8.598021269222672e-06, "loss": 0.4449, "step": 2286 }, { "epoch": 0.25201101928374653, "grad_norm": 7.644845008850098, "learning_rate": 8.59680696144941e-06, "loss": 0.3804, "step": 2287 }, { "epoch": 0.25212121212121213, "grad_norm": 9.03024673461914, "learning_rate": 8.595592213853702e-06, "loss": 0.441, "step": 2288 }, { "epoch": 0.2522314049586777, "grad_norm": 10.455665588378906, "learning_rate": 8.594377026584098e-06, "loss": 0.2856, "step": 2289 }, { "epoch": 0.25234159779614324, "grad_norm": 6.502602577209473, "learning_rate": 8.593161399789188e-06, "loss": 0.3827, "step": 2290 }, { "epoch": 0.25245179063360884, "grad_norm": 5.824034214019775, "learning_rate": 8.591945333617622e-06, "loss": 0.3888, "step": 2291 }, { "epoch": 0.2525619834710744, "grad_norm": 5.943631649017334, "learning_rate": 8.5907288282181e-06, "loss": 0.4399, "step": 2292 }, { "epoch": 0.25267217630853994, "grad_norm": 9.132189750671387, "learning_rate": 8.589511883739379e-06, "loss": 0.462, "step": 2293 }, { "epoch": 0.2527823691460055, "grad_norm": 6.748810768127441, "learning_rate": 8.58829450033027e-06, "loss": 0.454, "step": 2294 }, { "epoch": 0.2528925619834711, "grad_norm": 9.523448944091797, "learning_rate": 8.587076678139635e-06, "loss": 0.4171, "step": 2295 }, { "epoch": 0.25300275482093665, "grad_norm": 7.962812900543213, "learning_rate": 8.585858417316391e-06, "loss": 0.51, "step": 2296 }, { "epoch": 0.2531129476584022, "grad_norm": 5.420536994934082, "learning_rate": 8.584639718009508e-06, "loss": 0.4154, "step": 2297 }, { "epoch": 0.25322314049586775, "grad_norm": 5.080401420593262, "learning_rate": 8.583420580368013e-06, "loss": 0.469, "step": 2298 }, { "epoch": 0.25333333333333335, "grad_norm": 6.994992733001709, "learning_rate": 8.58220100454098e-06, "loss": 0.4188, "step": 2299 }, { "epoch": 0.2534435261707989, "grad_norm": 6.936811923980713, "learning_rate": 8.580980990677543e-06, "loss": 0.3953, "step": 2300 }, { "epoch": 0.25355371900826446, "grad_norm": 5.2105793952941895, "learning_rate": 8.579760538926887e-06, "loss": 0.3762, "step": 2301 }, { "epoch": 0.25366391184573, "grad_norm": 6.676947593688965, "learning_rate": 8.57853964943825e-06, "loss": 0.4554, "step": 2302 }, { "epoch": 0.2537741046831956, "grad_norm": 6.580031871795654, "learning_rate": 8.577318322360922e-06, "loss": 0.4174, "step": 2303 }, { "epoch": 0.25388429752066116, "grad_norm": 9.767011642456055, "learning_rate": 8.57609655784425e-06, "loss": 0.5647, "step": 2304 }, { "epoch": 0.2539944903581267, "grad_norm": 8.932231903076172, "learning_rate": 8.574874356037635e-06, "loss": 0.4578, "step": 2305 }, { "epoch": 0.25410468319559226, "grad_norm": 5.41628885269165, "learning_rate": 8.573651717090526e-06, "loss": 0.4322, "step": 2306 }, { "epoch": 0.25421487603305787, "grad_norm": 7.599315643310547, "learning_rate": 8.572428641152432e-06, "loss": 0.3949, "step": 2307 }, { "epoch": 0.2543250688705234, "grad_norm": 11.129815101623535, "learning_rate": 8.57120512837291e-06, "loss": 0.4471, "step": 2308 }, { "epoch": 0.25443526170798897, "grad_norm": 7.195977210998535, "learning_rate": 8.569981178901575e-06, "loss": 0.3736, "step": 2309 }, { "epoch": 0.2545454545454545, "grad_norm": 6.795074462890625, "learning_rate": 8.568756792888092e-06, "loss": 0.4654, "step": 2310 }, { "epoch": 0.2546556473829201, "grad_norm": 5.778489589691162, "learning_rate": 8.56753197048218e-06, "loss": 0.39, "step": 2311 }, { "epoch": 0.2547658402203857, "grad_norm": 3.8139307498931885, "learning_rate": 8.566306711833613e-06, "loss": 0.3855, "step": 2312 }, { "epoch": 0.2548760330578512, "grad_norm": 6.988625526428223, "learning_rate": 8.565081017092217e-06, "loss": 0.4758, "step": 2313 }, { "epoch": 0.25498622589531683, "grad_norm": 16.93379783630371, "learning_rate": 8.563854886407872e-06, "loss": 0.6658, "step": 2314 }, { "epoch": 0.2550964187327824, "grad_norm": 9.037361145019531, "learning_rate": 8.56262831993051e-06, "loss": 0.4437, "step": 2315 }, { "epoch": 0.25520661157024793, "grad_norm": 8.213639259338379, "learning_rate": 8.561401317810118e-06, "loss": 0.481, "step": 2316 }, { "epoch": 0.2553168044077135, "grad_norm": 13.289407730102539, "learning_rate": 8.560173880196734e-06, "loss": 0.4594, "step": 2317 }, { "epoch": 0.2554269972451791, "grad_norm": 6.225028991699219, "learning_rate": 8.558946007240452e-06, "loss": 0.3196, "step": 2318 }, { "epoch": 0.25553719008264464, "grad_norm": 6.35842227935791, "learning_rate": 8.557717699091419e-06, "loss": 0.4687, "step": 2319 }, { "epoch": 0.2556473829201102, "grad_norm": 5.915284156799316, "learning_rate": 8.556488955899833e-06, "loss": 0.4458, "step": 2320 }, { "epoch": 0.25575757575757574, "grad_norm": 10.873257637023926, "learning_rate": 8.555259777815946e-06, "loss": 0.4865, "step": 2321 }, { "epoch": 0.25586776859504134, "grad_norm": 6.701557636260986, "learning_rate": 8.554030164990063e-06, "loss": 0.4758, "step": 2322 }, { "epoch": 0.2559779614325069, "grad_norm": 4.650301456451416, "learning_rate": 8.552800117572546e-06, "loss": 0.4689, "step": 2323 }, { "epoch": 0.25608815426997245, "grad_norm": 6.8133440017700195, "learning_rate": 8.551569635713804e-06, "loss": 0.4373, "step": 2324 }, { "epoch": 0.256198347107438, "grad_norm": 6.845016956329346, "learning_rate": 8.550338719564301e-06, "loss": 0.4943, "step": 2325 }, { "epoch": 0.2563085399449036, "grad_norm": 9.78427791595459, "learning_rate": 8.549107369274559e-06, "loss": 0.4677, "step": 2326 }, { "epoch": 0.25641873278236915, "grad_norm": 4.695962429046631, "learning_rate": 8.547875584995146e-06, "loss": 0.3793, "step": 2327 }, { "epoch": 0.2565289256198347, "grad_norm": 5.9332451820373535, "learning_rate": 8.546643366876686e-06, "loss": 0.4679, "step": 2328 }, { "epoch": 0.25663911845730025, "grad_norm": 6.372260093688965, "learning_rate": 8.545410715069858e-06, "loss": 0.4294, "step": 2329 }, { "epoch": 0.25674931129476586, "grad_norm": 4.364634990692139, "learning_rate": 8.544177629725393e-06, "loss": 0.4569, "step": 2330 }, { "epoch": 0.2568595041322314, "grad_norm": 6.3348164558410645, "learning_rate": 8.542944110994072e-06, "loss": 0.4367, "step": 2331 }, { "epoch": 0.25696969696969696, "grad_norm": 7.209807872772217, "learning_rate": 8.541710159026733e-06, "loss": 0.4896, "step": 2332 }, { "epoch": 0.2570798898071625, "grad_norm": 9.143774032592773, "learning_rate": 8.540475773974264e-06, "loss": 0.4798, "step": 2333 }, { "epoch": 0.2571900826446281, "grad_norm": 9.026727676391602, "learning_rate": 8.539240955987609e-06, "loss": 0.5241, "step": 2334 }, { "epoch": 0.25730027548209367, "grad_norm": 4.808675765991211, "learning_rate": 8.538005705217762e-06, "loss": 0.4511, "step": 2335 }, { "epoch": 0.2574104683195592, "grad_norm": 11.612485885620117, "learning_rate": 8.536770021815771e-06, "loss": 0.4771, "step": 2336 }, { "epoch": 0.25752066115702477, "grad_norm": 7.632146835327148, "learning_rate": 8.535533905932739e-06, "loss": 0.496, "step": 2337 }, { "epoch": 0.25763085399449037, "grad_norm": 4.97585391998291, "learning_rate": 8.534297357719816e-06, "loss": 0.407, "step": 2338 }, { "epoch": 0.2577410468319559, "grad_norm": 6.8093743324279785, "learning_rate": 8.533060377328213e-06, "loss": 0.4495, "step": 2339 }, { "epoch": 0.2578512396694215, "grad_norm": 6.089529037475586, "learning_rate": 8.531822964909188e-06, "loss": 0.3967, "step": 2340 }, { "epoch": 0.2579614325068871, "grad_norm": 5.844442367553711, "learning_rate": 8.530585120614053e-06, "loss": 0.4217, "step": 2341 }, { "epoch": 0.25807162534435263, "grad_norm": 10.744218826293945, "learning_rate": 8.529346844594172e-06, "loss": 0.4048, "step": 2342 }, { "epoch": 0.2581818181818182, "grad_norm": 7.852959632873535, "learning_rate": 8.528108137000968e-06, "loss": 0.4425, "step": 2343 }, { "epoch": 0.25829201101928373, "grad_norm": 8.47117805480957, "learning_rate": 8.526868997985905e-06, "loss": 0.5659, "step": 2344 }, { "epoch": 0.25840220385674934, "grad_norm": 6.843001842498779, "learning_rate": 8.525629427700513e-06, "loss": 0.4097, "step": 2345 }, { "epoch": 0.2585123966942149, "grad_norm": 9.291862487792969, "learning_rate": 8.524389426296364e-06, "loss": 0.5022, "step": 2346 }, { "epoch": 0.25862258953168044, "grad_norm": 4.559871196746826, "learning_rate": 8.523148993925089e-06, "loss": 0.5074, "step": 2347 }, { "epoch": 0.258732782369146, "grad_norm": 6.633461952209473, "learning_rate": 8.521908130738369e-06, "loss": 0.4084, "step": 2348 }, { "epoch": 0.2588429752066116, "grad_norm": 6.937253952026367, "learning_rate": 8.520666836887939e-06, "loss": 0.4127, "step": 2349 }, { "epoch": 0.25895316804407714, "grad_norm": 11.475006103515625, "learning_rate": 8.519425112525586e-06, "loss": 0.4464, "step": 2350 }, { "epoch": 0.2590633608815427, "grad_norm": 7.735561847686768, "learning_rate": 8.518182957803149e-06, "loss": 0.3952, "step": 2351 }, { "epoch": 0.25917355371900824, "grad_norm": 7.2295756340026855, "learning_rate": 8.51694037287252e-06, "loss": 0.3981, "step": 2352 }, { "epoch": 0.25928374655647385, "grad_norm": 4.578884601593018, "learning_rate": 8.515697357885648e-06, "loss": 0.4478, "step": 2353 }, { "epoch": 0.2593939393939394, "grad_norm": 10.609971046447754, "learning_rate": 8.514453912994524e-06, "loss": 0.4542, "step": 2354 }, { "epoch": 0.25950413223140495, "grad_norm": 5.635560035705566, "learning_rate": 8.513210038351203e-06, "loss": 0.3859, "step": 2355 }, { "epoch": 0.2596143250688705, "grad_norm": 42.48790740966797, "learning_rate": 8.511965734107787e-06, "loss": 0.5248, "step": 2356 }, { "epoch": 0.2597245179063361, "grad_norm": 5.832466125488281, "learning_rate": 8.51072100041643e-06, "loss": 0.3818, "step": 2357 }, { "epoch": 0.25983471074380166, "grad_norm": 4.7486252784729, "learning_rate": 8.509475837429339e-06, "loss": 0.2956, "step": 2358 }, { "epoch": 0.2599449035812672, "grad_norm": 5.074799060821533, "learning_rate": 8.508230245298778e-06, "loss": 0.3235, "step": 2359 }, { "epoch": 0.26005509641873276, "grad_norm": 10.534666061401367, "learning_rate": 8.506984224177056e-06, "loss": 0.4491, "step": 2360 }, { "epoch": 0.26016528925619836, "grad_norm": 7.45255184173584, "learning_rate": 8.505737774216539e-06, "loss": 0.4088, "step": 2361 }, { "epoch": 0.2602754820936639, "grad_norm": 11.84439468383789, "learning_rate": 8.504490895569645e-06, "loss": 0.4307, "step": 2362 }, { "epoch": 0.26038567493112946, "grad_norm": 7.551827430725098, "learning_rate": 8.503243588388843e-06, "loss": 0.3964, "step": 2363 }, { "epoch": 0.26049586776859507, "grad_norm": 7.464683532714844, "learning_rate": 8.501995852826658e-06, "loss": 0.4345, "step": 2364 }, { "epoch": 0.2606060606060606, "grad_norm": 5.859431266784668, "learning_rate": 8.500747689035663e-06, "loss": 0.4822, "step": 2365 }, { "epoch": 0.26071625344352617, "grad_norm": 6.196678161621094, "learning_rate": 8.499499097168485e-06, "loss": 0.4524, "step": 2366 }, { "epoch": 0.2608264462809917, "grad_norm": 5.3284759521484375, "learning_rate": 8.498250077377803e-06, "loss": 0.4404, "step": 2367 }, { "epoch": 0.2609366391184573, "grad_norm": 9.878118515014648, "learning_rate": 8.49700062981635e-06, "loss": 0.4371, "step": 2368 }, { "epoch": 0.2610468319559229, "grad_norm": 7.886068344116211, "learning_rate": 8.495750754636909e-06, "loss": 0.4588, "step": 2369 }, { "epoch": 0.2611570247933884, "grad_norm": 9.894314765930176, "learning_rate": 8.494500451992318e-06, "loss": 0.4524, "step": 2370 }, { "epoch": 0.261267217630854, "grad_norm": 9.895092964172363, "learning_rate": 8.493249722035464e-06, "loss": 0.5528, "step": 2371 }, { "epoch": 0.2613774104683196, "grad_norm": 7.3613386154174805, "learning_rate": 8.49199856491929e-06, "loss": 0.3853, "step": 2372 }, { "epoch": 0.26148760330578513, "grad_norm": 5.514358043670654, "learning_rate": 8.490746980796787e-06, "loss": 0.3426, "step": 2373 }, { "epoch": 0.2615977961432507, "grad_norm": 7.374489784240723, "learning_rate": 8.489494969821004e-06, "loss": 0.4179, "step": 2374 }, { "epoch": 0.26170798898071623, "grad_norm": 6.119042873382568, "learning_rate": 8.488242532145035e-06, "loss": 0.4394, "step": 2375 }, { "epoch": 0.26181818181818184, "grad_norm": 5.095778465270996, "learning_rate": 8.48698966792203e-06, "loss": 0.3747, "step": 2376 }, { "epoch": 0.2619283746556474, "grad_norm": 11.36087703704834, "learning_rate": 8.485736377305191e-06, "loss": 0.451, "step": 2377 }, { "epoch": 0.26203856749311294, "grad_norm": 8.019950866699219, "learning_rate": 8.484482660447775e-06, "loss": 0.4427, "step": 2378 }, { "epoch": 0.2621487603305785, "grad_norm": 6.189932823181152, "learning_rate": 8.483228517503085e-06, "loss": 0.4113, "step": 2379 }, { "epoch": 0.2622589531680441, "grad_norm": 10.068389892578125, "learning_rate": 8.48197394862448e-06, "loss": 0.4674, "step": 2380 }, { "epoch": 0.26236914600550965, "grad_norm": 10.848103523254395, "learning_rate": 8.48071895396537e-06, "loss": 0.4179, "step": 2381 }, { "epoch": 0.2624793388429752, "grad_norm": 7.704885005950928, "learning_rate": 8.47946353367922e-06, "loss": 0.4916, "step": 2382 }, { "epoch": 0.26258953168044075, "grad_norm": 18.67957878112793, "learning_rate": 8.478207687919542e-06, "loss": 0.4747, "step": 2383 }, { "epoch": 0.26269972451790635, "grad_norm": 12.184995651245117, "learning_rate": 8.476951416839904e-06, "loss": 0.4911, "step": 2384 }, { "epoch": 0.2628099173553719, "grad_norm": 5.447048664093018, "learning_rate": 8.475694720593923e-06, "loss": 0.4661, "step": 2385 }, { "epoch": 0.26292011019283745, "grad_norm": 10.587584495544434, "learning_rate": 8.47443759933527e-06, "loss": 0.3728, "step": 2386 }, { "epoch": 0.263030303030303, "grad_norm": 6.572682857513428, "learning_rate": 8.47318005321767e-06, "loss": 0.4758, "step": 2387 }, { "epoch": 0.2631404958677686, "grad_norm": 9.376725196838379, "learning_rate": 8.471922082394892e-06, "loss": 0.4961, "step": 2388 }, { "epoch": 0.26325068870523416, "grad_norm": 5.249814033508301, "learning_rate": 8.470663687020769e-06, "loss": 0.463, "step": 2389 }, { "epoch": 0.2633608815426997, "grad_norm": 8.13807201385498, "learning_rate": 8.469404867249172e-06, "loss": 0.5362, "step": 2390 }, { "epoch": 0.2634710743801653, "grad_norm": 7.37142276763916, "learning_rate": 8.468145623234036e-06, "loss": 0.4614, "step": 2391 }, { "epoch": 0.26358126721763087, "grad_norm": 6.937657833099365, "learning_rate": 8.466885955129345e-06, "loss": 0.4759, "step": 2392 }, { "epoch": 0.2636914600550964, "grad_norm": 9.085661888122559, "learning_rate": 8.465625863089128e-06, "loss": 0.4831, "step": 2393 }, { "epoch": 0.26380165289256197, "grad_norm": 5.514589786529541, "learning_rate": 8.464365347267473e-06, "loss": 0.3941, "step": 2394 }, { "epoch": 0.26391184573002757, "grad_norm": 5.748258113861084, "learning_rate": 8.463104407818518e-06, "loss": 0.424, "step": 2395 }, { "epoch": 0.2640220385674931, "grad_norm": 5.270674228668213, "learning_rate": 8.461843044896451e-06, "loss": 0.4682, "step": 2396 }, { "epoch": 0.2641322314049587, "grad_norm": 7.216822624206543, "learning_rate": 8.460581258655515e-06, "loss": 0.4456, "step": 2397 }, { "epoch": 0.2642424242424242, "grad_norm": 4.967555999755859, "learning_rate": 8.459319049250001e-06, "loss": 0.4191, "step": 2398 }, { "epoch": 0.26435261707988983, "grad_norm": 5.243597984313965, "learning_rate": 8.458056416834255e-06, "loss": 0.4045, "step": 2399 }, { "epoch": 0.2644628099173554, "grad_norm": 6.000222206115723, "learning_rate": 8.456793361562674e-06, "loss": 0.4342, "step": 2400 }, { "epoch": 0.26457300275482093, "grad_norm": 3.6822104454040527, "learning_rate": 8.455529883589703e-06, "loss": 0.4325, "step": 2401 }, { "epoch": 0.2646831955922865, "grad_norm": 6.141874313354492, "learning_rate": 8.454265983069848e-06, "loss": 0.4673, "step": 2402 }, { "epoch": 0.2647933884297521, "grad_norm": 8.271032333374023, "learning_rate": 8.453001660157653e-06, "loss": 0.4486, "step": 2403 }, { "epoch": 0.26490358126721764, "grad_norm": 4.449370861053467, "learning_rate": 8.451736915007725e-06, "loss": 0.3562, "step": 2404 }, { "epoch": 0.2650137741046832, "grad_norm": 7.038038730621338, "learning_rate": 8.450471747774718e-06, "loss": 0.4834, "step": 2405 }, { "epoch": 0.26512396694214874, "grad_norm": 12.406217575073242, "learning_rate": 8.449206158613338e-06, "loss": 0.5946, "step": 2406 }, { "epoch": 0.26523415977961434, "grad_norm": 6.856632709503174, "learning_rate": 8.447940147678346e-06, "loss": 0.5554, "step": 2407 }, { "epoch": 0.2653443526170799, "grad_norm": 7.30984354019165, "learning_rate": 8.446673715124548e-06, "loss": 0.5054, "step": 2408 }, { "epoch": 0.26545454545454544, "grad_norm": 4.488925457000732, "learning_rate": 8.445406861106806e-06, "loss": 0.4447, "step": 2409 }, { "epoch": 0.265564738292011, "grad_norm": 6.564624786376953, "learning_rate": 8.444139585780034e-06, "loss": 0.4441, "step": 2410 }, { "epoch": 0.2656749311294766, "grad_norm": 8.532626152038574, "learning_rate": 8.442871889299194e-06, "loss": 0.4346, "step": 2411 }, { "epoch": 0.26578512396694215, "grad_norm": 9.0402193069458, "learning_rate": 8.441603771819302e-06, "loss": 0.4117, "step": 2412 }, { "epoch": 0.2658953168044077, "grad_norm": 7.293003082275391, "learning_rate": 8.440335233495428e-06, "loss": 0.3776, "step": 2413 }, { "epoch": 0.2660055096418733, "grad_norm": 6.154115676879883, "learning_rate": 8.439066274482687e-06, "loss": 0.4284, "step": 2414 }, { "epoch": 0.26611570247933886, "grad_norm": 5.79887580871582, "learning_rate": 8.43779689493625e-06, "loss": 0.4172, "step": 2415 }, { "epoch": 0.2662258953168044, "grad_norm": 9.61327838897705, "learning_rate": 8.43652709501134e-06, "loss": 0.4877, "step": 2416 }, { "epoch": 0.26633608815426996, "grad_norm": 12.206517219543457, "learning_rate": 8.43525687486323e-06, "loss": 0.5253, "step": 2417 }, { "epoch": 0.26644628099173556, "grad_norm": 5.559622764587402, "learning_rate": 8.43398623464724e-06, "loss": 0.5098, "step": 2418 }, { "epoch": 0.2665564738292011, "grad_norm": 14.062077522277832, "learning_rate": 8.43271517451875e-06, "loss": 0.5241, "step": 2419 }, { "epoch": 0.26666666666666666, "grad_norm": 6.438302993774414, "learning_rate": 8.431443694633187e-06, "loss": 0.4567, "step": 2420 }, { "epoch": 0.2667768595041322, "grad_norm": 9.327301979064941, "learning_rate": 8.430171795146025e-06, "loss": 0.4578, "step": 2421 }, { "epoch": 0.2668870523415978, "grad_norm": 5.0606818199157715, "learning_rate": 8.428899476212798e-06, "loss": 0.4326, "step": 2422 }, { "epoch": 0.26699724517906337, "grad_norm": 5.966927528381348, "learning_rate": 8.427626737989085e-06, "loss": 0.4125, "step": 2423 }, { "epoch": 0.2671074380165289, "grad_norm": 6.134056568145752, "learning_rate": 8.426353580630519e-06, "loss": 0.3967, "step": 2424 }, { "epoch": 0.26721763085399447, "grad_norm": 6.048746585845947, "learning_rate": 8.425080004292782e-06, "loss": 0.4668, "step": 2425 }, { "epoch": 0.2673278236914601, "grad_norm": 10.070021629333496, "learning_rate": 8.42380600913161e-06, "loss": 0.4439, "step": 2426 }, { "epoch": 0.2674380165289256, "grad_norm": 6.099440097808838, "learning_rate": 8.42253159530279e-06, "loss": 0.4487, "step": 2427 }, { "epoch": 0.2675482093663912, "grad_norm": 10.026384353637695, "learning_rate": 8.421256762962156e-06, "loss": 0.4665, "step": 2428 }, { "epoch": 0.2676584022038567, "grad_norm": 5.96007776260376, "learning_rate": 8.419981512265596e-06, "loss": 0.3705, "step": 2429 }, { "epoch": 0.26776859504132233, "grad_norm": 8.469380378723145, "learning_rate": 8.418705843369055e-06, "loss": 0.4131, "step": 2430 }, { "epoch": 0.2678787878787879, "grad_norm": 5.206123352050781, "learning_rate": 8.417429756428517e-06, "loss": 0.4689, "step": 2431 }, { "epoch": 0.26798898071625343, "grad_norm": 4.994175910949707, "learning_rate": 8.416153251600026e-06, "loss": 0.4722, "step": 2432 }, { "epoch": 0.268099173553719, "grad_norm": 5.6883225440979, "learning_rate": 8.414876329039675e-06, "loss": 0.5019, "step": 2433 }, { "epoch": 0.2682093663911846, "grad_norm": 5.061646938323975, "learning_rate": 8.41359898890361e-06, "loss": 0.4814, "step": 2434 }, { "epoch": 0.26831955922865014, "grad_norm": 7.3997368812561035, "learning_rate": 8.412321231348022e-06, "loss": 0.4339, "step": 2435 }, { "epoch": 0.2684297520661157, "grad_norm": 7.136973857879639, "learning_rate": 8.411043056529158e-06, "loss": 0.4767, "step": 2436 }, { "epoch": 0.26853994490358124, "grad_norm": 4.894901275634766, "learning_rate": 8.409764464603316e-06, "loss": 0.4344, "step": 2437 }, { "epoch": 0.26865013774104685, "grad_norm": 7.450186252593994, "learning_rate": 8.408485455726844e-06, "loss": 0.4932, "step": 2438 }, { "epoch": 0.2687603305785124, "grad_norm": 12.19626235961914, "learning_rate": 8.40720603005614e-06, "loss": 0.4221, "step": 2439 }, { "epoch": 0.26887052341597795, "grad_norm": 7.356841087341309, "learning_rate": 8.405926187747658e-06, "loss": 0.4903, "step": 2440 }, { "epoch": 0.26898071625344355, "grad_norm": 5.755402565002441, "learning_rate": 8.404645928957891e-06, "loss": 0.3814, "step": 2441 }, { "epoch": 0.2690909090909091, "grad_norm": 9.090841293334961, "learning_rate": 8.403365253843397e-06, "loss": 0.4033, "step": 2442 }, { "epoch": 0.26920110192837465, "grad_norm": 5.735386848449707, "learning_rate": 8.402084162560776e-06, "loss": 0.3967, "step": 2443 }, { "epoch": 0.2693112947658402, "grad_norm": 7.3089704513549805, "learning_rate": 8.400802655266682e-06, "loss": 0.5323, "step": 2444 }, { "epoch": 0.2694214876033058, "grad_norm": 7.761696815490723, "learning_rate": 8.39952073211782e-06, "loss": 0.4866, "step": 2445 }, { "epoch": 0.26953168044077136, "grad_norm": 6.164813041687012, "learning_rate": 8.398238393270946e-06, "loss": 0.3989, "step": 2446 }, { "epoch": 0.2696418732782369, "grad_norm": 6.546687602996826, "learning_rate": 8.396955638882864e-06, "loss": 0.3693, "step": 2447 }, { "epoch": 0.26975206611570246, "grad_norm": 7.708877086639404, "learning_rate": 8.395672469110433e-06, "loss": 0.4088, "step": 2448 }, { "epoch": 0.26986225895316807, "grad_norm": 8.575736999511719, "learning_rate": 8.394388884110559e-06, "loss": 0.4556, "step": 2449 }, { "epoch": 0.2699724517906336, "grad_norm": 9.647896766662598, "learning_rate": 8.393104884040202e-06, "loss": 0.4624, "step": 2450 }, { "epoch": 0.27008264462809917, "grad_norm": 6.133730411529541, "learning_rate": 8.391820469056371e-06, "loss": 0.3986, "step": 2451 }, { "epoch": 0.2701928374655647, "grad_norm": 7.404541015625, "learning_rate": 8.390535639316124e-06, "loss": 0.3624, "step": 2452 }, { "epoch": 0.2703030303030303, "grad_norm": 8.516987800598145, "learning_rate": 8.389250394976575e-06, "loss": 0.4818, "step": 2453 }, { "epoch": 0.2704132231404959, "grad_norm": 8.884702682495117, "learning_rate": 8.387964736194884e-06, "loss": 0.4364, "step": 2454 }, { "epoch": 0.2705234159779614, "grad_norm": 11.272823333740234, "learning_rate": 8.386678663128263e-06, "loss": 0.4891, "step": 2455 }, { "epoch": 0.270633608815427, "grad_norm": 7.883791446685791, "learning_rate": 8.385392175933974e-06, "loss": 0.4202, "step": 2456 }, { "epoch": 0.2707438016528926, "grad_norm": 8.155399322509766, "learning_rate": 8.384105274769331e-06, "loss": 0.433, "step": 2457 }, { "epoch": 0.27085399449035813, "grad_norm": 8.103219032287598, "learning_rate": 8.3828179597917e-06, "loss": 0.4528, "step": 2458 }, { "epoch": 0.2709641873278237, "grad_norm": 5.788285255432129, "learning_rate": 8.381530231158493e-06, "loss": 0.3991, "step": 2459 }, { "epoch": 0.27107438016528923, "grad_norm": 10.736172676086426, "learning_rate": 8.380242089027174e-06, "loss": 0.4536, "step": 2460 }, { "epoch": 0.27118457300275484, "grad_norm": 7.713866710662842, "learning_rate": 8.378953533555261e-06, "loss": 0.4379, "step": 2461 }, { "epoch": 0.2712947658402204, "grad_norm": 6.560085773468018, "learning_rate": 8.377664564900322e-06, "loss": 0.4984, "step": 2462 }, { "epoch": 0.27140495867768594, "grad_norm": 7.5404372215271, "learning_rate": 8.376375183219972e-06, "loss": 0.3497, "step": 2463 }, { "epoch": 0.27151515151515154, "grad_norm": 7.938744068145752, "learning_rate": 8.375085388671877e-06, "loss": 0.4557, "step": 2464 }, { "epoch": 0.2716253443526171, "grad_norm": 5.309293746948242, "learning_rate": 8.373795181413757e-06, "loss": 0.4525, "step": 2465 }, { "epoch": 0.27173553719008264, "grad_norm": 3.6332757472991943, "learning_rate": 8.372504561603379e-06, "loss": 0.4396, "step": 2466 }, { "epoch": 0.2718457300275482, "grad_norm": 5.703434944152832, "learning_rate": 8.371213529398561e-06, "loss": 0.4696, "step": 2467 }, { "epoch": 0.2719559228650138, "grad_norm": 9.001897811889648, "learning_rate": 8.369922084957174e-06, "loss": 0.4306, "step": 2468 }, { "epoch": 0.27206611570247935, "grad_norm": 6.7894415855407715, "learning_rate": 8.368630228437137e-06, "loss": 0.4258, "step": 2469 }, { "epoch": 0.2721763085399449, "grad_norm": 4.18749475479126, "learning_rate": 8.36733795999642e-06, "loss": 0.3744, "step": 2470 }, { "epoch": 0.27228650137741045, "grad_norm": 7.989779949188232, "learning_rate": 8.366045279793042e-06, "loss": 0.4467, "step": 2471 }, { "epoch": 0.27239669421487606, "grad_norm": 8.87408447265625, "learning_rate": 8.364752187985077e-06, "loss": 0.4525, "step": 2472 }, { "epoch": 0.2725068870523416, "grad_norm": 10.908501625061035, "learning_rate": 8.363458684730642e-06, "loss": 0.4576, "step": 2473 }, { "epoch": 0.27261707988980716, "grad_norm": 6.029660224914551, "learning_rate": 8.36216477018791e-06, "loss": 0.4239, "step": 2474 }, { "epoch": 0.2727272727272727, "grad_norm": 7.507077217102051, "learning_rate": 8.360870444515104e-06, "loss": 0.3579, "step": 2475 }, { "epoch": 0.2728374655647383, "grad_norm": 6.4907097816467285, "learning_rate": 8.359575707870495e-06, "loss": 0.4084, "step": 2476 }, { "epoch": 0.27294765840220386, "grad_norm": 8.459695816040039, "learning_rate": 8.358280560412403e-06, "loss": 0.4789, "step": 2477 }, { "epoch": 0.2730578512396694, "grad_norm": 7.87397575378418, "learning_rate": 8.356985002299205e-06, "loss": 0.4314, "step": 2478 }, { "epoch": 0.27316804407713496, "grad_norm": 8.098743438720703, "learning_rate": 8.355689033689321e-06, "loss": 0.482, "step": 2479 }, { "epoch": 0.27327823691460057, "grad_norm": 9.823105812072754, "learning_rate": 8.35439265474122e-06, "loss": 0.4553, "step": 2480 }, { "epoch": 0.2733884297520661, "grad_norm": 7.965592861175537, "learning_rate": 8.353095865613433e-06, "loss": 0.4911, "step": 2481 }, { "epoch": 0.27349862258953167, "grad_norm": 6.894061088562012, "learning_rate": 8.351798666464527e-06, "loss": 0.432, "step": 2482 }, { "epoch": 0.2736088154269972, "grad_norm": 6.235803127288818, "learning_rate": 8.350501057453127e-06, "loss": 0.4212, "step": 2483 }, { "epoch": 0.2737190082644628, "grad_norm": 9.930096626281738, "learning_rate": 8.349203038737904e-06, "loss": 0.43, "step": 2484 }, { "epoch": 0.2738292011019284, "grad_norm": 10.2173433303833, "learning_rate": 8.347904610477588e-06, "loss": 0.5353, "step": 2485 }, { "epoch": 0.2739393939393939, "grad_norm": 7.106429100036621, "learning_rate": 8.346605772830946e-06, "loss": 0.512, "step": 2486 }, { "epoch": 0.2740495867768595, "grad_norm": 9.963252067565918, "learning_rate": 8.345306525956807e-06, "loss": 0.5617, "step": 2487 }, { "epoch": 0.2741597796143251, "grad_norm": 5.965456008911133, "learning_rate": 8.344006870014039e-06, "loss": 0.3919, "step": 2488 }, { "epoch": 0.27426997245179063, "grad_norm": 5.027235507965088, "learning_rate": 8.34270680516157e-06, "loss": 0.431, "step": 2489 }, { "epoch": 0.2743801652892562, "grad_norm": 7.0271992683410645, "learning_rate": 8.341406331558373e-06, "loss": 0.4549, "step": 2490 }, { "epoch": 0.2744903581267218, "grad_norm": 7.3226518630981445, "learning_rate": 8.34010544936347e-06, "loss": 0.4255, "step": 2491 }, { "epoch": 0.27460055096418734, "grad_norm": 6.1253862380981445, "learning_rate": 8.338804158735934e-06, "loss": 0.4528, "step": 2492 }, { "epoch": 0.2747107438016529, "grad_norm": 6.783452033996582, "learning_rate": 8.337502459834892e-06, "loss": 0.4496, "step": 2493 }, { "epoch": 0.27482093663911844, "grad_norm": 5.832024097442627, "learning_rate": 8.336200352819514e-06, "loss": 0.3755, "step": 2494 }, { "epoch": 0.27493112947658405, "grad_norm": 5.6729736328125, "learning_rate": 8.334897837849027e-06, "loss": 0.4363, "step": 2495 }, { "epoch": 0.2750413223140496, "grad_norm": 7.188202857971191, "learning_rate": 8.333594915082701e-06, "loss": 0.4692, "step": 2496 }, { "epoch": 0.27515151515151515, "grad_norm": 6.959208965301514, "learning_rate": 8.33229158467986e-06, "loss": 0.4514, "step": 2497 }, { "epoch": 0.2752617079889807, "grad_norm": 7.196948051452637, "learning_rate": 8.330987846799881e-06, "loss": 0.4127, "step": 2498 }, { "epoch": 0.2753719008264463, "grad_norm": 9.837214469909668, "learning_rate": 8.32968370160218e-06, "loss": 0.4987, "step": 2499 }, { "epoch": 0.27548209366391185, "grad_norm": 10.358495712280273, "learning_rate": 8.328379149246234e-06, "loss": 0.3694, "step": 2500 }, { "epoch": 0.2755922865013774, "grad_norm": 13.23357105255127, "learning_rate": 8.327074189891564e-06, "loss": 0.5031, "step": 2501 }, { "epoch": 0.27570247933884295, "grad_norm": 8.414778709411621, "learning_rate": 8.325768823697743e-06, "loss": 0.4738, "step": 2502 }, { "epoch": 0.27581267217630856, "grad_norm": 7.910693645477295, "learning_rate": 8.324463050824394e-06, "loss": 0.4116, "step": 2503 }, { "epoch": 0.2759228650137741, "grad_norm": 8.252347946166992, "learning_rate": 8.323156871431186e-06, "loss": 0.3922, "step": 2504 }, { "epoch": 0.27603305785123966, "grad_norm": 8.44232177734375, "learning_rate": 8.321850285677842e-06, "loss": 0.4281, "step": 2505 }, { "epoch": 0.2761432506887052, "grad_norm": 6.815550804138184, "learning_rate": 8.320543293724133e-06, "loss": 0.4442, "step": 2506 }, { "epoch": 0.2762534435261708, "grad_norm": 13.305642127990723, "learning_rate": 8.319235895729878e-06, "loss": 0.4527, "step": 2507 }, { "epoch": 0.27636363636363637, "grad_norm": 5.294179916381836, "learning_rate": 8.31792809185495e-06, "loss": 0.4298, "step": 2508 }, { "epoch": 0.2764738292011019, "grad_norm": 7.0405097007751465, "learning_rate": 8.316619882259268e-06, "loss": 0.4452, "step": 2509 }, { "epoch": 0.27658402203856747, "grad_norm": 6.023338794708252, "learning_rate": 8.315311267102802e-06, "loss": 0.3899, "step": 2510 }, { "epoch": 0.2766942148760331, "grad_norm": 10.251482963562012, "learning_rate": 8.31400224654557e-06, "loss": 0.5171, "step": 2511 }, { "epoch": 0.2768044077134986, "grad_norm": 6.565580368041992, "learning_rate": 8.312692820747644e-06, "loss": 0.3834, "step": 2512 }, { "epoch": 0.2769146005509642, "grad_norm": 6.3364577293396, "learning_rate": 8.311382989869137e-06, "loss": 0.4053, "step": 2513 }, { "epoch": 0.2770247933884298, "grad_norm": 5.889908313751221, "learning_rate": 8.310072754070223e-06, "loss": 0.4285, "step": 2514 }, { "epoch": 0.27713498622589533, "grad_norm": 5.778831958770752, "learning_rate": 8.308762113511112e-06, "loss": 0.4834, "step": 2515 }, { "epoch": 0.2772451790633609, "grad_norm": 5.996057987213135, "learning_rate": 8.307451068352078e-06, "loss": 0.4179, "step": 2516 }, { "epoch": 0.27735537190082643, "grad_norm": 8.068934440612793, "learning_rate": 8.306139618753434e-06, "loss": 0.4559, "step": 2517 }, { "epoch": 0.27746556473829204, "grad_norm": 7.231866836547852, "learning_rate": 8.304827764875547e-06, "loss": 0.4152, "step": 2518 }, { "epoch": 0.2775757575757576, "grad_norm": 5.904922962188721, "learning_rate": 8.303515506878833e-06, "loss": 0.3983, "step": 2519 }, { "epoch": 0.27768595041322314, "grad_norm": 10.93997573852539, "learning_rate": 8.302202844923753e-06, "loss": 0.5144, "step": 2520 }, { "epoch": 0.2777961432506887, "grad_norm": 8.458961486816406, "learning_rate": 8.300889779170824e-06, "loss": 0.3975, "step": 2521 }, { "epoch": 0.2779063360881543, "grad_norm": 8.097896575927734, "learning_rate": 8.29957630978061e-06, "loss": 0.3939, "step": 2522 }, { "epoch": 0.27801652892561984, "grad_norm": 6.291425704956055, "learning_rate": 8.298262436913722e-06, "loss": 0.3961, "step": 2523 }, { "epoch": 0.2781267217630854, "grad_norm": 4.316643714904785, "learning_rate": 8.296948160730822e-06, "loss": 0.4179, "step": 2524 }, { "epoch": 0.27823691460055094, "grad_norm": 8.72903823852539, "learning_rate": 8.295633481392625e-06, "loss": 0.4965, "step": 2525 }, { "epoch": 0.27834710743801655, "grad_norm": 6.223062515258789, "learning_rate": 8.294318399059888e-06, "loss": 0.4358, "step": 2526 }, { "epoch": 0.2784573002754821, "grad_norm": 10.879405975341797, "learning_rate": 8.293002913893422e-06, "loss": 0.4807, "step": 2527 }, { "epoch": 0.27856749311294765, "grad_norm": 4.838342189788818, "learning_rate": 8.291687026054086e-06, "loss": 0.3958, "step": 2528 }, { "epoch": 0.2786776859504132, "grad_norm": 7.267243385314941, "learning_rate": 8.290370735702791e-06, "loss": 0.49, "step": 2529 }, { "epoch": 0.2787878787878788, "grad_norm": 9.317253112792969, "learning_rate": 8.28905404300049e-06, "loss": 0.4144, "step": 2530 }, { "epoch": 0.27889807162534436, "grad_norm": 5.159974098205566, "learning_rate": 8.287736948108197e-06, "loss": 0.4972, "step": 2531 }, { "epoch": 0.2790082644628099, "grad_norm": 8.115497589111328, "learning_rate": 8.28641945118696e-06, "loss": 0.3957, "step": 2532 }, { "epoch": 0.27911845730027546, "grad_norm": 5.645632743835449, "learning_rate": 8.285101552397892e-06, "loss": 0.4226, "step": 2533 }, { "epoch": 0.27922865013774106, "grad_norm": 7.425549030303955, "learning_rate": 8.28378325190214e-06, "loss": 0.4331, "step": 2534 }, { "epoch": 0.2793388429752066, "grad_norm": 7.270996570587158, "learning_rate": 8.282464549860915e-06, "loss": 0.4335, "step": 2535 }, { "epoch": 0.27944903581267216, "grad_norm": 11.127833366394043, "learning_rate": 8.281145446435463e-06, "loss": 0.4832, "step": 2536 }, { "epoch": 0.2795592286501377, "grad_norm": 5.004961013793945, "learning_rate": 8.27982594178709e-06, "loss": 0.4377, "step": 2537 }, { "epoch": 0.2796694214876033, "grad_norm": 7.438329219818115, "learning_rate": 8.278506036077146e-06, "loss": 0.4642, "step": 2538 }, { "epoch": 0.27977961432506887, "grad_norm": 6.2618937492370605, "learning_rate": 8.27718572946703e-06, "loss": 0.4354, "step": 2539 }, { "epoch": 0.2798898071625344, "grad_norm": 4.871488094329834, "learning_rate": 8.275865022118193e-06, "loss": 0.3845, "step": 2540 }, { "epoch": 0.28, "grad_norm": 6.332246780395508, "learning_rate": 8.274543914192129e-06, "loss": 0.4488, "step": 2541 }, { "epoch": 0.2801101928374656, "grad_norm": 6.855973243713379, "learning_rate": 8.273222405850388e-06, "loss": 0.4389, "step": 2542 }, { "epoch": 0.2802203856749311, "grad_norm": 10.263267517089844, "learning_rate": 8.271900497254566e-06, "loss": 0.4875, "step": 2543 }, { "epoch": 0.2803305785123967, "grad_norm": 4.8031415939331055, "learning_rate": 8.270578188566305e-06, "loss": 0.3905, "step": 2544 }, { "epoch": 0.2804407713498623, "grad_norm": 3.894521713256836, "learning_rate": 8.2692554799473e-06, "loss": 0.3452, "step": 2545 }, { "epoch": 0.28055096418732783, "grad_norm": 7.412913799285889, "learning_rate": 8.267932371559297e-06, "loss": 0.3899, "step": 2546 }, { "epoch": 0.2806611570247934, "grad_norm": 4.74569845199585, "learning_rate": 8.266608863564083e-06, "loss": 0.4083, "step": 2547 }, { "epoch": 0.28077134986225893, "grad_norm": 5.828033924102783, "learning_rate": 8.265284956123498e-06, "loss": 0.3356, "step": 2548 }, { "epoch": 0.28088154269972454, "grad_norm": 5.4041948318481445, "learning_rate": 8.263960649399437e-06, "loss": 0.4093, "step": 2549 }, { "epoch": 0.2809917355371901, "grad_norm": 7.65022611618042, "learning_rate": 8.26263594355383e-06, "loss": 0.3738, "step": 2550 }, { "epoch": 0.28110192837465564, "grad_norm": 6.111780643463135, "learning_rate": 8.261310838748671e-06, "loss": 0.3792, "step": 2551 }, { "epoch": 0.2812121212121212, "grad_norm": 7.705667495727539, "learning_rate": 8.25998533514599e-06, "loss": 0.4507, "step": 2552 }, { "epoch": 0.2813223140495868, "grad_norm": 6.605652332305908, "learning_rate": 8.258659432907877e-06, "loss": 0.3666, "step": 2553 }, { "epoch": 0.28143250688705235, "grad_norm": 7.341992378234863, "learning_rate": 8.257333132196461e-06, "loss": 0.4708, "step": 2554 }, { "epoch": 0.2815426997245179, "grad_norm": 4.790026664733887, "learning_rate": 8.256006433173925e-06, "loss": 0.3901, "step": 2555 }, { "epoch": 0.28165289256198345, "grad_norm": 8.660374641418457, "learning_rate": 8.254679336002498e-06, "loss": 0.398, "step": 2556 }, { "epoch": 0.28176308539944905, "grad_norm": 7.649413108825684, "learning_rate": 8.25335184084446e-06, "loss": 0.4353, "step": 2557 }, { "epoch": 0.2818732782369146, "grad_norm": 17.531639099121094, "learning_rate": 8.252023947862143e-06, "loss": 0.5034, "step": 2558 }, { "epoch": 0.28198347107438015, "grad_norm": 8.118510246276855, "learning_rate": 8.250695657217919e-06, "loss": 0.4191, "step": 2559 }, { "epoch": 0.2820936639118457, "grad_norm": 8.229848861694336, "learning_rate": 8.249366969074215e-06, "loss": 0.4461, "step": 2560 }, { "epoch": 0.2822038567493113, "grad_norm": 9.5634765625, "learning_rate": 8.248037883593502e-06, "loss": 0.395, "step": 2561 }, { "epoch": 0.28231404958677686, "grad_norm": 4.810074806213379, "learning_rate": 8.246708400938306e-06, "loss": 0.4922, "step": 2562 }, { "epoch": 0.2824242424242424, "grad_norm": 16.95096778869629, "learning_rate": 8.245378521271196e-06, "loss": 0.4841, "step": 2563 }, { "epoch": 0.28253443526170796, "grad_norm": 7.1187896728515625, "learning_rate": 8.244048244754792e-06, "loss": 0.3845, "step": 2564 }, { "epoch": 0.28264462809917357, "grad_norm": 4.641600608825684, "learning_rate": 8.242717571551763e-06, "loss": 0.4276, "step": 2565 }, { "epoch": 0.2827548209366391, "grad_norm": 7.558225631713867, "learning_rate": 8.241386501824824e-06, "loss": 0.4607, "step": 2566 }, { "epoch": 0.28286501377410467, "grad_norm": 7.707322597503662, "learning_rate": 8.24005503573674e-06, "loss": 0.474, "step": 2567 }, { "epoch": 0.2829752066115703, "grad_norm": 12.64372730255127, "learning_rate": 8.238723173450326e-06, "loss": 0.4801, "step": 2568 }, { "epoch": 0.2830853994490358, "grad_norm": 5.532975673675537, "learning_rate": 8.237390915128443e-06, "loss": 0.4439, "step": 2569 }, { "epoch": 0.2831955922865014, "grad_norm": 6.830770969390869, "learning_rate": 8.236058260934e-06, "loss": 0.4411, "step": 2570 }, { "epoch": 0.2833057851239669, "grad_norm": 7.833096981048584, "learning_rate": 8.234725211029957e-06, "loss": 0.4083, "step": 2571 }, { "epoch": 0.28341597796143253, "grad_norm": 6.798170566558838, "learning_rate": 8.233391765579323e-06, "loss": 0.4258, "step": 2572 }, { "epoch": 0.2835261707988981, "grad_norm": 5.830904006958008, "learning_rate": 8.232057924745152e-06, "loss": 0.3322, "step": 2573 }, { "epoch": 0.28363636363636363, "grad_norm": 9.897858619689941, "learning_rate": 8.230723688690546e-06, "loss": 0.511, "step": 2574 }, { "epoch": 0.2837465564738292, "grad_norm": 9.8292818069458, "learning_rate": 8.22938905757866e-06, "loss": 0.5303, "step": 2575 }, { "epoch": 0.2838567493112948, "grad_norm": 5.095852851867676, "learning_rate": 8.228054031572692e-06, "loss": 0.4417, "step": 2576 }, { "epoch": 0.28396694214876034, "grad_norm": 7.318708419799805, "learning_rate": 8.226718610835894e-06, "loss": 0.4943, "step": 2577 }, { "epoch": 0.2840771349862259, "grad_norm": 5.821365833282471, "learning_rate": 8.225382795531558e-06, "loss": 0.479, "step": 2578 }, { "epoch": 0.28418732782369144, "grad_norm": 5.989372253417969, "learning_rate": 8.224046585823035e-06, "loss": 0.3857, "step": 2579 }, { "epoch": 0.28429752066115704, "grad_norm": 8.840926170349121, "learning_rate": 8.222709981873716e-06, "loss": 0.4709, "step": 2580 }, { "epoch": 0.2844077134986226, "grad_norm": 10.142492294311523, "learning_rate": 8.221372983847043e-06, "loss": 0.5157, "step": 2581 }, { "epoch": 0.28451790633608814, "grad_norm": 11.084004402160645, "learning_rate": 8.220035591906505e-06, "loss": 0.5256, "step": 2582 }, { "epoch": 0.2846280991735537, "grad_norm": 7.02859354019165, "learning_rate": 8.21869780621564e-06, "loss": 0.4386, "step": 2583 }, { "epoch": 0.2847382920110193, "grad_norm": 7.2068939208984375, "learning_rate": 8.217359626938037e-06, "loss": 0.4258, "step": 2584 }, { "epoch": 0.28484848484848485, "grad_norm": 6.029386520385742, "learning_rate": 8.216021054237329e-06, "loss": 0.4751, "step": 2585 }, { "epoch": 0.2849586776859504, "grad_norm": 5.800614833831787, "learning_rate": 8.214682088277195e-06, "loss": 0.3853, "step": 2586 }, { "epoch": 0.28506887052341595, "grad_norm": 5.438484191894531, "learning_rate": 8.21334272922137e-06, "loss": 0.4587, "step": 2587 }, { "epoch": 0.28517906336088156, "grad_norm": 5.722385883331299, "learning_rate": 8.212002977233632e-06, "loss": 0.3503, "step": 2588 }, { "epoch": 0.2852892561983471, "grad_norm": 5.973930835723877, "learning_rate": 8.210662832477806e-06, "loss": 0.3744, "step": 2589 }, { "epoch": 0.28539944903581266, "grad_norm": 5.600560665130615, "learning_rate": 8.209322295117768e-06, "loss": 0.468, "step": 2590 }, { "epoch": 0.28550964187327826, "grad_norm": 7.9106831550598145, "learning_rate": 8.20798136531744e-06, "loss": 0.4752, "step": 2591 }, { "epoch": 0.2856198347107438, "grad_norm": 9.829314231872559, "learning_rate": 8.206640043240793e-06, "loss": 0.3866, "step": 2592 }, { "epoch": 0.28573002754820936, "grad_norm": 7.940008640289307, "learning_rate": 8.205298329051845e-06, "loss": 0.434, "step": 2593 }, { "epoch": 0.2858402203856749, "grad_norm": 8.637011528015137, "learning_rate": 8.203956222914665e-06, "loss": 0.4344, "step": 2594 }, { "epoch": 0.2859504132231405, "grad_norm": 5.530643463134766, "learning_rate": 8.202613724993364e-06, "loss": 0.3966, "step": 2595 }, { "epoch": 0.28606060606060607, "grad_norm": 4.837628364562988, "learning_rate": 8.201270835452108e-06, "loss": 0.375, "step": 2596 }, { "epoch": 0.2861707988980716, "grad_norm": 4.013152122497559, "learning_rate": 8.199927554455106e-06, "loss": 0.3959, "step": 2597 }, { "epoch": 0.28628099173553717, "grad_norm": 5.780310153961182, "learning_rate": 8.198583882166613e-06, "loss": 0.4421, "step": 2598 }, { "epoch": 0.2863911845730028, "grad_norm": 6.987085342407227, "learning_rate": 8.197239818750942e-06, "loss": 0.4186, "step": 2599 }, { "epoch": 0.2865013774104683, "grad_norm": 4.513391017913818, "learning_rate": 8.19589536437244e-06, "loss": 0.4329, "step": 2600 }, { "epoch": 0.2866115702479339, "grad_norm": 5.8561787605285645, "learning_rate": 8.194550519195512e-06, "loss": 0.4044, "step": 2601 }, { "epoch": 0.28672176308539943, "grad_norm": 11.793116569519043, "learning_rate": 8.193205283384608e-06, "loss": 0.4767, "step": 2602 }, { "epoch": 0.28683195592286503, "grad_norm": 9.649199485778809, "learning_rate": 8.191859657104225e-06, "loss": 0.4942, "step": 2603 }, { "epoch": 0.2869421487603306, "grad_norm": 10.381978034973145, "learning_rate": 8.190513640518906e-06, "loss": 0.4311, "step": 2604 }, { "epoch": 0.28705234159779613, "grad_norm": 7.879476070404053, "learning_rate": 8.189167233793248e-06, "loss": 0.3897, "step": 2605 }, { "epoch": 0.2871625344352617, "grad_norm": 8.143092155456543, "learning_rate": 8.187820437091885e-06, "loss": 0.4834, "step": 2606 }, { "epoch": 0.2872727272727273, "grad_norm": 8.470446586608887, "learning_rate": 8.18647325057951e-06, "loss": 0.578, "step": 2607 }, { "epoch": 0.28738292011019284, "grad_norm": 5.060790538787842, "learning_rate": 8.185125674420857e-06, "loss": 0.4648, "step": 2608 }, { "epoch": 0.2874931129476584, "grad_norm": 8.086231231689453, "learning_rate": 8.18377770878071e-06, "loss": 0.4915, "step": 2609 }, { "epoch": 0.28760330578512394, "grad_norm": 9.0546293258667, "learning_rate": 8.182429353823901e-06, "loss": 0.4771, "step": 2610 }, { "epoch": 0.28771349862258955, "grad_norm": 9.38154125213623, "learning_rate": 8.181080609715309e-06, "loss": 0.4612, "step": 2611 }, { "epoch": 0.2878236914600551, "grad_norm": 9.869471549987793, "learning_rate": 8.179731476619858e-06, "loss": 0.5246, "step": 2612 }, { "epoch": 0.28793388429752065, "grad_norm": 6.299503803253174, "learning_rate": 8.178381954702521e-06, "loss": 0.3416, "step": 2613 }, { "epoch": 0.2880440771349862, "grad_norm": 5.722007751464844, "learning_rate": 8.177032044128323e-06, "loss": 0.4092, "step": 2614 }, { "epoch": 0.2881542699724518, "grad_norm": 8.35212516784668, "learning_rate": 8.175681745062329e-06, "loss": 0.378, "step": 2615 }, { "epoch": 0.28826446280991735, "grad_norm": 5.700195789337158, "learning_rate": 8.17433105766966e-06, "loss": 0.4921, "step": 2616 }, { "epoch": 0.2883746556473829, "grad_norm": 7.712435245513916, "learning_rate": 8.172979982115477e-06, "loss": 0.5392, "step": 2617 }, { "epoch": 0.2884848484848485, "grad_norm": 4.76263952255249, "learning_rate": 8.17162851856499e-06, "loss": 0.4437, "step": 2618 }, { "epoch": 0.28859504132231406, "grad_norm": 6.987651348114014, "learning_rate": 8.170276667183461e-06, "loss": 0.2997, "step": 2619 }, { "epoch": 0.2887052341597796, "grad_norm": 9.844829559326172, "learning_rate": 8.168924428136193e-06, "loss": 0.4491, "step": 2620 }, { "epoch": 0.28881542699724516, "grad_norm": 6.718286514282227, "learning_rate": 8.167571801588542e-06, "loss": 0.4666, "step": 2621 }, { "epoch": 0.28892561983471077, "grad_norm": 8.184840202331543, "learning_rate": 8.16621878770591e-06, "loss": 0.474, "step": 2622 }, { "epoch": 0.2890358126721763, "grad_norm": 5.251203536987305, "learning_rate": 8.164865386653743e-06, "loss": 0.3551, "step": 2623 }, { "epoch": 0.28914600550964187, "grad_norm": 4.9718475341796875, "learning_rate": 8.163511598597537e-06, "loss": 0.415, "step": 2624 }, { "epoch": 0.2892561983471074, "grad_norm": 5.431325435638428, "learning_rate": 8.162157423702836e-06, "loss": 0.3606, "step": 2625 }, { "epoch": 0.289366391184573, "grad_norm": 7.744053840637207, "learning_rate": 8.16080286213523e-06, "loss": 0.503, "step": 2626 }, { "epoch": 0.2894765840220386, "grad_norm": 5.625741004943848, "learning_rate": 8.159447914060357e-06, "loss": 0.3628, "step": 2627 }, { "epoch": 0.2895867768595041, "grad_norm": 6.573037624359131, "learning_rate": 8.1580925796439e-06, "loss": 0.4738, "step": 2628 }, { "epoch": 0.2896969696969697, "grad_norm": 10.216695785522461, "learning_rate": 8.156736859051592e-06, "loss": 0.4733, "step": 2629 }, { "epoch": 0.2898071625344353, "grad_norm": 6.306691646575928, "learning_rate": 8.155380752449213e-06, "loss": 0.4845, "step": 2630 }, { "epoch": 0.28991735537190083, "grad_norm": 5.268336296081543, "learning_rate": 8.154024260002592e-06, "loss": 0.4264, "step": 2631 }, { "epoch": 0.2900275482093664, "grad_norm": 8.599966049194336, "learning_rate": 8.152667381877596e-06, "loss": 0.4741, "step": 2632 }, { "epoch": 0.29013774104683193, "grad_norm": 6.03732967376709, "learning_rate": 8.151310118240155e-06, "loss": 0.4128, "step": 2633 }, { "epoch": 0.29024793388429754, "grad_norm": 4.368444919586182, "learning_rate": 8.149952469256228e-06, "loss": 0.43, "step": 2634 }, { "epoch": 0.2903581267217631, "grad_norm": 5.164309024810791, "learning_rate": 8.148594435091837e-06, "loss": 0.3865, "step": 2635 }, { "epoch": 0.29046831955922864, "grad_norm": 11.640958786010742, "learning_rate": 8.14723601591304e-06, "loss": 0.5386, "step": 2636 }, { "epoch": 0.2905785123966942, "grad_norm": 4.805738925933838, "learning_rate": 8.145877211885949e-06, "loss": 0.3438, "step": 2637 }, { "epoch": 0.2906887052341598, "grad_norm": 5.049983501434326, "learning_rate": 8.144518023176718e-06, "loss": 0.4094, "step": 2638 }, { "epoch": 0.29079889807162534, "grad_norm": 4.325822353363037, "learning_rate": 8.14315844995155e-06, "loss": 0.447, "step": 2639 }, { "epoch": 0.2909090909090909, "grad_norm": 5.957462787628174, "learning_rate": 8.141798492376702e-06, "loss": 0.47, "step": 2640 }, { "epoch": 0.2910192837465565, "grad_norm": 9.82888126373291, "learning_rate": 8.140438150618463e-06, "loss": 0.4671, "step": 2641 }, { "epoch": 0.29112947658402205, "grad_norm": 7.724475383758545, "learning_rate": 8.139077424843183e-06, "loss": 0.4302, "step": 2642 }, { "epoch": 0.2912396694214876, "grad_norm": 5.9646100997924805, "learning_rate": 8.13771631521725e-06, "loss": 0.488, "step": 2643 }, { "epoch": 0.29134986225895315, "grad_norm": 5.967033863067627, "learning_rate": 8.136354821907104e-06, "loss": 0.4025, "step": 2644 }, { "epoch": 0.29146005509641876, "grad_norm": 9.786425590515137, "learning_rate": 8.13499294507923e-06, "loss": 0.4533, "step": 2645 }, { "epoch": 0.2915702479338843, "grad_norm": 9.216123580932617, "learning_rate": 8.133630684900162e-06, "loss": 0.5309, "step": 2646 }, { "epoch": 0.29168044077134986, "grad_norm": 12.148828506469727, "learning_rate": 8.132268041536476e-06, "loss": 0.4823, "step": 2647 }, { "epoch": 0.2917906336088154, "grad_norm": 7.568973064422607, "learning_rate": 8.130905015154799e-06, "loss": 0.5063, "step": 2648 }, { "epoch": 0.291900826446281, "grad_norm": 6.7498779296875, "learning_rate": 8.129541605921803e-06, "loss": 0.4117, "step": 2649 }, { "epoch": 0.29201101928374656, "grad_norm": 5.429599761962891, "learning_rate": 8.12817781400421e-06, "loss": 0.4413, "step": 2650 }, { "epoch": 0.2921212121212121, "grad_norm": 6.940309047698975, "learning_rate": 8.126813639568783e-06, "loss": 0.4633, "step": 2651 }, { "epoch": 0.29223140495867767, "grad_norm": 11.138673782348633, "learning_rate": 8.125449082782337e-06, "loss": 0.4829, "step": 2652 }, { "epoch": 0.29234159779614327, "grad_norm": 6.361445903778076, "learning_rate": 8.124084143811732e-06, "loss": 0.3944, "step": 2653 }, { "epoch": 0.2924517906336088, "grad_norm": 5.243053436279297, "learning_rate": 8.122718822823877e-06, "loss": 0.493, "step": 2654 }, { "epoch": 0.29256198347107437, "grad_norm": 5.032866954803467, "learning_rate": 8.12135311998572e-06, "loss": 0.4236, "step": 2655 }, { "epoch": 0.2926721763085399, "grad_norm": 9.60926628112793, "learning_rate": 8.119987035464263e-06, "loss": 0.4285, "step": 2656 }, { "epoch": 0.29278236914600553, "grad_norm": 5.861676216125488, "learning_rate": 8.118620569426554e-06, "loss": 0.4199, "step": 2657 }, { "epoch": 0.2928925619834711, "grad_norm": 9.698776245117188, "learning_rate": 8.117253722039686e-06, "loss": 0.4075, "step": 2658 }, { "epoch": 0.29300275482093663, "grad_norm": 5.61128044128418, "learning_rate": 8.115886493470797e-06, "loss": 0.3813, "step": 2659 }, { "epoch": 0.2931129476584022, "grad_norm": 6.095882415771484, "learning_rate": 8.114518883887076e-06, "loss": 0.3672, "step": 2660 }, { "epoch": 0.2932231404958678, "grad_norm": 6.09226655960083, "learning_rate": 8.113150893455756e-06, "loss": 0.4277, "step": 2661 }, { "epoch": 0.29333333333333333, "grad_norm": 5.465144634246826, "learning_rate": 8.111782522344114e-06, "loss": 0.3789, "step": 2662 }, { "epoch": 0.2934435261707989, "grad_norm": 7.330477714538574, "learning_rate": 8.11041377071948e-06, "loss": 0.3919, "step": 2663 }, { "epoch": 0.29355371900826444, "grad_norm": 6.909572124481201, "learning_rate": 8.109044638749224e-06, "loss": 0.4748, "step": 2664 }, { "epoch": 0.29366391184573004, "grad_norm": 5.801069736480713, "learning_rate": 8.107675126600767e-06, "loss": 0.4622, "step": 2665 }, { "epoch": 0.2937741046831956, "grad_norm": 7.1224188804626465, "learning_rate": 8.106305234441573e-06, "loss": 0.4052, "step": 2666 }, { "epoch": 0.29388429752066114, "grad_norm": 5.903815746307373, "learning_rate": 8.104934962439157e-06, "loss": 0.3944, "step": 2667 }, { "epoch": 0.29399449035812675, "grad_norm": 4.441305637359619, "learning_rate": 8.103564310761077e-06, "loss": 0.4419, "step": 2668 }, { "epoch": 0.2941046831955923, "grad_norm": 11.590902328491211, "learning_rate": 8.102193279574935e-06, "loss": 0.4373, "step": 2669 }, { "epoch": 0.29421487603305785, "grad_norm": 5.8332953453063965, "learning_rate": 8.100821869048385e-06, "loss": 0.4732, "step": 2670 }, { "epoch": 0.2943250688705234, "grad_norm": 5.280449867248535, "learning_rate": 8.099450079349124e-06, "loss": 0.4443, "step": 2671 }, { "epoch": 0.294435261707989, "grad_norm": 10.186681747436523, "learning_rate": 8.098077910644901e-06, "loss": 0.4394, "step": 2672 }, { "epoch": 0.29454545454545455, "grad_norm": 8.85413932800293, "learning_rate": 8.096705363103499e-06, "loss": 0.4828, "step": 2673 }, { "epoch": 0.2946556473829201, "grad_norm": 9.561735153198242, "learning_rate": 8.095332436892761e-06, "loss": 0.3634, "step": 2674 }, { "epoch": 0.29476584022038566, "grad_norm": 10.236101150512695, "learning_rate": 8.093959132180567e-06, "loss": 0.4397, "step": 2675 }, { "epoch": 0.29487603305785126, "grad_norm": 7.617365837097168, "learning_rate": 8.092585449134848e-06, "loss": 0.4184, "step": 2676 }, { "epoch": 0.2949862258953168, "grad_norm": 4.575974941253662, "learning_rate": 8.091211387923578e-06, "loss": 0.4487, "step": 2677 }, { "epoch": 0.29509641873278236, "grad_norm": 7.067541122436523, "learning_rate": 8.089836948714782e-06, "loss": 0.4459, "step": 2678 }, { "epoch": 0.2952066115702479, "grad_norm": 7.422731876373291, "learning_rate": 8.088462131676527e-06, "loss": 0.4099, "step": 2679 }, { "epoch": 0.2953168044077135, "grad_norm": 7.212629795074463, "learning_rate": 8.087086936976927e-06, "loss": 0.3918, "step": 2680 }, { "epoch": 0.29542699724517907, "grad_norm": 4.199789524078369, "learning_rate": 8.08571136478414e-06, "loss": 0.3815, "step": 2681 }, { "epoch": 0.2955371900826446, "grad_norm": 9.775740623474121, "learning_rate": 8.084335415266382e-06, "loss": 0.4704, "step": 2682 }, { "epoch": 0.29564738292011017, "grad_norm": 9.210233688354492, "learning_rate": 8.082959088591896e-06, "loss": 0.3792, "step": 2683 }, { "epoch": 0.2957575757575758, "grad_norm": 5.724307537078857, "learning_rate": 8.081582384928983e-06, "loss": 0.441, "step": 2684 }, { "epoch": 0.2958677685950413, "grad_norm": 6.285418510437012, "learning_rate": 8.080205304445992e-06, "loss": 0.4501, "step": 2685 }, { "epoch": 0.2959779614325069, "grad_norm": 5.50066614151001, "learning_rate": 8.078827847311313e-06, "loss": 0.4617, "step": 2686 }, { "epoch": 0.2960881542699724, "grad_norm": 4.148617267608643, "learning_rate": 8.077450013693382e-06, "loss": 0.4431, "step": 2687 }, { "epoch": 0.29619834710743803, "grad_norm": 8.889013290405273, "learning_rate": 8.076071803760683e-06, "loss": 0.4252, "step": 2688 }, { "epoch": 0.2963085399449036, "grad_norm": 7.282867431640625, "learning_rate": 8.074693217681747e-06, "loss": 0.3735, "step": 2689 }, { "epoch": 0.29641873278236913, "grad_norm": 9.445550918579102, "learning_rate": 8.073314255625144e-06, "loss": 0.5431, "step": 2690 }, { "epoch": 0.29652892561983474, "grad_norm": 12.855957984924316, "learning_rate": 8.071934917759502e-06, "loss": 0.5333, "step": 2691 }, { "epoch": 0.2966391184573003, "grad_norm": 6.184736728668213, "learning_rate": 8.070555204253485e-06, "loss": 0.477, "step": 2692 }, { "epoch": 0.29674931129476584, "grad_norm": 10.550787925720215, "learning_rate": 8.069175115275808e-06, "loss": 0.4769, "step": 2693 }, { "epoch": 0.2968595041322314, "grad_norm": 8.451855659484863, "learning_rate": 8.067794650995226e-06, "loss": 0.4841, "step": 2694 }, { "epoch": 0.296969696969697, "grad_norm": 12.774663925170898, "learning_rate": 8.066413811580548e-06, "loss": 0.4728, "step": 2695 }, { "epoch": 0.29707988980716254, "grad_norm": 12.18591594696045, "learning_rate": 8.065032597200624e-06, "loss": 0.5774, "step": 2696 }, { "epoch": 0.2971900826446281, "grad_norm": 10.858301162719727, "learning_rate": 8.063651008024351e-06, "loss": 0.5422, "step": 2697 }, { "epoch": 0.29730027548209365, "grad_norm": 5.995025634765625, "learning_rate": 8.06226904422067e-06, "loss": 0.4087, "step": 2698 }, { "epoch": 0.29741046831955925, "grad_norm": 8.615156173706055, "learning_rate": 8.06088670595857e-06, "loss": 0.4563, "step": 2699 }, { "epoch": 0.2975206611570248, "grad_norm": 8.634101867675781, "learning_rate": 8.05950399340709e-06, "loss": 0.4195, "step": 2700 }, { "epoch": 0.29763085399449035, "grad_norm": 5.616157531738281, "learning_rate": 8.058120906735304e-06, "loss": 0.3747, "step": 2701 }, { "epoch": 0.2977410468319559, "grad_norm": 4.679882526397705, "learning_rate": 8.056737446112338e-06, "loss": 0.3641, "step": 2702 }, { "epoch": 0.2978512396694215, "grad_norm": 5.062495231628418, "learning_rate": 8.055353611707364e-06, "loss": 0.5009, "step": 2703 }, { "epoch": 0.29796143250688706, "grad_norm": 4.492841720581055, "learning_rate": 8.0539694036896e-06, "loss": 0.3847, "step": 2704 }, { "epoch": 0.2980716253443526, "grad_norm": 7.658977508544922, "learning_rate": 8.052584822228312e-06, "loss": 0.4554, "step": 2705 }, { "epoch": 0.29818181818181816, "grad_norm": 7.011204719543457, "learning_rate": 8.051199867492803e-06, "loss": 0.3854, "step": 2706 }, { "epoch": 0.29829201101928376, "grad_norm": 6.54036283493042, "learning_rate": 8.04981453965243e-06, "loss": 0.4347, "step": 2707 }, { "epoch": 0.2984022038567493, "grad_norm": 4.983130931854248, "learning_rate": 8.048428838876593e-06, "loss": 0.4395, "step": 2708 }, { "epoch": 0.29851239669421487, "grad_norm": 4.8552374839782715, "learning_rate": 8.047042765334737e-06, "loss": 0.4574, "step": 2709 }, { "epoch": 0.2986225895316804, "grad_norm": 4.62309455871582, "learning_rate": 8.045656319196351e-06, "loss": 0.467, "step": 2710 }, { "epoch": 0.298732782369146, "grad_norm": 8.67222785949707, "learning_rate": 8.044269500630975e-06, "loss": 0.503, "step": 2711 }, { "epoch": 0.29884297520661157, "grad_norm": 6.187382698059082, "learning_rate": 8.042882309808187e-06, "loss": 0.3817, "step": 2712 }, { "epoch": 0.2989531680440771, "grad_norm": 7.065738201141357, "learning_rate": 8.041494746897618e-06, "loss": 0.4222, "step": 2713 }, { "epoch": 0.2990633608815427, "grad_norm": 10.733210563659668, "learning_rate": 8.040106812068943e-06, "loss": 0.3959, "step": 2714 }, { "epoch": 0.2991735537190083, "grad_norm": 8.270208358764648, "learning_rate": 8.03871850549187e-06, "loss": 0.4313, "step": 2715 }, { "epoch": 0.29928374655647383, "grad_norm": 8.04257583618164, "learning_rate": 8.037329827336176e-06, "loss": 0.4242, "step": 2716 }, { "epoch": 0.2993939393939394, "grad_norm": 6.929685592651367, "learning_rate": 8.035940777771664e-06, "loss": 0.5107, "step": 2717 }, { "epoch": 0.299504132231405, "grad_norm": 6.822888374328613, "learning_rate": 8.03455135696819e-06, "loss": 0.3517, "step": 2718 }, { "epoch": 0.29961432506887054, "grad_norm": 7.220570087432861, "learning_rate": 8.033161565095654e-06, "loss": 0.4018, "step": 2719 }, { "epoch": 0.2997245179063361, "grad_norm": 8.224783897399902, "learning_rate": 8.031771402324001e-06, "loss": 0.4423, "step": 2720 }, { "epoch": 0.29983471074380164, "grad_norm": 5.0163254737854, "learning_rate": 8.030380868823224e-06, "loss": 0.4316, "step": 2721 }, { "epoch": 0.29994490358126724, "grad_norm": 7.272629737854004, "learning_rate": 8.028989964763356e-06, "loss": 0.3644, "step": 2722 }, { "epoch": 0.3000550964187328, "grad_norm": 9.349287033081055, "learning_rate": 8.027598690314481e-06, "loss": 0.431, "step": 2723 }, { "epoch": 0.30016528925619834, "grad_norm": 7.657289505004883, "learning_rate": 8.026207045646728e-06, "loss": 0.3597, "step": 2724 }, { "epoch": 0.30016528925619834, "eval_loss": 0.44847550988197327, "eval_runtime": 41.9343, "eval_samples_per_second": 17.504, "eval_steps_per_second": 2.194, "step": 2724 }, { "epoch": 0.3002754820936639, "grad_norm": 9.78979778289795, "learning_rate": 8.024815030930264e-06, "loss": 0.4198, "step": 2725 }, { "epoch": 0.3003856749311295, "grad_norm": 6.471761226654053, "learning_rate": 8.023422646335311e-06, "loss": 0.4108, "step": 2726 }, { "epoch": 0.30049586776859505, "grad_norm": 9.229776382446289, "learning_rate": 8.022029892032128e-06, "loss": 0.5366, "step": 2727 }, { "epoch": 0.3006060606060606, "grad_norm": 12.983948707580566, "learning_rate": 8.020636768191027e-06, "loss": 0.516, "step": 2728 }, { "epoch": 0.30071625344352615, "grad_norm": 5.297845363616943, "learning_rate": 8.019243274982357e-06, "loss": 0.3972, "step": 2729 }, { "epoch": 0.30082644628099175, "grad_norm": 8.642130851745605, "learning_rate": 8.017849412576517e-06, "loss": 0.4109, "step": 2730 }, { "epoch": 0.3009366391184573, "grad_norm": 6.278255462646484, "learning_rate": 8.016455181143954e-06, "loss": 0.4478, "step": 2731 }, { "epoch": 0.30104683195592286, "grad_norm": 5.403011798858643, "learning_rate": 8.015060580855154e-06, "loss": 0.3878, "step": 2732 }, { "epoch": 0.3011570247933884, "grad_norm": 6.093563556671143, "learning_rate": 8.01366561188065e-06, "loss": 0.3991, "step": 2733 }, { "epoch": 0.301267217630854, "grad_norm": 5.804359436035156, "learning_rate": 8.012270274391022e-06, "loss": 0.4626, "step": 2734 }, { "epoch": 0.30137741046831956, "grad_norm": 11.595686912536621, "learning_rate": 8.010874568556892e-06, "loss": 0.5503, "step": 2735 }, { "epoch": 0.3014876033057851, "grad_norm": 5.565887928009033, "learning_rate": 8.00947849454893e-06, "loss": 0.4032, "step": 2736 }, { "epoch": 0.30159779614325066, "grad_norm": 10.746644020080566, "learning_rate": 8.008082052537848e-06, "loss": 0.4069, "step": 2737 }, { "epoch": 0.30170798898071627, "grad_norm": 6.30381965637207, "learning_rate": 8.006685242694409e-06, "loss": 0.3726, "step": 2738 }, { "epoch": 0.3018181818181818, "grad_norm": 10.491253852844238, "learning_rate": 8.005288065189414e-06, "loss": 0.5036, "step": 2739 }, { "epoch": 0.30192837465564737, "grad_norm": 6.20374870300293, "learning_rate": 8.003890520193711e-06, "loss": 0.4082, "step": 2740 }, { "epoch": 0.302038567493113, "grad_norm": 6.9139227867126465, "learning_rate": 8.002492607878197e-06, "loss": 0.4841, "step": 2741 }, { "epoch": 0.3021487603305785, "grad_norm": 8.514464378356934, "learning_rate": 8.001094328413807e-06, "loss": 0.4927, "step": 2742 }, { "epoch": 0.3022589531680441, "grad_norm": 10.513345718383789, "learning_rate": 7.999695681971525e-06, "loss": 0.4676, "step": 2743 }, { "epoch": 0.3023691460055096, "grad_norm": 8.520428657531738, "learning_rate": 7.998296668722381e-06, "loss": 0.442, "step": 2744 }, { "epoch": 0.30247933884297523, "grad_norm": 7.687136173248291, "learning_rate": 7.996897288837449e-06, "loss": 0.3255, "step": 2745 }, { "epoch": 0.3025895316804408, "grad_norm": 8.961156845092773, "learning_rate": 7.995497542487845e-06, "loss": 0.4728, "step": 2746 }, { "epoch": 0.30269972451790633, "grad_norm": 14.33635139465332, "learning_rate": 7.994097429844732e-06, "loss": 0.5018, "step": 2747 }, { "epoch": 0.3028099173553719, "grad_norm": 5.116520404815674, "learning_rate": 7.992696951079318e-06, "loss": 0.4029, "step": 2748 }, { "epoch": 0.3029201101928375, "grad_norm": 11.656856536865234, "learning_rate": 7.991296106362855e-06, "loss": 0.5388, "step": 2749 }, { "epoch": 0.30303030303030304, "grad_norm": 7.711641311645508, "learning_rate": 7.989894895866643e-06, "loss": 0.4161, "step": 2750 }, { "epoch": 0.3031404958677686, "grad_norm": 6.066614627838135, "learning_rate": 7.988493319762018e-06, "loss": 0.4009, "step": 2751 }, { "epoch": 0.30325068870523414, "grad_norm": 5.1315717697143555, "learning_rate": 7.987091378220376e-06, "loss": 0.4176, "step": 2752 }, { "epoch": 0.30336088154269975, "grad_norm": 6.4421844482421875, "learning_rate": 7.985689071413138e-06, "loss": 0.4541, "step": 2753 }, { "epoch": 0.3034710743801653, "grad_norm": 9.538857460021973, "learning_rate": 7.984286399511786e-06, "loss": 0.3356, "step": 2754 }, { "epoch": 0.30358126721763085, "grad_norm": 11.166560173034668, "learning_rate": 7.982883362687839e-06, "loss": 0.384, "step": 2755 }, { "epoch": 0.3036914600550964, "grad_norm": 5.413555145263672, "learning_rate": 7.981479961112863e-06, "loss": 0.4505, "step": 2756 }, { "epoch": 0.303801652892562, "grad_norm": 6.005543231964111, "learning_rate": 7.980076194958468e-06, "loss": 0.462, "step": 2757 }, { "epoch": 0.30391184573002755, "grad_norm": 9.219489097595215, "learning_rate": 7.978672064396307e-06, "loss": 0.4541, "step": 2758 }, { "epoch": 0.3040220385674931, "grad_norm": 5.687552452087402, "learning_rate": 7.977267569598082e-06, "loss": 0.3804, "step": 2759 }, { "epoch": 0.30413223140495865, "grad_norm": 7.908405780792236, "learning_rate": 7.975862710735531e-06, "loss": 0.481, "step": 2760 }, { "epoch": 0.30424242424242426, "grad_norm": 8.924123764038086, "learning_rate": 7.974457487980447e-06, "loss": 0.3272, "step": 2761 }, { "epoch": 0.3043526170798898, "grad_norm": 7.286680698394775, "learning_rate": 7.97305190150466e-06, "loss": 0.4686, "step": 2762 }, { "epoch": 0.30446280991735536, "grad_norm": 13.811619758605957, "learning_rate": 7.97164595148005e-06, "loss": 0.478, "step": 2763 }, { "epoch": 0.3045730027548209, "grad_norm": 4.992403030395508, "learning_rate": 7.970239638078536e-06, "loss": 0.3503, "step": 2764 }, { "epoch": 0.3046831955922865, "grad_norm": 6.27252721786499, "learning_rate": 7.968832961472084e-06, "loss": 0.3845, "step": 2765 }, { "epoch": 0.30479338842975207, "grad_norm": 7.079043865203857, "learning_rate": 7.967425921832705e-06, "loss": 0.3657, "step": 2766 }, { "epoch": 0.3049035812672176, "grad_norm": 9.025582313537598, "learning_rate": 7.966018519332453e-06, "loss": 0.4906, "step": 2767 }, { "epoch": 0.3050137741046832, "grad_norm": 5.517977237701416, "learning_rate": 7.964610754143427e-06, "loss": 0.4919, "step": 2768 }, { "epoch": 0.30512396694214877, "grad_norm": 5.134316921234131, "learning_rate": 7.96320262643777e-06, "loss": 0.4195, "step": 2769 }, { "epoch": 0.3052341597796143, "grad_norm": 7.691980361938477, "learning_rate": 7.961794136387672e-06, "loss": 0.5515, "step": 2770 }, { "epoch": 0.3053443526170799, "grad_norm": 6.8551435470581055, "learning_rate": 7.960385284165364e-06, "loss": 0.4347, "step": 2771 }, { "epoch": 0.3054545454545455, "grad_norm": 7.076082229614258, "learning_rate": 7.958976069943123e-06, "loss": 0.3281, "step": 2772 }, { "epoch": 0.30556473829201103, "grad_norm": 6.586147308349609, "learning_rate": 7.957566493893268e-06, "loss": 0.3514, "step": 2773 }, { "epoch": 0.3056749311294766, "grad_norm": 10.270943641662598, "learning_rate": 7.956156556188166e-06, "loss": 0.4335, "step": 2774 }, { "epoch": 0.30578512396694213, "grad_norm": 5.657565116882324, "learning_rate": 7.954746257000223e-06, "loss": 0.4522, "step": 2775 }, { "epoch": 0.30589531680440774, "grad_norm": 7.81183385848999, "learning_rate": 7.953335596501892e-06, "loss": 0.4627, "step": 2776 }, { "epoch": 0.3060055096418733, "grad_norm": 8.615931510925293, "learning_rate": 7.951924574865677e-06, "loss": 0.5076, "step": 2777 }, { "epoch": 0.30611570247933884, "grad_norm": 8.567231178283691, "learning_rate": 7.950513192264112e-06, "loss": 0.4647, "step": 2778 }, { "epoch": 0.3062258953168044, "grad_norm": 13.009634971618652, "learning_rate": 7.949101448869787e-06, "loss": 0.4996, "step": 2779 }, { "epoch": 0.30633608815427, "grad_norm": 9.427029609680176, "learning_rate": 7.947689344855331e-06, "loss": 0.5276, "step": 2780 }, { "epoch": 0.30644628099173554, "grad_norm": 6.252676010131836, "learning_rate": 7.946276880393418e-06, "loss": 0.4461, "step": 2781 }, { "epoch": 0.3065564738292011, "grad_norm": 27.598886489868164, "learning_rate": 7.944864055656765e-06, "loss": 0.5092, "step": 2782 }, { "epoch": 0.30666666666666664, "grad_norm": 6.648087024688721, "learning_rate": 7.943450870818137e-06, "loss": 0.4466, "step": 2783 }, { "epoch": 0.30677685950413225, "grad_norm": 6.301035404205322, "learning_rate": 7.942037326050336e-06, "loss": 0.4298, "step": 2784 }, { "epoch": 0.3068870523415978, "grad_norm": 5.16077995300293, "learning_rate": 7.940623421526217e-06, "loss": 0.4579, "step": 2785 }, { "epoch": 0.30699724517906335, "grad_norm": 10.42927074432373, "learning_rate": 7.939209157418669e-06, "loss": 0.4618, "step": 2786 }, { "epoch": 0.3071074380165289, "grad_norm": 4.532710552215576, "learning_rate": 7.937794533900634e-06, "loss": 0.3699, "step": 2787 }, { "epoch": 0.3072176308539945, "grad_norm": 17.369544982910156, "learning_rate": 7.936379551145092e-06, "loss": 0.4519, "step": 2788 }, { "epoch": 0.30732782369146006, "grad_norm": 4.231653213500977, "learning_rate": 7.934964209325071e-06, "loss": 0.3588, "step": 2789 }, { "epoch": 0.3074380165289256, "grad_norm": 6.629281997680664, "learning_rate": 7.933548508613638e-06, "loss": 0.4241, "step": 2790 }, { "epoch": 0.3075482093663912, "grad_norm": 5.726194381713867, "learning_rate": 7.932132449183912e-06, "loss": 0.4725, "step": 2791 }, { "epoch": 0.30765840220385676, "grad_norm": 5.600101947784424, "learning_rate": 7.930716031209043e-06, "loss": 0.4293, "step": 2792 }, { "epoch": 0.3077685950413223, "grad_norm": 12.127617835998535, "learning_rate": 7.929299254862239e-06, "loss": 0.539, "step": 2793 }, { "epoch": 0.30787878787878786, "grad_norm": 6.142868995666504, "learning_rate": 7.927882120316744e-06, "loss": 0.3327, "step": 2794 }, { "epoch": 0.30798898071625347, "grad_norm": 5.726541042327881, "learning_rate": 7.926464627745844e-06, "loss": 0.3919, "step": 2795 }, { "epoch": 0.308099173553719, "grad_norm": 6.68948221206665, "learning_rate": 7.925046777322873e-06, "loss": 0.4414, "step": 2796 }, { "epoch": 0.30820936639118457, "grad_norm": 10.885160446166992, "learning_rate": 7.92362856922121e-06, "loss": 0.4975, "step": 2797 }, { "epoch": 0.3083195592286501, "grad_norm": 6.794032096862793, "learning_rate": 7.922210003614277e-06, "loss": 0.4873, "step": 2798 }, { "epoch": 0.3084297520661157, "grad_norm": 7.296762943267822, "learning_rate": 7.920791080675534e-06, "loss": 0.4076, "step": 2799 }, { "epoch": 0.3085399449035813, "grad_norm": 9.808841705322266, "learning_rate": 7.919371800578489e-06, "loss": 0.463, "step": 2800 }, { "epoch": 0.3086501377410468, "grad_norm": 7.044501304626465, "learning_rate": 7.917952163496695e-06, "loss": 0.4042, "step": 2801 }, { "epoch": 0.3087603305785124, "grad_norm": 8.651405334472656, "learning_rate": 7.916532169603745e-06, "loss": 0.4401, "step": 2802 }, { "epoch": 0.308870523415978, "grad_norm": 8.043083190917969, "learning_rate": 7.915111819073282e-06, "loss": 0.4283, "step": 2803 }, { "epoch": 0.30898071625344353, "grad_norm": 6.39584493637085, "learning_rate": 7.913691112078985e-06, "loss": 0.4579, "step": 2804 }, { "epoch": 0.3090909090909091, "grad_norm": 9.391702651977539, "learning_rate": 7.912270048794582e-06, "loss": 0.5076, "step": 2805 }, { "epoch": 0.30920110192837463, "grad_norm": 9.41595458984375, "learning_rate": 7.910848629393842e-06, "loss": 0.435, "step": 2806 }, { "epoch": 0.30931129476584024, "grad_norm": 6.67035436630249, "learning_rate": 7.909426854050575e-06, "loss": 0.3859, "step": 2807 }, { "epoch": 0.3094214876033058, "grad_norm": 6.468634605407715, "learning_rate": 7.908004722938643e-06, "loss": 0.4393, "step": 2808 }, { "epoch": 0.30953168044077134, "grad_norm": 5.794666290283203, "learning_rate": 7.906582236231942e-06, "loss": 0.4383, "step": 2809 }, { "epoch": 0.3096418732782369, "grad_norm": 6.282586574554443, "learning_rate": 7.905159394104416e-06, "loss": 0.3891, "step": 2810 }, { "epoch": 0.3097520661157025, "grad_norm": 7.2533650398254395, "learning_rate": 7.903736196730053e-06, "loss": 0.3635, "step": 2811 }, { "epoch": 0.30986225895316805, "grad_norm": 8.431426048278809, "learning_rate": 7.902312644282886e-06, "loss": 0.4154, "step": 2812 }, { "epoch": 0.3099724517906336, "grad_norm": 8.024810791015625, "learning_rate": 7.900888736936983e-06, "loss": 0.4344, "step": 2813 }, { "epoch": 0.31008264462809915, "grad_norm": 5.503087043762207, "learning_rate": 7.899464474866466e-06, "loss": 0.4049, "step": 2814 }, { "epoch": 0.31019283746556475, "grad_norm": 12.236490249633789, "learning_rate": 7.898039858245496e-06, "loss": 0.4411, "step": 2815 }, { "epoch": 0.3103030303030303, "grad_norm": 7.569835186004639, "learning_rate": 7.896614887248276e-06, "loss": 0.4911, "step": 2816 }, { "epoch": 0.31041322314049585, "grad_norm": 12.798169136047363, "learning_rate": 7.895189562049051e-06, "loss": 0.5159, "step": 2817 }, { "epoch": 0.31052341597796146, "grad_norm": 5.333597183227539, "learning_rate": 7.893763882822115e-06, "loss": 0.4846, "step": 2818 }, { "epoch": 0.310633608815427, "grad_norm": 6.715102195739746, "learning_rate": 7.8923378497418e-06, "loss": 0.3047, "step": 2819 }, { "epoch": 0.31074380165289256, "grad_norm": 3.3703551292419434, "learning_rate": 7.890911462982482e-06, "loss": 0.3674, "step": 2820 }, { "epoch": 0.3108539944903581, "grad_norm": 6.084781169891357, "learning_rate": 7.889484722718586e-06, "loss": 0.4677, "step": 2821 }, { "epoch": 0.3109641873278237, "grad_norm": 5.297740459442139, "learning_rate": 7.888057629124573e-06, "loss": 0.3949, "step": 2822 }, { "epoch": 0.31107438016528927, "grad_norm": 3.961949348449707, "learning_rate": 7.886630182374947e-06, "loss": 0.3817, "step": 2823 }, { "epoch": 0.3111845730027548, "grad_norm": 8.566644668579102, "learning_rate": 7.885202382644265e-06, "loss": 0.4124, "step": 2824 }, { "epoch": 0.31129476584022037, "grad_norm": 5.797860622406006, "learning_rate": 7.883774230107115e-06, "loss": 0.4373, "step": 2825 }, { "epoch": 0.311404958677686, "grad_norm": 3.940842628479004, "learning_rate": 7.882345724938134e-06, "loss": 0.3944, "step": 2826 }, { "epoch": 0.3115151515151515, "grad_norm": 7.025946617126465, "learning_rate": 7.880916867312003e-06, "loss": 0.4132, "step": 2827 }, { "epoch": 0.3116253443526171, "grad_norm": 10.542343139648438, "learning_rate": 7.879487657403445e-06, "loss": 0.4763, "step": 2828 }, { "epoch": 0.3117355371900826, "grad_norm": 4.249691009521484, "learning_rate": 7.878058095387225e-06, "loss": 0.4395, "step": 2829 }, { "epoch": 0.31184573002754823, "grad_norm": 6.69081974029541, "learning_rate": 7.87662818143815e-06, "loss": 0.2579, "step": 2830 }, { "epoch": 0.3119559228650138, "grad_norm": 11.511882781982422, "learning_rate": 7.875197915731076e-06, "loss": 0.3756, "step": 2831 }, { "epoch": 0.31206611570247933, "grad_norm": 6.804051876068115, "learning_rate": 7.873767298440894e-06, "loss": 0.3955, "step": 2832 }, { "epoch": 0.3121763085399449, "grad_norm": 5.571722507476807, "learning_rate": 7.872336329742543e-06, "loss": 0.4043, "step": 2833 }, { "epoch": 0.3122865013774105, "grad_norm": 8.094839096069336, "learning_rate": 7.870905009811003e-06, "loss": 0.4749, "step": 2834 }, { "epoch": 0.31239669421487604, "grad_norm": 7.383030891418457, "learning_rate": 7.869473338821298e-06, "loss": 0.395, "step": 2835 }, { "epoch": 0.3125068870523416, "grad_norm": 7.508321285247803, "learning_rate": 7.868041316948498e-06, "loss": 0.4089, "step": 2836 }, { "epoch": 0.31261707988980714, "grad_norm": 7.432014465332031, "learning_rate": 7.86660894436771e-06, "loss": 0.5163, "step": 2837 }, { "epoch": 0.31272727272727274, "grad_norm": 12.985380172729492, "learning_rate": 7.865176221254084e-06, "loss": 0.4646, "step": 2838 }, { "epoch": 0.3128374655647383, "grad_norm": 8.084906578063965, "learning_rate": 7.863743147782819e-06, "loss": 0.4098, "step": 2839 }, { "epoch": 0.31294765840220384, "grad_norm": 12.373218536376953, "learning_rate": 7.862309724129152e-06, "loss": 0.4765, "step": 2840 }, { "epoch": 0.31305785123966945, "grad_norm": 7.3020429611206055, "learning_rate": 7.860875950468363e-06, "loss": 0.4874, "step": 2841 }, { "epoch": 0.313168044077135, "grad_norm": 12.162711143493652, "learning_rate": 7.859441826975776e-06, "loss": 0.4098, "step": 2842 }, { "epoch": 0.31327823691460055, "grad_norm": 6.177431106567383, "learning_rate": 7.858007353826759e-06, "loss": 0.4592, "step": 2843 }, { "epoch": 0.3133884297520661, "grad_norm": 6.27357292175293, "learning_rate": 7.856572531196722e-06, "loss": 0.3901, "step": 2844 }, { "epoch": 0.3134986225895317, "grad_norm": 7.773303508758545, "learning_rate": 7.855137359261115e-06, "loss": 0.3877, "step": 2845 }, { "epoch": 0.31360881542699726, "grad_norm": 10.020936965942383, "learning_rate": 7.853701838195432e-06, "loss": 0.5103, "step": 2846 }, { "epoch": 0.3137190082644628, "grad_norm": 6.017946243286133, "learning_rate": 7.852265968175215e-06, "loss": 0.4733, "step": 2847 }, { "epoch": 0.31382920110192836, "grad_norm": 6.867127418518066, "learning_rate": 7.850829749376037e-06, "loss": 0.4119, "step": 2848 }, { "epoch": 0.31393939393939396, "grad_norm": 7.664295673370361, "learning_rate": 7.849393181973527e-06, "loss": 0.4249, "step": 2849 }, { "epoch": 0.3140495867768595, "grad_norm": 11.663252830505371, "learning_rate": 7.847956266143349e-06, "loss": 0.4911, "step": 2850 }, { "epoch": 0.31415977961432506, "grad_norm": 4.299872398376465, "learning_rate": 7.846519002061208e-06, "loss": 0.4928, "step": 2851 }, { "epoch": 0.3142699724517906, "grad_norm": 8.30838680267334, "learning_rate": 7.845081389902857e-06, "loss": 0.4668, "step": 2852 }, { "epoch": 0.3143801652892562, "grad_norm": 9.995965957641602, "learning_rate": 7.84364342984409e-06, "loss": 0.4979, "step": 2853 }, { "epoch": 0.31449035812672177, "grad_norm": 5.872107982635498, "learning_rate": 7.842205122060742e-06, "loss": 0.4263, "step": 2854 }, { "epoch": 0.3146005509641873, "grad_norm": 12.73546028137207, "learning_rate": 7.84076646672869e-06, "loss": 0.412, "step": 2855 }, { "epoch": 0.31471074380165287, "grad_norm": 11.293656349182129, "learning_rate": 7.839327464023856e-06, "loss": 0.4555, "step": 2856 }, { "epoch": 0.3148209366391185, "grad_norm": 4.955324649810791, "learning_rate": 7.837888114122203e-06, "loss": 0.4784, "step": 2857 }, { "epoch": 0.314931129476584, "grad_norm": 10.688766479492188, "learning_rate": 7.836448417199735e-06, "loss": 0.5122, "step": 2858 }, { "epoch": 0.3150413223140496, "grad_norm": 4.592641830444336, "learning_rate": 7.835008373432504e-06, "loss": 0.4596, "step": 2859 }, { "epoch": 0.3151515151515151, "grad_norm": 11.006119728088379, "learning_rate": 7.833567982996598e-06, "loss": 0.445, "step": 2860 }, { "epoch": 0.31526170798898073, "grad_norm": 13.519087791442871, "learning_rate": 7.832127246068148e-06, "loss": 0.4218, "step": 2861 }, { "epoch": 0.3153719008264463, "grad_norm": 4.565635681152344, "learning_rate": 7.830686162823332e-06, "loss": 0.4107, "step": 2862 }, { "epoch": 0.31548209366391183, "grad_norm": 3.7139227390289307, "learning_rate": 7.829244733438368e-06, "loss": 0.43, "step": 2863 }, { "epoch": 0.3155922865013774, "grad_norm": 8.965399742126465, "learning_rate": 7.827802958089514e-06, "loss": 0.4718, "step": 2864 }, { "epoch": 0.315702479338843, "grad_norm": 7.846672058105469, "learning_rate": 7.826360836953073e-06, "loss": 0.43, "step": 2865 }, { "epoch": 0.31581267217630854, "grad_norm": 7.4430766105651855, "learning_rate": 7.82491837020539e-06, "loss": 0.4056, "step": 2866 }, { "epoch": 0.3159228650137741, "grad_norm": 7.6877946853637695, "learning_rate": 7.82347555802285e-06, "loss": 0.5108, "step": 2867 }, { "epoch": 0.3160330578512397, "grad_norm": 5.180397033691406, "learning_rate": 7.822032400581886e-06, "loss": 0.3812, "step": 2868 }, { "epoch": 0.31614325068870525, "grad_norm": 10.081506729125977, "learning_rate": 7.820588898058966e-06, "loss": 0.4718, "step": 2869 }, { "epoch": 0.3162534435261708, "grad_norm": 4.8489837646484375, "learning_rate": 7.819145050630602e-06, "loss": 0.4374, "step": 2870 }, { "epoch": 0.31636363636363635, "grad_norm": 6.359596252441406, "learning_rate": 7.817700858473353e-06, "loss": 0.4696, "step": 2871 }, { "epoch": 0.31647382920110195, "grad_norm": 4.26806640625, "learning_rate": 7.816256321763818e-06, "loss": 0.4418, "step": 2872 }, { "epoch": 0.3165840220385675, "grad_norm": 7.734675884246826, "learning_rate": 7.814811440678632e-06, "loss": 0.4507, "step": 2873 }, { "epoch": 0.31669421487603305, "grad_norm": 8.76938247680664, "learning_rate": 7.813366215394479e-06, "loss": 0.4429, "step": 2874 }, { "epoch": 0.3168044077134986, "grad_norm": 7.27657413482666, "learning_rate": 7.811920646088084e-06, "loss": 0.4837, "step": 2875 }, { "epoch": 0.3169146005509642, "grad_norm": 5.2841644287109375, "learning_rate": 7.810474732936213e-06, "loss": 0.3626, "step": 2876 }, { "epoch": 0.31702479338842976, "grad_norm": 6.800135135650635, "learning_rate": 7.809028476115674e-06, "loss": 0.4375, "step": 2877 }, { "epoch": 0.3171349862258953, "grad_norm": 9.077492713928223, "learning_rate": 7.807581875803318e-06, "loss": 0.4649, "step": 2878 }, { "epoch": 0.31724517906336086, "grad_norm": 10.60534954071045, "learning_rate": 7.806134932176038e-06, "loss": 0.4719, "step": 2879 }, { "epoch": 0.31735537190082647, "grad_norm": 10.929094314575195, "learning_rate": 7.804687645410764e-06, "loss": 0.446, "step": 2880 }, { "epoch": 0.317465564738292, "grad_norm": 15.249067306518555, "learning_rate": 7.803240015684475e-06, "loss": 0.3954, "step": 2881 }, { "epoch": 0.31757575757575757, "grad_norm": 10.716742515563965, "learning_rate": 7.80179204317419e-06, "loss": 0.4967, "step": 2882 }, { "epoch": 0.3176859504132231, "grad_norm": 7.304423809051514, "learning_rate": 7.800343728056968e-06, "loss": 0.4558, "step": 2883 }, { "epoch": 0.3177961432506887, "grad_norm": 8.222415924072266, "learning_rate": 7.79889507050991e-06, "loss": 0.4145, "step": 2884 }, { "epoch": 0.3179063360881543, "grad_norm": 5.5766191482543945, "learning_rate": 7.797446070710161e-06, "loss": 0.3608, "step": 2885 }, { "epoch": 0.3180165289256198, "grad_norm": 8.35886001586914, "learning_rate": 7.795996728834909e-06, "loss": 0.459, "step": 2886 }, { "epoch": 0.3181267217630854, "grad_norm": 6.088986873626709, "learning_rate": 7.794547045061375e-06, "loss": 0.464, "step": 2887 }, { "epoch": 0.318236914600551, "grad_norm": 20.60193634033203, "learning_rate": 7.793097019566836e-06, "loss": 0.5473, "step": 2888 }, { "epoch": 0.31834710743801653, "grad_norm": 9.807446479797363, "learning_rate": 7.791646652528598e-06, "loss": 0.4344, "step": 2889 }, { "epoch": 0.3184573002754821, "grad_norm": 10.306923866271973, "learning_rate": 7.790195944124014e-06, "loss": 0.581, "step": 2890 }, { "epoch": 0.3185674931129477, "grad_norm": 6.767941951751709, "learning_rate": 7.788744894530482e-06, "loss": 0.4147, "step": 2891 }, { "epoch": 0.31867768595041324, "grad_norm": 7.338228702545166, "learning_rate": 7.787293503925435e-06, "loss": 0.3658, "step": 2892 }, { "epoch": 0.3187878787878788, "grad_norm": 6.129321575164795, "learning_rate": 7.785841772486353e-06, "loss": 0.3595, "step": 2893 }, { "epoch": 0.31889807162534434, "grad_norm": 7.529963970184326, "learning_rate": 7.784389700390754e-06, "loss": 0.4386, "step": 2894 }, { "epoch": 0.31900826446280994, "grad_norm": 9.463605880737305, "learning_rate": 7.782937287816199e-06, "loss": 0.5608, "step": 2895 }, { "epoch": 0.3191184573002755, "grad_norm": 7.19361686706543, "learning_rate": 7.781484534940295e-06, "loss": 0.4318, "step": 2896 }, { "epoch": 0.31922865013774104, "grad_norm": 6.3514533042907715, "learning_rate": 7.780031441940682e-06, "loss": 0.3292, "step": 2897 }, { "epoch": 0.3193388429752066, "grad_norm": 4.113743305206299, "learning_rate": 7.778578008995048e-06, "loss": 0.4203, "step": 2898 }, { "epoch": 0.3194490358126722, "grad_norm": 6.209310054779053, "learning_rate": 7.777124236281122e-06, "loss": 0.4466, "step": 2899 }, { "epoch": 0.31955922865013775, "grad_norm": 6.49799919128418, "learning_rate": 7.77567012397667e-06, "loss": 0.4178, "step": 2900 }, { "epoch": 0.3196694214876033, "grad_norm": 4.771322250366211, "learning_rate": 7.774215672259506e-06, "loss": 0.3765, "step": 2901 }, { "epoch": 0.31977961432506885, "grad_norm": 4.73738431930542, "learning_rate": 7.772760881307482e-06, "loss": 0.367, "step": 2902 }, { "epoch": 0.31988980716253446, "grad_norm": 6.7683634757995605, "learning_rate": 7.77130575129849e-06, "loss": 0.4299, "step": 2903 }, { "epoch": 0.32, "grad_norm": 12.15935230255127, "learning_rate": 7.769850282410466e-06, "loss": 0.5786, "step": 2904 }, { "epoch": 0.32011019283746556, "grad_norm": 11.941153526306152, "learning_rate": 7.768394474821388e-06, "loss": 0.5455, "step": 2905 }, { "epoch": 0.3202203856749311, "grad_norm": 9.834383010864258, "learning_rate": 7.766938328709273e-06, "loss": 0.5029, "step": 2906 }, { "epoch": 0.3203305785123967, "grad_norm": 15.463730812072754, "learning_rate": 7.76548184425218e-06, "loss": 0.5836, "step": 2907 }, { "epoch": 0.32044077134986226, "grad_norm": 10.36178970336914, "learning_rate": 7.764025021628211e-06, "loss": 0.4246, "step": 2908 }, { "epoch": 0.3205509641873278, "grad_norm": 9.731084823608398, "learning_rate": 7.762567861015507e-06, "loss": 0.3565, "step": 2909 }, { "epoch": 0.32066115702479336, "grad_norm": 8.840417861938477, "learning_rate": 7.761110362592253e-06, "loss": 0.3893, "step": 2910 }, { "epoch": 0.32077134986225897, "grad_norm": 6.205860137939453, "learning_rate": 7.759652526536675e-06, "loss": 0.4306, "step": 2911 }, { "epoch": 0.3208815426997245, "grad_norm": 6.984999656677246, "learning_rate": 7.758194353027034e-06, "loss": 0.4017, "step": 2912 }, { "epoch": 0.32099173553719007, "grad_norm": 4.921543598175049, "learning_rate": 7.756735842241643e-06, "loss": 0.3812, "step": 2913 }, { "epoch": 0.3211019283746556, "grad_norm": 5.417006015777588, "learning_rate": 7.755276994358847e-06, "loss": 0.3356, "step": 2914 }, { "epoch": 0.3212121212121212, "grad_norm": 7.116862773895264, "learning_rate": 7.75381780955704e-06, "loss": 0.402, "step": 2915 }, { "epoch": 0.3213223140495868, "grad_norm": 6.648025989532471, "learning_rate": 7.752358288014649e-06, "loss": 0.4288, "step": 2916 }, { "epoch": 0.3214325068870523, "grad_norm": 7.639983177185059, "learning_rate": 7.750898429910148e-06, "loss": 0.4618, "step": 2917 }, { "epoch": 0.32154269972451793, "grad_norm": 4.660123348236084, "learning_rate": 7.749438235422048e-06, "loss": 0.4205, "step": 2918 }, { "epoch": 0.3216528925619835, "grad_norm": 7.626750469207764, "learning_rate": 7.747977704728908e-06, "loss": 0.4429, "step": 2919 }, { "epoch": 0.32176308539944903, "grad_norm": 10.333895683288574, "learning_rate": 7.746516838009324e-06, "loss": 0.5305, "step": 2920 }, { "epoch": 0.3218732782369146, "grad_norm": 7.846880912780762, "learning_rate": 7.745055635441927e-06, "loss": 0.508, "step": 2921 }, { "epoch": 0.3219834710743802, "grad_norm": 4.909427642822266, "learning_rate": 7.743594097205398e-06, "loss": 0.3761, "step": 2922 }, { "epoch": 0.32209366391184574, "grad_norm": 6.739473342895508, "learning_rate": 7.742132223478458e-06, "loss": 0.4365, "step": 2923 }, { "epoch": 0.3222038567493113, "grad_norm": 6.599324703216553, "learning_rate": 7.740670014439863e-06, "loss": 0.3779, "step": 2924 }, { "epoch": 0.32231404958677684, "grad_norm": 9.731310844421387, "learning_rate": 7.739207470268418e-06, "loss": 0.4568, "step": 2925 }, { "epoch": 0.32242424242424245, "grad_norm": 31.03965950012207, "learning_rate": 7.73774459114296e-06, "loss": 0.5713, "step": 2926 }, { "epoch": 0.322534435261708, "grad_norm": 5.701545715332031, "learning_rate": 7.736281377242376e-06, "loss": 0.4541, "step": 2927 }, { "epoch": 0.32264462809917355, "grad_norm": 8.6945219039917, "learning_rate": 7.73481782874559e-06, "loss": 0.5208, "step": 2928 }, { "epoch": 0.3227548209366391, "grad_norm": 6.546955585479736, "learning_rate": 7.73335394583156e-06, "loss": 0.3758, "step": 2929 }, { "epoch": 0.3228650137741047, "grad_norm": 6.786363124847412, "learning_rate": 7.731889728679301e-06, "loss": 0.3913, "step": 2930 }, { "epoch": 0.32297520661157025, "grad_norm": 5.942480564117432, "learning_rate": 7.730425177467854e-06, "loss": 0.4056, "step": 2931 }, { "epoch": 0.3230853994490358, "grad_norm": 7.159365177154541, "learning_rate": 7.728960292376306e-06, "loss": 0.5533, "step": 2932 }, { "epoch": 0.32319559228650135, "grad_norm": 4.457771301269531, "learning_rate": 7.727495073583788e-06, "loss": 0.3981, "step": 2933 }, { "epoch": 0.32330578512396696, "grad_norm": 5.632359504699707, "learning_rate": 7.72602952126947e-06, "loss": 0.3993, "step": 2934 }, { "epoch": 0.3234159779614325, "grad_norm": 5.7494001388549805, "learning_rate": 7.724563635612554e-06, "loss": 0.3922, "step": 2935 }, { "epoch": 0.32352617079889806, "grad_norm": 9.414168357849121, "learning_rate": 7.723097416792298e-06, "loss": 0.4567, "step": 2936 }, { "epoch": 0.3236363636363636, "grad_norm": 6.057540416717529, "learning_rate": 7.721630864987992e-06, "loss": 0.4211, "step": 2937 }, { "epoch": 0.3237465564738292, "grad_norm": 8.234047889709473, "learning_rate": 7.720163980378966e-06, "loss": 0.4801, "step": 2938 }, { "epoch": 0.32385674931129477, "grad_norm": 5.774929046630859, "learning_rate": 7.718696763144593e-06, "loss": 0.4454, "step": 2939 }, { "epoch": 0.3239669421487603, "grad_norm": 5.500660419464111, "learning_rate": 7.717229213464287e-06, "loss": 0.4491, "step": 2940 }, { "epoch": 0.32407713498622587, "grad_norm": 7.193687438964844, "learning_rate": 7.715761331517501e-06, "loss": 0.4146, "step": 2941 }, { "epoch": 0.3241873278236915, "grad_norm": 7.698046684265137, "learning_rate": 7.714293117483732e-06, "loss": 0.4115, "step": 2942 }, { "epoch": 0.324297520661157, "grad_norm": 5.544668197631836, "learning_rate": 7.712824571542512e-06, "loss": 0.4522, "step": 2943 }, { "epoch": 0.3244077134986226, "grad_norm": 5.389462471008301, "learning_rate": 7.71135569387342e-06, "loss": 0.5141, "step": 2944 }, { "epoch": 0.3245179063360882, "grad_norm": 7.950774669647217, "learning_rate": 7.709886484656071e-06, "loss": 0.4294, "step": 2945 }, { "epoch": 0.32462809917355373, "grad_norm": 9.045058250427246, "learning_rate": 7.70841694407012e-06, "loss": 0.4393, "step": 2946 }, { "epoch": 0.3247382920110193, "grad_norm": 8.255905151367188, "learning_rate": 7.706947072295266e-06, "loss": 0.4806, "step": 2947 }, { "epoch": 0.32484848484848483, "grad_norm": 5.852301597595215, "learning_rate": 7.705476869511249e-06, "loss": 0.4002, "step": 2948 }, { "epoch": 0.32495867768595044, "grad_norm": 10.095605850219727, "learning_rate": 7.704006335897843e-06, "loss": 0.4809, "step": 2949 }, { "epoch": 0.325068870523416, "grad_norm": 8.605550765991211, "learning_rate": 7.70253547163487e-06, "loss": 0.3596, "step": 2950 }, { "epoch": 0.32517906336088154, "grad_norm": 5.249491214752197, "learning_rate": 7.70106427690219e-06, "loss": 0.4228, "step": 2951 }, { "epoch": 0.3252892561983471, "grad_norm": 4.563051700592041, "learning_rate": 7.699592751879698e-06, "loss": 0.4318, "step": 2952 }, { "epoch": 0.3253994490358127, "grad_norm": 6.423548698425293, "learning_rate": 7.69812089674734e-06, "loss": 0.4508, "step": 2953 }, { "epoch": 0.32550964187327824, "grad_norm": 6.7021260261535645, "learning_rate": 7.696648711685093e-06, "loss": 0.4523, "step": 2954 }, { "epoch": 0.3256198347107438, "grad_norm": 7.479311466217041, "learning_rate": 7.69517619687298e-06, "loss": 0.4932, "step": 2955 }, { "epoch": 0.32573002754820934, "grad_norm": 5.966842174530029, "learning_rate": 7.693703352491057e-06, "loss": 0.3899, "step": 2956 }, { "epoch": 0.32584022038567495, "grad_norm": 4.4799346923828125, "learning_rate": 7.692230178719431e-06, "loss": 0.3957, "step": 2957 }, { "epoch": 0.3259504132231405, "grad_norm": 10.163111686706543, "learning_rate": 7.690756675738242e-06, "loss": 0.4173, "step": 2958 }, { "epoch": 0.32606060606060605, "grad_norm": 8.435184478759766, "learning_rate": 7.689282843727672e-06, "loss": 0.4158, "step": 2959 }, { "epoch": 0.3261707988980716, "grad_norm": 5.7857160568237305, "learning_rate": 7.687808682867943e-06, "loss": 0.4333, "step": 2960 }, { "epoch": 0.3262809917355372, "grad_norm": 7.384538173675537, "learning_rate": 7.686334193339315e-06, "loss": 0.3412, "step": 2961 }, { "epoch": 0.32639118457300276, "grad_norm": 5.766459941864014, "learning_rate": 7.684859375322094e-06, "loss": 0.4108, "step": 2962 }, { "epoch": 0.3265013774104683, "grad_norm": 4.282750129699707, "learning_rate": 7.683384228996624e-06, "loss": 0.4139, "step": 2963 }, { "epoch": 0.32661157024793386, "grad_norm": 6.017995357513428, "learning_rate": 7.681908754543282e-06, "loss": 0.3988, "step": 2964 }, { "epoch": 0.32672176308539946, "grad_norm": 5.02705192565918, "learning_rate": 7.680432952142497e-06, "loss": 0.4682, "step": 2965 }, { "epoch": 0.326831955922865, "grad_norm": 8.65324592590332, "learning_rate": 7.678956821974728e-06, "loss": 0.5517, "step": 2966 }, { "epoch": 0.32694214876033056, "grad_norm": 5.790050029754639, "learning_rate": 7.677480364220479e-06, "loss": 0.4394, "step": 2967 }, { "epoch": 0.32705234159779617, "grad_norm": 12.062713623046875, "learning_rate": 7.676003579060295e-06, "loss": 0.4488, "step": 2968 }, { "epoch": 0.3271625344352617, "grad_norm": 4.4486236572265625, "learning_rate": 7.67452646667476e-06, "loss": 0.4409, "step": 2969 }, { "epoch": 0.32727272727272727, "grad_norm": 6.7604241371154785, "learning_rate": 7.673049027244493e-06, "loss": 0.4054, "step": 2970 }, { "epoch": 0.3273829201101928, "grad_norm": 5.401520252227783, "learning_rate": 7.671571260950162e-06, "loss": 0.3751, "step": 2971 }, { "epoch": 0.3274931129476584, "grad_norm": 4.630212306976318, "learning_rate": 7.670093167972468e-06, "loss": 0.3902, "step": 2972 }, { "epoch": 0.327603305785124, "grad_norm": 12.318961143493652, "learning_rate": 7.668614748492154e-06, "loss": 0.4432, "step": 2973 }, { "epoch": 0.3277134986225895, "grad_norm": 6.725958347320557, "learning_rate": 7.667136002690004e-06, "loss": 0.3127, "step": 2974 }, { "epoch": 0.3278236914600551, "grad_norm": 5.85172176361084, "learning_rate": 7.66565693074684e-06, "loss": 0.4157, "step": 2975 }, { "epoch": 0.3279338842975207, "grad_norm": 12.489062309265137, "learning_rate": 7.664177532843525e-06, "loss": 0.4157, "step": 2976 }, { "epoch": 0.32804407713498623, "grad_norm": 6.006800174713135, "learning_rate": 7.662697809160963e-06, "loss": 0.4164, "step": 2977 }, { "epoch": 0.3281542699724518, "grad_norm": 21.75774574279785, "learning_rate": 7.661217759880095e-06, "loss": 0.3391, "step": 2978 }, { "epoch": 0.32826446280991733, "grad_norm": 7.501448631286621, "learning_rate": 7.659737385181907e-06, "loss": 0.4868, "step": 2979 }, { "epoch": 0.32837465564738294, "grad_norm": 7.795146465301514, "learning_rate": 7.658256685247415e-06, "loss": 0.4861, "step": 2980 }, { "epoch": 0.3284848484848485, "grad_norm": 9.895896911621094, "learning_rate": 7.656775660257689e-06, "loss": 0.4656, "step": 2981 }, { "epoch": 0.32859504132231404, "grad_norm": 7.72867488861084, "learning_rate": 7.655294310393822e-06, "loss": 0.3939, "step": 2982 }, { "epoch": 0.3287052341597796, "grad_norm": 6.340445041656494, "learning_rate": 7.653812635836963e-06, "loss": 0.4287, "step": 2983 }, { "epoch": 0.3288154269972452, "grad_norm": 11.97193431854248, "learning_rate": 7.652330636768289e-06, "loss": 0.417, "step": 2984 }, { "epoch": 0.32892561983471075, "grad_norm": 11.652042388916016, "learning_rate": 7.650848313369022e-06, "loss": 0.4848, "step": 2985 }, { "epoch": 0.3290358126721763, "grad_norm": 5.396225452423096, "learning_rate": 7.649365665820424e-06, "loss": 0.4078, "step": 2986 }, { "epoch": 0.32914600550964185, "grad_norm": 8.694803237915039, "learning_rate": 7.64788269430379e-06, "loss": 0.4594, "step": 2987 }, { "epoch": 0.32925619834710745, "grad_norm": 7.699805736541748, "learning_rate": 7.646399399000466e-06, "loss": 0.4293, "step": 2988 }, { "epoch": 0.329366391184573, "grad_norm": 5.9591755867004395, "learning_rate": 7.644915780091828e-06, "loss": 0.4216, "step": 2989 }, { "epoch": 0.32947658402203855, "grad_norm": 6.445276260375977, "learning_rate": 7.643431837759295e-06, "loss": 0.3882, "step": 2990 }, { "epoch": 0.3295867768595041, "grad_norm": 5.513563632965088, "learning_rate": 7.641947572184328e-06, "loss": 0.417, "step": 2991 }, { "epoch": 0.3296969696969697, "grad_norm": 5.847497463226318, "learning_rate": 7.64046298354842e-06, "loss": 0.4099, "step": 2992 }, { "epoch": 0.32980716253443526, "grad_norm": 7.973586082458496, "learning_rate": 7.638978072033114e-06, "loss": 0.398, "step": 2993 }, { "epoch": 0.3299173553719008, "grad_norm": 6.221728324890137, "learning_rate": 7.637492837819986e-06, "loss": 0.3629, "step": 2994 }, { "epoch": 0.3300275482093664, "grad_norm": 9.28332233428955, "learning_rate": 7.636007281090647e-06, "loss": 0.4382, "step": 2995 }, { "epoch": 0.33013774104683197, "grad_norm": 19.20231819152832, "learning_rate": 7.63452140202676e-06, "loss": 0.4491, "step": 2996 }, { "epoch": 0.3302479338842975, "grad_norm": 5.971023082733154, "learning_rate": 7.633035200810018e-06, "loss": 0.4333, "step": 2997 }, { "epoch": 0.33035812672176307, "grad_norm": 5.851497173309326, "learning_rate": 7.631548677622152e-06, "loss": 0.4213, "step": 2998 }, { "epoch": 0.3304683195592287, "grad_norm": 7.684570789337158, "learning_rate": 7.630061832644942e-06, "loss": 0.3773, "step": 2999 }, { "epoch": 0.3305785123966942, "grad_norm": 8.930782318115234, "learning_rate": 7.628574666060198e-06, "loss": 0.4315, "step": 3000 }, { "epoch": 0.3306887052341598, "grad_norm": 9.754863739013672, "learning_rate": 7.6270871780497726e-06, "loss": 0.4276, "step": 3001 }, { "epoch": 0.3307988980716253, "grad_norm": 13.1202392578125, "learning_rate": 7.625599368795558e-06, "loss": 0.4971, "step": 3002 }, { "epoch": 0.33090909090909093, "grad_norm": 7.650578498840332, "learning_rate": 7.624111238479486e-06, "loss": 0.3925, "step": 3003 }, { "epoch": 0.3310192837465565, "grad_norm": 5.505433082580566, "learning_rate": 7.622622787283528e-06, "loss": 0.3684, "step": 3004 }, { "epoch": 0.33112947658402203, "grad_norm": 9.210761070251465, "learning_rate": 7.621134015389693e-06, "loss": 0.5293, "step": 3005 }, { "epoch": 0.3312396694214876, "grad_norm": 9.177719116210938, "learning_rate": 7.61964492298003e-06, "loss": 0.4088, "step": 3006 }, { "epoch": 0.3313498622589532, "grad_norm": 8.678670883178711, "learning_rate": 7.618155510236627e-06, "loss": 0.4763, "step": 3007 }, { "epoch": 0.33146005509641874, "grad_norm": 4.918095588684082, "learning_rate": 7.616665777341612e-06, "loss": 0.4408, "step": 3008 }, { "epoch": 0.3315702479338843, "grad_norm": 18.14687156677246, "learning_rate": 7.6151757244771514e-06, "loss": 0.5224, "step": 3009 }, { "epoch": 0.33168044077134984, "grad_norm": 7.2442498207092285, "learning_rate": 7.613685351825451e-06, "loss": 0.3909, "step": 3010 }, { "epoch": 0.33179063360881544, "grad_norm": 6.038534164428711, "learning_rate": 7.612194659568755e-06, "loss": 0.3934, "step": 3011 }, { "epoch": 0.331900826446281, "grad_norm": 8.074752807617188, "learning_rate": 7.610703647889348e-06, "loss": 0.3914, "step": 3012 }, { "epoch": 0.33201101928374654, "grad_norm": 8.205302238464355, "learning_rate": 7.609212316969553e-06, "loss": 0.4561, "step": 3013 }, { "epoch": 0.3321212121212121, "grad_norm": 10.075181007385254, "learning_rate": 7.607720666991733e-06, "loss": 0.4963, "step": 3014 }, { "epoch": 0.3322314049586777, "grad_norm": 11.816518783569336, "learning_rate": 7.606228698138285e-06, "loss": 0.4361, "step": 3015 }, { "epoch": 0.33234159779614325, "grad_norm": 7.756158351898193, "learning_rate": 7.604736410591651e-06, "loss": 0.371, "step": 3016 }, { "epoch": 0.3324517906336088, "grad_norm": 7.583977699279785, "learning_rate": 7.603243804534313e-06, "loss": 0.3731, "step": 3017 }, { "epoch": 0.3325619834710744, "grad_norm": 4.773258686065674, "learning_rate": 7.601750880148786e-06, "loss": 0.4677, "step": 3018 }, { "epoch": 0.33267217630853996, "grad_norm": 13.50361442565918, "learning_rate": 7.600257637617627e-06, "loss": 0.3665, "step": 3019 }, { "epoch": 0.3327823691460055, "grad_norm": 11.342306137084961, "learning_rate": 7.5987640771234305e-06, "loss": 0.4426, "step": 3020 }, { "epoch": 0.33289256198347106, "grad_norm": 12.209716796875, "learning_rate": 7.597270198848834e-06, "loss": 0.492, "step": 3021 }, { "epoch": 0.33300275482093666, "grad_norm": 3.5088915824890137, "learning_rate": 7.5957760029765106e-06, "loss": 0.3931, "step": 3022 }, { "epoch": 0.3331129476584022, "grad_norm": 4.468617916107178, "learning_rate": 7.594281489689169e-06, "loss": 0.3579, "step": 3023 }, { "epoch": 0.33322314049586776, "grad_norm": 6.926708698272705, "learning_rate": 7.5927866591695645e-06, "loss": 0.4393, "step": 3024 }, { "epoch": 0.3333333333333333, "grad_norm": 6.028176784515381, "learning_rate": 7.591291511600485e-06, "loss": 0.4507, "step": 3025 }, { "epoch": 0.3334435261707989, "grad_norm": 6.006053447723389, "learning_rate": 7.589796047164759e-06, "loss": 0.4414, "step": 3026 }, { "epoch": 0.33355371900826447, "grad_norm": 6.254835605621338, "learning_rate": 7.588300266045255e-06, "loss": 0.4751, "step": 3027 }, { "epoch": 0.33366391184573, "grad_norm": 6.107983112335205, "learning_rate": 7.586804168424879e-06, "loss": 0.5331, "step": 3028 }, { "epoch": 0.33377410468319557, "grad_norm": 7.126374244689941, "learning_rate": 7.585307754486575e-06, "loss": 0.3972, "step": 3029 }, { "epoch": 0.3338842975206612, "grad_norm": 5.774569988250732, "learning_rate": 7.583811024413328e-06, "loss": 0.4707, "step": 3030 }, { "epoch": 0.3339944903581267, "grad_norm": 9.53724193572998, "learning_rate": 7.582313978388156e-06, "loss": 0.4184, "step": 3031 }, { "epoch": 0.3341046831955923, "grad_norm": 14.912829399108887, "learning_rate": 7.580816616594126e-06, "loss": 0.5473, "step": 3032 }, { "epoch": 0.33421487603305783, "grad_norm": 5.911381721496582, "learning_rate": 7.579318939214334e-06, "loss": 0.3691, "step": 3033 }, { "epoch": 0.33432506887052343, "grad_norm": 8.710693359375, "learning_rate": 7.577820946431918e-06, "loss": 0.3979, "step": 3034 }, { "epoch": 0.334435261707989, "grad_norm": 11.323448181152344, "learning_rate": 7.5763226384300555e-06, "loss": 0.447, "step": 3035 }, { "epoch": 0.33454545454545453, "grad_norm": 5.685377597808838, "learning_rate": 7.5748240153919605e-06, "loss": 0.3796, "step": 3036 }, { "epoch": 0.3346556473829201, "grad_norm": 8.896135330200195, "learning_rate": 7.573325077500888e-06, "loss": 0.4009, "step": 3037 }, { "epoch": 0.3347658402203857, "grad_norm": 7.273399829864502, "learning_rate": 7.571825824940129e-06, "loss": 0.3532, "step": 3038 }, { "epoch": 0.33487603305785124, "grad_norm": 7.050100326538086, "learning_rate": 7.570326257893015e-06, "loss": 0.4094, "step": 3039 }, { "epoch": 0.3349862258953168, "grad_norm": 5.257210731506348, "learning_rate": 7.5688263765429145e-06, "loss": 0.5058, "step": 3040 }, { "epoch": 0.33509641873278234, "grad_norm": 4.360664367675781, "learning_rate": 7.567326181073235e-06, "loss": 0.4412, "step": 3041 }, { "epoch": 0.33520661157024795, "grad_norm": 5.903068542480469, "learning_rate": 7.565825671667423e-06, "loss": 0.4536, "step": 3042 }, { "epoch": 0.3353168044077135, "grad_norm": 6.916237831115723, "learning_rate": 7.564324848508963e-06, "loss": 0.4919, "step": 3043 }, { "epoch": 0.33542699724517905, "grad_norm": 9.67279052734375, "learning_rate": 7.562823711781375e-06, "loss": 0.3965, "step": 3044 }, { "epoch": 0.33553719008264465, "grad_norm": 7.412188529968262, "learning_rate": 7.561322261668224e-06, "loss": 0.392, "step": 3045 }, { "epoch": 0.3356473829201102, "grad_norm": 11.482963562011719, "learning_rate": 7.55982049835311e-06, "loss": 0.4967, "step": 3046 }, { "epoch": 0.33575757575757575, "grad_norm": 7.3784708976745605, "learning_rate": 7.558318422019666e-06, "loss": 0.4075, "step": 3047 }, { "epoch": 0.3358677685950413, "grad_norm": 6.242224216461182, "learning_rate": 7.556816032851568e-06, "loss": 0.4608, "step": 3048 }, { "epoch": 0.3359779614325069, "grad_norm": 4.955140590667725, "learning_rate": 7.555313331032537e-06, "loss": 0.3645, "step": 3049 }, { "epoch": 0.33608815426997246, "grad_norm": 9.901180267333984, "learning_rate": 7.5538103167463175e-06, "loss": 0.4818, "step": 3050 }, { "epoch": 0.336198347107438, "grad_norm": 5.590670108795166, "learning_rate": 7.552306990176704e-06, "loss": 0.4435, "step": 3051 }, { "epoch": 0.33630853994490356, "grad_norm": 4.528062343597412, "learning_rate": 7.550803351507525e-06, "loss": 0.4076, "step": 3052 }, { "epoch": 0.33641873278236917, "grad_norm": 4.979361057281494, "learning_rate": 7.549299400922647e-06, "loss": 0.407, "step": 3053 }, { "epoch": 0.3365289256198347, "grad_norm": 6.397597789764404, "learning_rate": 7.547795138605976e-06, "loss": 0.4511, "step": 3054 }, { "epoch": 0.33663911845730027, "grad_norm": 5.850242614746094, "learning_rate": 7.546290564741454e-06, "loss": 0.4237, "step": 3055 }, { "epoch": 0.3367493112947658, "grad_norm": 4.40457010269165, "learning_rate": 7.544785679513064e-06, "loss": 0.4449, "step": 3056 }, { "epoch": 0.3368595041322314, "grad_norm": 14.284875869750977, "learning_rate": 7.543280483104824e-06, "loss": 0.5181, "step": 3057 }, { "epoch": 0.336969696969697, "grad_norm": 6.017383098602295, "learning_rate": 7.541774975700791e-06, "loss": 0.4222, "step": 3058 }, { "epoch": 0.3370798898071625, "grad_norm": 9.808992385864258, "learning_rate": 7.540269157485061e-06, "loss": 0.3855, "step": 3059 }, { "epoch": 0.3371900826446281, "grad_norm": 5.58088493347168, "learning_rate": 7.5387630286417705e-06, "loss": 0.3747, "step": 3060 }, { "epoch": 0.3373002754820937, "grad_norm": 9.518733024597168, "learning_rate": 7.537256589355085e-06, "loss": 0.5106, "step": 3061 }, { "epoch": 0.33741046831955923, "grad_norm": 8.29925537109375, "learning_rate": 7.535749839809219e-06, "loss": 0.4678, "step": 3062 }, { "epoch": 0.3375206611570248, "grad_norm": 5.418189525604248, "learning_rate": 7.534242780188419e-06, "loss": 0.4085, "step": 3063 }, { "epoch": 0.33763085399449033, "grad_norm": 14.174312591552734, "learning_rate": 7.532735410676968e-06, "loss": 0.5511, "step": 3064 }, { "epoch": 0.33774104683195594, "grad_norm": 4.7229695320129395, "learning_rate": 7.53122773145919e-06, "loss": 0.4061, "step": 3065 }, { "epoch": 0.3378512396694215, "grad_norm": 7.452611446380615, "learning_rate": 7.529719742719447e-06, "loss": 0.442, "step": 3066 }, { "epoch": 0.33796143250688704, "grad_norm": 4.89311408996582, "learning_rate": 7.528211444642138e-06, "loss": 0.3748, "step": 3067 }, { "epoch": 0.33807162534435264, "grad_norm": 7.314619541168213, "learning_rate": 7.526702837411699e-06, "loss": 0.3605, "step": 3068 }, { "epoch": 0.3381818181818182, "grad_norm": 8.761941909790039, "learning_rate": 7.525193921212606e-06, "loss": 0.3487, "step": 3069 }, { "epoch": 0.33829201101928374, "grad_norm": 8.771902084350586, "learning_rate": 7.52368469622937e-06, "loss": 0.3674, "step": 3070 }, { "epoch": 0.3384022038567493, "grad_norm": 9.572006225585938, "learning_rate": 7.52217516264654e-06, "loss": 0.4249, "step": 3071 }, { "epoch": 0.3385123966942149, "grad_norm": 9.799403190612793, "learning_rate": 7.520665320648705e-06, "loss": 0.4328, "step": 3072 }, { "epoch": 0.33862258953168045, "grad_norm": 4.222426414489746, "learning_rate": 7.5191551704204915e-06, "loss": 0.4301, "step": 3073 }, { "epoch": 0.338732782369146, "grad_norm": 6.847049236297607, "learning_rate": 7.5176447121465615e-06, "loss": 0.3948, "step": 3074 }, { "epoch": 0.33884297520661155, "grad_norm": 4.570529460906982, "learning_rate": 7.516133946011617e-06, "loss": 0.3965, "step": 3075 }, { "epoch": 0.33895316804407716, "grad_norm": 8.942601203918457, "learning_rate": 7.514622872200394e-06, "loss": 0.4789, "step": 3076 }, { "epoch": 0.3390633608815427, "grad_norm": 9.412450790405273, "learning_rate": 7.513111490897672e-06, "loss": 0.4786, "step": 3077 }, { "epoch": 0.33917355371900826, "grad_norm": 5.571948528289795, "learning_rate": 7.511599802288263e-06, "loss": 0.4435, "step": 3078 }, { "epoch": 0.3392837465564738, "grad_norm": 10.787137031555176, "learning_rate": 7.5100878065570185e-06, "loss": 0.4989, "step": 3079 }, { "epoch": 0.3393939393939394, "grad_norm": 6.516151428222656, "learning_rate": 7.5085755038888266e-06, "loss": 0.3717, "step": 3080 }, { "epoch": 0.33950413223140496, "grad_norm": 4.707610607147217, "learning_rate": 7.507062894468615e-06, "loss": 0.4428, "step": 3081 }, { "epoch": 0.3396143250688705, "grad_norm": 8.729148864746094, "learning_rate": 7.505549978481345e-06, "loss": 0.4132, "step": 3082 }, { "epoch": 0.33972451790633607, "grad_norm": 7.110918998718262, "learning_rate": 7.504036756112023e-06, "loss": 0.4446, "step": 3083 }, { "epoch": 0.33983471074380167, "grad_norm": 4.316981315612793, "learning_rate": 7.502523227545686e-06, "loss": 0.4446, "step": 3084 }, { "epoch": 0.3399449035812672, "grad_norm": 7.524450778961182, "learning_rate": 7.5010093929674065e-06, "loss": 0.4093, "step": 3085 }, { "epoch": 0.34005509641873277, "grad_norm": 6.450577259063721, "learning_rate": 7.499495252562303e-06, "loss": 0.4341, "step": 3086 }, { "epoch": 0.3401652892561983, "grad_norm": 5.516423225402832, "learning_rate": 7.497980806515524e-06, "loss": 0.3102, "step": 3087 }, { "epoch": 0.34027548209366393, "grad_norm": 4.545234203338623, "learning_rate": 7.4964660550122595e-06, "loss": 0.4379, "step": 3088 }, { "epoch": 0.3403856749311295, "grad_norm": 9.267948150634766, "learning_rate": 7.494950998237733e-06, "loss": 0.5489, "step": 3089 }, { "epoch": 0.34049586776859503, "grad_norm": 7.114536762237549, "learning_rate": 7.493435636377211e-06, "loss": 0.4453, "step": 3090 }, { "epoch": 0.3406060606060606, "grad_norm": 5.326347351074219, "learning_rate": 7.491919969615993e-06, "loss": 0.369, "step": 3091 }, { "epoch": 0.3407162534435262, "grad_norm": 6.879369735717773, "learning_rate": 7.490403998139414e-06, "loss": 0.3687, "step": 3092 }, { "epoch": 0.34082644628099174, "grad_norm": 8.012075424194336, "learning_rate": 7.488887722132853e-06, "loss": 0.3982, "step": 3093 }, { "epoch": 0.3409366391184573, "grad_norm": 4.556394100189209, "learning_rate": 7.487371141781718e-06, "loss": 0.444, "step": 3094 }, { "epoch": 0.3410468319559229, "grad_norm": 5.1188459396362305, "learning_rate": 7.485854257271463e-06, "loss": 0.3585, "step": 3095 }, { "epoch": 0.34115702479338844, "grad_norm": 5.984025955200195, "learning_rate": 7.484337068787574e-06, "loss": 0.4125, "step": 3096 }, { "epoch": 0.341267217630854, "grad_norm": 23.55314826965332, "learning_rate": 7.482819576515571e-06, "loss": 0.4087, "step": 3097 }, { "epoch": 0.34137741046831954, "grad_norm": 7.297911643981934, "learning_rate": 7.481301780641019e-06, "loss": 0.3901, "step": 3098 }, { "epoch": 0.34148760330578515, "grad_norm": 6.8249921798706055, "learning_rate": 7.479783681349515e-06, "loss": 0.4114, "step": 3099 }, { "epoch": 0.3415977961432507, "grad_norm": 12.885574340820312, "learning_rate": 7.478265278826693e-06, "loss": 0.5381, "step": 3100 }, { "epoch": 0.34170798898071625, "grad_norm": 9.255385398864746, "learning_rate": 7.476746573258227e-06, "loss": 0.5265, "step": 3101 }, { "epoch": 0.3418181818181818, "grad_norm": 9.08926773071289, "learning_rate": 7.475227564829826e-06, "loss": 0.437, "step": 3102 }, { "epoch": 0.3419283746556474, "grad_norm": 10.180606842041016, "learning_rate": 7.473708253727234e-06, "loss": 0.5285, "step": 3103 }, { "epoch": 0.34203856749311295, "grad_norm": 7.8228840827941895, "learning_rate": 7.472188640136239e-06, "loss": 0.454, "step": 3104 }, { "epoch": 0.3421487603305785, "grad_norm": 6.170624256134033, "learning_rate": 7.470668724242658e-06, "loss": 0.4702, "step": 3105 }, { "epoch": 0.34225895316804406, "grad_norm": 7.054677963256836, "learning_rate": 7.46914850623235e-06, "loss": 0.5059, "step": 3106 }, { "epoch": 0.34236914600550966, "grad_norm": 8.298323631286621, "learning_rate": 7.467627986291207e-06, "loss": 0.4698, "step": 3107 }, { "epoch": 0.3424793388429752, "grad_norm": 7.792571544647217, "learning_rate": 7.466107164605163e-06, "loss": 0.4285, "step": 3108 }, { "epoch": 0.34258953168044076, "grad_norm": 7.682173728942871, "learning_rate": 7.464586041360186e-06, "loss": 0.3875, "step": 3109 }, { "epoch": 0.3426997245179063, "grad_norm": 9.861799240112305, "learning_rate": 7.463064616742278e-06, "loss": 0.4637, "step": 3110 }, { "epoch": 0.3428099173553719, "grad_norm": 5.392634868621826, "learning_rate": 7.461542890937484e-06, "loss": 0.4372, "step": 3111 }, { "epoch": 0.34292011019283747, "grad_norm": 8.271360397338867, "learning_rate": 7.460020864131883e-06, "loss": 0.4836, "step": 3112 }, { "epoch": 0.343030303030303, "grad_norm": 7.254205703735352, "learning_rate": 7.458498536511587e-06, "loss": 0.4199, "step": 3113 }, { "epoch": 0.34314049586776857, "grad_norm": 9.592790603637695, "learning_rate": 7.4569759082627515e-06, "loss": 0.464, "step": 3114 }, { "epoch": 0.3432506887052342, "grad_norm": 9.89777946472168, "learning_rate": 7.455452979571562e-06, "loss": 0.4837, "step": 3115 }, { "epoch": 0.3433608815426997, "grad_norm": 7.666777610778809, "learning_rate": 7.453929750624249e-06, "loss": 0.3973, "step": 3116 }, { "epoch": 0.3434710743801653, "grad_norm": 7.6917405128479, "learning_rate": 7.452406221607073e-06, "loss": 0.4681, "step": 3117 }, { "epoch": 0.3435812672176309, "grad_norm": 6.429231643676758, "learning_rate": 7.450882392706332e-06, "loss": 0.4451, "step": 3118 }, { "epoch": 0.34369146005509643, "grad_norm": 6.396045207977295, "learning_rate": 7.449358264108365e-06, "loss": 0.4587, "step": 3119 }, { "epoch": 0.343801652892562, "grad_norm": 8.26191234588623, "learning_rate": 7.4478338359995405e-06, "loss": 0.3897, "step": 3120 }, { "epoch": 0.34391184573002753, "grad_norm": 4.853174686431885, "learning_rate": 7.44630910856627e-06, "loss": 0.4505, "step": 3121 }, { "epoch": 0.34402203856749314, "grad_norm": 5.741095066070557, "learning_rate": 7.444784081994998e-06, "loss": 0.3397, "step": 3122 }, { "epoch": 0.3441322314049587, "grad_norm": 8.979782104492188, "learning_rate": 7.443258756472207e-06, "loss": 0.4242, "step": 3123 }, { "epoch": 0.34424242424242424, "grad_norm": 7.540767192840576, "learning_rate": 7.4417331321844174e-06, "loss": 0.4519, "step": 3124 }, { "epoch": 0.3443526170798898, "grad_norm": 5.847742557525635, "learning_rate": 7.440207209318183e-06, "loss": 0.3724, "step": 3125 }, { "epoch": 0.3444628099173554, "grad_norm": 8.169764518737793, "learning_rate": 7.4386809880600975e-06, "loss": 0.4189, "step": 3126 }, { "epoch": 0.34457300275482095, "grad_norm": 9.309894561767578, "learning_rate": 7.437154468596788e-06, "loss": 0.4038, "step": 3127 }, { "epoch": 0.3446831955922865, "grad_norm": 7.024919033050537, "learning_rate": 7.435627651114919e-06, "loss": 0.3992, "step": 3128 }, { "epoch": 0.34479338842975205, "grad_norm": 13.71273422241211, "learning_rate": 7.434100535801192e-06, "loss": 0.4638, "step": 3129 }, { "epoch": 0.34490358126721765, "grad_norm": 6.8807830810546875, "learning_rate": 7.432573122842346e-06, "loss": 0.4508, "step": 3130 }, { "epoch": 0.3450137741046832, "grad_norm": 10.221870422363281, "learning_rate": 7.431045412425153e-06, "loss": 0.4322, "step": 3131 }, { "epoch": 0.34512396694214875, "grad_norm": 4.576849460601807, "learning_rate": 7.429517404736426e-06, "loss": 0.3949, "step": 3132 }, { "epoch": 0.3452341597796143, "grad_norm": 6.163484573364258, "learning_rate": 7.42798909996301e-06, "loss": 0.3809, "step": 3133 }, { "epoch": 0.3453443526170799, "grad_norm": 8.160926818847656, "learning_rate": 7.42646049829179e-06, "loss": 0.432, "step": 3134 }, { "epoch": 0.34545454545454546, "grad_norm": 7.556317329406738, "learning_rate": 7.424931599909682e-06, "loss": 0.4434, "step": 3135 }, { "epoch": 0.345564738292011, "grad_norm": 5.177813529968262, "learning_rate": 7.423402405003645e-06, "loss": 0.3683, "step": 3136 }, { "epoch": 0.34567493112947656, "grad_norm": 8.282599449157715, "learning_rate": 7.421872913760671e-06, "loss": 0.4316, "step": 3137 }, { "epoch": 0.34578512396694217, "grad_norm": 4.865157604217529, "learning_rate": 7.420343126367785e-06, "loss": 0.4248, "step": 3138 }, { "epoch": 0.3458953168044077, "grad_norm": 6.126284599304199, "learning_rate": 7.418813043012057e-06, "loss": 0.4936, "step": 3139 }, { "epoch": 0.34600550964187327, "grad_norm": 8.956912994384766, "learning_rate": 7.417282663880582e-06, "loss": 0.4199, "step": 3140 }, { "epoch": 0.3461157024793388, "grad_norm": 4.355523586273193, "learning_rate": 7.415751989160499e-06, "loss": 0.3804, "step": 3141 }, { "epoch": 0.3462258953168044, "grad_norm": 6.272829055786133, "learning_rate": 7.414221019038983e-06, "loss": 0.3715, "step": 3142 }, { "epoch": 0.34633608815426997, "grad_norm": 7.41179895401001, "learning_rate": 7.4126897537032396e-06, "loss": 0.4484, "step": 3143 }, { "epoch": 0.3464462809917355, "grad_norm": 9.036253929138184, "learning_rate": 7.411158193340517e-06, "loss": 0.4919, "step": 3144 }, { "epoch": 0.34655647382920113, "grad_norm": 9.739233016967773, "learning_rate": 7.409626338138096e-06, "loss": 0.3531, "step": 3145 }, { "epoch": 0.3466666666666667, "grad_norm": 5.984482288360596, "learning_rate": 7.408094188283291e-06, "loss": 0.381, "step": 3146 }, { "epoch": 0.34677685950413223, "grad_norm": 5.840088844299316, "learning_rate": 7.40656174396346e-06, "loss": 0.5005, "step": 3147 }, { "epoch": 0.3468870523415978, "grad_norm": 5.8981475830078125, "learning_rate": 7.405029005365989e-06, "loss": 0.3878, "step": 3148 }, { "epoch": 0.3469972451790634, "grad_norm": 7.689345359802246, "learning_rate": 7.403495972678303e-06, "loss": 0.4888, "step": 3149 }, { "epoch": 0.34710743801652894, "grad_norm": 7.819447994232178, "learning_rate": 7.401962646087867e-06, "loss": 0.3577, "step": 3150 }, { "epoch": 0.3472176308539945, "grad_norm": 7.267233848571777, "learning_rate": 7.400429025782174e-06, "loss": 0.5349, "step": 3151 }, { "epoch": 0.34732782369146004, "grad_norm": 8.139227867126465, "learning_rate": 7.398895111948761e-06, "loss": 0.5192, "step": 3152 }, { "epoch": 0.34743801652892564, "grad_norm": 5.7755608558654785, "learning_rate": 7.397360904775193e-06, "loss": 0.4215, "step": 3153 }, { "epoch": 0.3475482093663912, "grad_norm": 8.518473625183105, "learning_rate": 7.395826404449078e-06, "loss": 0.4111, "step": 3154 }, { "epoch": 0.34765840220385674, "grad_norm": 9.734333038330078, "learning_rate": 7.394291611158056e-06, "loss": 0.5013, "step": 3155 }, { "epoch": 0.3477685950413223, "grad_norm": 5.6584296226501465, "learning_rate": 7.392756525089804e-06, "loss": 0.4243, "step": 3156 }, { "epoch": 0.3478787878787879, "grad_norm": 5.8375020027160645, "learning_rate": 7.3912211464320324e-06, "loss": 0.4266, "step": 3157 }, { "epoch": 0.34798898071625345, "grad_norm": 4.557214260101318, "learning_rate": 7.3896854753724926e-06, "loss": 0.4037, "step": 3158 }, { "epoch": 0.348099173553719, "grad_norm": 5.716039657592773, "learning_rate": 7.3881495120989644e-06, "loss": 0.3868, "step": 3159 }, { "epoch": 0.34820936639118455, "grad_norm": 9.541804313659668, "learning_rate": 7.3866132567992725e-06, "loss": 0.4031, "step": 3160 }, { "epoch": 0.34831955922865016, "grad_norm": 7.745457649230957, "learning_rate": 7.385076709661268e-06, "loss": 0.4666, "step": 3161 }, { "epoch": 0.3484297520661157, "grad_norm": 5.802303314208984, "learning_rate": 7.3835398708728434e-06, "loss": 0.4277, "step": 3162 }, { "epoch": 0.34853994490358126, "grad_norm": 8.434252738952637, "learning_rate": 7.382002740621927e-06, "loss": 0.432, "step": 3163 }, { "epoch": 0.3486501377410468, "grad_norm": 6.1905999183654785, "learning_rate": 7.380465319096478e-06, "loss": 0.4522, "step": 3164 }, { "epoch": 0.3487603305785124, "grad_norm": 8.930573463439941, "learning_rate": 7.3789276064845e-06, "loss": 0.5147, "step": 3165 }, { "epoch": 0.34887052341597796, "grad_norm": 4.414763450622559, "learning_rate": 7.3773896029740185e-06, "loss": 0.3796, "step": 3166 }, { "epoch": 0.3489807162534435, "grad_norm": 5.990714073181152, "learning_rate": 7.375851308753109e-06, "loss": 0.4047, "step": 3167 }, { "epoch": 0.3490909090909091, "grad_norm": 13.556224822998047, "learning_rate": 7.3743127240098746e-06, "loss": 0.3992, "step": 3168 }, { "epoch": 0.34920110192837467, "grad_norm": 7.954684734344482, "learning_rate": 7.3727738489324545e-06, "loss": 0.4949, "step": 3169 }, { "epoch": 0.3493112947658402, "grad_norm": 9.754575729370117, "learning_rate": 7.371234683709025e-06, "loss": 0.5573, "step": 3170 }, { "epoch": 0.34942148760330577, "grad_norm": 6.305998802185059, "learning_rate": 7.369695228527796e-06, "loss": 0.4003, "step": 3171 }, { "epoch": 0.3495316804407714, "grad_norm": 5.422273635864258, "learning_rate": 7.368155483577017e-06, "loss": 0.4312, "step": 3172 }, { "epoch": 0.3496418732782369, "grad_norm": 7.9743781089782715, "learning_rate": 7.366615449044969e-06, "loss": 0.3525, "step": 3173 }, { "epoch": 0.3497520661157025, "grad_norm": 3.8766064643859863, "learning_rate": 7.365075125119969e-06, "loss": 0.4194, "step": 3174 }, { "epoch": 0.349862258953168, "grad_norm": 5.437851905822754, "learning_rate": 7.36353451199037e-06, "loss": 0.4043, "step": 3175 }, { "epoch": 0.34997245179063363, "grad_norm": 6.247857570648193, "learning_rate": 7.36199360984456e-06, "loss": 0.3724, "step": 3176 }, { "epoch": 0.3500826446280992, "grad_norm": 5.689133167266846, "learning_rate": 7.3604524188709625e-06, "loss": 0.4144, "step": 3177 }, { "epoch": 0.35019283746556473, "grad_norm": 7.601550579071045, "learning_rate": 7.358910939258038e-06, "loss": 0.4944, "step": 3178 }, { "epoch": 0.3503030303030303, "grad_norm": 4.168118000030518, "learning_rate": 7.35736917119428e-06, "loss": 0.3522, "step": 3179 }, { "epoch": 0.3504132231404959, "grad_norm": 5.866237640380859, "learning_rate": 7.355827114868216e-06, "loss": 0.3146, "step": 3180 }, { "epoch": 0.35052341597796144, "grad_norm": 8.666057586669922, "learning_rate": 7.354284770468411e-06, "loss": 0.4484, "step": 3181 }, { "epoch": 0.350633608815427, "grad_norm": 17.02338981628418, "learning_rate": 7.35274213818347e-06, "loss": 0.5835, "step": 3182 }, { "epoch": 0.35074380165289254, "grad_norm": 5.194803237915039, "learning_rate": 7.351199218202023e-06, "loss": 0.4193, "step": 3183 }, { "epoch": 0.35085399449035815, "grad_norm": 9.60494613647461, "learning_rate": 7.3496560107127405e-06, "loss": 0.4136, "step": 3184 }, { "epoch": 0.3509641873278237, "grad_norm": 7.146261692047119, "learning_rate": 7.348112515904331e-06, "loss": 0.4132, "step": 3185 }, { "epoch": 0.35107438016528925, "grad_norm": 5.76173210144043, "learning_rate": 7.346568733965534e-06, "loss": 0.4593, "step": 3186 }, { "epoch": 0.3511845730027548, "grad_norm": 5.917609691619873, "learning_rate": 7.345024665085121e-06, "loss": 0.4321, "step": 3187 }, { "epoch": 0.3512947658402204, "grad_norm": 7.550190448760986, "learning_rate": 7.3434803094519096e-06, "loss": 0.4516, "step": 3188 }, { "epoch": 0.35140495867768595, "grad_norm": 6.343799114227295, "learning_rate": 7.3419356672547425e-06, "loss": 0.4434, "step": 3189 }, { "epoch": 0.3515151515151515, "grad_norm": 9.271021842956543, "learning_rate": 7.3403907386824995e-06, "loss": 0.4578, "step": 3190 }, { "epoch": 0.35162534435261705, "grad_norm": 5.087544918060303, "learning_rate": 7.3388455239240986e-06, "loss": 0.3618, "step": 3191 }, { "epoch": 0.35173553719008266, "grad_norm": 5.713773727416992, "learning_rate": 7.33730002316849e-06, "loss": 0.4621, "step": 3192 }, { "epoch": 0.3518457300275482, "grad_norm": 7.254720687866211, "learning_rate": 7.335754236604661e-06, "loss": 0.415, "step": 3193 }, { "epoch": 0.35195592286501376, "grad_norm": 5.747312068939209, "learning_rate": 7.33420816442163e-06, "loss": 0.4226, "step": 3194 }, { "epoch": 0.35206611570247937, "grad_norm": 6.117341041564941, "learning_rate": 7.332661806808452e-06, "loss": 0.5094, "step": 3195 }, { "epoch": 0.3521763085399449, "grad_norm": 5.964598655700684, "learning_rate": 7.331115163954223e-06, "loss": 0.4295, "step": 3196 }, { "epoch": 0.35228650137741047, "grad_norm": 7.39686393737793, "learning_rate": 7.329568236048064e-06, "loss": 0.5029, "step": 3197 }, { "epoch": 0.352396694214876, "grad_norm": 8.714959144592285, "learning_rate": 7.328021023279136e-06, "loss": 0.4611, "step": 3198 }, { "epoch": 0.3525068870523416, "grad_norm": 10.739441871643066, "learning_rate": 7.326473525836635e-06, "loss": 0.4679, "step": 3199 }, { "epoch": 0.3526170798898072, "grad_norm": 4.571356773376465, "learning_rate": 7.324925743909792e-06, "loss": 0.4637, "step": 3200 }, { "epoch": 0.3527272727272727, "grad_norm": 5.35941743850708, "learning_rate": 7.323377677687871e-06, "loss": 0.4359, "step": 3201 }, { "epoch": 0.3528374655647383, "grad_norm": 6.5187482833862305, "learning_rate": 7.32182932736017e-06, "loss": 0.4485, "step": 3202 }, { "epoch": 0.3529476584022039, "grad_norm": 5.340867519378662, "learning_rate": 7.320280693116027e-06, "loss": 0.4535, "step": 3203 }, { "epoch": 0.35305785123966943, "grad_norm": 7.739713668823242, "learning_rate": 7.3187317751448076e-06, "loss": 0.3692, "step": 3204 }, { "epoch": 0.353168044077135, "grad_norm": 5.844455242156982, "learning_rate": 7.317182573635917e-06, "loss": 0.3821, "step": 3205 }, { "epoch": 0.35327823691460053, "grad_norm": 9.835358619689941, "learning_rate": 7.315633088778794e-06, "loss": 0.4196, "step": 3206 }, { "epoch": 0.35338842975206614, "grad_norm": 10.456611633300781, "learning_rate": 7.314083320762913e-06, "loss": 0.4098, "step": 3207 }, { "epoch": 0.3534986225895317, "grad_norm": 10.098512649536133, "learning_rate": 7.312533269777777e-06, "loss": 0.5049, "step": 3208 }, { "epoch": 0.35360881542699724, "grad_norm": 4.391416072845459, "learning_rate": 7.310982936012933e-06, "loss": 0.4541, "step": 3209 }, { "epoch": 0.3537190082644628, "grad_norm": 11.859990119934082, "learning_rate": 7.309432319657957e-06, "loss": 0.5331, "step": 3210 }, { "epoch": 0.3538292011019284, "grad_norm": 9.845305442810059, "learning_rate": 7.307881420902461e-06, "loss": 0.3995, "step": 3211 }, { "epoch": 0.35393939393939394, "grad_norm": 6.509954452514648, "learning_rate": 7.3063302399360865e-06, "loss": 0.405, "step": 3212 }, { "epoch": 0.3540495867768595, "grad_norm": 6.005599498748779, "learning_rate": 7.30477877694852e-06, "loss": 0.3491, "step": 3213 }, { "epoch": 0.35415977961432504, "grad_norm": 4.321752548217773, "learning_rate": 7.303227032129474e-06, "loss": 0.4173, "step": 3214 }, { "epoch": 0.35426997245179065, "grad_norm": 9.19875431060791, "learning_rate": 7.301675005668697e-06, "loss": 0.4207, "step": 3215 }, { "epoch": 0.3543801652892562, "grad_norm": 4.377025127410889, "learning_rate": 7.300122697755974e-06, "loss": 0.3493, "step": 3216 }, { "epoch": 0.35449035812672175, "grad_norm": 4.898425579071045, "learning_rate": 7.298570108581123e-06, "loss": 0.4039, "step": 3217 }, { "epoch": 0.35460055096418736, "grad_norm": 11.626812934875488, "learning_rate": 7.297017238333997e-06, "loss": 0.4473, "step": 3218 }, { "epoch": 0.3547107438016529, "grad_norm": 5.8675994873046875, "learning_rate": 7.295464087204483e-06, "loss": 0.3992, "step": 3219 }, { "epoch": 0.35482093663911846, "grad_norm": 11.243317604064941, "learning_rate": 7.293910655382501e-06, "loss": 0.4998, "step": 3220 }, { "epoch": 0.354931129476584, "grad_norm": 8.46310043334961, "learning_rate": 7.292356943058011e-06, "loss": 0.4724, "step": 3221 }, { "epoch": 0.3550413223140496, "grad_norm": 10.018592834472656, "learning_rate": 7.290802950420998e-06, "loss": 0.4469, "step": 3222 }, { "epoch": 0.35515151515151516, "grad_norm": 6.223527431488037, "learning_rate": 7.289248677661488e-06, "loss": 0.5229, "step": 3223 }, { "epoch": 0.3552617079889807, "grad_norm": 9.454200744628906, "learning_rate": 7.287694124969542e-06, "loss": 0.4397, "step": 3224 }, { "epoch": 0.35537190082644626, "grad_norm": 4.261337757110596, "learning_rate": 7.286139292535249e-06, "loss": 0.4521, "step": 3225 }, { "epoch": 0.35548209366391187, "grad_norm": 12.140992164611816, "learning_rate": 7.28458418054874e-06, "loss": 0.5055, "step": 3226 }, { "epoch": 0.3555922865013774, "grad_norm": 10.759506225585938, "learning_rate": 7.2830287892001705e-06, "loss": 0.5065, "step": 3227 }, { "epoch": 0.35570247933884297, "grad_norm": 9.743803024291992, "learning_rate": 7.281473118679743e-06, "loss": 0.5222, "step": 3228 }, { "epoch": 0.3558126721763085, "grad_norm": 5.370841026306152, "learning_rate": 7.2799171691776816e-06, "loss": 0.4211, "step": 3229 }, { "epoch": 0.3559228650137741, "grad_norm": 4.253917217254639, "learning_rate": 7.278360940884252e-06, "loss": 0.4418, "step": 3230 }, { "epoch": 0.3560330578512397, "grad_norm": 5.469326496124268, "learning_rate": 7.276804433989753e-06, "loss": 0.4074, "step": 3231 }, { "epoch": 0.3561432506887052, "grad_norm": 6.347237586975098, "learning_rate": 7.275247648684514e-06, "loss": 0.4117, "step": 3232 }, { "epoch": 0.3562534435261708, "grad_norm": 10.482932090759277, "learning_rate": 7.273690585158901e-06, "loss": 0.4599, "step": 3233 }, { "epoch": 0.3563636363636364, "grad_norm": 5.696188449859619, "learning_rate": 7.272133243603317e-06, "loss": 0.4533, "step": 3234 }, { "epoch": 0.35647382920110193, "grad_norm": 8.62669849395752, "learning_rate": 7.270575624208192e-06, "loss": 0.4449, "step": 3235 }, { "epoch": 0.3565840220385675, "grad_norm": 8.257272720336914, "learning_rate": 7.269017727163995e-06, "loss": 0.3958, "step": 3236 }, { "epoch": 0.35669421487603303, "grad_norm": 5.441015720367432, "learning_rate": 7.267459552661229e-06, "loss": 0.4107, "step": 3237 }, { "epoch": 0.35680440771349864, "grad_norm": 7.645590782165527, "learning_rate": 7.26590110089043e-06, "loss": 0.4335, "step": 3238 }, { "epoch": 0.3569146005509642, "grad_norm": 5.621476650238037, "learning_rate": 7.264342372042165e-06, "loss": 0.4162, "step": 3239 }, { "epoch": 0.35702479338842974, "grad_norm": 7.011815547943115, "learning_rate": 7.2627833663070394e-06, "loss": 0.4693, "step": 3240 }, { "epoch": 0.3571349862258953, "grad_norm": 5.472735404968262, "learning_rate": 7.261224083875688e-06, "loss": 0.485, "step": 3241 }, { "epoch": 0.3572451790633609, "grad_norm": 4.558168888092041, "learning_rate": 7.2596645249387876e-06, "loss": 0.4939, "step": 3242 }, { "epoch": 0.35735537190082645, "grad_norm": 5.512816905975342, "learning_rate": 7.258104689687038e-06, "loss": 0.4622, "step": 3243 }, { "epoch": 0.357465564738292, "grad_norm": 10.716435432434082, "learning_rate": 7.25654457831118e-06, "loss": 0.503, "step": 3244 }, { "epoch": 0.3575757575757576, "grad_norm": 5.423465251922607, "learning_rate": 7.254984191001986e-06, "loss": 0.4033, "step": 3245 }, { "epoch": 0.35768595041322315, "grad_norm": 7.701810836791992, "learning_rate": 7.253423527950259e-06, "loss": 0.4288, "step": 3246 }, { "epoch": 0.3577961432506887, "grad_norm": 6.4743242263793945, "learning_rate": 7.251862589346845e-06, "loss": 0.389, "step": 3247 }, { "epoch": 0.35790633608815425, "grad_norm": 6.115100383758545, "learning_rate": 7.2503013753826135e-06, "loss": 0.41, "step": 3248 }, { "epoch": 0.35801652892561986, "grad_norm": 4.2141289710998535, "learning_rate": 7.248739886248475e-06, "loss": 0.4381, "step": 3249 }, { "epoch": 0.3581267217630854, "grad_norm": 5.334897994995117, "learning_rate": 7.247178122135368e-06, "loss": 0.3945, "step": 3250 }, { "epoch": 0.35823691460055096, "grad_norm": 6.416749000549316, "learning_rate": 7.245616083234266e-06, "loss": 0.4502, "step": 3251 }, { "epoch": 0.3583471074380165, "grad_norm": 6.545506954193115, "learning_rate": 7.244053769736181e-06, "loss": 0.4508, "step": 3252 }, { "epoch": 0.3584573002754821, "grad_norm": 5.6274003982543945, "learning_rate": 7.242491181832151e-06, "loss": 0.4527, "step": 3253 }, { "epoch": 0.35856749311294767, "grad_norm": 6.930521488189697, "learning_rate": 7.240928319713253e-06, "loss": 0.367, "step": 3254 }, { "epoch": 0.3586776859504132, "grad_norm": 12.740245819091797, "learning_rate": 7.239365183570597e-06, "loss": 0.431, "step": 3255 }, { "epoch": 0.35878787878787877, "grad_norm": 5.199628829956055, "learning_rate": 7.237801773595325e-06, "loss": 0.4571, "step": 3256 }, { "epoch": 0.3588980716253444, "grad_norm": 8.44864559173584, "learning_rate": 7.236238089978613e-06, "loss": 0.3375, "step": 3257 }, { "epoch": 0.3590082644628099, "grad_norm": 4.241014003753662, "learning_rate": 7.234674132911668e-06, "loss": 0.4201, "step": 3258 }, { "epoch": 0.3591184573002755, "grad_norm": 10.057315826416016, "learning_rate": 7.233109902585735e-06, "loss": 0.42, "step": 3259 }, { "epoch": 0.359228650137741, "grad_norm": 6.158026218414307, "learning_rate": 7.23154539919209e-06, "loss": 0.4335, "step": 3260 }, { "epoch": 0.35933884297520663, "grad_norm": 7.874216079711914, "learning_rate": 7.2299806229220416e-06, "loss": 0.5172, "step": 3261 }, { "epoch": 0.3594490358126722, "grad_norm": 5.500450134277344, "learning_rate": 7.228415573966934e-06, "loss": 0.383, "step": 3262 }, { "epoch": 0.35955922865013773, "grad_norm": 6.57933235168457, "learning_rate": 7.226850252518144e-06, "loss": 0.4605, "step": 3263 }, { "epoch": 0.3596694214876033, "grad_norm": 5.754207611083984, "learning_rate": 7.225284658767077e-06, "loss": 0.482, "step": 3264 }, { "epoch": 0.3597796143250689, "grad_norm": 8.27151870727539, "learning_rate": 7.223718792905183e-06, "loss": 0.4389, "step": 3265 }, { "epoch": 0.35988980716253444, "grad_norm": 6.193888187408447, "learning_rate": 7.222152655123933e-06, "loss": 0.4712, "step": 3266 }, { "epoch": 0.36, "grad_norm": 5.438370227813721, "learning_rate": 7.220586245614838e-06, "loss": 0.372, "step": 3267 }, { "epoch": 0.36011019283746554, "grad_norm": 5.420002460479736, "learning_rate": 7.219019564569441e-06, "loss": 0.394, "step": 3268 }, { "epoch": 0.36022038567493114, "grad_norm": 8.265101432800293, "learning_rate": 7.217452612179314e-06, "loss": 0.4221, "step": 3269 }, { "epoch": 0.3603305785123967, "grad_norm": 8.280856132507324, "learning_rate": 7.215885388636075e-06, "loss": 0.4387, "step": 3270 }, { "epoch": 0.36044077134986224, "grad_norm": 9.710977554321289, "learning_rate": 7.214317894131357e-06, "loss": 0.4793, "step": 3271 }, { "epoch": 0.36055096418732785, "grad_norm": 4.9738688468933105, "learning_rate": 7.212750128856839e-06, "loss": 0.399, "step": 3272 }, { "epoch": 0.3606611570247934, "grad_norm": 18.947298049926758, "learning_rate": 7.211182093004231e-06, "loss": 0.4313, "step": 3273 }, { "epoch": 0.36077134986225895, "grad_norm": 6.143950939178467, "learning_rate": 7.209613786765272e-06, "loss": 0.3856, "step": 3274 }, { "epoch": 0.3608815426997245, "grad_norm": 5.4570465087890625, "learning_rate": 7.208045210331738e-06, "loss": 0.3718, "step": 3275 }, { "epoch": 0.3609917355371901, "grad_norm": 5.805082321166992, "learning_rate": 7.206476363895436e-06, "loss": 0.3773, "step": 3276 }, { "epoch": 0.36110192837465566, "grad_norm": 11.431785583496094, "learning_rate": 7.204907247648207e-06, "loss": 0.4925, "step": 3277 }, { "epoch": 0.3612121212121212, "grad_norm": 13.084609031677246, "learning_rate": 7.203337861781926e-06, "loss": 0.3695, "step": 3278 }, { "epoch": 0.36132231404958676, "grad_norm": 6.078426361083984, "learning_rate": 7.201768206488498e-06, "loss": 0.4312, "step": 3279 }, { "epoch": 0.36143250688705236, "grad_norm": 8.034586906433105, "learning_rate": 7.200198281959863e-06, "loss": 0.4721, "step": 3280 }, { "epoch": 0.3615426997245179, "grad_norm": 6.784212589263916, "learning_rate": 7.198628088387992e-06, "loss": 0.4449, "step": 3281 }, { "epoch": 0.36165289256198346, "grad_norm": 6.045942306518555, "learning_rate": 7.197057625964892e-06, "loss": 0.3875, "step": 3282 }, { "epoch": 0.361763085399449, "grad_norm": 5.697310924530029, "learning_rate": 7.195486894882602e-06, "loss": 0.3944, "step": 3283 }, { "epoch": 0.3618732782369146, "grad_norm": 7.050083637237549, "learning_rate": 7.193915895333192e-06, "loss": 0.4038, "step": 3284 }, { "epoch": 0.36198347107438017, "grad_norm": 9.088546752929688, "learning_rate": 7.192344627508767e-06, "loss": 0.5356, "step": 3285 }, { "epoch": 0.3620936639118457, "grad_norm": 9.426963806152344, "learning_rate": 7.190773091601461e-06, "loss": 0.5076, "step": 3286 }, { "epoch": 0.36220385674931127, "grad_norm": 5.8113813400268555, "learning_rate": 7.189201287803447e-06, "loss": 0.4346, "step": 3287 }, { "epoch": 0.3623140495867769, "grad_norm": 7.260709285736084, "learning_rate": 7.187629216306925e-06, "loss": 0.4405, "step": 3288 }, { "epoch": 0.3624242424242424, "grad_norm": 7.138723850250244, "learning_rate": 7.18605687730413e-06, "loss": 0.3697, "step": 3289 }, { "epoch": 0.362534435261708, "grad_norm": 8.327974319458008, "learning_rate": 7.184484270987333e-06, "loss": 0.4503, "step": 3290 }, { "epoch": 0.3626446280991735, "grad_norm": 7.641798496246338, "learning_rate": 7.182911397548831e-06, "loss": 0.3697, "step": 3291 }, { "epoch": 0.36275482093663913, "grad_norm": 6.12933349609375, "learning_rate": 7.181338257180956e-06, "loss": 0.4458, "step": 3292 }, { "epoch": 0.3628650137741047, "grad_norm": 5.816046714782715, "learning_rate": 7.179764850076078e-06, "loss": 0.438, "step": 3293 }, { "epoch": 0.36297520661157023, "grad_norm": 8.879964828491211, "learning_rate": 7.178191176426594e-06, "loss": 0.3965, "step": 3294 }, { "epoch": 0.36308539944903584, "grad_norm": 7.10797643661499, "learning_rate": 7.176617236424932e-06, "loss": 0.3342, "step": 3295 }, { "epoch": 0.3631955922865014, "grad_norm": 6.46264123916626, "learning_rate": 7.17504303026356e-06, "loss": 0.491, "step": 3296 }, { "epoch": 0.36330578512396694, "grad_norm": 6.991508960723877, "learning_rate": 7.173468558134969e-06, "loss": 0.3825, "step": 3297 }, { "epoch": 0.3634159779614325, "grad_norm": 6.524596214294434, "learning_rate": 7.171893820231693e-06, "loss": 0.4312, "step": 3298 }, { "epoch": 0.3635261707988981, "grad_norm": 25.75640296936035, "learning_rate": 7.170318816746289e-06, "loss": 0.4901, "step": 3299 }, { "epoch": 0.36363636363636365, "grad_norm": 9.401847839355469, "learning_rate": 7.168743547871353e-06, "loss": 0.4732, "step": 3300 }, { "epoch": 0.3637465564738292, "grad_norm": 6.0926666259765625, "learning_rate": 7.167168013799509e-06, "loss": 0.4114, "step": 3301 }, { "epoch": 0.36385674931129475, "grad_norm": 4.8055901527404785, "learning_rate": 7.165592214723416e-06, "loss": 0.3633, "step": 3302 }, { "epoch": 0.36396694214876035, "grad_norm": 7.8886847496032715, "learning_rate": 7.164016150835766e-06, "loss": 0.4952, "step": 3303 }, { "epoch": 0.3640771349862259, "grad_norm": 6.765124797821045, "learning_rate": 7.162439822329282e-06, "loss": 0.3733, "step": 3304 }, { "epoch": 0.36418732782369145, "grad_norm": 7.276710033416748, "learning_rate": 7.160863229396719e-06, "loss": 0.4238, "step": 3305 }, { "epoch": 0.364297520661157, "grad_norm": 5.627334117889404, "learning_rate": 7.159286372230865e-06, "loss": 0.3867, "step": 3306 }, { "epoch": 0.3644077134986226, "grad_norm": 6.506944179534912, "learning_rate": 7.157709251024539e-06, "loss": 0.4296, "step": 3307 }, { "epoch": 0.36451790633608816, "grad_norm": 6.304716110229492, "learning_rate": 7.156131865970597e-06, "loss": 0.4234, "step": 3308 }, { "epoch": 0.3646280991735537, "grad_norm": 9.835976600646973, "learning_rate": 7.154554217261921e-06, "loss": 0.4426, "step": 3309 }, { "epoch": 0.36473829201101926, "grad_norm": 8.039626121520996, "learning_rate": 7.152976305091427e-06, "loss": 0.4893, "step": 3310 }, { "epoch": 0.36484848484848487, "grad_norm": 4.799577236175537, "learning_rate": 7.151398129652067e-06, "loss": 0.4305, "step": 3311 }, { "epoch": 0.3649586776859504, "grad_norm": 6.675243854522705, "learning_rate": 7.149819691136822e-06, "loss": 0.3778, "step": 3312 }, { "epoch": 0.36506887052341597, "grad_norm": 7.772543907165527, "learning_rate": 7.148240989738705e-06, "loss": 0.4963, "step": 3313 }, { "epoch": 0.3651790633608815, "grad_norm": 12.201492309570312, "learning_rate": 7.1466620256507605e-06, "loss": 0.5121, "step": 3314 }, { "epoch": 0.3652892561983471, "grad_norm": 7.570444583892822, "learning_rate": 7.145082799066067e-06, "loss": 0.5187, "step": 3315 }, { "epoch": 0.3653994490358127, "grad_norm": 6.17292594909668, "learning_rate": 7.143503310177737e-06, "loss": 0.4445, "step": 3316 }, { "epoch": 0.3655096418732782, "grad_norm": 9.412154197692871, "learning_rate": 7.141923559178909e-06, "loss": 0.4234, "step": 3317 }, { "epoch": 0.3656198347107438, "grad_norm": 6.886417388916016, "learning_rate": 7.14034354626276e-06, "loss": 0.4243, "step": 3318 }, { "epoch": 0.3657300275482094, "grad_norm": 6.714693546295166, "learning_rate": 7.138763271622494e-06, "loss": 0.3521, "step": 3319 }, { "epoch": 0.36584022038567493, "grad_norm": 6.7171711921691895, "learning_rate": 7.137182735451349e-06, "loss": 0.4254, "step": 3320 }, { "epoch": 0.3659504132231405, "grad_norm": 8.124762535095215, "learning_rate": 7.135601937942598e-06, "loss": 0.4031, "step": 3321 }, { "epoch": 0.3660606060606061, "grad_norm": 3.9232611656188965, "learning_rate": 7.13402087928954e-06, "loss": 0.4102, "step": 3322 }, { "epoch": 0.36617079889807164, "grad_norm": 8.025232315063477, "learning_rate": 7.13243955968551e-06, "loss": 0.394, "step": 3323 }, { "epoch": 0.3662809917355372, "grad_norm": 6.724118232727051, "learning_rate": 7.130857979323875e-06, "loss": 0.428, "step": 3324 }, { "epoch": 0.36639118457300274, "grad_norm": 5.150669097900391, "learning_rate": 7.12927613839803e-06, "loss": 0.4169, "step": 3325 }, { "epoch": 0.36650137741046834, "grad_norm": 9.238730430603027, "learning_rate": 7.127694037101409e-06, "loss": 0.4994, "step": 3326 }, { "epoch": 0.3666115702479339, "grad_norm": 7.163279056549072, "learning_rate": 7.126111675627469e-06, "loss": 0.4505, "step": 3327 }, { "epoch": 0.36672176308539944, "grad_norm": 7.309119701385498, "learning_rate": 7.124529054169705e-06, "loss": 0.3855, "step": 3328 }, { "epoch": 0.366831955922865, "grad_norm": 7.18917179107666, "learning_rate": 7.122946172921644e-06, "loss": 0.4642, "step": 3329 }, { "epoch": 0.3669421487603306, "grad_norm": 4.3568572998046875, "learning_rate": 7.12136303207684e-06, "loss": 0.401, "step": 3330 }, { "epoch": 0.36705234159779615, "grad_norm": 7.7685980796813965, "learning_rate": 7.119779631828882e-06, "loss": 0.3312, "step": 3331 }, { "epoch": 0.3671625344352617, "grad_norm": 6.122189998626709, "learning_rate": 7.1181959723713935e-06, "loss": 0.4262, "step": 3332 }, { "epoch": 0.36727272727272725, "grad_norm": 3.9495091438293457, "learning_rate": 7.116612053898022e-06, "loss": 0.3853, "step": 3333 }, { "epoch": 0.36738292011019286, "grad_norm": 3.9788098335266113, "learning_rate": 7.115027876602456e-06, "loss": 0.4517, "step": 3334 }, { "epoch": 0.3674931129476584, "grad_norm": 4.041481971740723, "learning_rate": 7.113443440678406e-06, "loss": 0.4503, "step": 3335 }, { "epoch": 0.36760330578512396, "grad_norm": 18.059675216674805, "learning_rate": 7.111858746319622e-06, "loss": 0.5433, "step": 3336 }, { "epoch": 0.3677134986225895, "grad_norm": 9.006142616271973, "learning_rate": 7.110273793719882e-06, "loss": 0.5144, "step": 3337 }, { "epoch": 0.3678236914600551, "grad_norm": 4.624941825866699, "learning_rate": 7.108688583072996e-06, "loss": 0.4708, "step": 3338 }, { "epoch": 0.36793388429752066, "grad_norm": 7.720061302185059, "learning_rate": 7.107103114572805e-06, "loss": 0.4989, "step": 3339 }, { "epoch": 0.3680440771349862, "grad_norm": 5.653432846069336, "learning_rate": 7.1055173884131835e-06, "loss": 0.4084, "step": 3340 }, { "epoch": 0.36815426997245176, "grad_norm": 13.400155067443848, "learning_rate": 7.103931404788034e-06, "loss": 0.4775, "step": 3341 }, { "epoch": 0.36826446280991737, "grad_norm": 9.454264640808105, "learning_rate": 7.102345163891297e-06, "loss": 0.448, "step": 3342 }, { "epoch": 0.3683746556473829, "grad_norm": 5.640964984893799, "learning_rate": 7.100758665916938e-06, "loss": 0.4596, "step": 3343 }, { "epoch": 0.36848484848484847, "grad_norm": 8.44428825378418, "learning_rate": 7.099171911058954e-06, "loss": 0.4497, "step": 3344 }, { "epoch": 0.3685950413223141, "grad_norm": 5.792271137237549, "learning_rate": 7.0975848995113775e-06, "loss": 0.4347, "step": 3345 }, { "epoch": 0.3687052341597796, "grad_norm": 7.476680755615234, "learning_rate": 7.09599763146827e-06, "loss": 0.3456, "step": 3346 }, { "epoch": 0.3688154269972452, "grad_norm": 5.843869209289551, "learning_rate": 7.094410107123726e-06, "loss": 0.4177, "step": 3347 }, { "epoch": 0.3689256198347107, "grad_norm": 7.796284198760986, "learning_rate": 7.092822326671867e-06, "loss": 0.4709, "step": 3348 }, { "epoch": 0.36903581267217633, "grad_norm": 14.648262023925781, "learning_rate": 7.091234290306853e-06, "loss": 0.4764, "step": 3349 }, { "epoch": 0.3691460055096419, "grad_norm": 6.055613994598389, "learning_rate": 7.08964599822287e-06, "loss": 0.3784, "step": 3350 }, { "epoch": 0.36925619834710743, "grad_norm": 4.72700834274292, "learning_rate": 7.088057450614133e-06, "loss": 0.4069, "step": 3351 }, { "epoch": 0.369366391184573, "grad_norm": 6.382406234741211, "learning_rate": 7.0864686476748965e-06, "loss": 0.3952, "step": 3352 }, { "epoch": 0.3694765840220386, "grad_norm": 7.267900466918945, "learning_rate": 7.084879589599439e-06, "loss": 0.4146, "step": 3353 }, { "epoch": 0.36958677685950414, "grad_norm": 14.743758201599121, "learning_rate": 7.083290276582075e-06, "loss": 0.3825, "step": 3354 }, { "epoch": 0.3696969696969697, "grad_norm": 5.35316276550293, "learning_rate": 7.0817007088171445e-06, "loss": 0.459, "step": 3355 }, { "epoch": 0.36980716253443524, "grad_norm": 4.526485919952393, "learning_rate": 7.080110886499023e-06, "loss": 0.3998, "step": 3356 }, { "epoch": 0.36991735537190085, "grad_norm": 8.61453628540039, "learning_rate": 7.078520809822118e-06, "loss": 0.433, "step": 3357 }, { "epoch": 0.3700275482093664, "grad_norm": 4.0356125831604, "learning_rate": 7.076930478980865e-06, "loss": 0.3755, "step": 3358 }, { "epoch": 0.37013774104683195, "grad_norm": 12.406144142150879, "learning_rate": 7.07533989416973e-06, "loss": 0.4947, "step": 3359 }, { "epoch": 0.3702479338842975, "grad_norm": 7.419103145599365, "learning_rate": 7.0737490555832155e-06, "loss": 0.4703, "step": 3360 }, { "epoch": 0.3703581267217631, "grad_norm": 6.0058112144470215, "learning_rate": 7.072157963415849e-06, "loss": 0.4211, "step": 3361 }, { "epoch": 0.37046831955922865, "grad_norm": 18.789775848388672, "learning_rate": 7.070566617862192e-06, "loss": 0.466, "step": 3362 }, { "epoch": 0.3705785123966942, "grad_norm": 9.18820858001709, "learning_rate": 7.068975019116836e-06, "loss": 0.3948, "step": 3363 }, { "epoch": 0.37068870523415975, "grad_norm": 6.847883701324463, "learning_rate": 7.067383167374405e-06, "loss": 0.4324, "step": 3364 }, { "epoch": 0.37079889807162536, "grad_norm": 8.323562622070312, "learning_rate": 7.065791062829552e-06, "loss": 0.4534, "step": 3365 }, { "epoch": 0.3709090909090909, "grad_norm": 5.818692684173584, "learning_rate": 7.064198705676961e-06, "loss": 0.4155, "step": 3366 }, { "epoch": 0.37101928374655646, "grad_norm": 11.364190101623535, "learning_rate": 7.0626060961113484e-06, "loss": 0.5086, "step": 3367 }, { "epoch": 0.371129476584022, "grad_norm": 7.733094692230225, "learning_rate": 7.061013234327461e-06, "loss": 0.4434, "step": 3368 }, { "epoch": 0.3712396694214876, "grad_norm": 5.380969047546387, "learning_rate": 7.059420120520076e-06, "loss": 0.4551, "step": 3369 }, { "epoch": 0.37134986225895317, "grad_norm": 5.875665664672852, "learning_rate": 7.057826754884001e-06, "loss": 0.4119, "step": 3370 }, { "epoch": 0.3714600550964187, "grad_norm": 8.532805442810059, "learning_rate": 7.056233137614075e-06, "loss": 0.4052, "step": 3371 }, { "epoch": 0.3715702479338843, "grad_norm": 5.982562065124512, "learning_rate": 7.054639268905168e-06, "loss": 0.3746, "step": 3372 }, { "epoch": 0.3716804407713499, "grad_norm": 11.36665153503418, "learning_rate": 7.05304514895218e-06, "loss": 0.5034, "step": 3373 }, { "epoch": 0.3717906336088154, "grad_norm": 8.417113304138184, "learning_rate": 7.051450777950042e-06, "loss": 0.3379, "step": 3374 }, { "epoch": 0.371900826446281, "grad_norm": 9.102882385253906, "learning_rate": 7.049856156093717e-06, "loss": 0.5195, "step": 3375 }, { "epoch": 0.3720110192837466, "grad_norm": 7.243648052215576, "learning_rate": 7.048261283578196e-06, "loss": 0.4003, "step": 3376 }, { "epoch": 0.37212121212121213, "grad_norm": 6.981441974639893, "learning_rate": 7.046666160598504e-06, "loss": 0.4477, "step": 3377 }, { "epoch": 0.3722314049586777, "grad_norm": 7.513751983642578, "learning_rate": 7.045070787349694e-06, "loss": 0.4497, "step": 3378 }, { "epoch": 0.37234159779614323, "grad_norm": 6.859706878662109, "learning_rate": 7.043475164026848e-06, "loss": 0.4278, "step": 3379 }, { "epoch": 0.37245179063360884, "grad_norm": 7.869282245635986, "learning_rate": 7.041879290825086e-06, "loss": 0.5427, "step": 3380 }, { "epoch": 0.3725619834710744, "grad_norm": 11.550929069519043, "learning_rate": 7.040283167939548e-06, "loss": 0.4197, "step": 3381 }, { "epoch": 0.37267217630853994, "grad_norm": 6.503086090087891, "learning_rate": 7.038686795565414e-06, "loss": 0.5284, "step": 3382 }, { "epoch": 0.3727823691460055, "grad_norm": 4.888057231903076, "learning_rate": 7.037090173897889e-06, "loss": 0.392, "step": 3383 }, { "epoch": 0.3728925619834711, "grad_norm": 7.019505023956299, "learning_rate": 7.035493303132211e-06, "loss": 0.3921, "step": 3384 }, { "epoch": 0.37300275482093664, "grad_norm": 4.9363203048706055, "learning_rate": 7.033896183463648e-06, "loss": 0.4847, "step": 3385 }, { "epoch": 0.3731129476584022, "grad_norm": 7.932403564453125, "learning_rate": 7.032298815087495e-06, "loss": 0.4259, "step": 3386 }, { "epoch": 0.37322314049586774, "grad_norm": 5.2227091789245605, "learning_rate": 7.030701198199081e-06, "loss": 0.3965, "step": 3387 }, { "epoch": 0.37333333333333335, "grad_norm": 7.936646938323975, "learning_rate": 7.0291033329937695e-06, "loss": 0.438, "step": 3388 }, { "epoch": 0.3734435261707989, "grad_norm": 5.427149295806885, "learning_rate": 7.027505219666945e-06, "loss": 0.4556, "step": 3389 }, { "epoch": 0.37355371900826445, "grad_norm": 5.356679439544678, "learning_rate": 7.025906858414028e-06, "loss": 0.3175, "step": 3390 }, { "epoch": 0.37366391184573, "grad_norm": 5.085716247558594, "learning_rate": 7.024308249430467e-06, "loss": 0.4417, "step": 3391 }, { "epoch": 0.3737741046831956, "grad_norm": 5.447735786437988, "learning_rate": 7.022709392911745e-06, "loss": 0.4662, "step": 3392 }, { "epoch": 0.37388429752066116, "grad_norm": 8.673369407653809, "learning_rate": 7.0211102890533715e-06, "loss": 0.4313, "step": 3393 }, { "epoch": 0.3739944903581267, "grad_norm": 5.703742504119873, "learning_rate": 7.019510938050884e-06, "loss": 0.4111, "step": 3394 }, { "epoch": 0.3741046831955923, "grad_norm": 5.098775863647461, "learning_rate": 7.017911340099858e-06, "loss": 0.399, "step": 3395 }, { "epoch": 0.37421487603305786, "grad_norm": 5.0954484939575195, "learning_rate": 7.016311495395891e-06, "loss": 0.4055, "step": 3396 }, { "epoch": 0.3743250688705234, "grad_norm": 8.749516487121582, "learning_rate": 7.014711404134616e-06, "loss": 0.3872, "step": 3397 }, { "epoch": 0.37443526170798896, "grad_norm": 10.231471061706543, "learning_rate": 7.013111066511694e-06, "loss": 0.4641, "step": 3398 }, { "epoch": 0.37454545454545457, "grad_norm": 13.29430866241455, "learning_rate": 7.011510482722817e-06, "loss": 0.4989, "step": 3399 }, { "epoch": 0.3746556473829201, "grad_norm": 7.456292152404785, "learning_rate": 7.009909652963704e-06, "loss": 0.5007, "step": 3400 }, { "epoch": 0.37476584022038567, "grad_norm": 5.273218154907227, "learning_rate": 7.0083085774301085e-06, "loss": 0.2961, "step": 3401 }, { "epoch": 0.3748760330578512, "grad_norm": 7.514196395874023, "learning_rate": 7.006707256317813e-06, "loss": 0.4883, "step": 3402 }, { "epoch": 0.3749862258953168, "grad_norm": 8.081720352172852, "learning_rate": 7.005105689822629e-06, "loss": 0.5013, "step": 3403 }, { "epoch": 0.3750964187327824, "grad_norm": 12.821732521057129, "learning_rate": 7.003503878140396e-06, "loss": 0.4893, "step": 3404 }, { "epoch": 0.3752066115702479, "grad_norm": 6.560337066650391, "learning_rate": 7.001901821466988e-06, "loss": 0.4052, "step": 3405 }, { "epoch": 0.3753168044077135, "grad_norm": 9.20208740234375, "learning_rate": 7.000299519998307e-06, "loss": 0.4203, "step": 3406 }, { "epoch": 0.3754269972451791, "grad_norm": 4.343939304351807, "learning_rate": 6.998696973930282e-06, "loss": 0.4376, "step": 3407 }, { "epoch": 0.37553719008264463, "grad_norm": 3.986074209213257, "learning_rate": 6.997094183458877e-06, "loss": 0.409, "step": 3408 }, { "epoch": 0.3756473829201102, "grad_norm": 7.913437366485596, "learning_rate": 6.995491148780082e-06, "loss": 0.426, "step": 3409 }, { "epoch": 0.37575757575757573, "grad_norm": 9.18805980682373, "learning_rate": 6.993887870089918e-06, "loss": 0.4564, "step": 3410 }, { "epoch": 0.37586776859504134, "grad_norm": 4.529977321624756, "learning_rate": 6.992284347584438e-06, "loss": 0.4254, "step": 3411 }, { "epoch": 0.3759779614325069, "grad_norm": 6.054936408996582, "learning_rate": 6.990680581459721e-06, "loss": 0.4183, "step": 3412 }, { "epoch": 0.37608815426997244, "grad_norm": 9.715087890625, "learning_rate": 6.9890765719118805e-06, "loss": 0.4464, "step": 3413 }, { "epoch": 0.376198347107438, "grad_norm": 5.770445346832275, "learning_rate": 6.987472319137052e-06, "loss": 0.3828, "step": 3414 }, { "epoch": 0.3763085399449036, "grad_norm": 8.281036376953125, "learning_rate": 6.9858678233314094e-06, "loss": 0.4281, "step": 3415 }, { "epoch": 0.37641873278236915, "grad_norm": 4.728235244750977, "learning_rate": 6.984263084691153e-06, "loss": 0.4528, "step": 3416 }, { "epoch": 0.3765289256198347, "grad_norm": 4.969788551330566, "learning_rate": 6.98265810341251e-06, "loss": 0.4187, "step": 3417 }, { "epoch": 0.37663911845730025, "grad_norm": 6.069950103759766, "learning_rate": 6.981052879691742e-06, "loss": 0.3874, "step": 3418 }, { "epoch": 0.37674931129476585, "grad_norm": 5.942756175994873, "learning_rate": 6.979447413725136e-06, "loss": 0.4353, "step": 3419 }, { "epoch": 0.3768595041322314, "grad_norm": 10.28392505645752, "learning_rate": 6.977841705709012e-06, "loss": 0.4416, "step": 3420 }, { "epoch": 0.37696969696969695, "grad_norm": 8.235113143920898, "learning_rate": 6.9762357558397176e-06, "loss": 0.4165, "step": 3421 }, { "epoch": 0.37707988980716256, "grad_norm": 5.41470193862915, "learning_rate": 6.974629564313629e-06, "loss": 0.3751, "step": 3422 }, { "epoch": 0.3771900826446281, "grad_norm": 6.02674674987793, "learning_rate": 6.9730231313271565e-06, "loss": 0.4191, "step": 3423 }, { "epoch": 0.37730027548209366, "grad_norm": 8.15639591217041, "learning_rate": 6.971416457076736e-06, "loss": 0.4016, "step": 3424 }, { "epoch": 0.3774104683195592, "grad_norm": 8.789399147033691, "learning_rate": 6.969809541758832e-06, "loss": 0.5495, "step": 3425 }, { "epoch": 0.3775206611570248, "grad_norm": 5.830241680145264, "learning_rate": 6.968202385569942e-06, "loss": 0.4577, "step": 3426 }, { "epoch": 0.37763085399449037, "grad_norm": 8.189421653747559, "learning_rate": 6.966594988706591e-06, "loss": 0.4584, "step": 3427 }, { "epoch": 0.3777410468319559, "grad_norm": 5.825428009033203, "learning_rate": 6.964987351365332e-06, "loss": 0.4046, "step": 3428 }, { "epoch": 0.37785123966942147, "grad_norm": 4.946706771850586, "learning_rate": 6.963379473742752e-06, "loss": 0.3461, "step": 3429 }, { "epoch": 0.3779614325068871, "grad_norm": 6.065890789031982, "learning_rate": 6.961771356035462e-06, "loss": 0.4715, "step": 3430 }, { "epoch": 0.3780716253443526, "grad_norm": 6.467423439025879, "learning_rate": 6.960162998440108e-06, "loss": 0.4751, "step": 3431 }, { "epoch": 0.3781818181818182, "grad_norm": 11.82904052734375, "learning_rate": 6.958554401153357e-06, "loss": 0.4042, "step": 3432 }, { "epoch": 0.3782920110192837, "grad_norm": 6.656799793243408, "learning_rate": 6.956945564371915e-06, "loss": 0.4459, "step": 3433 }, { "epoch": 0.37840220385674933, "grad_norm": 5.858724117279053, "learning_rate": 6.955336488292511e-06, "loss": 0.4509, "step": 3434 }, { "epoch": 0.3785123966942149, "grad_norm": 4.343075752258301, "learning_rate": 6.9537271731119034e-06, "loss": 0.3628, "step": 3435 }, { "epoch": 0.37862258953168043, "grad_norm": 5.194216251373291, "learning_rate": 6.952117619026886e-06, "loss": 0.4156, "step": 3436 }, { "epoch": 0.378732782369146, "grad_norm": 6.092804908752441, "learning_rate": 6.9505078262342715e-06, "loss": 0.4287, "step": 3437 }, { "epoch": 0.3788429752066116, "grad_norm": 7.648767471313477, "learning_rate": 6.948897794930914e-06, "loss": 0.4498, "step": 3438 }, { "epoch": 0.37895316804407714, "grad_norm": 6.104161739349365, "learning_rate": 6.947287525313685e-06, "loss": 0.4257, "step": 3439 }, { "epoch": 0.3790633608815427, "grad_norm": 5.8925580978393555, "learning_rate": 6.945677017579491e-06, "loss": 0.4253, "step": 3440 }, { "epoch": 0.37917355371900824, "grad_norm": 7.46693754196167, "learning_rate": 6.94406627192527e-06, "loss": 0.3619, "step": 3441 }, { "epoch": 0.37928374655647384, "grad_norm": 10.42000961303711, "learning_rate": 6.942455288547984e-06, "loss": 0.4174, "step": 3442 }, { "epoch": 0.3793939393939394, "grad_norm": 6.869128227233887, "learning_rate": 6.940844067644626e-06, "loss": 0.4107, "step": 3443 }, { "epoch": 0.37950413223140494, "grad_norm": 5.960266590118408, "learning_rate": 6.939232609412221e-06, "loss": 0.4108, "step": 3444 }, { "epoch": 0.37961432506887055, "grad_norm": 6.284941673278809, "learning_rate": 6.937620914047818e-06, "loss": 0.4197, "step": 3445 }, { "epoch": 0.3797245179063361, "grad_norm": 8.592612266540527, "learning_rate": 6.936008981748496e-06, "loss": 0.4845, "step": 3446 }, { "epoch": 0.37983471074380165, "grad_norm": 9.778707504272461, "learning_rate": 6.934396812711367e-06, "loss": 0.4354, "step": 3447 }, { "epoch": 0.3799449035812672, "grad_norm": 8.552468299865723, "learning_rate": 6.9327844071335684e-06, "loss": 0.5189, "step": 3448 }, { "epoch": 0.3800550964187328, "grad_norm": 5.014615058898926, "learning_rate": 6.931171765212267e-06, "loss": 0.4423, "step": 3449 }, { "epoch": 0.38016528925619836, "grad_norm": 10.290054321289062, "learning_rate": 6.929558887144657e-06, "loss": 0.4075, "step": 3450 }, { "epoch": 0.3802754820936639, "grad_norm": 9.373320579528809, "learning_rate": 6.927945773127967e-06, "loss": 0.5347, "step": 3451 }, { "epoch": 0.38038567493112946, "grad_norm": 4.781619548797607, "learning_rate": 6.92633242335945e-06, "loss": 0.3897, "step": 3452 }, { "epoch": 0.38049586776859506, "grad_norm": 7.904768466949463, "learning_rate": 6.924718838036385e-06, "loss": 0.4536, "step": 3453 }, { "epoch": 0.3806060606060606, "grad_norm": 5.603475570678711, "learning_rate": 6.923105017356087e-06, "loss": 0.4637, "step": 3454 }, { "epoch": 0.38071625344352616, "grad_norm": 7.653232574462891, "learning_rate": 6.921490961515897e-06, "loss": 0.4554, "step": 3455 }, { "epoch": 0.3808264462809917, "grad_norm": 10.0427827835083, "learning_rate": 6.91987667071318e-06, "loss": 0.4982, "step": 3456 }, { "epoch": 0.3809366391184573, "grad_norm": 10.740218162536621, "learning_rate": 6.918262145145336e-06, "loss": 0.482, "step": 3457 }, { "epoch": 0.38104683195592287, "grad_norm": 5.421781063079834, "learning_rate": 6.916647385009791e-06, "loss": 0.384, "step": 3458 }, { "epoch": 0.3811570247933884, "grad_norm": 6.068757057189941, "learning_rate": 6.915032390504003e-06, "loss": 0.4189, "step": 3459 }, { "epoch": 0.38126721763085397, "grad_norm": 7.270913600921631, "learning_rate": 6.913417161825449e-06, "loss": 0.3768, "step": 3460 }, { "epoch": 0.3813774104683196, "grad_norm": 5.31407356262207, "learning_rate": 6.911801699171648e-06, "loss": 0.4006, "step": 3461 }, { "epoch": 0.38148760330578513, "grad_norm": 4.115329742431641, "learning_rate": 6.9101860027401376e-06, "loss": 0.4165, "step": 3462 }, { "epoch": 0.3815977961432507, "grad_norm": 9.908076286315918, "learning_rate": 6.908570072728487e-06, "loss": 0.4513, "step": 3463 }, { "epoch": 0.38170798898071623, "grad_norm": 6.62583589553833, "learning_rate": 6.906953909334297e-06, "loss": 0.396, "step": 3464 }, { "epoch": 0.38181818181818183, "grad_norm": 9.345455169677734, "learning_rate": 6.905337512755191e-06, "loss": 0.4403, "step": 3465 }, { "epoch": 0.3819283746556474, "grad_norm": 4.885265827178955, "learning_rate": 6.903720883188827e-06, "loss": 0.429, "step": 3466 }, { "epoch": 0.38203856749311293, "grad_norm": 9.006327629089355, "learning_rate": 6.9021040208328885e-06, "loss": 0.4824, "step": 3467 }, { "epoch": 0.3821487603305785, "grad_norm": 9.241366386413574, "learning_rate": 6.9004869258850835e-06, "loss": 0.4928, "step": 3468 }, { "epoch": 0.3822589531680441, "grad_norm": 6.628201484680176, "learning_rate": 6.898869598543158e-06, "loss": 0.4649, "step": 3469 }, { "epoch": 0.38236914600550964, "grad_norm": 7.430251121520996, "learning_rate": 6.897252039004879e-06, "loss": 0.4449, "step": 3470 }, { "epoch": 0.3824793388429752, "grad_norm": 7.064650058746338, "learning_rate": 6.8956342474680415e-06, "loss": 0.4136, "step": 3471 }, { "epoch": 0.3825895316804408, "grad_norm": 9.308135032653809, "learning_rate": 6.894016224130475e-06, "loss": 0.4978, "step": 3472 }, { "epoch": 0.38269972451790635, "grad_norm": 10.293804168701172, "learning_rate": 6.892397969190031e-06, "loss": 0.3802, "step": 3473 }, { "epoch": 0.3828099173553719, "grad_norm": 5.387298107147217, "learning_rate": 6.890779482844592e-06, "loss": 0.3775, "step": 3474 }, { "epoch": 0.38292011019283745, "grad_norm": 8.136940956115723, "learning_rate": 6.889160765292071e-06, "loss": 0.4118, "step": 3475 }, { "epoch": 0.38303030303030305, "grad_norm": 6.427221775054932, "learning_rate": 6.887541816730406e-06, "loss": 0.4748, "step": 3476 }, { "epoch": 0.3831404958677686, "grad_norm": 7.789750576019287, "learning_rate": 6.8859226373575625e-06, "loss": 0.399, "step": 3477 }, { "epoch": 0.38325068870523415, "grad_norm": 4.99204158782959, "learning_rate": 6.884303227371536e-06, "loss": 0.3875, "step": 3478 }, { "epoch": 0.3833608815426997, "grad_norm": 6.423731327056885, "learning_rate": 6.882683586970352e-06, "loss": 0.4032, "step": 3479 }, { "epoch": 0.3834710743801653, "grad_norm": 5.279046535491943, "learning_rate": 6.8810637163520635e-06, "loss": 0.3844, "step": 3480 }, { "epoch": 0.38358126721763086, "grad_norm": 8.075996398925781, "learning_rate": 6.879443615714746e-06, "loss": 0.4386, "step": 3481 }, { "epoch": 0.3836914600550964, "grad_norm": 5.527756690979004, "learning_rate": 6.877823285256512e-06, "loss": 0.3602, "step": 3482 }, { "epoch": 0.38380165289256196, "grad_norm": 3.9500184059143066, "learning_rate": 6.876202725175495e-06, "loss": 0.361, "step": 3483 }, { "epoch": 0.38391184573002757, "grad_norm": 4.0957489013671875, "learning_rate": 6.8745819356698595e-06, "loss": 0.4082, "step": 3484 }, { "epoch": 0.3840220385674931, "grad_norm": 7.210476875305176, "learning_rate": 6.8729609169377995e-06, "loss": 0.5056, "step": 3485 }, { "epoch": 0.38413223140495867, "grad_norm": 8.166796684265137, "learning_rate": 6.871339669177535e-06, "loss": 0.4193, "step": 3486 }, { "epoch": 0.3842424242424242, "grad_norm": 10.03830337524414, "learning_rate": 6.869718192587313e-06, "loss": 0.5178, "step": 3487 }, { "epoch": 0.3843526170798898, "grad_norm": 4.627344131469727, "learning_rate": 6.868096487365411e-06, "loss": 0.4505, "step": 3488 }, { "epoch": 0.3844628099173554, "grad_norm": 4.49107551574707, "learning_rate": 6.866474553710132e-06, "loss": 0.4003, "step": 3489 }, { "epoch": 0.3845730027548209, "grad_norm": 5.724297523498535, "learning_rate": 6.864852391819812e-06, "loss": 0.3477, "step": 3490 }, { "epoch": 0.3846831955922865, "grad_norm": 6.456491947174072, "learning_rate": 6.8632300018928046e-06, "loss": 0.4606, "step": 3491 }, { "epoch": 0.3847933884297521, "grad_norm": 7.052715301513672, "learning_rate": 6.861607384127504e-06, "loss": 0.3989, "step": 3492 }, { "epoch": 0.38490358126721763, "grad_norm": 5.147886753082275, "learning_rate": 6.859984538722322e-06, "loss": 0.4748, "step": 3493 }, { "epoch": 0.3850137741046832, "grad_norm": 7.9086594581604, "learning_rate": 6.8583614658757056e-06, "loss": 0.4413, "step": 3494 }, { "epoch": 0.3851239669421488, "grad_norm": 7.838868618011475, "learning_rate": 6.8567381657861255e-06, "loss": 0.4008, "step": 3495 }, { "epoch": 0.38523415977961434, "grad_norm": 8.721397399902344, "learning_rate": 6.855114638652079e-06, "loss": 0.4771, "step": 3496 }, { "epoch": 0.3853443526170799, "grad_norm": 12.337827682495117, "learning_rate": 6.853490884672094e-06, "loss": 0.454, "step": 3497 }, { "epoch": 0.38545454545454544, "grad_norm": 7.4478840827941895, "learning_rate": 6.851866904044727e-06, "loss": 0.3743, "step": 3498 }, { "epoch": 0.38556473829201104, "grad_norm": 12.162842750549316, "learning_rate": 6.850242696968558e-06, "loss": 0.5879, "step": 3499 }, { "epoch": 0.3856749311294766, "grad_norm": 5.296576499938965, "learning_rate": 6.848618263642201e-06, "loss": 0.3823, "step": 3500 }, { "epoch": 0.38578512396694215, "grad_norm": 7.909220218658447, "learning_rate": 6.84699360426429e-06, "loss": 0.3704, "step": 3501 }, { "epoch": 0.3858953168044077, "grad_norm": 7.7088236808776855, "learning_rate": 6.845368719033493e-06, "loss": 0.4408, "step": 3502 }, { "epoch": 0.3860055096418733, "grad_norm": 5.983971118927002, "learning_rate": 6.843743608148502e-06, "loss": 0.3819, "step": 3503 }, { "epoch": 0.38611570247933885, "grad_norm": 5.924647331237793, "learning_rate": 6.842118271808038e-06, "loss": 0.45, "step": 3504 }, { "epoch": 0.3862258953168044, "grad_norm": 6.25740909576416, "learning_rate": 6.84049271021085e-06, "loss": 0.3646, "step": 3505 }, { "epoch": 0.38633608815426995, "grad_norm": 5.766270637512207, "learning_rate": 6.838866923555712e-06, "loss": 0.4103, "step": 3506 }, { "epoch": 0.38644628099173556, "grad_norm": 8.451875686645508, "learning_rate": 6.83724091204143e-06, "loss": 0.5397, "step": 3507 }, { "epoch": 0.3865564738292011, "grad_norm": 8.257237434387207, "learning_rate": 6.835614675866834e-06, "loss": 0.4081, "step": 3508 }, { "epoch": 0.38666666666666666, "grad_norm": 7.948094367980957, "learning_rate": 6.83398821523078e-06, "loss": 0.4358, "step": 3509 }, { "epoch": 0.3867768595041322, "grad_norm": 5.9864091873168945, "learning_rate": 6.832361530332158e-06, "loss": 0.3864, "step": 3510 }, { "epoch": 0.3868870523415978, "grad_norm": 6.935376167297363, "learning_rate": 6.830734621369878e-06, "loss": 0.3885, "step": 3511 }, { "epoch": 0.38699724517906336, "grad_norm": 8.323222160339355, "learning_rate": 6.829107488542881e-06, "loss": 0.4655, "step": 3512 }, { "epoch": 0.3871074380165289, "grad_norm": 6.112347602844238, "learning_rate": 6.827480132050137e-06, "loss": 0.4251, "step": 3513 }, { "epoch": 0.38721763085399447, "grad_norm": 5.319746494293213, "learning_rate": 6.825852552090639e-06, "loss": 0.4231, "step": 3514 }, { "epoch": 0.38732782369146007, "grad_norm": 11.083109855651855, "learning_rate": 6.824224748863411e-06, "loss": 0.4576, "step": 3515 }, { "epoch": 0.3874380165289256, "grad_norm": 5.660933494567871, "learning_rate": 6.822596722567504e-06, "loss": 0.3755, "step": 3516 }, { "epoch": 0.38754820936639117, "grad_norm": 9.598067283630371, "learning_rate": 6.820968473401992e-06, "loss": 0.4294, "step": 3517 }, { "epoch": 0.3876584022038567, "grad_norm": 12.287252426147461, "learning_rate": 6.819340001565984e-06, "loss": 0.5338, "step": 3518 }, { "epoch": 0.38776859504132233, "grad_norm": 6.616037845611572, "learning_rate": 6.817711307258608e-06, "loss": 0.4248, "step": 3519 }, { "epoch": 0.3878787878787879, "grad_norm": 5.660268783569336, "learning_rate": 6.816082390679023e-06, "loss": 0.3798, "step": 3520 }, { "epoch": 0.38798898071625343, "grad_norm": 16.88709259033203, "learning_rate": 6.814453252026417e-06, "loss": 0.4722, "step": 3521 }, { "epoch": 0.38809917355371903, "grad_norm": 7.495926380157471, "learning_rate": 6.812823891500004e-06, "loss": 0.395, "step": 3522 }, { "epoch": 0.3882093663911846, "grad_norm": 8.984149932861328, "learning_rate": 6.811194309299023e-06, "loss": 0.5225, "step": 3523 }, { "epoch": 0.38831955922865014, "grad_norm": 6.24255895614624, "learning_rate": 6.80956450562274e-06, "loss": 0.3885, "step": 3524 }, { "epoch": 0.3884297520661157, "grad_norm": 5.3379597663879395, "learning_rate": 6.807934480670451e-06, "loss": 0.5105, "step": 3525 }, { "epoch": 0.3885399449035813, "grad_norm": 6.046708583831787, "learning_rate": 6.8063042346414795e-06, "loss": 0.4401, "step": 3526 }, { "epoch": 0.38865013774104684, "grad_norm": 6.267719745635986, "learning_rate": 6.8046737677351726e-06, "loss": 0.4637, "step": 3527 }, { "epoch": 0.3887603305785124, "grad_norm": 5.851125717163086, "learning_rate": 6.803043080150905e-06, "loss": 0.4352, "step": 3528 }, { "epoch": 0.38887052341597794, "grad_norm": 4.543891906738281, "learning_rate": 6.801412172088081e-06, "loss": 0.3847, "step": 3529 }, { "epoch": 0.38898071625344355, "grad_norm": 8.075711250305176, "learning_rate": 6.799781043746129e-06, "loss": 0.3763, "step": 3530 }, { "epoch": 0.3890909090909091, "grad_norm": 7.912997245788574, "learning_rate": 6.7981496953245065e-06, "loss": 0.428, "step": 3531 }, { "epoch": 0.38920110192837465, "grad_norm": 7.018909454345703, "learning_rate": 6.7965181270226965e-06, "loss": 0.3968, "step": 3532 }, { "epoch": 0.3893112947658402, "grad_norm": 7.459453105926514, "learning_rate": 6.79488633904021e-06, "loss": 0.4546, "step": 3533 }, { "epoch": 0.3894214876033058, "grad_norm": 5.769484519958496, "learning_rate": 6.793254331576583e-06, "loss": 0.3698, "step": 3534 }, { "epoch": 0.38953168044077136, "grad_norm": 5.911960124969482, "learning_rate": 6.7916221048313815e-06, "loss": 0.4182, "step": 3535 }, { "epoch": 0.3896418732782369, "grad_norm": 6.263228416442871, "learning_rate": 6.7899896590041954e-06, "loss": 0.3739, "step": 3536 }, { "epoch": 0.38975206611570246, "grad_norm": 11.059060096740723, "learning_rate": 6.788356994294642e-06, "loss": 0.4189, "step": 3537 }, { "epoch": 0.38986225895316806, "grad_norm": 8.340376853942871, "learning_rate": 6.7867241109023656e-06, "loss": 0.3487, "step": 3538 }, { "epoch": 0.3899724517906336, "grad_norm": 10.862907409667969, "learning_rate": 6.7850910090270385e-06, "loss": 0.5064, "step": 3539 }, { "epoch": 0.39008264462809916, "grad_norm": 11.00351619720459, "learning_rate": 6.783457688868356e-06, "loss": 0.5565, "step": 3540 }, { "epoch": 0.3901928374655647, "grad_norm": 6.101572036743164, "learning_rate": 6.7818241506260486e-06, "loss": 0.4638, "step": 3541 }, { "epoch": 0.3903030303030303, "grad_norm": 5.2540459632873535, "learning_rate": 6.78019039449986e-06, "loss": 0.4955, "step": 3542 }, { "epoch": 0.39041322314049587, "grad_norm": 4.800851345062256, "learning_rate": 6.778556420689573e-06, "loss": 0.4166, "step": 3543 }, { "epoch": 0.3905234159779614, "grad_norm": 4.908843517303467, "learning_rate": 6.776922229394992e-06, "loss": 0.4068, "step": 3544 }, { "epoch": 0.390633608815427, "grad_norm": 4.958993434906006, "learning_rate": 6.775287820815946e-06, "loss": 0.4352, "step": 3545 }, { "epoch": 0.3907438016528926, "grad_norm": 5.4108076095581055, "learning_rate": 6.7736531951522955e-06, "loss": 0.4019, "step": 3546 }, { "epoch": 0.3908539944903581, "grad_norm": 9.655213356018066, "learning_rate": 6.772018352603922e-06, "loss": 0.4683, "step": 3547 }, { "epoch": 0.3909641873278237, "grad_norm": 10.1361665725708, "learning_rate": 6.770383293370734e-06, "loss": 0.4672, "step": 3548 }, { "epoch": 0.3910743801652893, "grad_norm": 4.456143856048584, "learning_rate": 6.768748017652676e-06, "loss": 0.3904, "step": 3549 }, { "epoch": 0.39118457300275483, "grad_norm": 8.810110092163086, "learning_rate": 6.7671125256497086e-06, "loss": 0.483, "step": 3550 }, { "epoch": 0.3912947658402204, "grad_norm": 6.227502346038818, "learning_rate": 6.765476817561819e-06, "loss": 0.4536, "step": 3551 }, { "epoch": 0.39140495867768593, "grad_norm": 7.060149192810059, "learning_rate": 6.763840893589025e-06, "loss": 0.453, "step": 3552 }, { "epoch": 0.39151515151515154, "grad_norm": 5.1962714195251465, "learning_rate": 6.762204753931373e-06, "loss": 0.4527, "step": 3553 }, { "epoch": 0.3916253443526171, "grad_norm": 7.3671441078186035, "learning_rate": 6.760568398788929e-06, "loss": 0.4007, "step": 3554 }, { "epoch": 0.39173553719008264, "grad_norm": 5.331968784332275, "learning_rate": 6.75893182836179e-06, "loss": 0.4689, "step": 3555 }, { "epoch": 0.3918457300275482, "grad_norm": 4.352341651916504, "learning_rate": 6.757295042850077e-06, "loss": 0.4117, "step": 3556 }, { "epoch": 0.3919559228650138, "grad_norm": 6.480555057525635, "learning_rate": 6.75565804245394e-06, "loss": 0.416, "step": 3557 }, { "epoch": 0.39206611570247935, "grad_norm": 6.7451653480529785, "learning_rate": 6.754020827373551e-06, "loss": 0.3993, "step": 3558 }, { "epoch": 0.3921763085399449, "grad_norm": 7.922951698303223, "learning_rate": 6.752383397809114e-06, "loss": 0.4779, "step": 3559 }, { "epoch": 0.39228650137741045, "grad_norm": 8.732173919677734, "learning_rate": 6.750745753960855e-06, "loss": 0.4077, "step": 3560 }, { "epoch": 0.39239669421487605, "grad_norm": 10.558531761169434, "learning_rate": 6.749107896029027e-06, "loss": 0.4473, "step": 3561 }, { "epoch": 0.3925068870523416, "grad_norm": 6.761999607086182, "learning_rate": 6.747469824213909e-06, "loss": 0.4593, "step": 3562 }, { "epoch": 0.39261707988980715, "grad_norm": 6.093954563140869, "learning_rate": 6.745831538715807e-06, "loss": 0.4687, "step": 3563 }, { "epoch": 0.3927272727272727, "grad_norm": 6.497241497039795, "learning_rate": 6.744193039735054e-06, "loss": 0.3646, "step": 3564 }, { "epoch": 0.3928374655647383, "grad_norm": 5.9658613204956055, "learning_rate": 6.742554327472006e-06, "loss": 0.4484, "step": 3565 }, { "epoch": 0.39294765840220386, "grad_norm": 5.079550266265869, "learning_rate": 6.740915402127048e-06, "loss": 0.4256, "step": 3566 }, { "epoch": 0.3930578512396694, "grad_norm": 6.680603504180908, "learning_rate": 6.739276263900591e-06, "loss": 0.352, "step": 3567 }, { "epoch": 0.39316804407713496, "grad_norm": 9.222851753234863, "learning_rate": 6.737636912993067e-06, "loss": 0.4939, "step": 3568 }, { "epoch": 0.39327823691460057, "grad_norm": 5.447881698608398, "learning_rate": 6.735997349604943e-06, "loss": 0.4103, "step": 3569 }, { "epoch": 0.3933884297520661, "grad_norm": 5.519157886505127, "learning_rate": 6.734357573936705e-06, "loss": 0.4654, "step": 3570 }, { "epoch": 0.39349862258953167, "grad_norm": 6.907510757446289, "learning_rate": 6.732717586188866e-06, "loss": 0.3764, "step": 3571 }, { "epoch": 0.39360881542699727, "grad_norm": 7.562971115112305, "learning_rate": 6.731077386561968e-06, "loss": 0.4075, "step": 3572 }, { "epoch": 0.3937190082644628, "grad_norm": 10.390762329101562, "learning_rate": 6.729436975256575e-06, "loss": 0.4435, "step": 3573 }, { "epoch": 0.3938292011019284, "grad_norm": 4.922039985656738, "learning_rate": 6.727796352473279e-06, "loss": 0.4382, "step": 3574 }, { "epoch": 0.3939393939393939, "grad_norm": 6.189228057861328, "learning_rate": 6.726155518412701e-06, "loss": 0.3901, "step": 3575 }, { "epoch": 0.39404958677685953, "grad_norm": 5.172933578491211, "learning_rate": 6.724514473275477e-06, "loss": 0.3882, "step": 3576 }, { "epoch": 0.3941597796143251, "grad_norm": 11.044160842895508, "learning_rate": 6.722873217262283e-06, "loss": 0.3986, "step": 3577 }, { "epoch": 0.39426997245179063, "grad_norm": 6.9071125984191895, "learning_rate": 6.721231750573813e-06, "loss": 0.4145, "step": 3578 }, { "epoch": 0.3943801652892562, "grad_norm": 5.4316935539245605, "learning_rate": 6.719590073410785e-06, "loss": 0.3772, "step": 3579 }, { "epoch": 0.3944903581267218, "grad_norm": 6.2734527587890625, "learning_rate": 6.717948185973946e-06, "loss": 0.4587, "step": 3580 }, { "epoch": 0.39460055096418734, "grad_norm": 8.695782661437988, "learning_rate": 6.71630608846407e-06, "loss": 0.444, "step": 3581 }, { "epoch": 0.3947107438016529, "grad_norm": 10.9119873046875, "learning_rate": 6.714663781081956e-06, "loss": 0.3928, "step": 3582 }, { "epoch": 0.39482093663911844, "grad_norm": 8.182262420654297, "learning_rate": 6.713021264028423e-06, "loss": 0.4551, "step": 3583 }, { "epoch": 0.39493112947658404, "grad_norm": 8.505237579345703, "learning_rate": 6.711378537504324e-06, "loss": 0.4165, "step": 3584 }, { "epoch": 0.3950413223140496, "grad_norm": 4.6055521965026855, "learning_rate": 6.709735601710533e-06, "loss": 0.3815, "step": 3585 }, { "epoch": 0.39515151515151514, "grad_norm": 6.0570220947265625, "learning_rate": 6.70809245684795e-06, "loss": 0.4094, "step": 3586 }, { "epoch": 0.3952617079889807, "grad_norm": 8.43146800994873, "learning_rate": 6.7064491031175e-06, "loss": 0.5455, "step": 3587 }, { "epoch": 0.3953719008264463, "grad_norm": 13.617354393005371, "learning_rate": 6.704805540720139e-06, "loss": 0.4753, "step": 3588 }, { "epoch": 0.39548209366391185, "grad_norm": 6.508190155029297, "learning_rate": 6.703161769856837e-06, "loss": 0.4257, "step": 3589 }, { "epoch": 0.3955922865013774, "grad_norm": 6.021367073059082, "learning_rate": 6.701517790728602e-06, "loss": 0.4027, "step": 3590 }, { "epoch": 0.39570247933884295, "grad_norm": 4.801262855529785, "learning_rate": 6.699873603536459e-06, "loss": 0.4364, "step": 3591 }, { "epoch": 0.39581267217630856, "grad_norm": 5.468711853027344, "learning_rate": 6.698229208481465e-06, "loss": 0.4166, "step": 3592 }, { "epoch": 0.3959228650137741, "grad_norm": 11.384525299072266, "learning_rate": 6.696584605764694e-06, "loss": 0.3336, "step": 3593 }, { "epoch": 0.39603305785123966, "grad_norm": 8.580153465270996, "learning_rate": 6.694939795587254e-06, "loss": 0.5833, "step": 3594 }, { "epoch": 0.39614325068870526, "grad_norm": 6.473851203918457, "learning_rate": 6.693294778150276e-06, "loss": 0.4507, "step": 3595 }, { "epoch": 0.3962534435261708, "grad_norm": 6.248347759246826, "learning_rate": 6.691649553654909e-06, "loss": 0.3913, "step": 3596 }, { "epoch": 0.39636363636363636, "grad_norm": 6.736072063446045, "learning_rate": 6.690004122302337e-06, "loss": 0.47, "step": 3597 }, { "epoch": 0.3964738292011019, "grad_norm": 7.859832286834717, "learning_rate": 6.688358484293765e-06, "loss": 0.3682, "step": 3598 }, { "epoch": 0.3965840220385675, "grad_norm": 8.565985679626465, "learning_rate": 6.686712639830426e-06, "loss": 0.4206, "step": 3599 }, { "epoch": 0.39669421487603307, "grad_norm": 5.582574367523193, "learning_rate": 6.685066589113573e-06, "loss": 0.3417, "step": 3600 }, { "epoch": 0.3968044077134986, "grad_norm": 6.459357738494873, "learning_rate": 6.683420332344489e-06, "loss": 0.3929, "step": 3601 }, { "epoch": 0.39691460055096417, "grad_norm": 9.563920974731445, "learning_rate": 6.68177386972448e-06, "loss": 0.3584, "step": 3602 }, { "epoch": 0.3970247933884298, "grad_norm": 6.491499423980713, "learning_rate": 6.6801272014548775e-06, "loss": 0.4232, "step": 3603 }, { "epoch": 0.3971349862258953, "grad_norm": 8.638758659362793, "learning_rate": 6.678480327737039e-06, "loss": 0.4587, "step": 3604 }, { "epoch": 0.3972451790633609, "grad_norm": 3.47772216796875, "learning_rate": 6.6768332487723455e-06, "loss": 0.3784, "step": 3605 }, { "epoch": 0.3973553719008264, "grad_norm": 6.206127166748047, "learning_rate": 6.6751859647622055e-06, "loss": 0.3892, "step": 3606 }, { "epoch": 0.39746556473829203, "grad_norm": 9.063240051269531, "learning_rate": 6.6735384759080494e-06, "loss": 0.4438, "step": 3607 }, { "epoch": 0.3975757575757576, "grad_norm": 6.677006721496582, "learning_rate": 6.6718907824113355e-06, "loss": 0.4103, "step": 3608 }, { "epoch": 0.39768595041322313, "grad_norm": 5.7193450927734375, "learning_rate": 6.670242884473546e-06, "loss": 0.4243, "step": 3609 }, { "epoch": 0.3977961432506887, "grad_norm": 8.467764854431152, "learning_rate": 6.668594782296187e-06, "loss": 0.3908, "step": 3610 }, { "epoch": 0.3979063360881543, "grad_norm": 5.584086894989014, "learning_rate": 6.666946476080791e-06, "loss": 0.4517, "step": 3611 }, { "epoch": 0.39801652892561984, "grad_norm": 5.541923522949219, "learning_rate": 6.665297966028918e-06, "loss": 0.4184, "step": 3612 }, { "epoch": 0.3981267217630854, "grad_norm": 13.841939926147461, "learning_rate": 6.663649252342146e-06, "loss": 0.5451, "step": 3613 }, { "epoch": 0.39823691460055094, "grad_norm": 7.07025671005249, "learning_rate": 6.662000335222083e-06, "loss": 0.4008, "step": 3614 }, { "epoch": 0.39834710743801655, "grad_norm": 13.290519714355469, "learning_rate": 6.660351214870362e-06, "loss": 0.4857, "step": 3615 }, { "epoch": 0.3984573002754821, "grad_norm": 5.437166213989258, "learning_rate": 6.658701891488639e-06, "loss": 0.4092, "step": 3616 }, { "epoch": 0.39856749311294765, "grad_norm": 6.242116451263428, "learning_rate": 6.657052365278596e-06, "loss": 0.4404, "step": 3617 }, { "epoch": 0.3986776859504132, "grad_norm": 5.129985332489014, "learning_rate": 6.655402636441937e-06, "loss": 0.4212, "step": 3618 }, { "epoch": 0.3987878787878788, "grad_norm": 5.419785976409912, "learning_rate": 6.653752705180396e-06, "loss": 0.3966, "step": 3619 }, { "epoch": 0.39889807162534435, "grad_norm": 5.731557846069336, "learning_rate": 6.652102571695729e-06, "loss": 0.4348, "step": 3620 }, { "epoch": 0.3990082644628099, "grad_norm": 5.96975564956665, "learning_rate": 6.650452236189715e-06, "loss": 0.4983, "step": 3621 }, { "epoch": 0.3991184573002755, "grad_norm": 6.626498222351074, "learning_rate": 6.648801698864159e-06, "loss": 0.4551, "step": 3622 }, { "epoch": 0.39922865013774106, "grad_norm": 7.082576274871826, "learning_rate": 6.6471509599208935e-06, "loss": 0.4722, "step": 3623 }, { "epoch": 0.3993388429752066, "grad_norm": 10.073821067810059, "learning_rate": 6.645500019561768e-06, "loss": 0.4255, "step": 3624 }, { "epoch": 0.39944903581267216, "grad_norm": 7.216056823730469, "learning_rate": 6.643848877988668e-06, "loss": 0.4229, "step": 3625 }, { "epoch": 0.39955922865013777, "grad_norm": 6.034587860107422, "learning_rate": 6.6421975354034915e-06, "loss": 0.4199, "step": 3626 }, { "epoch": 0.3996694214876033, "grad_norm": 6.705302715301514, "learning_rate": 6.6405459920081715e-06, "loss": 0.43, "step": 3627 }, { "epoch": 0.39977961432506887, "grad_norm": 9.449960708618164, "learning_rate": 6.638894248004659e-06, "loss": 0.4597, "step": 3628 }, { "epoch": 0.3998898071625344, "grad_norm": 5.67280912399292, "learning_rate": 6.637242303594931e-06, "loss": 0.3125, "step": 3629 }, { "epoch": 0.4, "grad_norm": 9.817313194274902, "learning_rate": 6.63559015898099e-06, "loss": 0.502, "step": 3630 }, { "epoch": 0.4001101928374656, "grad_norm": 8.02699089050293, "learning_rate": 6.633937814364864e-06, "loss": 0.4656, "step": 3631 }, { "epoch": 0.4002203856749311, "grad_norm": 4.558982849121094, "learning_rate": 6.6322852699486e-06, "loss": 0.4041, "step": 3632 }, { "epoch": 0.4002203856749311, "eval_loss": 0.4264827072620392, "eval_runtime": 41.9502, "eval_samples_per_second": 17.497, "eval_steps_per_second": 2.193, "step": 3632 }, { "epoch": 0.4003305785123967, "grad_norm": 4.743308067321777, "learning_rate": 6.630632525934277e-06, "loss": 0.4498, "step": 3633 }, { "epoch": 0.4004407713498623, "grad_norm": 5.383522033691406, "learning_rate": 6.628979582523995e-06, "loss": 0.4034, "step": 3634 }, { "epoch": 0.40055096418732783, "grad_norm": 6.133452415466309, "learning_rate": 6.627326439919875e-06, "loss": 0.4272, "step": 3635 }, { "epoch": 0.4006611570247934, "grad_norm": 8.080024719238281, "learning_rate": 6.62567309832407e-06, "loss": 0.4828, "step": 3636 }, { "epoch": 0.40077134986225893, "grad_norm": 9.888001441955566, "learning_rate": 6.624019557938749e-06, "loss": 0.4499, "step": 3637 }, { "epoch": 0.40088154269972454, "grad_norm": 5.559149742126465, "learning_rate": 6.62236581896611e-06, "loss": 0.3778, "step": 3638 }, { "epoch": 0.4009917355371901, "grad_norm": 4.956784248352051, "learning_rate": 6.620711881608375e-06, "loss": 0.4253, "step": 3639 }, { "epoch": 0.40110192837465564, "grad_norm": 8.00571346282959, "learning_rate": 6.6190577460677894e-06, "loss": 0.4107, "step": 3640 }, { "epoch": 0.4012121212121212, "grad_norm": 10.162068367004395, "learning_rate": 6.617403412546625e-06, "loss": 0.514, "step": 3641 }, { "epoch": 0.4013223140495868, "grad_norm": 3.738342761993408, "learning_rate": 6.615748881247172e-06, "loss": 0.3903, "step": 3642 }, { "epoch": 0.40143250688705234, "grad_norm": 8.659130096435547, "learning_rate": 6.6140941523717525e-06, "loss": 0.4029, "step": 3643 }, { "epoch": 0.4015426997245179, "grad_norm": 5.845111846923828, "learning_rate": 6.6124392261227065e-06, "loss": 0.4482, "step": 3644 }, { "epoch": 0.40165289256198344, "grad_norm": 6.543630123138428, "learning_rate": 6.6107841027024025e-06, "loss": 0.4043, "step": 3645 }, { "epoch": 0.40176308539944905, "grad_norm": 7.756674766540527, "learning_rate": 6.60912878231323e-06, "loss": 0.4155, "step": 3646 }, { "epoch": 0.4018732782369146, "grad_norm": 5.7334980964660645, "learning_rate": 6.607473265157604e-06, "loss": 0.4492, "step": 3647 }, { "epoch": 0.40198347107438015, "grad_norm": 4.604136943817139, "learning_rate": 6.605817551437963e-06, "loss": 0.4511, "step": 3648 }, { "epoch": 0.40209366391184576, "grad_norm": 8.115760803222656, "learning_rate": 6.604161641356772e-06, "loss": 0.4525, "step": 3649 }, { "epoch": 0.4022038567493113, "grad_norm": 5.188201904296875, "learning_rate": 6.6025055351165155e-06, "loss": 0.4432, "step": 3650 }, { "epoch": 0.40231404958677686, "grad_norm": 9.08431339263916, "learning_rate": 6.600849232919707e-06, "loss": 0.4567, "step": 3651 }, { "epoch": 0.4024242424242424, "grad_norm": 4.568078994750977, "learning_rate": 6.599192734968878e-06, "loss": 0.4409, "step": 3652 }, { "epoch": 0.402534435261708, "grad_norm": 6.599457740783691, "learning_rate": 6.597536041466589e-06, "loss": 0.4305, "step": 3653 }, { "epoch": 0.40264462809917356, "grad_norm": 4.810308933258057, "learning_rate": 6.595879152615423e-06, "loss": 0.3766, "step": 3654 }, { "epoch": 0.4027548209366391, "grad_norm": 4.524010181427002, "learning_rate": 6.594222068617988e-06, "loss": 0.4594, "step": 3655 }, { "epoch": 0.40286501377410466, "grad_norm": 5.130374431610107, "learning_rate": 6.592564789676912e-06, "loss": 0.4735, "step": 3656 }, { "epoch": 0.40297520661157027, "grad_norm": 4.945376873016357, "learning_rate": 6.590907315994849e-06, "loss": 0.457, "step": 3657 }, { "epoch": 0.4030853994490358, "grad_norm": 7.4924726486206055, "learning_rate": 6.589249647774479e-06, "loss": 0.4096, "step": 3658 }, { "epoch": 0.40319559228650137, "grad_norm": 8.88684368133545, "learning_rate": 6.587591785218504e-06, "loss": 0.4629, "step": 3659 }, { "epoch": 0.4033057851239669, "grad_norm": 4.510934829711914, "learning_rate": 6.5859337285296474e-06, "loss": 0.4635, "step": 3660 }, { "epoch": 0.4034159779614325, "grad_norm": 5.477586269378662, "learning_rate": 6.584275477910662e-06, "loss": 0.4797, "step": 3661 }, { "epoch": 0.4035261707988981, "grad_norm": 10.872736930847168, "learning_rate": 6.582617033564319e-06, "loss": 0.4547, "step": 3662 }, { "epoch": 0.4036363636363636, "grad_norm": 6.2118048667907715, "learning_rate": 6.580958395693414e-06, "loss": 0.4346, "step": 3663 }, { "epoch": 0.4037465564738292, "grad_norm": 5.804343223571777, "learning_rate": 6.5792995645007705e-06, "loss": 0.4222, "step": 3664 }, { "epoch": 0.4038567493112948, "grad_norm": 5.338511943817139, "learning_rate": 6.577640540189229e-06, "loss": 0.3204, "step": 3665 }, { "epoch": 0.40396694214876033, "grad_norm": 7.646331787109375, "learning_rate": 6.575981322961662e-06, "loss": 0.4007, "step": 3666 }, { "epoch": 0.4040771349862259, "grad_norm": 5.6988630294799805, "learning_rate": 6.574321913020956e-06, "loss": 0.378, "step": 3667 }, { "epoch": 0.40418732782369143, "grad_norm": 10.288561820983887, "learning_rate": 6.572662310570027e-06, "loss": 0.4368, "step": 3668 }, { "epoch": 0.40429752066115704, "grad_norm": 4.442179203033447, "learning_rate": 6.571002515811818e-06, "loss": 0.4269, "step": 3669 }, { "epoch": 0.4044077134986226, "grad_norm": 14.73851490020752, "learning_rate": 6.569342528949284e-06, "loss": 0.4558, "step": 3670 }, { "epoch": 0.40451790633608814, "grad_norm": 10.834920883178711, "learning_rate": 6.567682350185416e-06, "loss": 0.4896, "step": 3671 }, { "epoch": 0.40462809917355375, "grad_norm": 4.4984283447265625, "learning_rate": 6.566021979723219e-06, "loss": 0.4421, "step": 3672 }, { "epoch": 0.4047382920110193, "grad_norm": 6.032543182373047, "learning_rate": 6.564361417765727e-06, "loss": 0.4236, "step": 3673 }, { "epoch": 0.40484848484848485, "grad_norm": 5.807493686676025, "learning_rate": 6.562700664515998e-06, "loss": 0.4265, "step": 3674 }, { "epoch": 0.4049586776859504, "grad_norm": 4.196972846984863, "learning_rate": 6.561039720177107e-06, "loss": 0.4221, "step": 3675 }, { "epoch": 0.405068870523416, "grad_norm": 5.8588032722473145, "learning_rate": 6.5593785849521595e-06, "loss": 0.3816, "step": 3676 }, { "epoch": 0.40517906336088155, "grad_norm": 13.153230667114258, "learning_rate": 6.55771725904428e-06, "loss": 0.5647, "step": 3677 }, { "epoch": 0.4052892561983471, "grad_norm": 7.3144989013671875, "learning_rate": 6.556055742656619e-06, "loss": 0.4653, "step": 3678 }, { "epoch": 0.40539944903581265, "grad_norm": 6.19295597076416, "learning_rate": 6.554394035992348e-06, "loss": 0.4183, "step": 3679 }, { "epoch": 0.40550964187327826, "grad_norm": 8.30089282989502, "learning_rate": 6.552732139254662e-06, "loss": 0.4574, "step": 3680 }, { "epoch": 0.4056198347107438, "grad_norm": 5.111574172973633, "learning_rate": 6.55107005264678e-06, "loss": 0.318, "step": 3681 }, { "epoch": 0.40573002754820936, "grad_norm": 7.422606945037842, "learning_rate": 6.549407776371946e-06, "loss": 0.4245, "step": 3682 }, { "epoch": 0.4058402203856749, "grad_norm": 15.394242286682129, "learning_rate": 6.547745310633425e-06, "loss": 0.5091, "step": 3683 }, { "epoch": 0.4059504132231405, "grad_norm": 9.286622047424316, "learning_rate": 6.546082655634505e-06, "loss": 0.4621, "step": 3684 }, { "epoch": 0.40606060606060607, "grad_norm": 5.918662071228027, "learning_rate": 6.544419811578498e-06, "loss": 0.4551, "step": 3685 }, { "epoch": 0.4061707988980716, "grad_norm": 4.8866496086120605, "learning_rate": 6.5427567786687376e-06, "loss": 0.4349, "step": 3686 }, { "epoch": 0.40628099173553717, "grad_norm": 7.229520320892334, "learning_rate": 6.541093557108583e-06, "loss": 0.4098, "step": 3687 }, { "epoch": 0.4063911845730028, "grad_norm": 8.540162086486816, "learning_rate": 6.539430147101414e-06, "loss": 0.3744, "step": 3688 }, { "epoch": 0.4065013774104683, "grad_norm": 6.524184226989746, "learning_rate": 6.537766548850637e-06, "loss": 0.4935, "step": 3689 }, { "epoch": 0.4066115702479339, "grad_norm": 11.073874473571777, "learning_rate": 6.5361027625596775e-06, "loss": 0.4058, "step": 3690 }, { "epoch": 0.4067217630853994, "grad_norm": 5.637555122375488, "learning_rate": 6.534438788431984e-06, "loss": 0.4091, "step": 3691 }, { "epoch": 0.40683195592286503, "grad_norm": 7.77812385559082, "learning_rate": 6.532774626671033e-06, "loss": 0.4041, "step": 3692 }, { "epoch": 0.4069421487603306, "grad_norm": 4.523971080780029, "learning_rate": 6.531110277480317e-06, "loss": 0.3751, "step": 3693 }, { "epoch": 0.40705234159779613, "grad_norm": 5.27182674407959, "learning_rate": 6.529445741063356e-06, "loss": 0.326, "step": 3694 }, { "epoch": 0.4071625344352617, "grad_norm": 5.846972465515137, "learning_rate": 6.5277810176236946e-06, "loss": 0.3946, "step": 3695 }, { "epoch": 0.4072727272727273, "grad_norm": 7.611056804656982, "learning_rate": 6.526116107364893e-06, "loss": 0.4223, "step": 3696 }, { "epoch": 0.40738292011019284, "grad_norm": 4.998781204223633, "learning_rate": 6.524451010490542e-06, "loss": 0.4046, "step": 3697 }, { "epoch": 0.4074931129476584, "grad_norm": 8.281871795654297, "learning_rate": 6.52278572720425e-06, "loss": 0.4542, "step": 3698 }, { "epoch": 0.407603305785124, "grad_norm": 11.4763822555542, "learning_rate": 6.52112025770965e-06, "loss": 0.4554, "step": 3699 }, { "epoch": 0.40771349862258954, "grad_norm": 4.127493858337402, "learning_rate": 6.519454602210402e-06, "loss": 0.4009, "step": 3700 }, { "epoch": 0.4078236914600551, "grad_norm": 7.102976322174072, "learning_rate": 6.517788760910178e-06, "loss": 0.4302, "step": 3701 }, { "epoch": 0.40793388429752064, "grad_norm": 4.179760456085205, "learning_rate": 6.516122734012684e-06, "loss": 0.3512, "step": 3702 }, { "epoch": 0.40804407713498625, "grad_norm": 5.761562347412109, "learning_rate": 6.514456521721642e-06, "loss": 0.4264, "step": 3703 }, { "epoch": 0.4081542699724518, "grad_norm": 6.149220943450928, "learning_rate": 6.5127901242407995e-06, "loss": 0.3949, "step": 3704 }, { "epoch": 0.40826446280991735, "grad_norm": 8.804180145263672, "learning_rate": 6.511123541773926e-06, "loss": 0.4283, "step": 3705 }, { "epoch": 0.4083746556473829, "grad_norm": 6.940871715545654, "learning_rate": 6.509456774524812e-06, "loss": 0.426, "step": 3706 }, { "epoch": 0.4084848484848485, "grad_norm": 10.35230827331543, "learning_rate": 6.5077898226972745e-06, "loss": 0.4789, "step": 3707 }, { "epoch": 0.40859504132231406, "grad_norm": 6.280311107635498, "learning_rate": 6.506122686495149e-06, "loss": 0.4017, "step": 3708 }, { "epoch": 0.4087052341597796, "grad_norm": 12.406330108642578, "learning_rate": 6.504455366122296e-06, "loss": 0.622, "step": 3709 }, { "epoch": 0.40881542699724516, "grad_norm": 8.450384140014648, "learning_rate": 6.5027878617825955e-06, "loss": 0.4474, "step": 3710 }, { "epoch": 0.40892561983471076, "grad_norm": 11.929401397705078, "learning_rate": 6.501120173679955e-06, "loss": 0.5489, "step": 3711 }, { "epoch": 0.4090358126721763, "grad_norm": 10.01643180847168, "learning_rate": 6.499452302018302e-06, "loss": 0.4799, "step": 3712 }, { "epoch": 0.40914600550964186, "grad_norm": 7.758121490478516, "learning_rate": 6.497784247001583e-06, "loss": 0.4074, "step": 3713 }, { "epoch": 0.4092561983471074, "grad_norm": 6.807549476623535, "learning_rate": 6.496116008833773e-06, "loss": 0.4317, "step": 3714 }, { "epoch": 0.409366391184573, "grad_norm": 5.175261974334717, "learning_rate": 6.494447587718864e-06, "loss": 0.403, "step": 3715 }, { "epoch": 0.40947658402203857, "grad_norm": 6.163679122924805, "learning_rate": 6.492778983860873e-06, "loss": 0.4818, "step": 3716 }, { "epoch": 0.4095867768595041, "grad_norm": 11.304265022277832, "learning_rate": 6.491110197463842e-06, "loss": 0.4254, "step": 3717 }, { "epoch": 0.40969696969696967, "grad_norm": 21.56818389892578, "learning_rate": 6.48944122873183e-06, "loss": 0.4987, "step": 3718 }, { "epoch": 0.4098071625344353, "grad_norm": 5.828494548797607, "learning_rate": 6.487772077868921e-06, "loss": 0.4672, "step": 3719 }, { "epoch": 0.4099173553719008, "grad_norm": 8.28695297241211, "learning_rate": 6.486102745079223e-06, "loss": 0.4679, "step": 3720 }, { "epoch": 0.4100275482093664, "grad_norm": 6.2719268798828125, "learning_rate": 6.484433230566861e-06, "loss": 0.4214, "step": 3721 }, { "epoch": 0.410137741046832, "grad_norm": 16.524568557739258, "learning_rate": 6.4827635345359864e-06, "loss": 0.5679, "step": 3722 }, { "epoch": 0.41024793388429753, "grad_norm": 9.619969367980957, "learning_rate": 6.4810936571907745e-06, "loss": 0.4052, "step": 3723 }, { "epoch": 0.4103581267217631, "grad_norm": 19.248586654663086, "learning_rate": 6.479423598735417e-06, "loss": 0.3896, "step": 3724 }, { "epoch": 0.41046831955922863, "grad_norm": 10.976968765258789, "learning_rate": 6.4777533593741336e-06, "loss": 0.445, "step": 3725 }, { "epoch": 0.41057851239669424, "grad_norm": 7.226747512817383, "learning_rate": 6.4760829393111615e-06, "loss": 0.4403, "step": 3726 }, { "epoch": 0.4106887052341598, "grad_norm": 5.23472785949707, "learning_rate": 6.474412338750762e-06, "loss": 0.4736, "step": 3727 }, { "epoch": 0.41079889807162534, "grad_norm": 8.113616943359375, "learning_rate": 6.472741557897219e-06, "loss": 0.5052, "step": 3728 }, { "epoch": 0.4109090909090909, "grad_norm": 7.333268165588379, "learning_rate": 6.4710705969548385e-06, "loss": 0.3841, "step": 3729 }, { "epoch": 0.4110192837465565, "grad_norm": 6.259932041168213, "learning_rate": 6.469399456127947e-06, "loss": 0.4402, "step": 3730 }, { "epoch": 0.41112947658402205, "grad_norm": 5.707891941070557, "learning_rate": 6.467728135620892e-06, "loss": 0.4271, "step": 3731 }, { "epoch": 0.4112396694214876, "grad_norm": 5.641848087310791, "learning_rate": 6.46605663563805e-06, "loss": 0.4079, "step": 3732 }, { "epoch": 0.41134986225895315, "grad_norm": 5.5837507247924805, "learning_rate": 6.4643849563838105e-06, "loss": 0.4462, "step": 3733 }, { "epoch": 0.41146005509641875, "grad_norm": 3.8371849060058594, "learning_rate": 6.462713098062587e-06, "loss": 0.3939, "step": 3734 }, { "epoch": 0.4115702479338843, "grad_norm": 18.237119674682617, "learning_rate": 6.461041060878821e-06, "loss": 0.4489, "step": 3735 }, { "epoch": 0.41168044077134985, "grad_norm": 9.150449752807617, "learning_rate": 6.4593688450369695e-06, "loss": 0.5537, "step": 3736 }, { "epoch": 0.4117906336088154, "grad_norm": 5.526957035064697, "learning_rate": 6.457696450741512e-06, "loss": 0.4463, "step": 3737 }, { "epoch": 0.411900826446281, "grad_norm": 7.520010948181152, "learning_rate": 6.456023878196953e-06, "loss": 0.4618, "step": 3738 }, { "epoch": 0.41201101928374656, "grad_norm": 4.8241424560546875, "learning_rate": 6.454351127607817e-06, "loss": 0.452, "step": 3739 }, { "epoch": 0.4121212121212121, "grad_norm": 7.352964878082275, "learning_rate": 6.452678199178649e-06, "loss": 0.3904, "step": 3740 }, { "epoch": 0.41223140495867766, "grad_norm": 5.8515753746032715, "learning_rate": 6.451005093114018e-06, "loss": 0.4451, "step": 3741 }, { "epoch": 0.41234159779614327, "grad_norm": 8.789145469665527, "learning_rate": 6.4493318096185135e-06, "loss": 0.3964, "step": 3742 }, { "epoch": 0.4124517906336088, "grad_norm": 10.589065551757812, "learning_rate": 6.4476583488967455e-06, "loss": 0.426, "step": 3743 }, { "epoch": 0.41256198347107437, "grad_norm": 8.067483901977539, "learning_rate": 6.445984711153348e-06, "loss": 0.4201, "step": 3744 }, { "epoch": 0.4126721763085399, "grad_norm": 8.293087005615234, "learning_rate": 6.444310896592978e-06, "loss": 0.4547, "step": 3745 }, { "epoch": 0.4127823691460055, "grad_norm": 8.98918342590332, "learning_rate": 6.442636905420307e-06, "loss": 0.4654, "step": 3746 }, { "epoch": 0.4128925619834711, "grad_norm": 6.305483341217041, "learning_rate": 6.440962737840038e-06, "loss": 0.501, "step": 3747 }, { "epoch": 0.4130027548209366, "grad_norm": 6.1845502853393555, "learning_rate": 6.439288394056886e-06, "loss": 0.4221, "step": 3748 }, { "epoch": 0.41311294765840223, "grad_norm": 9.436981201171875, "learning_rate": 6.437613874275596e-06, "loss": 0.4046, "step": 3749 }, { "epoch": 0.4132231404958678, "grad_norm": 10.030034065246582, "learning_rate": 6.435939178700926e-06, "loss": 0.4202, "step": 3750 }, { "epoch": 0.41333333333333333, "grad_norm": 6.672735214233398, "learning_rate": 6.434264307537664e-06, "loss": 0.4676, "step": 3751 }, { "epoch": 0.4134435261707989, "grad_norm": 12.742806434631348, "learning_rate": 6.432589260990614e-06, "loss": 0.4334, "step": 3752 }, { "epoch": 0.4135537190082645, "grad_norm": 6.345670223236084, "learning_rate": 6.430914039264604e-06, "loss": 0.4058, "step": 3753 }, { "epoch": 0.41366391184573004, "grad_norm": 5.737221717834473, "learning_rate": 6.429238642564482e-06, "loss": 0.3692, "step": 3754 }, { "epoch": 0.4137741046831956, "grad_norm": 8.014758110046387, "learning_rate": 6.4275630710951155e-06, "loss": 0.5336, "step": 3755 }, { "epoch": 0.41388429752066114, "grad_norm": 5.608326435089111, "learning_rate": 6.4258873250614e-06, "loss": 0.3423, "step": 3756 }, { "epoch": 0.41399449035812674, "grad_norm": 5.42350435256958, "learning_rate": 6.4242114046682435e-06, "loss": 0.4464, "step": 3757 }, { "epoch": 0.4141046831955923, "grad_norm": 4.1355156898498535, "learning_rate": 6.422535310120583e-06, "loss": 0.4116, "step": 3758 }, { "epoch": 0.41421487603305784, "grad_norm": 5.403732776641846, "learning_rate": 6.420859041623371e-06, "loss": 0.4441, "step": 3759 }, { "epoch": 0.4143250688705234, "grad_norm": 6.440338134765625, "learning_rate": 6.419182599381586e-06, "loss": 0.4527, "step": 3760 }, { "epoch": 0.414435261707989, "grad_norm": 12.544288635253906, "learning_rate": 6.417505983600226e-06, "loss": 0.5211, "step": 3761 }, { "epoch": 0.41454545454545455, "grad_norm": 9.335394859313965, "learning_rate": 6.4158291944843075e-06, "loss": 0.4746, "step": 3762 }, { "epoch": 0.4146556473829201, "grad_norm": 5.873477935791016, "learning_rate": 6.4141522322388725e-06, "loss": 0.4255, "step": 3763 }, { "epoch": 0.41476584022038565, "grad_norm": 9.662142753601074, "learning_rate": 6.412475097068982e-06, "loss": 0.5154, "step": 3764 }, { "epoch": 0.41487603305785126, "grad_norm": 4.658679485321045, "learning_rate": 6.410797789179717e-06, "loss": 0.4081, "step": 3765 }, { "epoch": 0.4149862258953168, "grad_norm": 5.880003929138184, "learning_rate": 6.409120308776182e-06, "loss": 0.3921, "step": 3766 }, { "epoch": 0.41509641873278236, "grad_norm": 5.686578273773193, "learning_rate": 6.4074426560635025e-06, "loss": 0.4204, "step": 3767 }, { "epoch": 0.4152066115702479, "grad_norm": 5.895596027374268, "learning_rate": 6.405764831246823e-06, "loss": 0.4568, "step": 3768 }, { "epoch": 0.4153168044077135, "grad_norm": 8.551445960998535, "learning_rate": 6.404086834531309e-06, "loss": 0.4531, "step": 3769 }, { "epoch": 0.41542699724517906, "grad_norm": 5.391382694244385, "learning_rate": 6.402408666122152e-06, "loss": 0.3608, "step": 3770 }, { "epoch": 0.4155371900826446, "grad_norm": 15.061893463134766, "learning_rate": 6.4007303262245566e-06, "loss": 0.407, "step": 3771 }, { "epoch": 0.4156473829201102, "grad_norm": 8.456413269042969, "learning_rate": 6.399051815043754e-06, "loss": 0.4365, "step": 3772 }, { "epoch": 0.41575757575757577, "grad_norm": 4.756795406341553, "learning_rate": 6.397373132784995e-06, "loss": 0.4001, "step": 3773 }, { "epoch": 0.4158677685950413, "grad_norm": 7.5593342781066895, "learning_rate": 6.395694279653553e-06, "loss": 0.4053, "step": 3774 }, { "epoch": 0.41597796143250687, "grad_norm": 7.961740493774414, "learning_rate": 6.394015255854717e-06, "loss": 0.4265, "step": 3775 }, { "epoch": 0.4160881542699725, "grad_norm": 6.529910087585449, "learning_rate": 6.392336061593802e-06, "loss": 0.4168, "step": 3776 }, { "epoch": 0.416198347107438, "grad_norm": 11.598061561584473, "learning_rate": 6.390656697076143e-06, "loss": 0.4635, "step": 3777 }, { "epoch": 0.4163085399449036, "grad_norm": 7.832777976989746, "learning_rate": 6.3889771625070925e-06, "loss": 0.4787, "step": 3778 }, { "epoch": 0.4164187327823691, "grad_norm": 6.925414085388184, "learning_rate": 6.38729745809203e-06, "loss": 0.5039, "step": 3779 }, { "epoch": 0.41652892561983473, "grad_norm": 5.661531448364258, "learning_rate": 6.385617584036348e-06, "loss": 0.4309, "step": 3780 }, { "epoch": 0.4166391184573003, "grad_norm": 7.859992027282715, "learning_rate": 6.3839375405454666e-06, "loss": 0.4091, "step": 3781 }, { "epoch": 0.41674931129476583, "grad_norm": 6.426723480224609, "learning_rate": 6.3822573278248235e-06, "loss": 0.439, "step": 3782 }, { "epoch": 0.4168595041322314, "grad_norm": 8.279650688171387, "learning_rate": 6.380576946079875e-06, "loss": 0.4073, "step": 3783 }, { "epoch": 0.416969696969697, "grad_norm": 4.793420314788818, "learning_rate": 6.3788963955161046e-06, "loss": 0.4326, "step": 3784 }, { "epoch": 0.41707988980716254, "grad_norm": 8.681497573852539, "learning_rate": 6.377215676339007e-06, "loss": 0.3881, "step": 3785 }, { "epoch": 0.4171900826446281, "grad_norm": 7.378214359283447, "learning_rate": 6.375534788754106e-06, "loss": 0.4035, "step": 3786 }, { "epoch": 0.41730027548209364, "grad_norm": 6.326870441436768, "learning_rate": 6.373853732966944e-06, "loss": 0.4634, "step": 3787 }, { "epoch": 0.41741046831955925, "grad_norm": 9.407134056091309, "learning_rate": 6.372172509183082e-06, "loss": 0.4246, "step": 3788 }, { "epoch": 0.4175206611570248, "grad_norm": 5.874998092651367, "learning_rate": 6.370491117608101e-06, "loss": 0.3218, "step": 3789 }, { "epoch": 0.41763085399449035, "grad_norm": 6.414131164550781, "learning_rate": 6.368809558447603e-06, "loss": 0.5007, "step": 3790 }, { "epoch": 0.4177410468319559, "grad_norm": 6.991122245788574, "learning_rate": 6.367127831907214e-06, "loss": 0.4957, "step": 3791 }, { "epoch": 0.4178512396694215, "grad_norm": 4.127720832824707, "learning_rate": 6.3654459381925785e-06, "loss": 0.395, "step": 3792 }, { "epoch": 0.41796143250688705, "grad_norm": 8.374381065368652, "learning_rate": 6.363763877509355e-06, "loss": 0.3983, "step": 3793 }, { "epoch": 0.4180716253443526, "grad_norm": 4.763553142547607, "learning_rate": 6.362081650063234e-06, "loss": 0.3837, "step": 3794 }, { "epoch": 0.41818181818181815, "grad_norm": 6.920061111450195, "learning_rate": 6.360399256059919e-06, "loss": 0.3299, "step": 3795 }, { "epoch": 0.41829201101928376, "grad_norm": 6.663188457489014, "learning_rate": 6.358716695705135e-06, "loss": 0.3495, "step": 3796 }, { "epoch": 0.4184022038567493, "grad_norm": 6.809614181518555, "learning_rate": 6.357033969204628e-06, "loss": 0.3879, "step": 3797 }, { "epoch": 0.41851239669421486, "grad_norm": 7.749216556549072, "learning_rate": 6.355351076764164e-06, "loss": 0.4378, "step": 3798 }, { "epoch": 0.41862258953168047, "grad_norm": 5.883305072784424, "learning_rate": 6.353668018589527e-06, "loss": 0.3725, "step": 3799 }, { "epoch": 0.418732782369146, "grad_norm": 7.1863250732421875, "learning_rate": 6.3519847948865284e-06, "loss": 0.3712, "step": 3800 }, { "epoch": 0.41884297520661157, "grad_norm": 11.583418846130371, "learning_rate": 6.350301405860991e-06, "loss": 0.4467, "step": 3801 }, { "epoch": 0.4189531680440771, "grad_norm": 6.89532995223999, "learning_rate": 6.348617851718766e-06, "loss": 0.4527, "step": 3802 }, { "epoch": 0.4190633608815427, "grad_norm": 6.568483352661133, "learning_rate": 6.346934132665716e-06, "loss": 0.3707, "step": 3803 }, { "epoch": 0.4191735537190083, "grad_norm": 8.508829116821289, "learning_rate": 6.345250248907731e-06, "loss": 0.5115, "step": 3804 }, { "epoch": 0.4192837465564738, "grad_norm": 6.044707775115967, "learning_rate": 6.3435662006507194e-06, "loss": 0.3654, "step": 3805 }, { "epoch": 0.4193939393939394, "grad_norm": 6.409003734588623, "learning_rate": 6.341881988100605e-06, "loss": 0.4392, "step": 3806 }, { "epoch": 0.419504132231405, "grad_norm": 8.158562660217285, "learning_rate": 6.340197611463341e-06, "loss": 0.4119, "step": 3807 }, { "epoch": 0.41961432506887053, "grad_norm": 6.024563312530518, "learning_rate": 6.338513070944891e-06, "loss": 0.3328, "step": 3808 }, { "epoch": 0.4197245179063361, "grad_norm": 9.549670219421387, "learning_rate": 6.336828366751245e-06, "loss": 0.4517, "step": 3809 }, { "epoch": 0.41983471074380163, "grad_norm": 9.08705997467041, "learning_rate": 6.335143499088412e-06, "loss": 0.4371, "step": 3810 }, { "epoch": 0.41994490358126724, "grad_norm": 15.335102081298828, "learning_rate": 6.333458468162415e-06, "loss": 0.3911, "step": 3811 }, { "epoch": 0.4200550964187328, "grad_norm": 9.106855392456055, "learning_rate": 6.33177327417931e-06, "loss": 0.4405, "step": 3812 }, { "epoch": 0.42016528925619834, "grad_norm": 6.226790904998779, "learning_rate": 6.330087917345156e-06, "loss": 0.4018, "step": 3813 }, { "epoch": 0.4202754820936639, "grad_norm": 7.737644672393799, "learning_rate": 6.328402397866045e-06, "loss": 0.4866, "step": 3814 }, { "epoch": 0.4203856749311295, "grad_norm": 6.267590522766113, "learning_rate": 6.3267167159480845e-06, "loss": 0.363, "step": 3815 }, { "epoch": 0.42049586776859504, "grad_norm": 3.8961129188537598, "learning_rate": 6.325030871797403e-06, "loss": 0.4197, "step": 3816 }, { "epoch": 0.4206060606060606, "grad_norm": 11.12862491607666, "learning_rate": 6.323344865620147e-06, "loss": 0.4586, "step": 3817 }, { "epoch": 0.42071625344352614, "grad_norm": 4.104209899902344, "learning_rate": 6.3216586976224815e-06, "loss": 0.4427, "step": 3818 }, { "epoch": 0.42082644628099175, "grad_norm": 8.506538391113281, "learning_rate": 6.3199723680105966e-06, "loss": 0.4049, "step": 3819 }, { "epoch": 0.4209366391184573, "grad_norm": 8.029024124145508, "learning_rate": 6.318285876990697e-06, "loss": 0.4489, "step": 3820 }, { "epoch": 0.42104683195592285, "grad_norm": 5.868107318878174, "learning_rate": 6.316599224769008e-06, "loss": 0.3651, "step": 3821 }, { "epoch": 0.42115702479338846, "grad_norm": 6.64792013168335, "learning_rate": 6.314912411551779e-06, "loss": 0.4023, "step": 3822 }, { "epoch": 0.421267217630854, "grad_norm": 6.665269374847412, "learning_rate": 6.313225437545274e-06, "loss": 0.3859, "step": 3823 }, { "epoch": 0.42137741046831956, "grad_norm": 12.29218578338623, "learning_rate": 6.311538302955778e-06, "loss": 0.5835, "step": 3824 }, { "epoch": 0.4214876033057851, "grad_norm": 5.8684539794921875, "learning_rate": 6.309851007989598e-06, "loss": 0.3665, "step": 3825 }, { "epoch": 0.4215977961432507, "grad_norm": 4.377128601074219, "learning_rate": 6.308163552853057e-06, "loss": 0.374, "step": 3826 }, { "epoch": 0.42170798898071626, "grad_norm": 4.379380702972412, "learning_rate": 6.3064759377525e-06, "loss": 0.3751, "step": 3827 }, { "epoch": 0.4218181818181818, "grad_norm": 6.47163724899292, "learning_rate": 6.304788162894291e-06, "loss": 0.4254, "step": 3828 }, { "epoch": 0.42192837465564736, "grad_norm": 7.115240573883057, "learning_rate": 6.3031002284848106e-06, "loss": 0.4518, "step": 3829 }, { "epoch": 0.42203856749311297, "grad_norm": 5.516519069671631, "learning_rate": 6.301412134730468e-06, "loss": 0.4202, "step": 3830 }, { "epoch": 0.4221487603305785, "grad_norm": 5.217955589294434, "learning_rate": 6.299723881837678e-06, "loss": 0.3879, "step": 3831 }, { "epoch": 0.42225895316804407, "grad_norm": 8.340252876281738, "learning_rate": 6.298035470012889e-06, "loss": 0.4044, "step": 3832 }, { "epoch": 0.4223691460055096, "grad_norm": 5.796931743621826, "learning_rate": 6.296346899462559e-06, "loss": 0.429, "step": 3833 }, { "epoch": 0.4224793388429752, "grad_norm": 9.1382417678833, "learning_rate": 6.294658170393169e-06, "loss": 0.3954, "step": 3834 }, { "epoch": 0.4225895316804408, "grad_norm": 8.257237434387207, "learning_rate": 6.292969283011219e-06, "loss": 0.3601, "step": 3835 }, { "epoch": 0.42269972451790633, "grad_norm": 7.257051467895508, "learning_rate": 6.291280237523227e-06, "loss": 0.4497, "step": 3836 }, { "epoch": 0.4228099173553719, "grad_norm": 4.841982364654541, "learning_rate": 6.2895910341357355e-06, "loss": 0.384, "step": 3837 }, { "epoch": 0.4229201101928375, "grad_norm": 9.36284351348877, "learning_rate": 6.287901673055301e-06, "loss": 0.5032, "step": 3838 }, { "epoch": 0.42303030303030303, "grad_norm": 6.163784503936768, "learning_rate": 6.2862121544885e-06, "loss": 0.4299, "step": 3839 }, { "epoch": 0.4231404958677686, "grad_norm": 10.357873916625977, "learning_rate": 6.28452247864193e-06, "loss": 0.4065, "step": 3840 }, { "epoch": 0.42325068870523413, "grad_norm": 6.996313571929932, "learning_rate": 6.282832645722206e-06, "loss": 0.3963, "step": 3841 }, { "epoch": 0.42336088154269974, "grad_norm": 7.793999671936035, "learning_rate": 6.281142655935963e-06, "loss": 0.5049, "step": 3842 }, { "epoch": 0.4234710743801653, "grad_norm": 5.729565143585205, "learning_rate": 6.279452509489856e-06, "loss": 0.3909, "step": 3843 }, { "epoch": 0.42358126721763084, "grad_norm": 9.760586738586426, "learning_rate": 6.277762206590559e-06, "loss": 0.4008, "step": 3844 }, { "epoch": 0.4236914600550964, "grad_norm": 5.855377674102783, "learning_rate": 6.276071747444763e-06, "loss": 0.4294, "step": 3845 }, { "epoch": 0.423801652892562, "grad_norm": 5.786755561828613, "learning_rate": 6.27438113225918e-06, "loss": 0.4266, "step": 3846 }, { "epoch": 0.42391184573002755, "grad_norm": 6.315412521362305, "learning_rate": 6.272690361240542e-06, "loss": 0.3893, "step": 3847 }, { "epoch": 0.4240220385674931, "grad_norm": 7.395302772521973, "learning_rate": 6.270999434595598e-06, "loss": 0.4622, "step": 3848 }, { "epoch": 0.4241322314049587, "grad_norm": 13.069974899291992, "learning_rate": 6.269308352531116e-06, "loss": 0.5344, "step": 3849 }, { "epoch": 0.42424242424242425, "grad_norm": 5.600767135620117, "learning_rate": 6.267617115253885e-06, "loss": 0.4751, "step": 3850 }, { "epoch": 0.4243526170798898, "grad_norm": 5.95487642288208, "learning_rate": 6.26592572297071e-06, "loss": 0.3983, "step": 3851 }, { "epoch": 0.42446280991735535, "grad_norm": 7.793905258178711, "learning_rate": 6.264234175888418e-06, "loss": 0.4072, "step": 3852 }, { "epoch": 0.42457300275482096, "grad_norm": 9.293682098388672, "learning_rate": 6.262542474213855e-06, "loss": 0.462, "step": 3853 }, { "epoch": 0.4246831955922865, "grad_norm": 3.852151393890381, "learning_rate": 6.260850618153883e-06, "loss": 0.4374, "step": 3854 }, { "epoch": 0.42479338842975206, "grad_norm": 6.262339115142822, "learning_rate": 6.259158607915385e-06, "loss": 0.3165, "step": 3855 }, { "epoch": 0.4249035812672176, "grad_norm": 3.88421630859375, "learning_rate": 6.257466443705261e-06, "loss": 0.3914, "step": 3856 }, { "epoch": 0.4250137741046832, "grad_norm": 5.283646583557129, "learning_rate": 6.255774125730432e-06, "loss": 0.3919, "step": 3857 }, { "epoch": 0.42512396694214877, "grad_norm": 4.296477317810059, "learning_rate": 6.254081654197839e-06, "loss": 0.4203, "step": 3858 }, { "epoch": 0.4252341597796143, "grad_norm": 5.580582141876221, "learning_rate": 6.252389029314436e-06, "loss": 0.4347, "step": 3859 }, { "epoch": 0.42534435261707987, "grad_norm": 18.328691482543945, "learning_rate": 6.2506962512872e-06, "loss": 0.4846, "step": 3860 }, { "epoch": 0.4254545454545455, "grad_norm": 9.269140243530273, "learning_rate": 6.249003320323131e-06, "loss": 0.5153, "step": 3861 }, { "epoch": 0.425564738292011, "grad_norm": 3.998983144760132, "learning_rate": 6.2473102366292385e-06, "loss": 0.365, "step": 3862 }, { "epoch": 0.4256749311294766, "grad_norm": 9.478604316711426, "learning_rate": 6.245617000412555e-06, "loss": 0.3878, "step": 3863 }, { "epoch": 0.4257851239669421, "grad_norm": 10.491778373718262, "learning_rate": 6.2439236118801314e-06, "loss": 0.4833, "step": 3864 }, { "epoch": 0.42589531680440773, "grad_norm": 4.075682163238525, "learning_rate": 6.242230071239042e-06, "loss": 0.4464, "step": 3865 }, { "epoch": 0.4260055096418733, "grad_norm": 8.175837516784668, "learning_rate": 6.240536378696371e-06, "loss": 0.4427, "step": 3866 }, { "epoch": 0.42611570247933883, "grad_norm": 5.808559417724609, "learning_rate": 6.238842534459224e-06, "loss": 0.4144, "step": 3867 }, { "epoch": 0.4262258953168044, "grad_norm": 8.47080135345459, "learning_rate": 6.237148538734732e-06, "loss": 0.4799, "step": 3868 }, { "epoch": 0.42633608815427, "grad_norm": 6.882425308227539, "learning_rate": 6.235454391730035e-06, "loss": 0.491, "step": 3869 }, { "epoch": 0.42644628099173554, "grad_norm": 10.453054428100586, "learning_rate": 6.233760093652297e-06, "loss": 0.3773, "step": 3870 }, { "epoch": 0.4265564738292011, "grad_norm": 10.358419418334961, "learning_rate": 6.232065644708698e-06, "loss": 0.4692, "step": 3871 }, { "epoch": 0.4266666666666667, "grad_norm": 4.853580474853516, "learning_rate": 6.23037104510644e-06, "loss": 0.4267, "step": 3872 }, { "epoch": 0.42677685950413224, "grad_norm": 6.97660493850708, "learning_rate": 6.228676295052738e-06, "loss": 0.406, "step": 3873 }, { "epoch": 0.4268870523415978, "grad_norm": 6.049506187438965, "learning_rate": 6.22698139475483e-06, "loss": 0.4359, "step": 3874 }, { "epoch": 0.42699724517906334, "grad_norm": 11.104652404785156, "learning_rate": 6.225286344419971e-06, "loss": 0.42, "step": 3875 }, { "epoch": 0.42710743801652895, "grad_norm": 5.031164646148682, "learning_rate": 6.223591144255433e-06, "loss": 0.428, "step": 3876 }, { "epoch": 0.4272176308539945, "grad_norm": 5.761896133422852, "learning_rate": 6.221895794468508e-06, "loss": 0.3766, "step": 3877 }, { "epoch": 0.42732782369146005, "grad_norm": 5.83917760848999, "learning_rate": 6.2202002952665054e-06, "loss": 0.3782, "step": 3878 }, { "epoch": 0.4274380165289256, "grad_norm": 6.0388102531433105, "learning_rate": 6.2185046468567535e-06, "loss": 0.4874, "step": 3879 }, { "epoch": 0.4275482093663912, "grad_norm": 4.570846080780029, "learning_rate": 6.216808849446596e-06, "loss": 0.3824, "step": 3880 }, { "epoch": 0.42765840220385676, "grad_norm": 6.330661773681641, "learning_rate": 6.2151129032434024e-06, "loss": 0.3905, "step": 3881 }, { "epoch": 0.4277685950413223, "grad_norm": 6.117094993591309, "learning_rate": 6.2134168084545506e-06, "loss": 0.4565, "step": 3882 }, { "epoch": 0.42787878787878786, "grad_norm": 10.915534973144531, "learning_rate": 6.211720565287443e-06, "loss": 0.4279, "step": 3883 }, { "epoch": 0.42798898071625346, "grad_norm": 9.58615779876709, "learning_rate": 6.2100241739495e-06, "loss": 0.4818, "step": 3884 }, { "epoch": 0.428099173553719, "grad_norm": 4.800282955169678, "learning_rate": 6.208327634648157e-06, "loss": 0.4052, "step": 3885 }, { "epoch": 0.42820936639118456, "grad_norm": 9.476829528808594, "learning_rate": 6.2066309475908696e-06, "loss": 0.4348, "step": 3886 }, { "epoch": 0.4283195592286501, "grad_norm": 11.474186897277832, "learning_rate": 6.20493411298511e-06, "loss": 0.5148, "step": 3887 }, { "epoch": 0.4284297520661157, "grad_norm": 8.32387638092041, "learning_rate": 6.203237131038371e-06, "loss": 0.4234, "step": 3888 }, { "epoch": 0.42853994490358127, "grad_norm": 7.900044918060303, "learning_rate": 6.201540001958163e-06, "loss": 0.471, "step": 3889 }, { "epoch": 0.4286501377410468, "grad_norm": 5.806334495544434, "learning_rate": 6.199842725952008e-06, "loss": 0.4014, "step": 3890 }, { "epoch": 0.42876033057851237, "grad_norm": 7.8750081062316895, "learning_rate": 6.198145303227456e-06, "loss": 0.4524, "step": 3891 }, { "epoch": 0.428870523415978, "grad_norm": 5.8703436851501465, "learning_rate": 6.1964477339920695e-06, "loss": 0.433, "step": 3892 }, { "epoch": 0.42898071625344353, "grad_norm": 8.025110244750977, "learning_rate": 6.194750018453428e-06, "loss": 0.3659, "step": 3893 }, { "epoch": 0.4290909090909091, "grad_norm": 5.678608417510986, "learning_rate": 6.193052156819132e-06, "loss": 0.4667, "step": 3894 }, { "epoch": 0.42920110192837463, "grad_norm": 4.612313270568848, "learning_rate": 6.191354149296798e-06, "loss": 0.4118, "step": 3895 }, { "epoch": 0.42931129476584023, "grad_norm": 4.925156593322754, "learning_rate": 6.189655996094059e-06, "loss": 0.4305, "step": 3896 }, { "epoch": 0.4294214876033058, "grad_norm": 5.6468586921691895, "learning_rate": 6.187957697418571e-06, "loss": 0.4594, "step": 3897 }, { "epoch": 0.42953168044077134, "grad_norm": 9.24137020111084, "learning_rate": 6.186259253478e-06, "loss": 0.4387, "step": 3898 }, { "epoch": 0.42964187327823694, "grad_norm": 5.610344409942627, "learning_rate": 6.184560664480036e-06, "loss": 0.4119, "step": 3899 }, { "epoch": 0.4297520661157025, "grad_norm": 5.931612014770508, "learning_rate": 6.182861930632387e-06, "loss": 0.4762, "step": 3900 }, { "epoch": 0.42986225895316804, "grad_norm": 7.208206653594971, "learning_rate": 6.181163052142771e-06, "loss": 0.3654, "step": 3901 }, { "epoch": 0.4299724517906336, "grad_norm": 5.230598449707031, "learning_rate": 6.179464029218936e-06, "loss": 0.4699, "step": 3902 }, { "epoch": 0.4300826446280992, "grad_norm": 7.095353126525879, "learning_rate": 6.177764862068636e-06, "loss": 0.4829, "step": 3903 }, { "epoch": 0.43019283746556475, "grad_norm": 6.133082389831543, "learning_rate": 6.176065550899648e-06, "loss": 0.3976, "step": 3904 }, { "epoch": 0.4303030303030303, "grad_norm": 5.761401176452637, "learning_rate": 6.174366095919767e-06, "loss": 0.4073, "step": 3905 }, { "epoch": 0.43041322314049585, "grad_norm": 8.578031539916992, "learning_rate": 6.172666497336804e-06, "loss": 0.3761, "step": 3906 }, { "epoch": 0.43052341597796145, "grad_norm": 7.943502902984619, "learning_rate": 6.170966755358592e-06, "loss": 0.3873, "step": 3907 }, { "epoch": 0.430633608815427, "grad_norm": 4.049300670623779, "learning_rate": 6.169266870192972e-06, "loss": 0.4109, "step": 3908 }, { "epoch": 0.43074380165289256, "grad_norm": 5.710516452789307, "learning_rate": 6.1675668420478114e-06, "loss": 0.4388, "step": 3909 }, { "epoch": 0.4308539944903581, "grad_norm": 8.189253807067871, "learning_rate": 6.165866671130992e-06, "loss": 0.5286, "step": 3910 }, { "epoch": 0.4309641873278237, "grad_norm": 6.19651985168457, "learning_rate": 6.16416635765041e-06, "loss": 0.3811, "step": 3911 }, { "epoch": 0.43107438016528926, "grad_norm": 7.109768867492676, "learning_rate": 6.162465901813987e-06, "loss": 0.4204, "step": 3912 }, { "epoch": 0.4311845730027548, "grad_norm": 4.812902450561523, "learning_rate": 6.160765303829653e-06, "loss": 0.4346, "step": 3913 }, { "epoch": 0.43129476584022036, "grad_norm": 12.155776023864746, "learning_rate": 6.1590645639053625e-06, "loss": 0.4088, "step": 3914 }, { "epoch": 0.43140495867768597, "grad_norm": 9.650174140930176, "learning_rate": 6.157363682249081e-06, "loss": 0.3858, "step": 3915 }, { "epoch": 0.4315151515151515, "grad_norm": 9.240128517150879, "learning_rate": 6.155662659068797e-06, "loss": 0.4106, "step": 3916 }, { "epoch": 0.43162534435261707, "grad_norm": 5.920962333679199, "learning_rate": 6.153961494572515e-06, "loss": 0.4427, "step": 3917 }, { "epoch": 0.4317355371900826, "grad_norm": 5.3680620193481445, "learning_rate": 6.152260188968251e-06, "loss": 0.3361, "step": 3918 }, { "epoch": 0.4318457300275482, "grad_norm": 6.636050224304199, "learning_rate": 6.150558742464047e-06, "loss": 0.4551, "step": 3919 }, { "epoch": 0.4319559228650138, "grad_norm": 6.805266380310059, "learning_rate": 6.1488571552679566e-06, "loss": 0.3813, "step": 3920 }, { "epoch": 0.4320661157024793, "grad_norm": 5.0186285972595215, "learning_rate": 6.147155427588054e-06, "loss": 0.4109, "step": 3921 }, { "epoch": 0.43217630853994493, "grad_norm": 5.153963565826416, "learning_rate": 6.1454535596324275e-06, "loss": 0.3662, "step": 3922 }, { "epoch": 0.4322865013774105, "grad_norm": 10.273711204528809, "learning_rate": 6.1437515516091815e-06, "loss": 0.5663, "step": 3923 }, { "epoch": 0.43239669421487603, "grad_norm": 5.787929534912109, "learning_rate": 6.142049403726445e-06, "loss": 0.3786, "step": 3924 }, { "epoch": 0.4325068870523416, "grad_norm": 5.409564971923828, "learning_rate": 6.140347116192354e-06, "loss": 0.4063, "step": 3925 }, { "epoch": 0.4326170798898072, "grad_norm": 5.356973648071289, "learning_rate": 6.138644689215068e-06, "loss": 0.385, "step": 3926 }, { "epoch": 0.43272727272727274, "grad_norm": 4.582985877990723, "learning_rate": 6.136942123002765e-06, "loss": 0.3817, "step": 3927 }, { "epoch": 0.4328374655647383, "grad_norm": 5.866171360015869, "learning_rate": 6.135239417763634e-06, "loss": 0.3633, "step": 3928 }, { "epoch": 0.43294765840220384, "grad_norm": 11.208274841308594, "learning_rate": 6.133536573705885e-06, "loss": 0.4202, "step": 3929 }, { "epoch": 0.43305785123966944, "grad_norm": 7.416287422180176, "learning_rate": 6.131833591037744e-06, "loss": 0.4555, "step": 3930 }, { "epoch": 0.433168044077135, "grad_norm": 6.879089832305908, "learning_rate": 6.130130469967453e-06, "loss": 0.4877, "step": 3931 }, { "epoch": 0.43327823691460055, "grad_norm": 6.2986931800842285, "learning_rate": 6.1284272107032735e-06, "loss": 0.4587, "step": 3932 }, { "epoch": 0.4333884297520661, "grad_norm": 6.2256059646606445, "learning_rate": 6.126723813453484e-06, "loss": 0.449, "step": 3933 }, { "epoch": 0.4334986225895317, "grad_norm": 6.348203182220459, "learning_rate": 6.1250202784263725e-06, "loss": 0.3939, "step": 3934 }, { "epoch": 0.43360881542699725, "grad_norm": 5.402763843536377, "learning_rate": 6.123316605830256e-06, "loss": 0.4556, "step": 3935 }, { "epoch": 0.4337190082644628, "grad_norm": 5.364984512329102, "learning_rate": 6.121612795873457e-06, "loss": 0.4256, "step": 3936 }, { "epoch": 0.43382920110192835, "grad_norm": 9.048213958740234, "learning_rate": 6.119908848764323e-06, "loss": 0.4534, "step": 3937 }, { "epoch": 0.43393939393939396, "grad_norm": 6.247734069824219, "learning_rate": 6.118204764711214e-06, "loss": 0.3265, "step": 3938 }, { "epoch": 0.4340495867768595, "grad_norm": 5.927489757537842, "learning_rate": 6.116500543922507e-06, "loss": 0.402, "step": 3939 }, { "epoch": 0.43415977961432506, "grad_norm": 6.853671073913574, "learning_rate": 6.1147961866065965e-06, "loss": 0.3861, "step": 3940 }, { "epoch": 0.4342699724517906, "grad_norm": 10.130403518676758, "learning_rate": 6.113091692971894e-06, "loss": 0.4626, "step": 3941 }, { "epoch": 0.4343801652892562, "grad_norm": 5.313806533813477, "learning_rate": 6.111387063226828e-06, "loss": 0.419, "step": 3942 }, { "epoch": 0.43449035812672177, "grad_norm": 6.938016891479492, "learning_rate": 6.109682297579842e-06, "loss": 0.3902, "step": 3943 }, { "epoch": 0.4346005509641873, "grad_norm": 7.420373439788818, "learning_rate": 6.1079773962393965e-06, "loss": 0.438, "step": 3944 }, { "epoch": 0.43471074380165287, "grad_norm": 10.01765251159668, "learning_rate": 6.106272359413972e-06, "loss": 0.4614, "step": 3945 }, { "epoch": 0.43482093663911847, "grad_norm": 5.062164306640625, "learning_rate": 6.104567187312058e-06, "loss": 0.4132, "step": 3946 }, { "epoch": 0.434931129476584, "grad_norm": 5.256128311157227, "learning_rate": 6.102861880142169e-06, "loss": 0.455, "step": 3947 }, { "epoch": 0.43504132231404957, "grad_norm": 13.423139572143555, "learning_rate": 6.101156438112832e-06, "loss": 0.4443, "step": 3948 }, { "epoch": 0.4351515151515152, "grad_norm": 5.345340251922607, "learning_rate": 6.09945086143259e-06, "loss": 0.4338, "step": 3949 }, { "epoch": 0.43526170798898073, "grad_norm": 4.3182477951049805, "learning_rate": 6.097745150310002e-06, "loss": 0.3966, "step": 3950 }, { "epoch": 0.4353719008264463, "grad_norm": 8.046113967895508, "learning_rate": 6.096039304953646e-06, "loss": 0.3909, "step": 3951 }, { "epoch": 0.43548209366391183, "grad_norm": 6.949949264526367, "learning_rate": 6.094333325572116e-06, "loss": 0.3925, "step": 3952 }, { "epoch": 0.43559228650137743, "grad_norm": 8.936685562133789, "learning_rate": 6.092627212374019e-06, "loss": 0.4698, "step": 3953 }, { "epoch": 0.435702479338843, "grad_norm": 8.110991477966309, "learning_rate": 6.090920965567983e-06, "loss": 0.4106, "step": 3954 }, { "epoch": 0.43581267217630854, "grad_norm": 5.9730682373046875, "learning_rate": 6.08921458536265e-06, "loss": 0.3644, "step": 3955 }, { "epoch": 0.4359228650137741, "grad_norm": 6.58652400970459, "learning_rate": 6.087508071966678e-06, "loss": 0.409, "step": 3956 }, { "epoch": 0.4360330578512397, "grad_norm": 6.210163116455078, "learning_rate": 6.085801425588741e-06, "loss": 0.378, "step": 3957 }, { "epoch": 0.43614325068870524, "grad_norm": 8.399636268615723, "learning_rate": 6.084094646437531e-06, "loss": 0.4283, "step": 3958 }, { "epoch": 0.4362534435261708, "grad_norm": 4.2696404457092285, "learning_rate": 6.082387734721755e-06, "loss": 0.3491, "step": 3959 }, { "epoch": 0.43636363636363634, "grad_norm": 5.721973419189453, "learning_rate": 6.080680690650136e-06, "loss": 0.4111, "step": 3960 }, { "epoch": 0.43647382920110195, "grad_norm": 8.701297760009766, "learning_rate": 6.078973514431415e-06, "loss": 0.4477, "step": 3961 }, { "epoch": 0.4365840220385675, "grad_norm": 6.38549280166626, "learning_rate": 6.077266206274346e-06, "loss": 0.4517, "step": 3962 }, { "epoch": 0.43669421487603305, "grad_norm": 9.18921947479248, "learning_rate": 6.075558766387704e-06, "loss": 0.4909, "step": 3963 }, { "epoch": 0.4368044077134986, "grad_norm": 4.652480125427246, "learning_rate": 6.073851194980274e-06, "loss": 0.3808, "step": 3964 }, { "epoch": 0.4369146005509642, "grad_norm": 7.874028205871582, "learning_rate": 6.07214349226086e-06, "loss": 0.394, "step": 3965 }, { "epoch": 0.43702479338842976, "grad_norm": 4.699254989624023, "learning_rate": 6.070435658438285e-06, "loss": 0.4416, "step": 3966 }, { "epoch": 0.4371349862258953, "grad_norm": 5.5780487060546875, "learning_rate": 6.068727693721384e-06, "loss": 0.4461, "step": 3967 }, { "epoch": 0.43724517906336086, "grad_norm": 6.561910152435303, "learning_rate": 6.067019598319007e-06, "loss": 0.4214, "step": 3968 }, { "epoch": 0.43735537190082646, "grad_norm": 6.581684589385986, "learning_rate": 6.065311372440025e-06, "loss": 0.3745, "step": 3969 }, { "epoch": 0.437465564738292, "grad_norm": 14.880142211914062, "learning_rate": 6.063603016293321e-06, "loss": 0.4787, "step": 3970 }, { "epoch": 0.43757575757575756, "grad_norm": 6.389845848083496, "learning_rate": 6.0618945300877964e-06, "loss": 0.4239, "step": 3971 }, { "epoch": 0.43768595041322317, "grad_norm": 7.486594200134277, "learning_rate": 6.060185914032365e-06, "loss": 0.4018, "step": 3972 }, { "epoch": 0.4377961432506887, "grad_norm": 21.0837459564209, "learning_rate": 6.058477168335961e-06, "loss": 0.521, "step": 3973 }, { "epoch": 0.43790633608815427, "grad_norm": 13.081880569458008, "learning_rate": 6.05676829320753e-06, "loss": 0.398, "step": 3974 }, { "epoch": 0.4380165289256198, "grad_norm": 4.662603378295898, "learning_rate": 6.0550592888560365e-06, "loss": 0.4188, "step": 3975 }, { "epoch": 0.4381267217630854, "grad_norm": 5.609738826751709, "learning_rate": 6.053350155490462e-06, "loss": 0.4515, "step": 3976 }, { "epoch": 0.438236914600551, "grad_norm": 8.3709135055542, "learning_rate": 6.051640893319798e-06, "loss": 0.4255, "step": 3977 }, { "epoch": 0.4383471074380165, "grad_norm": 7.281825542449951, "learning_rate": 6.049931502553058e-06, "loss": 0.4306, "step": 3978 }, { "epoch": 0.4384573002754821, "grad_norm": 3.8517487049102783, "learning_rate": 6.0482219833992665e-06, "loss": 0.3779, "step": 3979 }, { "epoch": 0.4385674931129477, "grad_norm": 5.827889919281006, "learning_rate": 6.046512336067467e-06, "loss": 0.3965, "step": 3980 }, { "epoch": 0.43867768595041323, "grad_norm": 13.511173248291016, "learning_rate": 6.044802560766718e-06, "loss": 0.5418, "step": 3981 }, { "epoch": 0.4387878787878788, "grad_norm": 7.127382278442383, "learning_rate": 6.043092657706092e-06, "loss": 0.4385, "step": 3982 }, { "epoch": 0.43889807162534433, "grad_norm": 5.266600608825684, "learning_rate": 6.0413826270946806e-06, "loss": 0.4144, "step": 3983 }, { "epoch": 0.43900826446280994, "grad_norm": 12.07368278503418, "learning_rate": 6.0396724691415866e-06, "loss": 0.5229, "step": 3984 }, { "epoch": 0.4391184573002755, "grad_norm": 7.233647346496582, "learning_rate": 6.037962184055928e-06, "loss": 0.4541, "step": 3985 }, { "epoch": 0.43922865013774104, "grad_norm": 10.408295631408691, "learning_rate": 6.036251772046847e-06, "loss": 0.4293, "step": 3986 }, { "epoch": 0.4393388429752066, "grad_norm": 5.630067348480225, "learning_rate": 6.034541233323491e-06, "loss": 0.4029, "step": 3987 }, { "epoch": 0.4394490358126722, "grad_norm": 5.803105354309082, "learning_rate": 6.032830568095027e-06, "loss": 0.4252, "step": 3988 }, { "epoch": 0.43955922865013775, "grad_norm": 9.329225540161133, "learning_rate": 6.031119776570639e-06, "loss": 0.3607, "step": 3989 }, { "epoch": 0.4396694214876033, "grad_norm": 6.282469272613525, "learning_rate": 6.029408858959522e-06, "loss": 0.4765, "step": 3990 }, { "epoch": 0.43977961432506885, "grad_norm": 5.0228424072265625, "learning_rate": 6.0276978154708945e-06, "loss": 0.4113, "step": 3991 }, { "epoch": 0.43988980716253445, "grad_norm": 6.176638126373291, "learning_rate": 6.0259866463139795e-06, "loss": 0.4589, "step": 3992 }, { "epoch": 0.44, "grad_norm": 5.363448143005371, "learning_rate": 6.024275351698024e-06, "loss": 0.4199, "step": 3993 }, { "epoch": 0.44011019283746555, "grad_norm": 5.49215841293335, "learning_rate": 6.022563931832289e-06, "loss": 0.3592, "step": 3994 }, { "epoch": 0.4402203856749311, "grad_norm": 5.0736083984375, "learning_rate": 6.020852386926046e-06, "loss": 0.3744, "step": 3995 }, { "epoch": 0.4403305785123967, "grad_norm": 6.1613850593566895, "learning_rate": 6.0191407171885875e-06, "loss": 0.4618, "step": 3996 }, { "epoch": 0.44044077134986226, "grad_norm": 6.350560188293457, "learning_rate": 6.017428922829216e-06, "loss": 0.3987, "step": 3997 }, { "epoch": 0.4405509641873278, "grad_norm": 6.247224807739258, "learning_rate": 6.0157170040572545e-06, "loss": 0.4572, "step": 3998 }, { "epoch": 0.4406611570247934, "grad_norm": 6.238313674926758, "learning_rate": 6.0140049610820386e-06, "loss": 0.3761, "step": 3999 }, { "epoch": 0.44077134986225897, "grad_norm": 8.729419708251953, "learning_rate": 6.012292794112917e-06, "loss": 0.4835, "step": 4000 }, { "epoch": 0.4408815426997245, "grad_norm": 4.114143371582031, "learning_rate": 6.01058050335926e-06, "loss": 0.4602, "step": 4001 }, { "epoch": 0.44099173553719007, "grad_norm": 6.828856945037842, "learning_rate": 6.008868089030445e-06, "loss": 0.4266, "step": 4002 }, { "epoch": 0.44110192837465567, "grad_norm": 5.825451374053955, "learning_rate": 6.007155551335869e-06, "loss": 0.3877, "step": 4003 }, { "epoch": 0.4412121212121212, "grad_norm": 19.42027473449707, "learning_rate": 6.005442890484945e-06, "loss": 0.5371, "step": 4004 }, { "epoch": 0.4413223140495868, "grad_norm": 5.937627792358398, "learning_rate": 6.003730106687099e-06, "loss": 0.325, "step": 4005 }, { "epoch": 0.4414325068870523, "grad_norm": 6.736237525939941, "learning_rate": 6.0020172001517705e-06, "loss": 0.3434, "step": 4006 }, { "epoch": 0.44154269972451793, "grad_norm": 5.677096843719482, "learning_rate": 6.00030417108842e-06, "loss": 0.4118, "step": 4007 }, { "epoch": 0.4416528925619835, "grad_norm": 4.171231269836426, "learning_rate": 5.9985910197065154e-06, "loss": 0.3589, "step": 4008 }, { "epoch": 0.44176308539944903, "grad_norm": 8.049046516418457, "learning_rate": 5.996877746215545e-06, "loss": 0.4109, "step": 4009 }, { "epoch": 0.4418732782369146, "grad_norm": 11.192566871643066, "learning_rate": 5.995164350825008e-06, "loss": 0.4301, "step": 4010 }, { "epoch": 0.4419834710743802, "grad_norm": 8.543044090270996, "learning_rate": 5.993450833744424e-06, "loss": 0.4741, "step": 4011 }, { "epoch": 0.44209366391184574, "grad_norm": 7.122586727142334, "learning_rate": 5.991737195183323e-06, "loss": 0.3592, "step": 4012 }, { "epoch": 0.4422038567493113, "grad_norm": 10.358314514160156, "learning_rate": 5.990023435351249e-06, "loss": 0.4666, "step": 4013 }, { "epoch": 0.44231404958677684, "grad_norm": 9.374655723571777, "learning_rate": 5.988309554457765e-06, "loss": 0.4063, "step": 4014 }, { "epoch": 0.44242424242424244, "grad_norm": 12.979504585266113, "learning_rate": 5.9865955527124466e-06, "loss": 0.5649, "step": 4015 }, { "epoch": 0.442534435261708, "grad_norm": 5.430773735046387, "learning_rate": 5.984881430324883e-06, "loss": 0.4055, "step": 4016 }, { "epoch": 0.44264462809917354, "grad_norm": 5.780434608459473, "learning_rate": 5.983167187504681e-06, "loss": 0.3895, "step": 4017 }, { "epoch": 0.4427548209366391, "grad_norm": 8.516437530517578, "learning_rate": 5.98145282446146e-06, "loss": 0.3959, "step": 4018 }, { "epoch": 0.4428650137741047, "grad_norm": 5.609032154083252, "learning_rate": 5.9797383414048535e-06, "loss": 0.4287, "step": 4019 }, { "epoch": 0.44297520661157025, "grad_norm": 8.30771255493164, "learning_rate": 5.978023738544514e-06, "loss": 0.4973, "step": 4020 }, { "epoch": 0.4430853994490358, "grad_norm": 5.688619136810303, "learning_rate": 5.9763090160901e-06, "loss": 0.4544, "step": 4021 }, { "epoch": 0.44319559228650135, "grad_norm": 4.770142078399658, "learning_rate": 5.974594174251297e-06, "loss": 0.3571, "step": 4022 }, { "epoch": 0.44330578512396696, "grad_norm": 11.235660552978516, "learning_rate": 5.972879213237791e-06, "loss": 0.48, "step": 4023 }, { "epoch": 0.4434159779614325, "grad_norm": 5.956013202667236, "learning_rate": 5.971164133259295e-06, "loss": 0.4469, "step": 4024 }, { "epoch": 0.44352617079889806, "grad_norm": 6.475219249725342, "learning_rate": 5.96944893452553e-06, "loss": 0.4272, "step": 4025 }, { "epoch": 0.44363636363636366, "grad_norm": 5.644810676574707, "learning_rate": 5.9677336172462316e-06, "loss": 0.4083, "step": 4026 }, { "epoch": 0.4437465564738292, "grad_norm": 8.617670059204102, "learning_rate": 5.966018181631152e-06, "loss": 0.4128, "step": 4027 }, { "epoch": 0.44385674931129476, "grad_norm": 7.4977593421936035, "learning_rate": 5.964302627890057e-06, "loss": 0.4083, "step": 4028 }, { "epoch": 0.4439669421487603, "grad_norm": 6.7644853591918945, "learning_rate": 5.962586956232727e-06, "loss": 0.4154, "step": 4029 }, { "epoch": 0.4440771349862259, "grad_norm": 6.908641338348389, "learning_rate": 5.9608711668689565e-06, "loss": 0.4706, "step": 4030 }, { "epoch": 0.44418732782369147, "grad_norm": 8.456871032714844, "learning_rate": 5.959155260008554e-06, "loss": 0.4842, "step": 4031 }, { "epoch": 0.444297520661157, "grad_norm": 9.610672950744629, "learning_rate": 5.9574392358613445e-06, "loss": 0.3758, "step": 4032 }, { "epoch": 0.44440771349862257, "grad_norm": 13.6271333694458, "learning_rate": 5.955723094637163e-06, "loss": 0.5054, "step": 4033 }, { "epoch": 0.4445179063360882, "grad_norm": 5.952396869659424, "learning_rate": 5.954006836545864e-06, "loss": 0.3694, "step": 4034 }, { "epoch": 0.4446280991735537, "grad_norm": 5.047083854675293, "learning_rate": 5.952290461797314e-06, "loss": 0.4463, "step": 4035 }, { "epoch": 0.4447382920110193, "grad_norm": 6.901847839355469, "learning_rate": 5.950573970601392e-06, "loss": 0.3828, "step": 4036 }, { "epoch": 0.4448484848484848, "grad_norm": 4.849525451660156, "learning_rate": 5.948857363167995e-06, "loss": 0.413, "step": 4037 }, { "epoch": 0.44495867768595043, "grad_norm": 7.4211745262146, "learning_rate": 5.9471406397070285e-06, "loss": 0.3549, "step": 4038 }, { "epoch": 0.445068870523416, "grad_norm": 6.657160758972168, "learning_rate": 5.945423800428419e-06, "loss": 0.4122, "step": 4039 }, { "epoch": 0.44517906336088153, "grad_norm": 4.9380388259887695, "learning_rate": 5.943706845542103e-06, "loss": 0.4272, "step": 4040 }, { "epoch": 0.4452892561983471, "grad_norm": 6.010152339935303, "learning_rate": 5.941989775258032e-06, "loss": 0.3934, "step": 4041 }, { "epoch": 0.4453994490358127, "grad_norm": 6.57681131362915, "learning_rate": 5.940272589786172e-06, "loss": 0.4489, "step": 4042 }, { "epoch": 0.44550964187327824, "grad_norm": 8.034668922424316, "learning_rate": 5.938555289336503e-06, "loss": 0.4549, "step": 4043 }, { "epoch": 0.4456198347107438, "grad_norm": 6.683331489562988, "learning_rate": 5.936837874119017e-06, "loss": 0.3653, "step": 4044 }, { "epoch": 0.44573002754820934, "grad_norm": 7.716479301452637, "learning_rate": 5.935120344343724e-06, "loss": 0.437, "step": 4045 }, { "epoch": 0.44584022038567495, "grad_norm": 6.004525661468506, "learning_rate": 5.933402700220645e-06, "loss": 0.4325, "step": 4046 }, { "epoch": 0.4459504132231405, "grad_norm": 7.189431667327881, "learning_rate": 5.931684941959814e-06, "loss": 0.4682, "step": 4047 }, { "epoch": 0.44606060606060605, "grad_norm": 6.102496147155762, "learning_rate": 5.929967069771285e-06, "loss": 0.43, "step": 4048 }, { "epoch": 0.44617079889807165, "grad_norm": 5.6079511642456055, "learning_rate": 5.9282490838651185e-06, "loss": 0.4937, "step": 4049 }, { "epoch": 0.4462809917355372, "grad_norm": 7.230101585388184, "learning_rate": 5.926530984451395e-06, "loss": 0.4118, "step": 4050 }, { "epoch": 0.44639118457300275, "grad_norm": 5.412363529205322, "learning_rate": 5.924812771740201e-06, "loss": 0.4016, "step": 4051 }, { "epoch": 0.4465013774104683, "grad_norm": 7.132895469665527, "learning_rate": 5.9230944459416475e-06, "loss": 0.4153, "step": 4052 }, { "epoch": 0.4466115702479339, "grad_norm": 5.968504428863525, "learning_rate": 5.921376007265851e-06, "loss": 0.3692, "step": 4053 }, { "epoch": 0.44672176308539946, "grad_norm": 7.628453254699707, "learning_rate": 5.919657455922944e-06, "loss": 0.4621, "step": 4054 }, { "epoch": 0.446831955922865, "grad_norm": 8.419864654541016, "learning_rate": 5.9179387921230745e-06, "loss": 0.4084, "step": 4055 }, { "epoch": 0.44694214876033056, "grad_norm": 7.555700778961182, "learning_rate": 5.9162200160764015e-06, "loss": 0.477, "step": 4056 }, { "epoch": 0.44705234159779617, "grad_norm": 6.322028636932373, "learning_rate": 5.914501127993102e-06, "loss": 0.3713, "step": 4057 }, { "epoch": 0.4471625344352617, "grad_norm": 6.326248645782471, "learning_rate": 5.912782128083361e-06, "loss": 0.4339, "step": 4058 }, { "epoch": 0.44727272727272727, "grad_norm": 6.034976959228516, "learning_rate": 5.911063016557381e-06, "loss": 0.4335, "step": 4059 }, { "epoch": 0.4473829201101928, "grad_norm": 8.301700592041016, "learning_rate": 5.909343793625379e-06, "loss": 0.3955, "step": 4060 }, { "epoch": 0.4474931129476584, "grad_norm": 8.64501667022705, "learning_rate": 5.907624459497584e-06, "loss": 0.4309, "step": 4061 }, { "epoch": 0.447603305785124, "grad_norm": 4.960073471069336, "learning_rate": 5.905905014384235e-06, "loss": 0.4167, "step": 4062 }, { "epoch": 0.4477134986225895, "grad_norm": 10.823214530944824, "learning_rate": 5.904185458495592e-06, "loss": 0.4695, "step": 4063 }, { "epoch": 0.4478236914600551, "grad_norm": 4.7425408363342285, "learning_rate": 5.902465792041922e-06, "loss": 0.4547, "step": 4064 }, { "epoch": 0.4479338842975207, "grad_norm": 8.464041709899902, "learning_rate": 5.900746015233507e-06, "loss": 0.5264, "step": 4065 }, { "epoch": 0.44804407713498623, "grad_norm": 9.00203800201416, "learning_rate": 5.89902612828065e-06, "loss": 0.3918, "step": 4066 }, { "epoch": 0.4481542699724518, "grad_norm": 6.427272796630859, "learning_rate": 5.897306131393654e-06, "loss": 0.4185, "step": 4067 }, { "epoch": 0.44826446280991733, "grad_norm": 5.438120365142822, "learning_rate": 5.8955860247828465e-06, "loss": 0.4098, "step": 4068 }, { "epoch": 0.44837465564738294, "grad_norm": 3.8153905868530273, "learning_rate": 5.893865808658562e-06, "loss": 0.4449, "step": 4069 }, { "epoch": 0.4484848484848485, "grad_norm": 6.233038902282715, "learning_rate": 5.892145483231153e-06, "loss": 0.3843, "step": 4070 }, { "epoch": 0.44859504132231404, "grad_norm": 6.0744709968566895, "learning_rate": 5.890425048710982e-06, "loss": 0.4148, "step": 4071 }, { "epoch": 0.4487052341597796, "grad_norm": 8.636625289916992, "learning_rate": 5.8887045053084265e-06, "loss": 0.4608, "step": 4072 }, { "epoch": 0.4488154269972452, "grad_norm": 4.762270927429199, "learning_rate": 5.886983853233879e-06, "loss": 0.3427, "step": 4073 }, { "epoch": 0.44892561983471074, "grad_norm": 6.674958229064941, "learning_rate": 5.88526309269774e-06, "loss": 0.4495, "step": 4074 }, { "epoch": 0.4490358126721763, "grad_norm": 5.687108516693115, "learning_rate": 5.883542223910426e-06, "loss": 0.4071, "step": 4075 }, { "epoch": 0.4491460055096419, "grad_norm": 8.375948905944824, "learning_rate": 5.8818212470823696e-06, "loss": 0.4192, "step": 4076 }, { "epoch": 0.44925619834710745, "grad_norm": 4.817061901092529, "learning_rate": 5.880100162424013e-06, "loss": 0.3792, "step": 4077 }, { "epoch": 0.449366391184573, "grad_norm": 5.390269756317139, "learning_rate": 5.878378970145813e-06, "loss": 0.3867, "step": 4078 }, { "epoch": 0.44947658402203855, "grad_norm": 8.200571060180664, "learning_rate": 5.87665767045824e-06, "loss": 0.4738, "step": 4079 }, { "epoch": 0.44958677685950416, "grad_norm": 8.864215850830078, "learning_rate": 5.874936263571775e-06, "loss": 0.4256, "step": 4080 }, { "epoch": 0.4496969696969697, "grad_norm": 8.43858814239502, "learning_rate": 5.873214749696918e-06, "loss": 0.4767, "step": 4081 }, { "epoch": 0.44980716253443526, "grad_norm": 7.963834762573242, "learning_rate": 5.871493129044172e-06, "loss": 0.4067, "step": 4082 }, { "epoch": 0.4499173553719008, "grad_norm": 8.85851001739502, "learning_rate": 5.869771401824065e-06, "loss": 0.4339, "step": 4083 }, { "epoch": 0.4500275482093664, "grad_norm": 5.0043110847473145, "learning_rate": 5.868049568247128e-06, "loss": 0.408, "step": 4084 }, { "epoch": 0.45013774104683196, "grad_norm": 10.033919334411621, "learning_rate": 5.866327628523911e-06, "loss": 0.4837, "step": 4085 }, { "epoch": 0.4502479338842975, "grad_norm": 6.435369491577148, "learning_rate": 5.864605582864975e-06, "loss": 0.4075, "step": 4086 }, { "epoch": 0.45035812672176306, "grad_norm": 8.440646171569824, "learning_rate": 5.862883431480894e-06, "loss": 0.4751, "step": 4087 }, { "epoch": 0.45046831955922867, "grad_norm": 6.353564739227295, "learning_rate": 5.861161174582254e-06, "loss": 0.5333, "step": 4088 }, { "epoch": 0.4505785123966942, "grad_norm": 7.430014610290527, "learning_rate": 5.859438812379656e-06, "loss": 0.4208, "step": 4089 }, { "epoch": 0.45068870523415977, "grad_norm": 5.450559139251709, "learning_rate": 5.857716345083712e-06, "loss": 0.4442, "step": 4090 }, { "epoch": 0.4507988980716253, "grad_norm": 7.066483497619629, "learning_rate": 5.855993772905051e-06, "loss": 0.3885, "step": 4091 }, { "epoch": 0.4509090909090909, "grad_norm": 7.168652057647705, "learning_rate": 5.854271096054307e-06, "loss": 0.5197, "step": 4092 }, { "epoch": 0.4510192837465565, "grad_norm": 6.303380966186523, "learning_rate": 5.852548314742131e-06, "loss": 0.4214, "step": 4093 }, { "epoch": 0.451129476584022, "grad_norm": 5.621610164642334, "learning_rate": 5.850825429179192e-06, "loss": 0.3097, "step": 4094 }, { "epoch": 0.4512396694214876, "grad_norm": 6.062560558319092, "learning_rate": 5.849102439576163e-06, "loss": 0.4208, "step": 4095 }, { "epoch": 0.4513498622589532, "grad_norm": 7.787949085235596, "learning_rate": 5.847379346143734e-06, "loss": 0.5181, "step": 4096 }, { "epoch": 0.45146005509641873, "grad_norm": 6.1522297859191895, "learning_rate": 5.845656149092607e-06, "loss": 0.3853, "step": 4097 }, { "epoch": 0.4515702479338843, "grad_norm": 5.82921838760376, "learning_rate": 5.843932848633497e-06, "loss": 0.4241, "step": 4098 }, { "epoch": 0.4516804407713499, "grad_norm": 4.390358924865723, "learning_rate": 5.8422094449771335e-06, "loss": 0.4511, "step": 4099 }, { "epoch": 0.45179063360881544, "grad_norm": 5.280516147613525, "learning_rate": 5.8404859383342534e-06, "loss": 0.3394, "step": 4100 }, { "epoch": 0.451900826446281, "grad_norm": 4.667827606201172, "learning_rate": 5.838762328915613e-06, "loss": 0.4015, "step": 4101 }, { "epoch": 0.45201101928374654, "grad_norm": 5.000226974487305, "learning_rate": 5.837038616931975e-06, "loss": 0.3944, "step": 4102 }, { "epoch": 0.45212121212121215, "grad_norm": 4.7028045654296875, "learning_rate": 5.8353148025941165e-06, "loss": 0.4614, "step": 4103 }, { "epoch": 0.4522314049586777, "grad_norm": 12.146053314208984, "learning_rate": 5.833590886112831e-06, "loss": 0.3995, "step": 4104 }, { "epoch": 0.45234159779614325, "grad_norm": 7.792434215545654, "learning_rate": 5.831866867698918e-06, "loss": 0.4203, "step": 4105 }, { "epoch": 0.4524517906336088, "grad_norm": 9.011277198791504, "learning_rate": 5.830142747563195e-06, "loss": 0.4962, "step": 4106 }, { "epoch": 0.4525619834710744, "grad_norm": 8.330543518066406, "learning_rate": 5.828418525916491e-06, "loss": 0.3836, "step": 4107 }, { "epoch": 0.45267217630853995, "grad_norm": 5.687896251678467, "learning_rate": 5.826694202969641e-06, "loss": 0.3428, "step": 4108 }, { "epoch": 0.4527823691460055, "grad_norm": 6.441923141479492, "learning_rate": 5.824969778933504e-06, "loss": 0.4435, "step": 4109 }, { "epoch": 0.45289256198347105, "grad_norm": 5.4831438064575195, "learning_rate": 5.823245254018941e-06, "loss": 0.3995, "step": 4110 }, { "epoch": 0.45300275482093666, "grad_norm": 8.866961479187012, "learning_rate": 5.82152062843683e-06, "loss": 0.4861, "step": 4111 }, { "epoch": 0.4531129476584022, "grad_norm": 5.998145580291748, "learning_rate": 5.81979590239806e-06, "loss": 0.4596, "step": 4112 }, { "epoch": 0.45322314049586776, "grad_norm": 6.52558708190918, "learning_rate": 5.818071076113534e-06, "loss": 0.3853, "step": 4113 }, { "epoch": 0.4533333333333333, "grad_norm": 4.526134014129639, "learning_rate": 5.8163461497941655e-06, "loss": 0.3947, "step": 4114 }, { "epoch": 0.4534435261707989, "grad_norm": 11.384407043457031, "learning_rate": 5.8146211236508794e-06, "loss": 0.4874, "step": 4115 }, { "epoch": 0.45355371900826447, "grad_norm": 9.417316436767578, "learning_rate": 5.812895997894617e-06, "loss": 0.4166, "step": 4116 }, { "epoch": 0.45366391184573, "grad_norm": 9.920056343078613, "learning_rate": 5.811170772736329e-06, "loss": 0.3421, "step": 4117 }, { "epoch": 0.45377410468319557, "grad_norm": 22.426549911499023, "learning_rate": 5.809445448386976e-06, "loss": 0.5263, "step": 4118 }, { "epoch": 0.4538842975206612, "grad_norm": 6.549997806549072, "learning_rate": 5.8077200250575334e-06, "loss": 0.3492, "step": 4119 }, { "epoch": 0.4539944903581267, "grad_norm": 8.752823829650879, "learning_rate": 5.80599450295899e-06, "loss": 0.4303, "step": 4120 }, { "epoch": 0.4541046831955923, "grad_norm": 9.733783721923828, "learning_rate": 5.804268882302343e-06, "loss": 0.3882, "step": 4121 }, { "epoch": 0.4542148760330578, "grad_norm": 4.425025939941406, "learning_rate": 5.802543163298605e-06, "loss": 0.3804, "step": 4122 }, { "epoch": 0.45432506887052343, "grad_norm": 8.211994171142578, "learning_rate": 5.800817346158799e-06, "loss": 0.4269, "step": 4123 }, { "epoch": 0.454435261707989, "grad_norm": 9.320842742919922, "learning_rate": 5.7990914310939605e-06, "loss": 0.4989, "step": 4124 }, { "epoch": 0.45454545454545453, "grad_norm": 5.90273904800415, "learning_rate": 5.7973654183151355e-06, "loss": 0.3813, "step": 4125 }, { "epoch": 0.45465564738292014, "grad_norm": 6.839073181152344, "learning_rate": 5.795639308033383e-06, "loss": 0.4971, "step": 4126 }, { "epoch": 0.4547658402203857, "grad_norm": 4.444808483123779, "learning_rate": 5.793913100459778e-06, "loss": 0.4208, "step": 4127 }, { "epoch": 0.45487603305785124, "grad_norm": 8.782885551452637, "learning_rate": 5.792186795805399e-06, "loss": 0.4249, "step": 4128 }, { "epoch": 0.4549862258953168, "grad_norm": 4.603721618652344, "learning_rate": 5.790460394281343e-06, "loss": 0.4146, "step": 4129 }, { "epoch": 0.4550964187327824, "grad_norm": 9.251736640930176, "learning_rate": 5.788733896098716e-06, "loss": 0.4833, "step": 4130 }, { "epoch": 0.45520661157024794, "grad_norm": 4.481989860534668, "learning_rate": 5.787007301468637e-06, "loss": 0.4099, "step": 4131 }, { "epoch": 0.4553168044077135, "grad_norm": 5.976316928863525, "learning_rate": 5.7852806106022354e-06, "loss": 0.4435, "step": 4132 }, { "epoch": 0.45542699724517904, "grad_norm": 5.873013019561768, "learning_rate": 5.783553823710654e-06, "loss": 0.4555, "step": 4133 }, { "epoch": 0.45553719008264465, "grad_norm": 7.329885482788086, "learning_rate": 5.781826941005048e-06, "loss": 0.479, "step": 4134 }, { "epoch": 0.4556473829201102, "grad_norm": 7.070761680603027, "learning_rate": 5.78009996269658e-06, "loss": 0.4293, "step": 4135 }, { "epoch": 0.45575757575757575, "grad_norm": 8.647218704223633, "learning_rate": 5.77837288899643e-06, "loss": 0.4293, "step": 4136 }, { "epoch": 0.4558677685950413, "grad_norm": 6.377957820892334, "learning_rate": 5.776645720115787e-06, "loss": 0.4042, "step": 4137 }, { "epoch": 0.4559779614325069, "grad_norm": 5.826206207275391, "learning_rate": 5.774918456265848e-06, "loss": 0.3931, "step": 4138 }, { "epoch": 0.45608815426997246, "grad_norm": 6.398508548736572, "learning_rate": 5.773191097657827e-06, "loss": 0.4291, "step": 4139 }, { "epoch": 0.456198347107438, "grad_norm": 3.5530846118927, "learning_rate": 5.771463644502951e-06, "loss": 0.3721, "step": 4140 }, { "epoch": 0.45630853994490356, "grad_norm": 7.899833679199219, "learning_rate": 5.769736097012451e-06, "loss": 0.3802, "step": 4141 }, { "epoch": 0.45641873278236916, "grad_norm": 7.081460952758789, "learning_rate": 5.7680084553975765e-06, "loss": 0.3913, "step": 4142 }, { "epoch": 0.4565289256198347, "grad_norm": 7.769568920135498, "learning_rate": 5.766280719869584e-06, "loss": 0.3456, "step": 4143 }, { "epoch": 0.45663911845730026, "grad_norm": 7.003978252410889, "learning_rate": 5.764552890639744e-06, "loss": 0.4205, "step": 4144 }, { "epoch": 0.4567493112947658, "grad_norm": 8.077564239501953, "learning_rate": 5.76282496791934e-06, "loss": 0.4132, "step": 4145 }, { "epoch": 0.4568595041322314, "grad_norm": 11.824240684509277, "learning_rate": 5.7610969519196595e-06, "loss": 0.4437, "step": 4146 }, { "epoch": 0.45696969696969697, "grad_norm": 12.949878692626953, "learning_rate": 5.7593688428520115e-06, "loss": 0.5133, "step": 4147 }, { "epoch": 0.4570798898071625, "grad_norm": 4.608225345611572, "learning_rate": 5.757640640927711e-06, "loss": 0.3576, "step": 4148 }, { "epoch": 0.4571900826446281, "grad_norm": 6.741049766540527, "learning_rate": 5.755912346358081e-06, "loss": 0.4643, "step": 4149 }, { "epoch": 0.4573002754820937, "grad_norm": 4.472276210784912, "learning_rate": 5.7541839593544645e-06, "loss": 0.4487, "step": 4150 }, { "epoch": 0.4574104683195592, "grad_norm": 7.069968223571777, "learning_rate": 5.752455480128209e-06, "loss": 0.4236, "step": 4151 }, { "epoch": 0.4575206611570248, "grad_norm": 7.472801685333252, "learning_rate": 5.750726908890675e-06, "loss": 0.4297, "step": 4152 }, { "epoch": 0.4576308539944904, "grad_norm": 6.52254056930542, "learning_rate": 5.748998245853235e-06, "loss": 0.4283, "step": 4153 }, { "epoch": 0.45774104683195593, "grad_norm": 5.164422512054443, "learning_rate": 5.747269491227271e-06, "loss": 0.419, "step": 4154 }, { "epoch": 0.4578512396694215, "grad_norm": 10.188724517822266, "learning_rate": 5.74554064522418e-06, "loss": 0.4748, "step": 4155 }, { "epoch": 0.45796143250688703, "grad_norm": 9.821293830871582, "learning_rate": 5.743811708055364e-06, "loss": 0.3994, "step": 4156 }, { "epoch": 0.45807162534435264, "grad_norm": 4.618627071380615, "learning_rate": 5.7420826799322445e-06, "loss": 0.4188, "step": 4157 }, { "epoch": 0.4581818181818182, "grad_norm": 6.881746768951416, "learning_rate": 5.740353561066246e-06, "loss": 0.5055, "step": 4158 }, { "epoch": 0.45829201101928374, "grad_norm": 5.866497993469238, "learning_rate": 5.738624351668808e-06, "loss": 0.415, "step": 4159 }, { "epoch": 0.4584022038567493, "grad_norm": 5.360278606414795, "learning_rate": 5.736895051951382e-06, "loss": 0.4482, "step": 4160 }, { "epoch": 0.4585123966942149, "grad_norm": 6.263329982757568, "learning_rate": 5.735165662125426e-06, "loss": 0.3392, "step": 4161 }, { "epoch": 0.45862258953168045, "grad_norm": 6.2176313400268555, "learning_rate": 5.733436182402416e-06, "loss": 0.4368, "step": 4162 }, { "epoch": 0.458732782369146, "grad_norm": 7.075268268585205, "learning_rate": 5.7317066129938335e-06, "loss": 0.402, "step": 4163 }, { "epoch": 0.45884297520661155, "grad_norm": 11.643766403198242, "learning_rate": 5.729976954111171e-06, "loss": 0.4219, "step": 4164 }, { "epoch": 0.45895316804407715, "grad_norm": 5.848720550537109, "learning_rate": 5.728247205965936e-06, "loss": 0.3966, "step": 4165 }, { "epoch": 0.4590633608815427, "grad_norm": 10.475896835327148, "learning_rate": 5.726517368769644e-06, "loss": 0.496, "step": 4166 }, { "epoch": 0.45917355371900825, "grad_norm": 6.278916358947754, "learning_rate": 5.724787442733819e-06, "loss": 0.3825, "step": 4167 }, { "epoch": 0.4592837465564738, "grad_norm": 4.7572760581970215, "learning_rate": 5.723057428070003e-06, "loss": 0.3911, "step": 4168 }, { "epoch": 0.4593939393939394, "grad_norm": 9.122440338134766, "learning_rate": 5.721327324989743e-06, "loss": 0.4123, "step": 4169 }, { "epoch": 0.45950413223140496, "grad_norm": 4.627889156341553, "learning_rate": 5.719597133704597e-06, "loss": 0.4247, "step": 4170 }, { "epoch": 0.4596143250688705, "grad_norm": 10.681262969970703, "learning_rate": 5.717866854426135e-06, "loss": 0.3771, "step": 4171 }, { "epoch": 0.45972451790633606, "grad_norm": 10.229944229125977, "learning_rate": 5.7161364873659395e-06, "loss": 0.565, "step": 4172 }, { "epoch": 0.45983471074380167, "grad_norm": 9.267426490783691, "learning_rate": 5.714406032735602e-06, "loss": 0.38, "step": 4173 }, { "epoch": 0.4599449035812672, "grad_norm": 9.422104835510254, "learning_rate": 5.712675490746724e-06, "loss": 0.441, "step": 4174 }, { "epoch": 0.46005509641873277, "grad_norm": 4.980631351470947, "learning_rate": 5.710944861610919e-06, "loss": 0.3582, "step": 4175 }, { "epoch": 0.4601652892561984, "grad_norm": 7.520429611206055, "learning_rate": 5.709214145539811e-06, "loss": 0.3643, "step": 4176 }, { "epoch": 0.4602754820936639, "grad_norm": 8.083974838256836, "learning_rate": 5.707483342745032e-06, "loss": 0.4682, "step": 4177 }, { "epoch": 0.4603856749311295, "grad_norm": 5.137016296386719, "learning_rate": 5.705752453438231e-06, "loss": 0.3894, "step": 4178 }, { "epoch": 0.460495867768595, "grad_norm": 10.261585235595703, "learning_rate": 5.704021477831062e-06, "loss": 0.5258, "step": 4179 }, { "epoch": 0.46060606060606063, "grad_norm": 6.102723598480225, "learning_rate": 5.7022904161351886e-06, "loss": 0.3917, "step": 4180 }, { "epoch": 0.4607162534435262, "grad_norm": 4.687762260437012, "learning_rate": 5.70055926856229e-06, "loss": 0.4548, "step": 4181 }, { "epoch": 0.46082644628099173, "grad_norm": 9.035147666931152, "learning_rate": 5.698828035324051e-06, "loss": 0.432, "step": 4182 }, { "epoch": 0.4609366391184573, "grad_norm": 8.2828369140625, "learning_rate": 5.697096716632173e-06, "loss": 0.4721, "step": 4183 }, { "epoch": 0.4610468319559229, "grad_norm": 10.927952766418457, "learning_rate": 5.69536531269836e-06, "loss": 0.4251, "step": 4184 }, { "epoch": 0.46115702479338844, "grad_norm": 10.918237686157227, "learning_rate": 5.693633823734331e-06, "loss": 0.4486, "step": 4185 }, { "epoch": 0.461267217630854, "grad_norm": 9.484095573425293, "learning_rate": 5.69190224995182e-06, "loss": 0.4049, "step": 4186 }, { "epoch": 0.46137741046831954, "grad_norm": 5.832151412963867, "learning_rate": 5.690170591562557e-06, "loss": 0.4209, "step": 4187 }, { "epoch": 0.46148760330578514, "grad_norm": 8.604377746582031, "learning_rate": 5.6884388487782995e-06, "loss": 0.4892, "step": 4188 }, { "epoch": 0.4615977961432507, "grad_norm": 6.541306972503662, "learning_rate": 5.686707021810802e-06, "loss": 0.4282, "step": 4189 }, { "epoch": 0.46170798898071624, "grad_norm": 5.040317058563232, "learning_rate": 5.6849751108718395e-06, "loss": 0.4091, "step": 4190 }, { "epoch": 0.4618181818181818, "grad_norm": 6.780949592590332, "learning_rate": 5.68324311617319e-06, "loss": 0.4909, "step": 4191 }, { "epoch": 0.4619283746556474, "grad_norm": 9.71782112121582, "learning_rate": 5.681511037926643e-06, "loss": 0.4777, "step": 4192 }, { "epoch": 0.46203856749311295, "grad_norm": 12.609463691711426, "learning_rate": 5.679778876344001e-06, "loss": 0.4661, "step": 4193 }, { "epoch": 0.4621487603305785, "grad_norm": 7.244626522064209, "learning_rate": 5.678046631637074e-06, "loss": 0.4415, "step": 4194 }, { "epoch": 0.46225895316804405, "grad_norm": 8.343194007873535, "learning_rate": 5.676314304017684e-06, "loss": 0.3958, "step": 4195 }, { "epoch": 0.46236914600550966, "grad_norm": 5.753417015075684, "learning_rate": 5.674581893697663e-06, "loss": 0.3926, "step": 4196 }, { "epoch": 0.4624793388429752, "grad_norm": 6.718925952911377, "learning_rate": 5.6728494008888516e-06, "loss": 0.4558, "step": 4197 }, { "epoch": 0.46258953168044076, "grad_norm": 7.640260696411133, "learning_rate": 5.6711168258031e-06, "loss": 0.3741, "step": 4198 }, { "epoch": 0.46269972451790636, "grad_norm": 9.677931785583496, "learning_rate": 5.6693841686522734e-06, "loss": 0.3595, "step": 4199 }, { "epoch": 0.4628099173553719, "grad_norm": 6.4213948249816895, "learning_rate": 5.66765142964824e-06, "loss": 0.4104, "step": 4200 }, { "epoch": 0.46292011019283746, "grad_norm": 4.397367000579834, "learning_rate": 5.665918609002884e-06, "loss": 0.3712, "step": 4201 }, { "epoch": 0.463030303030303, "grad_norm": 7.3970746994018555, "learning_rate": 5.664185706928094e-06, "loss": 0.4252, "step": 4202 }, { "epoch": 0.4631404958677686, "grad_norm": 11.200392723083496, "learning_rate": 5.6624527236357754e-06, "loss": 0.4969, "step": 4203 }, { "epoch": 0.46325068870523417, "grad_norm": 6.623799800872803, "learning_rate": 5.6607196593378375e-06, "loss": 0.3434, "step": 4204 }, { "epoch": 0.4633608815426997, "grad_norm": 8.250639915466309, "learning_rate": 5.658986514246202e-06, "loss": 0.4288, "step": 4205 }, { "epoch": 0.46347107438016527, "grad_norm": 8.903459548950195, "learning_rate": 5.6572532885728e-06, "loss": 0.418, "step": 4206 }, { "epoch": 0.4635812672176309, "grad_norm": 6.643313884735107, "learning_rate": 5.655519982529574e-06, "loss": 0.3834, "step": 4207 }, { "epoch": 0.4636914600550964, "grad_norm": 6.704066276550293, "learning_rate": 5.653786596328472e-06, "loss": 0.3128, "step": 4208 }, { "epoch": 0.463801652892562, "grad_norm": 6.213980674743652, "learning_rate": 5.6520531301814595e-06, "loss": 0.447, "step": 4209 }, { "epoch": 0.46391184573002753, "grad_norm": 4.845897674560547, "learning_rate": 5.650319584300503e-06, "loss": 0.3957, "step": 4210 }, { "epoch": 0.46402203856749313, "grad_norm": 6.639408111572266, "learning_rate": 5.648585958897585e-06, "loss": 0.4439, "step": 4211 }, { "epoch": 0.4641322314049587, "grad_norm": 9.47696590423584, "learning_rate": 5.646852254184695e-06, "loss": 0.4725, "step": 4212 }, { "epoch": 0.46424242424242423, "grad_norm": 5.440971851348877, "learning_rate": 5.645118470373832e-06, "loss": 0.4025, "step": 4213 }, { "epoch": 0.4643526170798898, "grad_norm": 5.227582931518555, "learning_rate": 5.643384607677007e-06, "loss": 0.4273, "step": 4214 }, { "epoch": 0.4644628099173554, "grad_norm": 7.226681232452393, "learning_rate": 5.641650666306237e-06, "loss": 0.4511, "step": 4215 }, { "epoch": 0.46457300275482094, "grad_norm": 6.690056800842285, "learning_rate": 5.639916646473554e-06, "loss": 0.357, "step": 4216 }, { "epoch": 0.4646831955922865, "grad_norm": 9.158060073852539, "learning_rate": 5.6381825483909916e-06, "loss": 0.4276, "step": 4217 }, { "epoch": 0.46479338842975204, "grad_norm": 6.537561416625977, "learning_rate": 5.636448372270602e-06, "loss": 0.335, "step": 4218 }, { "epoch": 0.46490358126721765, "grad_norm": 6.055027484893799, "learning_rate": 5.634714118324442e-06, "loss": 0.4111, "step": 4219 }, { "epoch": 0.4650137741046832, "grad_norm": 18.14972496032715, "learning_rate": 5.6329797867645746e-06, "loss": 0.5513, "step": 4220 }, { "epoch": 0.46512396694214875, "grad_norm": 6.206079006195068, "learning_rate": 5.6312453778030806e-06, "loss": 0.4031, "step": 4221 }, { "epoch": 0.4652341597796143, "grad_norm": 7.26116943359375, "learning_rate": 5.629510891652045e-06, "loss": 0.4924, "step": 4222 }, { "epoch": 0.4653443526170799, "grad_norm": 5.4720377922058105, "learning_rate": 5.62777632852356e-06, "loss": 0.3969, "step": 4223 }, { "epoch": 0.46545454545454545, "grad_norm": 6.136850833892822, "learning_rate": 5.6260416886297356e-06, "loss": 0.4204, "step": 4224 }, { "epoch": 0.465564738292011, "grad_norm": 7.384100914001465, "learning_rate": 5.624306972182681e-06, "loss": 0.4294, "step": 4225 }, { "epoch": 0.4656749311294766, "grad_norm": 11.933074951171875, "learning_rate": 5.6225721793945235e-06, "loss": 0.4586, "step": 4226 }, { "epoch": 0.46578512396694216, "grad_norm": 11.574447631835938, "learning_rate": 5.6208373104773925e-06, "loss": 0.4163, "step": 4227 }, { "epoch": 0.4658953168044077, "grad_norm": 7.786038875579834, "learning_rate": 5.619102365643434e-06, "loss": 0.4239, "step": 4228 }, { "epoch": 0.46600550964187326, "grad_norm": 7.837298393249512, "learning_rate": 5.617367345104796e-06, "loss": 0.4201, "step": 4229 }, { "epoch": 0.46611570247933887, "grad_norm": 5.018138408660889, "learning_rate": 5.615632249073641e-06, "loss": 0.4, "step": 4230 }, { "epoch": 0.4662258953168044, "grad_norm": 7.820041179656982, "learning_rate": 5.613897077762136e-06, "loss": 0.3829, "step": 4231 }, { "epoch": 0.46633608815426997, "grad_norm": 6.930209159851074, "learning_rate": 5.612161831382465e-06, "loss": 0.3486, "step": 4232 }, { "epoch": 0.4664462809917355, "grad_norm": 5.2343974113464355, "learning_rate": 5.610426510146814e-06, "loss": 0.3918, "step": 4233 }, { "epoch": 0.4665564738292011, "grad_norm": 5.706419467926025, "learning_rate": 5.608691114267379e-06, "loss": 0.3262, "step": 4234 }, { "epoch": 0.4666666666666667, "grad_norm": 4.098259449005127, "learning_rate": 5.606955643956368e-06, "loss": 0.3802, "step": 4235 }, { "epoch": 0.4667768595041322, "grad_norm": 6.685352802276611, "learning_rate": 5.605220099425995e-06, "loss": 0.4617, "step": 4236 }, { "epoch": 0.4668870523415978, "grad_norm": 8.390470504760742, "learning_rate": 5.603484480888488e-06, "loss": 0.4822, "step": 4237 }, { "epoch": 0.4669972451790634, "grad_norm": 5.249129295349121, "learning_rate": 5.6017487885560784e-06, "loss": 0.4082, "step": 4238 }, { "epoch": 0.46710743801652893, "grad_norm": 5.53339958190918, "learning_rate": 5.600013022641009e-06, "loss": 0.4677, "step": 4239 }, { "epoch": 0.4672176308539945, "grad_norm": 8.485630989074707, "learning_rate": 5.598277183355533e-06, "loss": 0.4946, "step": 4240 }, { "epoch": 0.46732782369146003, "grad_norm": 4.0238189697265625, "learning_rate": 5.5965412709119094e-06, "loss": 0.3379, "step": 4241 }, { "epoch": 0.46743801652892564, "grad_norm": 5.816349029541016, "learning_rate": 5.594805285522411e-06, "loss": 0.3848, "step": 4242 }, { "epoch": 0.4675482093663912, "grad_norm": 13.12299633026123, "learning_rate": 5.593069227399312e-06, "loss": 0.5288, "step": 4243 }, { "epoch": 0.46765840220385674, "grad_norm": 6.124564170837402, "learning_rate": 5.591333096754903e-06, "loss": 0.3349, "step": 4244 }, { "epoch": 0.4677685950413223, "grad_norm": 5.709305286407471, "learning_rate": 5.589596893801479e-06, "loss": 0.398, "step": 4245 }, { "epoch": 0.4678787878787879, "grad_norm": 7.738457202911377, "learning_rate": 5.587860618751347e-06, "loss": 0.4516, "step": 4246 }, { "epoch": 0.46798898071625344, "grad_norm": 7.816864013671875, "learning_rate": 5.58612427181682e-06, "loss": 0.4699, "step": 4247 }, { "epoch": 0.468099173553719, "grad_norm": 9.974682807922363, "learning_rate": 5.58438785321022e-06, "loss": 0.355, "step": 4248 }, { "epoch": 0.4682093663911846, "grad_norm": 9.548007011413574, "learning_rate": 5.58265136314388e-06, "loss": 0.3784, "step": 4249 }, { "epoch": 0.46831955922865015, "grad_norm": 8.929972648620605, "learning_rate": 5.580914801830141e-06, "loss": 0.4947, "step": 4250 }, { "epoch": 0.4684297520661157, "grad_norm": 9.959912300109863, "learning_rate": 5.579178169481348e-06, "loss": 0.4054, "step": 4251 }, { "epoch": 0.46853994490358125, "grad_norm": 7.277345180511475, "learning_rate": 5.577441466309865e-06, "loss": 0.4526, "step": 4252 }, { "epoch": 0.46865013774104686, "grad_norm": 6.184348106384277, "learning_rate": 5.575704692528053e-06, "loss": 0.4462, "step": 4253 }, { "epoch": 0.4687603305785124, "grad_norm": 5.845827579498291, "learning_rate": 5.5739678483482895e-06, "loss": 0.3542, "step": 4254 }, { "epoch": 0.46887052341597796, "grad_norm": 5.441403865814209, "learning_rate": 5.572230933982958e-06, "loss": 0.3858, "step": 4255 }, { "epoch": 0.4689807162534435, "grad_norm": 7.008370876312256, "learning_rate": 5.570493949644452e-06, "loss": 0.3452, "step": 4256 }, { "epoch": 0.4690909090909091, "grad_norm": 9.76735782623291, "learning_rate": 5.56875689554517e-06, "loss": 0.4842, "step": 4257 }, { "epoch": 0.46920110192837466, "grad_norm": 8.114686965942383, "learning_rate": 5.56701977189752e-06, "loss": 0.4102, "step": 4258 }, { "epoch": 0.4693112947658402, "grad_norm": 7.080952167510986, "learning_rate": 5.565282578913924e-06, "loss": 0.4561, "step": 4259 }, { "epoch": 0.46942148760330576, "grad_norm": 4.5083794593811035, "learning_rate": 5.563545316806808e-06, "loss": 0.3941, "step": 4260 }, { "epoch": 0.46953168044077137, "grad_norm": 7.826779365539551, "learning_rate": 5.561807985788603e-06, "loss": 0.4642, "step": 4261 }, { "epoch": 0.4696418732782369, "grad_norm": 8.584476470947266, "learning_rate": 5.560070586071755e-06, "loss": 0.5064, "step": 4262 }, { "epoch": 0.46975206611570247, "grad_norm": 6.538086414337158, "learning_rate": 5.558333117868715e-06, "loss": 0.3951, "step": 4263 }, { "epoch": 0.469862258953168, "grad_norm": 6.985914707183838, "learning_rate": 5.556595581391941e-06, "loss": 0.3504, "step": 4264 }, { "epoch": 0.4699724517906336, "grad_norm": 5.304479122161865, "learning_rate": 5.554857976853907e-06, "loss": 0.3632, "step": 4265 }, { "epoch": 0.4700826446280992, "grad_norm": 4.482560157775879, "learning_rate": 5.553120304467082e-06, "loss": 0.3665, "step": 4266 }, { "epoch": 0.47019283746556473, "grad_norm": 8.54191780090332, "learning_rate": 5.551382564443958e-06, "loss": 0.4603, "step": 4267 }, { "epoch": 0.4703030303030303, "grad_norm": 8.821993827819824, "learning_rate": 5.549644756997023e-06, "loss": 0.423, "step": 4268 }, { "epoch": 0.4704132231404959, "grad_norm": 11.837578773498535, "learning_rate": 5.547906882338782e-06, "loss": 0.5689, "step": 4269 }, { "epoch": 0.47052341597796143, "grad_norm": 8.223700523376465, "learning_rate": 5.546168940681743e-06, "loss": 0.4629, "step": 4270 }, { "epoch": 0.470633608815427, "grad_norm": 6.382428169250488, "learning_rate": 5.544430932238423e-06, "loss": 0.4461, "step": 4271 }, { "epoch": 0.47074380165289254, "grad_norm": 8.722525596618652, "learning_rate": 5.542692857221348e-06, "loss": 0.3981, "step": 4272 }, { "epoch": 0.47085399449035814, "grad_norm": 5.7968010902404785, "learning_rate": 5.540954715843055e-06, "loss": 0.4261, "step": 4273 }, { "epoch": 0.4709641873278237, "grad_norm": 5.978255271911621, "learning_rate": 5.539216508316085e-06, "loss": 0.3708, "step": 4274 }, { "epoch": 0.47107438016528924, "grad_norm": 3.8172013759613037, "learning_rate": 5.537478234852988e-06, "loss": 0.4553, "step": 4275 }, { "epoch": 0.47118457300275485, "grad_norm": 7.099826335906982, "learning_rate": 5.535739895666321e-06, "loss": 0.3988, "step": 4276 }, { "epoch": 0.4712947658402204, "grad_norm": 6.397146701812744, "learning_rate": 5.5340014909686525e-06, "loss": 0.391, "step": 4277 }, { "epoch": 0.47140495867768595, "grad_norm": 6.066586494445801, "learning_rate": 5.532263020972556e-06, "loss": 0.3247, "step": 4278 }, { "epoch": 0.4715151515151515, "grad_norm": 6.35976505279541, "learning_rate": 5.530524485890614e-06, "loss": 0.4055, "step": 4279 }, { "epoch": 0.4716253443526171, "grad_norm": 4.074512958526611, "learning_rate": 5.528785885935418e-06, "loss": 0.3659, "step": 4280 }, { "epoch": 0.47173553719008265, "grad_norm": 6.480283737182617, "learning_rate": 5.527047221319566e-06, "loss": 0.3655, "step": 4281 }, { "epoch": 0.4718457300275482, "grad_norm": 3.880749225616455, "learning_rate": 5.525308492255662e-06, "loss": 0.4007, "step": 4282 }, { "epoch": 0.47195592286501376, "grad_norm": 5.906920909881592, "learning_rate": 5.523569698956324e-06, "loss": 0.3596, "step": 4283 }, { "epoch": 0.47206611570247936, "grad_norm": 4.660150527954102, "learning_rate": 5.521830841634172e-06, "loss": 0.386, "step": 4284 }, { "epoch": 0.4721763085399449, "grad_norm": 7.394484996795654, "learning_rate": 5.520091920501833e-06, "loss": 0.422, "step": 4285 }, { "epoch": 0.47228650137741046, "grad_norm": 7.227637767791748, "learning_rate": 5.51835293577195e-06, "loss": 0.4269, "step": 4286 }, { "epoch": 0.472396694214876, "grad_norm": 5.268207550048828, "learning_rate": 5.516613887657165e-06, "loss": 0.4106, "step": 4287 }, { "epoch": 0.4725068870523416, "grad_norm": 9.578835487365723, "learning_rate": 5.514874776370133e-06, "loss": 0.4801, "step": 4288 }, { "epoch": 0.47261707988980717, "grad_norm": 4.544835090637207, "learning_rate": 5.5131356021235135e-06, "loss": 0.3645, "step": 4289 }, { "epoch": 0.4727272727272727, "grad_norm": 8.72737979888916, "learning_rate": 5.511396365129975e-06, "loss": 0.4129, "step": 4290 }, { "epoch": 0.47283746556473827, "grad_norm": 8.959371566772461, "learning_rate": 5.509657065602197e-06, "loss": 0.4078, "step": 4291 }, { "epoch": 0.4729476584022039, "grad_norm": 5.4538702964782715, "learning_rate": 5.507917703752856e-06, "loss": 0.3698, "step": 4292 }, { "epoch": 0.4730578512396694, "grad_norm": 18.069997787475586, "learning_rate": 5.506178279794652e-06, "loss": 0.5317, "step": 4293 }, { "epoch": 0.473168044077135, "grad_norm": 12.249960899353027, "learning_rate": 5.5044387939402775e-06, "loss": 0.4586, "step": 4294 }, { "epoch": 0.4732782369146005, "grad_norm": 5.215136528015137, "learning_rate": 5.502699246402444e-06, "loss": 0.4257, "step": 4295 }, { "epoch": 0.47338842975206613, "grad_norm": 9.20561695098877, "learning_rate": 5.500959637393865e-06, "loss": 0.3994, "step": 4296 }, { "epoch": 0.4734986225895317, "grad_norm": 6.768822193145752, "learning_rate": 5.499219967127258e-06, "loss": 0.4311, "step": 4297 }, { "epoch": 0.47360881542699723, "grad_norm": 7.707272529602051, "learning_rate": 5.497480235815356e-06, "loss": 0.5008, "step": 4298 }, { "epoch": 0.47371900826446284, "grad_norm": 5.857189178466797, "learning_rate": 5.4957404436708975e-06, "loss": 0.3601, "step": 4299 }, { "epoch": 0.4738292011019284, "grad_norm": 9.859488487243652, "learning_rate": 5.494000590906622e-06, "loss": 0.5111, "step": 4300 }, { "epoch": 0.47393939393939394, "grad_norm": 6.382084369659424, "learning_rate": 5.492260677735284e-06, "loss": 0.427, "step": 4301 }, { "epoch": 0.4740495867768595, "grad_norm": 8.878406524658203, "learning_rate": 5.490520704369642e-06, "loss": 0.4398, "step": 4302 }, { "epoch": 0.4741597796143251, "grad_norm": 7.349577903747559, "learning_rate": 5.488780671022461e-06, "loss": 0.4158, "step": 4303 }, { "epoch": 0.47426997245179064, "grad_norm": 7.9784064292907715, "learning_rate": 5.487040577906515e-06, "loss": 0.4526, "step": 4304 }, { "epoch": 0.4743801652892562, "grad_norm": 13.937430381774902, "learning_rate": 5.485300425234587e-06, "loss": 0.4784, "step": 4305 }, { "epoch": 0.47449035812672175, "grad_norm": 8.099891662597656, "learning_rate": 5.483560213219464e-06, "loss": 0.4843, "step": 4306 }, { "epoch": 0.47460055096418735, "grad_norm": 5.307769298553467, "learning_rate": 5.4818199420739395e-06, "loss": 0.382, "step": 4307 }, { "epoch": 0.4747107438016529, "grad_norm": 13.030486106872559, "learning_rate": 5.480079612010819e-06, "loss": 0.5393, "step": 4308 }, { "epoch": 0.47482093663911845, "grad_norm": 6.650481700897217, "learning_rate": 5.478339223242912e-06, "loss": 0.4695, "step": 4309 }, { "epoch": 0.474931129476584, "grad_norm": 8.314249038696289, "learning_rate": 5.476598775983033e-06, "loss": 0.4093, "step": 4310 }, { "epoch": 0.4750413223140496, "grad_norm": 13.993112564086914, "learning_rate": 5.47485827044401e-06, "loss": 0.5179, "step": 4311 }, { "epoch": 0.47515151515151516, "grad_norm": 5.659000396728516, "learning_rate": 5.473117706838673e-06, "loss": 0.3622, "step": 4312 }, { "epoch": 0.4752617079889807, "grad_norm": 4.3064470291137695, "learning_rate": 5.471377085379858e-06, "loss": 0.3936, "step": 4313 }, { "epoch": 0.47537190082644626, "grad_norm": 6.03898811340332, "learning_rate": 5.469636406280416e-06, "loss": 0.4387, "step": 4314 }, { "epoch": 0.47548209366391186, "grad_norm": 4.539555549621582, "learning_rate": 5.467895669753194e-06, "loss": 0.2841, "step": 4315 }, { "epoch": 0.4755922865013774, "grad_norm": 6.276854038238525, "learning_rate": 5.466154876011055e-06, "loss": 0.4324, "step": 4316 }, { "epoch": 0.47570247933884297, "grad_norm": 9.656164169311523, "learning_rate": 5.464414025266863e-06, "loss": 0.4452, "step": 4317 }, { "epoch": 0.4758126721763085, "grad_norm": 4.501316070556641, "learning_rate": 5.462673117733493e-06, "loss": 0.4771, "step": 4318 }, { "epoch": 0.4759228650137741, "grad_norm": 9.54035758972168, "learning_rate": 5.460932153623829e-06, "loss": 0.3968, "step": 4319 }, { "epoch": 0.47603305785123967, "grad_norm": 5.434879779815674, "learning_rate": 5.459191133150753e-06, "loss": 0.3633, "step": 4320 }, { "epoch": 0.4761432506887052, "grad_norm": 5.331169128417969, "learning_rate": 5.457450056527162e-06, "loss": 0.432, "step": 4321 }, { "epoch": 0.47625344352617077, "grad_norm": 6.110133647918701, "learning_rate": 5.455708923965954e-06, "loss": 0.3912, "step": 4322 }, { "epoch": 0.4763636363636364, "grad_norm": 5.7163405418396, "learning_rate": 5.453967735680044e-06, "loss": 0.391, "step": 4323 }, { "epoch": 0.47647382920110193, "grad_norm": 4.875898361206055, "learning_rate": 5.4522264918823395e-06, "loss": 0.4557, "step": 4324 }, { "epoch": 0.4765840220385675, "grad_norm": 5.104773044586182, "learning_rate": 5.4504851927857664e-06, "loss": 0.4269, "step": 4325 }, { "epoch": 0.4766942148760331, "grad_norm": 4.523794651031494, "learning_rate": 5.448743838603252e-06, "loss": 0.3935, "step": 4326 }, { "epoch": 0.47680440771349863, "grad_norm": 7.606765270233154, "learning_rate": 5.447002429547732e-06, "loss": 0.4277, "step": 4327 }, { "epoch": 0.4769146005509642, "grad_norm": 4.95655632019043, "learning_rate": 5.445260965832146e-06, "loss": 0.4097, "step": 4328 }, { "epoch": 0.47702479338842974, "grad_norm": 4.617556571960449, "learning_rate": 5.443519447669445e-06, "loss": 0.443, "step": 4329 }, { "epoch": 0.47713498622589534, "grad_norm": 6.024190902709961, "learning_rate": 5.441777875272585e-06, "loss": 0.4358, "step": 4330 }, { "epoch": 0.4772451790633609, "grad_norm": 5.942342758178711, "learning_rate": 5.440036248854525e-06, "loss": 0.337, "step": 4331 }, { "epoch": 0.47735537190082644, "grad_norm": 6.1515398025512695, "learning_rate": 5.438294568628235e-06, "loss": 0.4493, "step": 4332 }, { "epoch": 0.477465564738292, "grad_norm": 5.799715042114258, "learning_rate": 5.43655283480669e-06, "loss": 0.3944, "step": 4333 }, { "epoch": 0.4775757575757576, "grad_norm": 5.636984348297119, "learning_rate": 5.4348110476028715e-06, "loss": 0.4278, "step": 4334 }, { "epoch": 0.47768595041322315, "grad_norm": 7.268592357635498, "learning_rate": 5.4330692072297665e-06, "loss": 0.3753, "step": 4335 }, { "epoch": 0.4777961432506887, "grad_norm": 5.62819766998291, "learning_rate": 5.431327313900371e-06, "loss": 0.3748, "step": 4336 }, { "epoch": 0.47790633608815425, "grad_norm": 5.308300971984863, "learning_rate": 5.4295853678276855e-06, "loss": 0.4211, "step": 4337 }, { "epoch": 0.47801652892561985, "grad_norm": 4.91331148147583, "learning_rate": 5.427843369224718e-06, "loss": 0.4165, "step": 4338 }, { "epoch": 0.4781267217630854, "grad_norm": 7.938823699951172, "learning_rate": 5.426101318304482e-06, "loss": 0.4165, "step": 4339 }, { "epoch": 0.47823691460055096, "grad_norm": 8.069595336914062, "learning_rate": 5.424359215279999e-06, "loss": 0.3445, "step": 4340 }, { "epoch": 0.4783471074380165, "grad_norm": 7.100213527679443, "learning_rate": 5.422617060364293e-06, "loss": 0.3952, "step": 4341 }, { "epoch": 0.4784573002754821, "grad_norm": 9.818364143371582, "learning_rate": 5.4208748537703995e-06, "loss": 0.3307, "step": 4342 }, { "epoch": 0.47856749311294766, "grad_norm": 9.175378799438477, "learning_rate": 5.419132595711357e-06, "loss": 0.4935, "step": 4343 }, { "epoch": 0.4786776859504132, "grad_norm": 6.158470630645752, "learning_rate": 5.417390286400213e-06, "loss": 0.3144, "step": 4344 }, { "epoch": 0.47878787878787876, "grad_norm": 10.078022003173828, "learning_rate": 5.415647926050016e-06, "loss": 0.4913, "step": 4345 }, { "epoch": 0.47889807162534437, "grad_norm": 6.890070915222168, "learning_rate": 5.413905514873825e-06, "loss": 0.4608, "step": 4346 }, { "epoch": 0.4790082644628099, "grad_norm": 6.087070465087891, "learning_rate": 5.412163053084709e-06, "loss": 0.3743, "step": 4347 }, { "epoch": 0.47911845730027547, "grad_norm": 6.422086238861084, "learning_rate": 5.410420540895731e-06, "loss": 0.4777, "step": 4348 }, { "epoch": 0.4792286501377411, "grad_norm": 7.705680847167969, "learning_rate": 5.408677978519975e-06, "loss": 0.4404, "step": 4349 }, { "epoch": 0.4793388429752066, "grad_norm": 8.673178672790527, "learning_rate": 5.406935366170518e-06, "loss": 0.5056, "step": 4350 }, { "epoch": 0.4794490358126722, "grad_norm": 6.674211502075195, "learning_rate": 5.405192704060454e-06, "loss": 0.4401, "step": 4351 }, { "epoch": 0.4795592286501377, "grad_norm": 11.007810592651367, "learning_rate": 5.403449992402875e-06, "loss": 0.3834, "step": 4352 }, { "epoch": 0.47966942148760333, "grad_norm": 6.436962604522705, "learning_rate": 5.401707231410881e-06, "loss": 0.3686, "step": 4353 }, { "epoch": 0.4797796143250689, "grad_norm": 8.79969596862793, "learning_rate": 5.399964421297583e-06, "loss": 0.4908, "step": 4354 }, { "epoch": 0.47988980716253443, "grad_norm": 9.806441307067871, "learning_rate": 5.398221562276092e-06, "loss": 0.3788, "step": 4355 }, { "epoch": 0.48, "grad_norm": 8.346846580505371, "learning_rate": 5.396478654559527e-06, "loss": 0.4135, "step": 4356 }, { "epoch": 0.4801101928374656, "grad_norm": 9.595311164855957, "learning_rate": 5.394735698361015e-06, "loss": 0.3901, "step": 4357 }, { "epoch": 0.48022038567493114, "grad_norm": 5.470604419708252, "learning_rate": 5.392992693893684e-06, "loss": 0.4604, "step": 4358 }, { "epoch": 0.4803305785123967, "grad_norm": 9.696976661682129, "learning_rate": 5.391249641370673e-06, "loss": 0.4023, "step": 4359 }, { "epoch": 0.48044077134986224, "grad_norm": 4.56251335144043, "learning_rate": 5.389506541005125e-06, "loss": 0.3888, "step": 4360 }, { "epoch": 0.48055096418732784, "grad_norm": 11.087410926818848, "learning_rate": 5.387763393010187e-06, "loss": 0.3778, "step": 4361 }, { "epoch": 0.4806611570247934, "grad_norm": 6.750298500061035, "learning_rate": 5.386020197599016e-06, "loss": 0.4241, "step": 4362 }, { "epoch": 0.48077134986225895, "grad_norm": 4.112381458282471, "learning_rate": 5.384276954984769e-06, "loss": 0.2738, "step": 4363 }, { "epoch": 0.4808815426997245, "grad_norm": 8.764153480529785, "learning_rate": 5.3825336653806144e-06, "loss": 0.4023, "step": 4364 }, { "epoch": 0.4809917355371901, "grad_norm": 4.686344146728516, "learning_rate": 5.380790328999726e-06, "loss": 0.3965, "step": 4365 }, { "epoch": 0.48110192837465565, "grad_norm": 6.162919998168945, "learning_rate": 5.379046946055276e-06, "loss": 0.3832, "step": 4366 }, { "epoch": 0.4812121212121212, "grad_norm": 8.729318618774414, "learning_rate": 5.3773035167604516e-06, "loss": 0.5119, "step": 4367 }, { "epoch": 0.48132231404958675, "grad_norm": 6.998101711273193, "learning_rate": 5.375560041328441e-06, "loss": 0.4262, "step": 4368 }, { "epoch": 0.48143250688705236, "grad_norm": 13.333380699157715, "learning_rate": 5.373816519972438e-06, "loss": 0.4887, "step": 4369 }, { "epoch": 0.4815426997245179, "grad_norm": 5.41016960144043, "learning_rate": 5.3720729529056425e-06, "loss": 0.3149, "step": 4370 }, { "epoch": 0.48165289256198346, "grad_norm": 11.839273452758789, "learning_rate": 5.370329340341261e-06, "loss": 0.5802, "step": 4371 }, { "epoch": 0.481763085399449, "grad_norm": 9.279288291931152, "learning_rate": 5.3685856824925066e-06, "loss": 0.4462, "step": 4372 }, { "epoch": 0.4818732782369146, "grad_norm": 5.733633041381836, "learning_rate": 5.3668419795725925e-06, "loss": 0.4337, "step": 4373 }, { "epoch": 0.48198347107438017, "grad_norm": 6.996150016784668, "learning_rate": 5.365098231794743e-06, "loss": 0.4301, "step": 4374 }, { "epoch": 0.4820936639118457, "grad_norm": 5.667147159576416, "learning_rate": 5.363354439372188e-06, "loss": 0.3247, "step": 4375 }, { "epoch": 0.4822038567493113, "grad_norm": 5.164324760437012, "learning_rate": 5.361610602518156e-06, "loss": 0.3992, "step": 4376 }, { "epoch": 0.48231404958677687, "grad_norm": 8.832839012145996, "learning_rate": 5.3598667214458875e-06, "loss": 0.4427, "step": 4377 }, { "epoch": 0.4824242424242424, "grad_norm": 9.516709327697754, "learning_rate": 5.35812279636863e-06, "loss": 0.4116, "step": 4378 }, { "epoch": 0.482534435261708, "grad_norm": 6.538336277008057, "learning_rate": 5.35637882749963e-06, "loss": 0.4716, "step": 4379 }, { "epoch": 0.4826446280991736, "grad_norm": 8.16253662109375, "learning_rate": 5.354634815052142e-06, "loss": 0.3644, "step": 4380 }, { "epoch": 0.48275482093663913, "grad_norm": 5.015005111694336, "learning_rate": 5.3528907592394275e-06, "loss": 0.3886, "step": 4381 }, { "epoch": 0.4828650137741047, "grad_norm": 5.4582319259643555, "learning_rate": 5.351146660274751e-06, "loss": 0.4306, "step": 4382 }, { "epoch": 0.48297520661157023, "grad_norm": 8.287875175476074, "learning_rate": 5.349402518371385e-06, "loss": 0.483, "step": 4383 }, { "epoch": 0.48308539944903583, "grad_norm": 5.849573612213135, "learning_rate": 5.347658333742604e-06, "loss": 0.4077, "step": 4384 }, { "epoch": 0.4831955922865014, "grad_norm": 7.98061466217041, "learning_rate": 5.34591410660169e-06, "loss": 0.3696, "step": 4385 }, { "epoch": 0.48330578512396694, "grad_norm": 7.349559783935547, "learning_rate": 5.344169837161929e-06, "loss": 0.3966, "step": 4386 }, { "epoch": 0.4834159779614325, "grad_norm": 13.274238586425781, "learning_rate": 5.3424255256366105e-06, "loss": 0.4603, "step": 4387 }, { "epoch": 0.4835261707988981, "grad_norm": 8.205038070678711, "learning_rate": 5.340681172239037e-06, "loss": 0.43, "step": 4388 }, { "epoch": 0.48363636363636364, "grad_norm": 5.981983184814453, "learning_rate": 5.3389367771825065e-06, "loss": 0.38, "step": 4389 }, { "epoch": 0.4837465564738292, "grad_norm": 7.728749752044678, "learning_rate": 5.337192340680325e-06, "loss": 0.4007, "step": 4390 }, { "epoch": 0.48385674931129474, "grad_norm": 8.33792781829834, "learning_rate": 5.335447862945806e-06, "loss": 0.407, "step": 4391 }, { "epoch": 0.48396694214876035, "grad_norm": 5.418024063110352, "learning_rate": 5.333703344192267e-06, "loss": 0.2849, "step": 4392 }, { "epoch": 0.4840771349862259, "grad_norm": 7.042039394378662, "learning_rate": 5.331958784633031e-06, "loss": 0.4277, "step": 4393 }, { "epoch": 0.48418732782369145, "grad_norm": 8.459491729736328, "learning_rate": 5.330214184481422e-06, "loss": 0.3693, "step": 4394 }, { "epoch": 0.484297520661157, "grad_norm": 7.763633728027344, "learning_rate": 5.328469543950776e-06, "loss": 0.3237, "step": 4395 }, { "epoch": 0.4844077134986226, "grad_norm": 9.964437484741211, "learning_rate": 5.326724863254428e-06, "loss": 0.414, "step": 4396 }, { "epoch": 0.48451790633608816, "grad_norm": 8.47101879119873, "learning_rate": 5.324980142605718e-06, "loss": 0.4962, "step": 4397 }, { "epoch": 0.4846280991735537, "grad_norm": 25.347455978393555, "learning_rate": 5.323235382217995e-06, "loss": 0.6602, "step": 4398 }, { "epoch": 0.48473829201101926, "grad_norm": 7.682060718536377, "learning_rate": 5.3214905823046106e-06, "loss": 0.4277, "step": 4399 }, { "epoch": 0.48484848484848486, "grad_norm": 12.011445999145508, "learning_rate": 5.319745743078922e-06, "loss": 0.4978, "step": 4400 }, { "epoch": 0.4849586776859504, "grad_norm": 6.436575412750244, "learning_rate": 5.318000864754289e-06, "loss": 0.3284, "step": 4401 }, { "epoch": 0.48506887052341596, "grad_norm": 8.122916221618652, "learning_rate": 5.316255947544078e-06, "loss": 0.3877, "step": 4402 }, { "epoch": 0.48517906336088157, "grad_norm": 7.619511127471924, "learning_rate": 5.314510991661662e-06, "loss": 0.3095, "step": 4403 }, { "epoch": 0.4852892561983471, "grad_norm": 4.5794997215271, "learning_rate": 5.312765997320413e-06, "loss": 0.4292, "step": 4404 }, { "epoch": 0.48539944903581267, "grad_norm": 5.267623424530029, "learning_rate": 5.311020964733712e-06, "loss": 0.4613, "step": 4405 }, { "epoch": 0.4855096418732782, "grad_norm": 7.349124431610107, "learning_rate": 5.309275894114947e-06, "loss": 0.3512, "step": 4406 }, { "epoch": 0.4856198347107438, "grad_norm": 5.92105770111084, "learning_rate": 5.307530785677505e-06, "loss": 0.2938, "step": 4407 }, { "epoch": 0.4857300275482094, "grad_norm": 6.2049360275268555, "learning_rate": 5.30578563963478e-06, "loss": 0.4101, "step": 4408 }, { "epoch": 0.4858402203856749, "grad_norm": 8.178729057312012, "learning_rate": 5.304040456200172e-06, "loss": 0.3573, "step": 4409 }, { "epoch": 0.4859504132231405, "grad_norm": 5.729934215545654, "learning_rate": 5.302295235587085e-06, "loss": 0.3723, "step": 4410 }, { "epoch": 0.4860606060606061, "grad_norm": 7.468725204467773, "learning_rate": 5.300549978008925e-06, "loss": 0.4322, "step": 4411 }, { "epoch": 0.48617079889807163, "grad_norm": 7.304121494293213, "learning_rate": 5.298804683679105e-06, "loss": 0.3723, "step": 4412 }, { "epoch": 0.4862809917355372, "grad_norm": 5.810751438140869, "learning_rate": 5.297059352811044e-06, "loss": 0.439, "step": 4413 }, { "epoch": 0.48639118457300273, "grad_norm": 6.385992527008057, "learning_rate": 5.29531398561816e-06, "loss": 0.3924, "step": 4414 }, { "epoch": 0.48650137741046834, "grad_norm": 8.34052562713623, "learning_rate": 5.293568582313882e-06, "loss": 0.4946, "step": 4415 }, { "epoch": 0.4866115702479339, "grad_norm": 10.94664192199707, "learning_rate": 5.291823143111639e-06, "loss": 0.431, "step": 4416 }, { "epoch": 0.48672176308539944, "grad_norm": 6.250720500946045, "learning_rate": 5.290077668224865e-06, "loss": 0.447, "step": 4417 }, { "epoch": 0.486831955922865, "grad_norm": 9.11233901977539, "learning_rate": 5.288332157866999e-06, "loss": 0.4224, "step": 4418 }, { "epoch": 0.4869421487603306, "grad_norm": 7.473569393157959, "learning_rate": 5.286586612251485e-06, "loss": 0.5118, "step": 4419 }, { "epoch": 0.48705234159779615, "grad_norm": 5.473938465118408, "learning_rate": 5.284841031591772e-06, "loss": 0.4349, "step": 4420 }, { "epoch": 0.4871625344352617, "grad_norm": 7.55420446395874, "learning_rate": 5.283095416101312e-06, "loss": 0.3926, "step": 4421 }, { "epoch": 0.48727272727272725, "grad_norm": 6.22142219543457, "learning_rate": 5.2813497659935575e-06, "loss": 0.3938, "step": 4422 }, { "epoch": 0.48738292011019285, "grad_norm": 4.999783515930176, "learning_rate": 5.279604081481973e-06, "loss": 0.436, "step": 4423 }, { "epoch": 0.4874931129476584, "grad_norm": 6.7879533767700195, "learning_rate": 5.27785836278002e-06, "loss": 0.4117, "step": 4424 }, { "epoch": 0.48760330578512395, "grad_norm": 3.9674224853515625, "learning_rate": 5.27611261010117e-06, "loss": 0.3627, "step": 4425 }, { "epoch": 0.48771349862258956, "grad_norm": 7.521617889404297, "learning_rate": 5.274366823658895e-06, "loss": 0.4763, "step": 4426 }, { "epoch": 0.4878236914600551, "grad_norm": 12.487852096557617, "learning_rate": 5.272621003666671e-06, "loss": 0.3876, "step": 4427 }, { "epoch": 0.48793388429752066, "grad_norm": 10.022464752197266, "learning_rate": 5.270875150337982e-06, "loss": 0.4421, "step": 4428 }, { "epoch": 0.4880440771349862, "grad_norm": 13.987154960632324, "learning_rate": 5.269129263886312e-06, "loss": 0.4146, "step": 4429 }, { "epoch": 0.4881542699724518, "grad_norm": 5.220799922943115, "learning_rate": 5.267383344525148e-06, "loss": 0.4125, "step": 4430 }, { "epoch": 0.48826446280991737, "grad_norm": 9.151785850524902, "learning_rate": 5.265637392467986e-06, "loss": 0.4678, "step": 4431 }, { "epoch": 0.4883746556473829, "grad_norm": 7.962405681610107, "learning_rate": 5.263891407928324e-06, "loss": 0.3437, "step": 4432 }, { "epoch": 0.48848484848484847, "grad_norm": 5.536421298980713, "learning_rate": 5.26214539111966e-06, "loss": 0.3515, "step": 4433 }, { "epoch": 0.48859504132231407, "grad_norm": 8.647902488708496, "learning_rate": 5.260399342255504e-06, "loss": 0.502, "step": 4434 }, { "epoch": 0.4887052341597796, "grad_norm": 5.995055198669434, "learning_rate": 5.258653261549363e-06, "loss": 0.4266, "step": 4435 }, { "epoch": 0.4888154269972452, "grad_norm": 8.032722473144531, "learning_rate": 5.2569071492147474e-06, "loss": 0.4477, "step": 4436 }, { "epoch": 0.4889256198347107, "grad_norm": 7.697789669036865, "learning_rate": 5.255161005465177e-06, "loss": 0.4765, "step": 4437 }, { "epoch": 0.48903581267217633, "grad_norm": 6.443562030792236, "learning_rate": 5.253414830514174e-06, "loss": 0.3692, "step": 4438 }, { "epoch": 0.4891460055096419, "grad_norm": 4.5295586585998535, "learning_rate": 5.2516686245752605e-06, "loss": 0.3371, "step": 4439 }, { "epoch": 0.48925619834710743, "grad_norm": 8.297006607055664, "learning_rate": 5.249922387861964e-06, "loss": 0.4633, "step": 4440 }, { "epoch": 0.489366391184573, "grad_norm": 4.661348342895508, "learning_rate": 5.248176120587821e-06, "loss": 0.3234, "step": 4441 }, { "epoch": 0.4894765840220386, "grad_norm": 5.500077247619629, "learning_rate": 5.246429822966363e-06, "loss": 0.4505, "step": 4442 }, { "epoch": 0.48958677685950414, "grad_norm": 6.7624006271362305, "learning_rate": 5.244683495211132e-06, "loss": 0.3474, "step": 4443 }, { "epoch": 0.4896969696969697, "grad_norm": 7.952180862426758, "learning_rate": 5.242937137535672e-06, "loss": 0.415, "step": 4444 }, { "epoch": 0.48980716253443524, "grad_norm": 8.214971542358398, "learning_rate": 5.2411907501535285e-06, "loss": 0.4879, "step": 4445 }, { "epoch": 0.48991735537190084, "grad_norm": 7.120847702026367, "learning_rate": 5.239444333278251e-06, "loss": 0.5096, "step": 4446 }, { "epoch": 0.4900275482093664, "grad_norm": 10.56857681274414, "learning_rate": 5.237697887123396e-06, "loss": 0.4134, "step": 4447 }, { "epoch": 0.49013774104683194, "grad_norm": 7.67206335067749, "learning_rate": 5.23595141190252e-06, "loss": 0.5067, "step": 4448 }, { "epoch": 0.4902479338842975, "grad_norm": 4.152846336364746, "learning_rate": 5.234204907829187e-06, "loss": 0.3521, "step": 4449 }, { "epoch": 0.4903581267217631, "grad_norm": 11.548025131225586, "learning_rate": 5.232458375116956e-06, "loss": 0.472, "step": 4450 }, { "epoch": 0.49046831955922865, "grad_norm": 14.106864929199219, "learning_rate": 5.2307118139794015e-06, "loss": 0.4055, "step": 4451 }, { "epoch": 0.4905785123966942, "grad_norm": 7.866242408752441, "learning_rate": 5.228965224630094e-06, "loss": 0.4024, "step": 4452 }, { "epoch": 0.4906887052341598, "grad_norm": 7.196980953216553, "learning_rate": 5.227218607282606e-06, "loss": 0.4326, "step": 4453 }, { "epoch": 0.49079889807162536, "grad_norm": 9.129240989685059, "learning_rate": 5.225471962150519e-06, "loss": 0.4973, "step": 4454 }, { "epoch": 0.4909090909090909, "grad_norm": 7.125060558319092, "learning_rate": 5.223725289447413e-06, "loss": 0.4516, "step": 4455 }, { "epoch": 0.49101928374655646, "grad_norm": 6.00028657913208, "learning_rate": 5.221978589386876e-06, "loss": 0.4101, "step": 4456 }, { "epoch": 0.49112947658402206, "grad_norm": 4.266380786895752, "learning_rate": 5.220231862182495e-06, "loss": 0.3677, "step": 4457 }, { "epoch": 0.4912396694214876, "grad_norm": 6.081855297088623, "learning_rate": 5.218485108047862e-06, "loss": 0.2898, "step": 4458 }, { "epoch": 0.49134986225895316, "grad_norm": 7.620930194854736, "learning_rate": 5.2167383271965745e-06, "loss": 0.5123, "step": 4459 }, { "epoch": 0.4914600550964187, "grad_norm": 5.6011247634887695, "learning_rate": 5.21499151984223e-06, "loss": 0.3554, "step": 4460 }, { "epoch": 0.4915702479338843, "grad_norm": 9.127986907958984, "learning_rate": 5.2132446861984285e-06, "loss": 0.5321, "step": 4461 }, { "epoch": 0.49168044077134987, "grad_norm": 9.018638610839844, "learning_rate": 5.21149782647878e-06, "loss": 0.496, "step": 4462 }, { "epoch": 0.4917906336088154, "grad_norm": 5.467581272125244, "learning_rate": 5.2097509408968884e-06, "loss": 0.4314, "step": 4463 }, { "epoch": 0.49190082644628097, "grad_norm": 3.5782599449157715, "learning_rate": 5.208004029666366e-06, "loss": 0.4047, "step": 4464 }, { "epoch": 0.4920110192837466, "grad_norm": 6.758794784545898, "learning_rate": 5.20625709300083e-06, "loss": 0.4573, "step": 4465 }, { "epoch": 0.4921212121212121, "grad_norm": 6.822628974914551, "learning_rate": 5.204510131113896e-06, "loss": 0.4063, "step": 4466 }, { "epoch": 0.4922314049586777, "grad_norm": 7.487907409667969, "learning_rate": 5.202763144219185e-06, "loss": 0.4272, "step": 4467 }, { "epoch": 0.4923415977961432, "grad_norm": 8.487168312072754, "learning_rate": 5.20101613253032e-06, "loss": 0.4357, "step": 4468 }, { "epoch": 0.49245179063360883, "grad_norm": 11.44509506225586, "learning_rate": 5.199269096260932e-06, "loss": 0.5114, "step": 4469 }, { "epoch": 0.4925619834710744, "grad_norm": 10.665582656860352, "learning_rate": 5.197522035624647e-06, "loss": 0.3898, "step": 4470 }, { "epoch": 0.49267217630853993, "grad_norm": 6.364475727081299, "learning_rate": 5.195774950835098e-06, "loss": 0.4094, "step": 4471 }, { "epoch": 0.4927823691460055, "grad_norm": 6.148874759674072, "learning_rate": 5.194027842105923e-06, "loss": 0.4306, "step": 4472 }, { "epoch": 0.4928925619834711, "grad_norm": 11.46756362915039, "learning_rate": 5.192280709650761e-06, "loss": 0.4999, "step": 4473 }, { "epoch": 0.49300275482093664, "grad_norm": 9.750408172607422, "learning_rate": 5.19053355368325e-06, "loss": 0.3532, "step": 4474 }, { "epoch": 0.4931129476584022, "grad_norm": 5.653137683868408, "learning_rate": 5.188786374417039e-06, "loss": 0.3562, "step": 4475 }, { "epoch": 0.4932231404958678, "grad_norm": 6.068719387054443, "learning_rate": 5.187039172065773e-06, "loss": 0.4124, "step": 4476 }, { "epoch": 0.49333333333333335, "grad_norm": 7.310559272766113, "learning_rate": 5.185291946843104e-06, "loss": 0.4666, "step": 4477 }, { "epoch": 0.4934435261707989, "grad_norm": 7.6926589012146, "learning_rate": 5.183544698962685e-06, "loss": 0.4709, "step": 4478 }, { "epoch": 0.49355371900826445, "grad_norm": 4.070844650268555, "learning_rate": 5.181797428638168e-06, "loss": 0.4124, "step": 4479 }, { "epoch": 0.49366391184573005, "grad_norm": 4.346336841583252, "learning_rate": 5.180050136083218e-06, "loss": 0.341, "step": 4480 }, { "epoch": 0.4937741046831956, "grad_norm": 6.669896125793457, "learning_rate": 5.178302821511489e-06, "loss": 0.4988, "step": 4481 }, { "epoch": 0.49388429752066115, "grad_norm": 11.357707023620605, "learning_rate": 5.176555485136652e-06, "loss": 0.4435, "step": 4482 }, { "epoch": 0.4939944903581267, "grad_norm": 5.121276378631592, "learning_rate": 5.174808127172367e-06, "loss": 0.3446, "step": 4483 }, { "epoch": 0.4941046831955923, "grad_norm": 7.8835530281066895, "learning_rate": 5.1730607478323095e-06, "loss": 0.3309, "step": 4484 }, { "epoch": 0.49421487603305786, "grad_norm": 4.873608589172363, "learning_rate": 5.171313347330148e-06, "loss": 0.4474, "step": 4485 }, { "epoch": 0.4943250688705234, "grad_norm": 5.673107147216797, "learning_rate": 5.169565925879557e-06, "loss": 0.4362, "step": 4486 }, { "epoch": 0.49443526170798896, "grad_norm": 9.590445518493652, "learning_rate": 5.167818483694216e-06, "loss": 0.428, "step": 4487 }, { "epoch": 0.49454545454545457, "grad_norm": 5.7733001708984375, "learning_rate": 5.166071020987802e-06, "loss": 0.3635, "step": 4488 }, { "epoch": 0.4946556473829201, "grad_norm": 11.6152925491333, "learning_rate": 5.164323537973996e-06, "loss": 0.3634, "step": 4489 }, { "epoch": 0.49476584022038567, "grad_norm": 7.276978015899658, "learning_rate": 5.162576034866486e-06, "loss": 0.5251, "step": 4490 }, { "epoch": 0.4948760330578512, "grad_norm": 9.533393859863281, "learning_rate": 5.160828511878959e-06, "loss": 0.459, "step": 4491 }, { "epoch": 0.4949862258953168, "grad_norm": 6.698780536651611, "learning_rate": 5.159080969225101e-06, "loss": 0.432, "step": 4492 }, { "epoch": 0.4950964187327824, "grad_norm": 5.994406223297119, "learning_rate": 5.157333407118608e-06, "loss": 0.4019, "step": 4493 }, { "epoch": 0.4952066115702479, "grad_norm": 7.2543625831604, "learning_rate": 5.155585825773172e-06, "loss": 0.4129, "step": 4494 }, { "epoch": 0.4953168044077135, "grad_norm": 8.09793472290039, "learning_rate": 5.153838225402489e-06, "loss": 0.4901, "step": 4495 }, { "epoch": 0.4954269972451791, "grad_norm": 4.807717800140381, "learning_rate": 5.152090606220258e-06, "loss": 0.4073, "step": 4496 }, { "epoch": 0.49553719008264463, "grad_norm": 7.383471965789795, "learning_rate": 5.150342968440181e-06, "loss": 0.4761, "step": 4497 }, { "epoch": 0.4956473829201102, "grad_norm": 5.512248516082764, "learning_rate": 5.148595312275964e-06, "loss": 0.4348, "step": 4498 }, { "epoch": 0.49575757575757573, "grad_norm": 7.76236629486084, "learning_rate": 5.146847637941308e-06, "loss": 0.3897, "step": 4499 }, { "epoch": 0.49586776859504134, "grad_norm": 9.837713241577148, "learning_rate": 5.145099945649925e-06, "loss": 0.4142, "step": 4500 }, { "epoch": 0.4959779614325069, "grad_norm": 6.959941864013672, "learning_rate": 5.143352235615523e-06, "loss": 0.4046, "step": 4501 }, { "epoch": 0.49608815426997244, "grad_norm": 7.133990287780762, "learning_rate": 5.141604508051814e-06, "loss": 0.467, "step": 4502 }, { "epoch": 0.49619834710743804, "grad_norm": 4.255684852600098, "learning_rate": 5.139856763172515e-06, "loss": 0.4422, "step": 4503 }, { "epoch": 0.4963085399449036, "grad_norm": 7.363885402679443, "learning_rate": 5.13810900119134e-06, "loss": 0.4305, "step": 4504 }, { "epoch": 0.49641873278236914, "grad_norm": 7.709430694580078, "learning_rate": 5.136361222322009e-06, "loss": 0.371, "step": 4505 }, { "epoch": 0.4965289256198347, "grad_norm": 4.840433120727539, "learning_rate": 5.134613426778242e-06, "loss": 0.3803, "step": 4506 }, { "epoch": 0.4966391184573003, "grad_norm": 7.726146697998047, "learning_rate": 5.1328656147737625e-06, "loss": 0.4255, "step": 4507 }, { "epoch": 0.49674931129476585, "grad_norm": 3.6370415687561035, "learning_rate": 5.131117786522296e-06, "loss": 0.386, "step": 4508 }, { "epoch": 0.4968595041322314, "grad_norm": 5.870304107666016, "learning_rate": 5.129369942237567e-06, "loss": 0.481, "step": 4509 }, { "epoch": 0.49696969696969695, "grad_norm": 6.260837554931641, "learning_rate": 5.127622082133307e-06, "loss": 0.3926, "step": 4510 }, { "epoch": 0.49707988980716256, "grad_norm": 7.427427291870117, "learning_rate": 5.125874206423245e-06, "loss": 0.4743, "step": 4511 }, { "epoch": 0.4971900826446281, "grad_norm": 4.559499263763428, "learning_rate": 5.124126315321114e-06, "loss": 0.3297, "step": 4512 }, { "epoch": 0.49730027548209366, "grad_norm": 5.440631866455078, "learning_rate": 5.122378409040649e-06, "loss": 0.3712, "step": 4513 }, { "epoch": 0.4974104683195592, "grad_norm": 6.5577712059021, "learning_rate": 5.120630487795585e-06, "loss": 0.399, "step": 4514 }, { "epoch": 0.4975206611570248, "grad_norm": 4.975412845611572, "learning_rate": 5.118882551799662e-06, "loss": 0.3959, "step": 4515 }, { "epoch": 0.49763085399449036, "grad_norm": 4.3721537590026855, "learning_rate": 5.117134601266619e-06, "loss": 0.42, "step": 4516 }, { "epoch": 0.4977410468319559, "grad_norm": 5.379798412322998, "learning_rate": 5.1153866364101964e-06, "loss": 0.414, "step": 4517 }, { "epoch": 0.49785123966942146, "grad_norm": 7.518587589263916, "learning_rate": 5.113638657444141e-06, "loss": 0.4546, "step": 4518 }, { "epoch": 0.49796143250688707, "grad_norm": 8.444965362548828, "learning_rate": 5.111890664582196e-06, "loss": 0.4242, "step": 4519 }, { "epoch": 0.4980716253443526, "grad_norm": 7.5318989753723145, "learning_rate": 5.110142658038107e-06, "loss": 0.4095, "step": 4520 }, { "epoch": 0.49818181818181817, "grad_norm": 4.474093914031982, "learning_rate": 5.108394638025626e-06, "loss": 0.3704, "step": 4521 }, { "epoch": 0.4982920110192837, "grad_norm": 5.056334495544434, "learning_rate": 5.106646604758501e-06, "loss": 0.3922, "step": 4522 }, { "epoch": 0.4984022038567493, "grad_norm": 8.53176498413086, "learning_rate": 5.104898558450484e-06, "loss": 0.4607, "step": 4523 }, { "epoch": 0.4985123966942149, "grad_norm": 6.229320526123047, "learning_rate": 5.103150499315328e-06, "loss": 0.3709, "step": 4524 }, { "epoch": 0.4986225895316804, "grad_norm": 4.518656253814697, "learning_rate": 5.101402427566789e-06, "loss": 0.3943, "step": 4525 }, { "epoch": 0.49873278236914603, "grad_norm": 6.313726425170898, "learning_rate": 5.0996543434186254e-06, "loss": 0.4078, "step": 4526 }, { "epoch": 0.4988429752066116, "grad_norm": 7.320578575134277, "learning_rate": 5.097906247084592e-06, "loss": 0.4082, "step": 4527 }, { "epoch": 0.49895316804407713, "grad_norm": 7.043966293334961, "learning_rate": 5.0961581387784495e-06, "loss": 0.3463, "step": 4528 }, { "epoch": 0.4990633608815427, "grad_norm": 10.093509674072266, "learning_rate": 5.09441001871396e-06, "loss": 0.499, "step": 4529 }, { "epoch": 0.4991735537190083, "grad_norm": 9.356034278869629, "learning_rate": 5.092661887104883e-06, "loss": 0.4536, "step": 4530 }, { "epoch": 0.49928374655647384, "grad_norm": 5.936094284057617, "learning_rate": 5.090913744164987e-06, "loss": 0.3736, "step": 4531 }, { "epoch": 0.4993939393939394, "grad_norm": 5.618441104888916, "learning_rate": 5.0891655901080325e-06, "loss": 0.408, "step": 4532 }, { "epoch": 0.49950413223140494, "grad_norm": 5.921712875366211, "learning_rate": 5.08741742514779e-06, "loss": 0.4293, "step": 4533 }, { "epoch": 0.49961432506887055, "grad_norm": 6.156184196472168, "learning_rate": 5.085669249498027e-06, "loss": 0.3517, "step": 4534 }, { "epoch": 0.4997245179063361, "grad_norm": 6.982524871826172, "learning_rate": 5.08392106337251e-06, "loss": 0.3507, "step": 4535 }, { "epoch": 0.49983471074380165, "grad_norm": 13.732891082763672, "learning_rate": 5.082172866985014e-06, "loss": 0.5273, "step": 4536 }, { "epoch": 0.4999449035812672, "grad_norm": 5.866691589355469, "learning_rate": 5.080424660549305e-06, "loss": 0.449, "step": 4537 }, { "epoch": 0.5000550964187328, "grad_norm": 6.930387496948242, "learning_rate": 5.07867644427916e-06, "loss": 0.4369, "step": 4538 }, { "epoch": 0.5001652892561983, "grad_norm": 17.0075740814209, "learning_rate": 5.076928218388353e-06, "loss": 0.4197, "step": 4539 }, { "epoch": 0.5002754820936639, "grad_norm": 6.532504081726074, "learning_rate": 5.07517998309066e-06, "loss": 0.3714, "step": 4540 }, { "epoch": 0.5002754820936639, "eval_loss": 0.4173244833946228, "eval_runtime": 41.943, "eval_samples_per_second": 17.5, "eval_steps_per_second": 2.193, "step": 4540 }, { "epoch": 0.5003856749311295, "grad_norm": 6.452209949493408, "learning_rate": 5.073431738599855e-06, "loss": 0.3411, "step": 4541 }, { "epoch": 0.500495867768595, "grad_norm": 6.462887763977051, "learning_rate": 5.071683485129718e-06, "loss": 0.3818, "step": 4542 }, { "epoch": 0.5006060606060606, "grad_norm": 11.5089693069458, "learning_rate": 5.069935222894027e-06, "loss": 0.4426, "step": 4543 }, { "epoch": 0.5007162534435262, "grad_norm": 9.659563064575195, "learning_rate": 5.068186952106562e-06, "loss": 0.4759, "step": 4544 }, { "epoch": 0.5008264462809917, "grad_norm": 5.877937316894531, "learning_rate": 5.066438672981103e-06, "loss": 0.3361, "step": 4545 }, { "epoch": 0.5009366391184573, "grad_norm": 6.2046074867248535, "learning_rate": 5.064690385731434e-06, "loss": 0.4354, "step": 4546 }, { "epoch": 0.5010468319559228, "grad_norm": 4.185536861419678, "learning_rate": 5.062942090571337e-06, "loss": 0.4153, "step": 4547 }, { "epoch": 0.5011570247933884, "grad_norm": 5.724506378173828, "learning_rate": 5.061193787714595e-06, "loss": 0.3996, "step": 4548 }, { "epoch": 0.501267217630854, "grad_norm": 6.192051410675049, "learning_rate": 5.059445477374992e-06, "loss": 0.3987, "step": 4549 }, { "epoch": 0.5013774104683195, "grad_norm": 8.699459075927734, "learning_rate": 5.057697159766319e-06, "loss": 0.4474, "step": 4550 }, { "epoch": 0.5014876033057851, "grad_norm": 13.547306060791016, "learning_rate": 5.055948835102354e-06, "loss": 0.5094, "step": 4551 }, { "epoch": 0.5015977961432507, "grad_norm": 7.65572452545166, "learning_rate": 5.054200503596894e-06, "loss": 0.4359, "step": 4552 }, { "epoch": 0.5017079889807162, "grad_norm": 6.122793674468994, "learning_rate": 5.0524521654637195e-06, "loss": 0.4099, "step": 4553 }, { "epoch": 0.5018181818181818, "grad_norm": 8.056415557861328, "learning_rate": 5.050703820916626e-06, "loss": 0.4108, "step": 4554 }, { "epoch": 0.5019283746556474, "grad_norm": 6.604487895965576, "learning_rate": 5.048955470169398e-06, "loss": 0.3795, "step": 4555 }, { "epoch": 0.5020385674931129, "grad_norm": 6.377932071685791, "learning_rate": 5.04720711343583e-06, "loss": 0.4065, "step": 4556 }, { "epoch": 0.5021487603305785, "grad_norm": 6.709017276763916, "learning_rate": 5.045458750929712e-06, "loss": 0.3861, "step": 4557 }, { "epoch": 0.502258953168044, "grad_norm": 7.791820049285889, "learning_rate": 5.0437103828648345e-06, "loss": 0.3611, "step": 4558 }, { "epoch": 0.5023691460055096, "grad_norm": 3.928077220916748, "learning_rate": 5.041962009454994e-06, "loss": 0.3614, "step": 4559 }, { "epoch": 0.5024793388429752, "grad_norm": 5.483081817626953, "learning_rate": 5.04021363091398e-06, "loss": 0.4157, "step": 4560 }, { "epoch": 0.5025895316804407, "grad_norm": 5.403481960296631, "learning_rate": 5.038465247455591e-06, "loss": 0.4341, "step": 4561 }, { "epoch": 0.5026997245179063, "grad_norm": 7.353564739227295, "learning_rate": 5.036716859293618e-06, "loss": 0.5812, "step": 4562 }, { "epoch": 0.502809917355372, "grad_norm": 6.972169399261475, "learning_rate": 5.034968466641856e-06, "loss": 0.4056, "step": 4563 }, { "epoch": 0.5029201101928374, "grad_norm": 6.7498955726623535, "learning_rate": 5.033220069714104e-06, "loss": 0.4436, "step": 4564 }, { "epoch": 0.503030303030303, "grad_norm": 10.816003799438477, "learning_rate": 5.031471668724158e-06, "loss": 0.4928, "step": 4565 }, { "epoch": 0.5031404958677685, "grad_norm": 7.438859939575195, "learning_rate": 5.0297232638858115e-06, "loss": 0.3682, "step": 4566 }, { "epoch": 0.5032506887052342, "grad_norm": 7.910082817077637, "learning_rate": 5.027974855412865e-06, "loss": 0.4335, "step": 4567 }, { "epoch": 0.5033608815426998, "grad_norm": 4.845508098602295, "learning_rate": 5.026226443519116e-06, "loss": 0.4097, "step": 4568 }, { "epoch": 0.5034710743801653, "grad_norm": 11.199101448059082, "learning_rate": 5.02447802841836e-06, "loss": 0.4512, "step": 4569 }, { "epoch": 0.5035812672176309, "grad_norm": 10.137066841125488, "learning_rate": 5.022729610324398e-06, "loss": 0.3955, "step": 4570 }, { "epoch": 0.5036914600550965, "grad_norm": 5.9792938232421875, "learning_rate": 5.020981189451028e-06, "loss": 0.4271, "step": 4571 }, { "epoch": 0.503801652892562, "grad_norm": 5.189902305603027, "learning_rate": 5.019232766012052e-06, "loss": 0.4371, "step": 4572 }, { "epoch": 0.5039118457300276, "grad_norm": 7.720902919769287, "learning_rate": 5.017484340221265e-06, "loss": 0.3973, "step": 4573 }, { "epoch": 0.5040220385674931, "grad_norm": 5.592770099639893, "learning_rate": 5.015735912292469e-06, "loss": 0.3591, "step": 4574 }, { "epoch": 0.5041322314049587, "grad_norm": 8.448620796203613, "learning_rate": 5.013987482439465e-06, "loss": 0.4779, "step": 4575 }, { "epoch": 0.5042424242424243, "grad_norm": 4.631404399871826, "learning_rate": 5.012239050876052e-06, "loss": 0.4248, "step": 4576 }, { "epoch": 0.5043526170798898, "grad_norm": 10.913233757019043, "learning_rate": 5.010490617816033e-06, "loss": 0.3743, "step": 4577 }, { "epoch": 0.5044628099173554, "grad_norm": 5.6780314445495605, "learning_rate": 5.008742183473205e-06, "loss": 0.4088, "step": 4578 }, { "epoch": 0.504573002754821, "grad_norm": 7.929697513580322, "learning_rate": 5.006993748061372e-06, "loss": 0.3323, "step": 4579 }, { "epoch": 0.5046831955922865, "grad_norm": 8.058935165405273, "learning_rate": 5.005245311794334e-06, "loss": 0.4651, "step": 4580 }, { "epoch": 0.5047933884297521, "grad_norm": 4.103596210479736, "learning_rate": 5.0034968748858905e-06, "loss": 0.3993, "step": 4581 }, { "epoch": 0.5049035812672177, "grad_norm": 5.556540012359619, "learning_rate": 5.001748437549847e-06, "loss": 0.3532, "step": 4582 }, { "epoch": 0.5050137741046832, "grad_norm": 10.806697845458984, "learning_rate": 5e-06, "loss": 0.4152, "step": 4583 }, { "epoch": 0.5051239669421488, "grad_norm": 5.479221820831299, "learning_rate": 4.998251562450155e-06, "loss": 0.4651, "step": 4584 }, { "epoch": 0.5052341597796143, "grad_norm": 8.868670463562012, "learning_rate": 4.9965031251141095e-06, "loss": 0.3818, "step": 4585 }, { "epoch": 0.5053443526170799, "grad_norm": 4.519674301147461, "learning_rate": 4.994754688205667e-06, "loss": 0.4236, "step": 4586 }, { "epoch": 0.5054545454545455, "grad_norm": 7.915712833404541, "learning_rate": 4.99300625193863e-06, "loss": 0.4449, "step": 4587 }, { "epoch": 0.505564738292011, "grad_norm": 7.476759910583496, "learning_rate": 4.9912578165267955e-06, "loss": 0.3903, "step": 4588 }, { "epoch": 0.5056749311294766, "grad_norm": 6.582509517669678, "learning_rate": 4.989509382183969e-06, "loss": 0.4416, "step": 4589 }, { "epoch": 0.5057851239669422, "grad_norm": 5.545627593994141, "learning_rate": 4.98776094912395e-06, "loss": 0.3882, "step": 4590 }, { "epoch": 0.5058953168044077, "grad_norm": 6.2511749267578125, "learning_rate": 4.986012517560536e-06, "loss": 0.4118, "step": 4591 }, { "epoch": 0.5060055096418733, "grad_norm": 6.856733798980713, "learning_rate": 4.984264087707532e-06, "loss": 0.4525, "step": 4592 }, { "epoch": 0.5061157024793388, "grad_norm": 4.939981460571289, "learning_rate": 4.982515659778736e-06, "loss": 0.4167, "step": 4593 }, { "epoch": 0.5062258953168044, "grad_norm": 6.003458499908447, "learning_rate": 4.9807672339879494e-06, "loss": 0.4306, "step": 4594 }, { "epoch": 0.50633608815427, "grad_norm": 6.502118110656738, "learning_rate": 4.9790188105489734e-06, "loss": 0.3594, "step": 4595 }, { "epoch": 0.5064462809917355, "grad_norm": 4.995030403137207, "learning_rate": 4.977270389675603e-06, "loss": 0.3798, "step": 4596 }, { "epoch": 0.5065564738292011, "grad_norm": 7.406594276428223, "learning_rate": 4.975521971581641e-06, "loss": 0.4716, "step": 4597 }, { "epoch": 0.5066666666666667, "grad_norm": 4.951489448547363, "learning_rate": 4.973773556480887e-06, "loss": 0.4043, "step": 4598 }, { "epoch": 0.5067768595041322, "grad_norm": 5.4200239181518555, "learning_rate": 4.972025144587135e-06, "loss": 0.4161, "step": 4599 }, { "epoch": 0.5068870523415978, "grad_norm": 4.892787456512451, "learning_rate": 4.97027673611419e-06, "loss": 0.3647, "step": 4600 }, { "epoch": 0.5069972451790634, "grad_norm": 5.32407808303833, "learning_rate": 4.968528331275845e-06, "loss": 0.4264, "step": 4601 }, { "epoch": 0.5071074380165289, "grad_norm": 5.645662784576416, "learning_rate": 4.966779930285897e-06, "loss": 0.3807, "step": 4602 }, { "epoch": 0.5072176308539945, "grad_norm": 8.279090881347656, "learning_rate": 4.965031533358145e-06, "loss": 0.4578, "step": 4603 }, { "epoch": 0.50732782369146, "grad_norm": 6.70154333114624, "learning_rate": 4.9632831407063855e-06, "loss": 0.4058, "step": 4604 }, { "epoch": 0.5074380165289256, "grad_norm": 7.6447248458862305, "learning_rate": 4.9615347525444115e-06, "loss": 0.3833, "step": 4605 }, { "epoch": 0.5075482093663912, "grad_norm": 8.844406127929688, "learning_rate": 4.959786369086022e-06, "loss": 0.4539, "step": 4606 }, { "epoch": 0.5076584022038567, "grad_norm": 8.162901878356934, "learning_rate": 4.958037990545008e-06, "loss": 0.4531, "step": 4607 }, { "epoch": 0.5077685950413223, "grad_norm": 12.129911422729492, "learning_rate": 4.956289617135166e-06, "loss": 0.4316, "step": 4608 }, { "epoch": 0.5078787878787879, "grad_norm": 6.629499912261963, "learning_rate": 4.954541249070291e-06, "loss": 0.4399, "step": 4609 }, { "epoch": 0.5079889807162534, "grad_norm": 7.2819952964782715, "learning_rate": 4.9527928865641714e-06, "loss": 0.398, "step": 4610 }, { "epoch": 0.508099173553719, "grad_norm": 5.846472263336182, "learning_rate": 4.951044529830603e-06, "loss": 0.4775, "step": 4611 }, { "epoch": 0.5082093663911845, "grad_norm": 6.285477638244629, "learning_rate": 4.949296179083377e-06, "loss": 0.4359, "step": 4612 }, { "epoch": 0.5083195592286501, "grad_norm": 4.833187580108643, "learning_rate": 4.9475478345362804e-06, "loss": 0.4372, "step": 4613 }, { "epoch": 0.5084297520661157, "grad_norm": 7.467216491699219, "learning_rate": 4.945799496403108e-06, "loss": 0.3121, "step": 4614 }, { "epoch": 0.5085399449035812, "grad_norm": 4.804136753082275, "learning_rate": 4.944051164897647e-06, "loss": 0.3873, "step": 4615 }, { "epoch": 0.5086501377410468, "grad_norm": 5.1879472732543945, "learning_rate": 4.942302840233684e-06, "loss": 0.4423, "step": 4616 }, { "epoch": 0.5087603305785124, "grad_norm": 11.147826194763184, "learning_rate": 4.940554522625008e-06, "loss": 0.5191, "step": 4617 }, { "epoch": 0.5088705234159779, "grad_norm": 5.908621311187744, "learning_rate": 4.938806212285408e-06, "loss": 0.4041, "step": 4618 }, { "epoch": 0.5089807162534435, "grad_norm": 5.038792610168457, "learning_rate": 4.937057909428665e-06, "loss": 0.428, "step": 4619 }, { "epoch": 0.509090909090909, "grad_norm": 4.840710163116455, "learning_rate": 4.935309614268567e-06, "loss": 0.4918, "step": 4620 }, { "epoch": 0.5092011019283746, "grad_norm": 9.819032669067383, "learning_rate": 4.933561327018897e-06, "loss": 0.4938, "step": 4621 }, { "epoch": 0.5093112947658403, "grad_norm": 5.381676197052002, "learning_rate": 4.93181304789344e-06, "loss": 0.4242, "step": 4622 }, { "epoch": 0.5094214876033057, "grad_norm": 7.184305191040039, "learning_rate": 4.930064777105976e-06, "loss": 0.4407, "step": 4623 }, { "epoch": 0.5095316804407714, "grad_norm": 5.1411333084106445, "learning_rate": 4.928316514870283e-06, "loss": 0.3675, "step": 4624 }, { "epoch": 0.509641873278237, "grad_norm": 6.193539142608643, "learning_rate": 4.9265682614001455e-06, "loss": 0.3727, "step": 4625 }, { "epoch": 0.5097520661157025, "grad_norm": 8.644025802612305, "learning_rate": 4.924820016909343e-06, "loss": 0.4193, "step": 4626 }, { "epoch": 0.5098622589531681, "grad_norm": 5.382510185241699, "learning_rate": 4.923071781611647e-06, "loss": 0.3977, "step": 4627 }, { "epoch": 0.5099724517906337, "grad_norm": 6.619536399841309, "learning_rate": 4.921323555720842e-06, "loss": 0.4694, "step": 4628 }, { "epoch": 0.5100826446280992, "grad_norm": 10.856200218200684, "learning_rate": 4.919575339450698e-06, "loss": 0.4868, "step": 4629 }, { "epoch": 0.5101928374655648, "grad_norm": 6.595972537994385, "learning_rate": 4.917827133014988e-06, "loss": 0.4259, "step": 4630 }, { "epoch": 0.5103030303030303, "grad_norm": 6.2144904136657715, "learning_rate": 4.916078936627492e-06, "loss": 0.4727, "step": 4631 }, { "epoch": 0.5104132231404959, "grad_norm": 4.708610534667969, "learning_rate": 4.914330750501975e-06, "loss": 0.3465, "step": 4632 }, { "epoch": 0.5105234159779615, "grad_norm": 4.370340347290039, "learning_rate": 4.912582574852211e-06, "loss": 0.3769, "step": 4633 }, { "epoch": 0.510633608815427, "grad_norm": 5.362609386444092, "learning_rate": 4.910834409891968e-06, "loss": 0.4057, "step": 4634 }, { "epoch": 0.5107438016528926, "grad_norm": 5.957698345184326, "learning_rate": 4.909086255835015e-06, "loss": 0.4122, "step": 4635 }, { "epoch": 0.5108539944903582, "grad_norm": 5.449244022369385, "learning_rate": 4.907338112895118e-06, "loss": 0.4174, "step": 4636 }, { "epoch": 0.5109641873278237, "grad_norm": 4.519969940185547, "learning_rate": 4.905589981286043e-06, "loss": 0.3028, "step": 4637 }, { "epoch": 0.5110743801652893, "grad_norm": 10.036328315734863, "learning_rate": 4.903841861221552e-06, "loss": 0.4588, "step": 4638 }, { "epoch": 0.5111845730027548, "grad_norm": 7.442993640899658, "learning_rate": 4.90209375291541e-06, "loss": 0.4371, "step": 4639 }, { "epoch": 0.5112947658402204, "grad_norm": 6.990756034851074, "learning_rate": 4.900345656581377e-06, "loss": 0.4603, "step": 4640 }, { "epoch": 0.511404958677686, "grad_norm": 6.148458480834961, "learning_rate": 4.898597572433212e-06, "loss": 0.4531, "step": 4641 }, { "epoch": 0.5115151515151515, "grad_norm": 6.159834861755371, "learning_rate": 4.896849500684673e-06, "loss": 0.43, "step": 4642 }, { "epoch": 0.5116253443526171, "grad_norm": 5.136331558227539, "learning_rate": 4.895101441549518e-06, "loss": 0.4275, "step": 4643 }, { "epoch": 0.5117355371900827, "grad_norm": 5.321721076965332, "learning_rate": 4.8933533952415e-06, "loss": 0.3909, "step": 4644 }, { "epoch": 0.5118457300275482, "grad_norm": 3.884355306625366, "learning_rate": 4.891605361974375e-06, "loss": 0.3781, "step": 4645 }, { "epoch": 0.5119559228650138, "grad_norm": 5.795078754425049, "learning_rate": 4.889857341961894e-06, "loss": 0.4706, "step": 4646 }, { "epoch": 0.5120661157024793, "grad_norm": 7.165544033050537, "learning_rate": 4.888109335417805e-06, "loss": 0.5056, "step": 4647 }, { "epoch": 0.5121763085399449, "grad_norm": 4.470412731170654, "learning_rate": 4.88636134255586e-06, "loss": 0.3842, "step": 4648 }, { "epoch": 0.5122865013774105, "grad_norm": 5.512759208679199, "learning_rate": 4.8846133635898035e-06, "loss": 0.4169, "step": 4649 }, { "epoch": 0.512396694214876, "grad_norm": 4.522556781768799, "learning_rate": 4.882865398733383e-06, "loss": 0.3372, "step": 4650 }, { "epoch": 0.5125068870523416, "grad_norm": 7.432344913482666, "learning_rate": 4.881117448200339e-06, "loss": 0.4508, "step": 4651 }, { "epoch": 0.5126170798898072, "grad_norm": 5.732212066650391, "learning_rate": 4.879369512204415e-06, "loss": 0.4417, "step": 4652 }, { "epoch": 0.5127272727272727, "grad_norm": 6.637563228607178, "learning_rate": 4.877621590959352e-06, "loss": 0.4064, "step": 4653 }, { "epoch": 0.5128374655647383, "grad_norm": 8.682241439819336, "learning_rate": 4.8758736846788885e-06, "loss": 0.4144, "step": 4654 }, { "epoch": 0.5129476584022039, "grad_norm": 6.463537216186523, "learning_rate": 4.874125793576755e-06, "loss": 0.4625, "step": 4655 }, { "epoch": 0.5130578512396694, "grad_norm": 6.958169460296631, "learning_rate": 4.872377917866695e-06, "loss": 0.4418, "step": 4656 }, { "epoch": 0.513168044077135, "grad_norm": 5.541899681091309, "learning_rate": 4.870630057762435e-06, "loss": 0.3645, "step": 4657 }, { "epoch": 0.5132782369146005, "grad_norm": 3.5046939849853516, "learning_rate": 4.868882213477704e-06, "loss": 0.4038, "step": 4658 }, { "epoch": 0.5133884297520661, "grad_norm": 10.378771781921387, "learning_rate": 4.867134385226239e-06, "loss": 0.3873, "step": 4659 }, { "epoch": 0.5134986225895317, "grad_norm": 5.14128303527832, "learning_rate": 4.86538657322176e-06, "loss": 0.3369, "step": 4660 }, { "epoch": 0.5136088154269972, "grad_norm": 11.744321823120117, "learning_rate": 4.863638777677993e-06, "loss": 0.4452, "step": 4661 }, { "epoch": 0.5137190082644628, "grad_norm": 5.557402610778809, "learning_rate": 4.861890998808662e-06, "loss": 0.4282, "step": 4662 }, { "epoch": 0.5138292011019284, "grad_norm": 4.960723400115967, "learning_rate": 4.860143236827485e-06, "loss": 0.3699, "step": 4663 }, { "epoch": 0.5139393939393939, "grad_norm": 10.539837837219238, "learning_rate": 4.858395491948187e-06, "loss": 0.4571, "step": 4664 }, { "epoch": 0.5140495867768595, "grad_norm": 5.027210712432861, "learning_rate": 4.8566477643844795e-06, "loss": 0.4328, "step": 4665 }, { "epoch": 0.514159779614325, "grad_norm": 7.645861625671387, "learning_rate": 4.8549000543500765e-06, "loss": 0.4807, "step": 4666 }, { "epoch": 0.5142699724517906, "grad_norm": 6.753965377807617, "learning_rate": 4.853152362058693e-06, "loss": 0.4608, "step": 4667 }, { "epoch": 0.5143801652892562, "grad_norm": 6.281160354614258, "learning_rate": 4.851404687724038e-06, "loss": 0.4345, "step": 4668 }, { "epoch": 0.5144903581267217, "grad_norm": 10.625886917114258, "learning_rate": 4.849657031559819e-06, "loss": 0.5259, "step": 4669 }, { "epoch": 0.5146005509641873, "grad_norm": 7.434563159942627, "learning_rate": 4.847909393779743e-06, "loss": 0.3596, "step": 4670 }, { "epoch": 0.5147107438016529, "grad_norm": 5.95920467376709, "learning_rate": 4.846161774597514e-06, "loss": 0.4121, "step": 4671 }, { "epoch": 0.5148209366391184, "grad_norm": 7.267666816711426, "learning_rate": 4.84441417422683e-06, "loss": 0.4547, "step": 4672 }, { "epoch": 0.514931129476584, "grad_norm": 4.895453453063965, "learning_rate": 4.842666592881394e-06, "loss": 0.4458, "step": 4673 }, { "epoch": 0.5150413223140495, "grad_norm": 7.7665205001831055, "learning_rate": 4.8409190307749e-06, "loss": 0.4, "step": 4674 }, { "epoch": 0.5151515151515151, "grad_norm": 6.711982727050781, "learning_rate": 4.839171488121042e-06, "loss": 0.4023, "step": 4675 }, { "epoch": 0.5152617079889807, "grad_norm": 6.800289630889893, "learning_rate": 4.8374239651335145e-06, "loss": 0.4279, "step": 4676 }, { "epoch": 0.5153719008264462, "grad_norm": 6.929649353027344, "learning_rate": 4.835676462026004e-06, "loss": 0.4309, "step": 4677 }, { "epoch": 0.5154820936639118, "grad_norm": 7.900669574737549, "learning_rate": 4.8339289790122004e-06, "loss": 0.3919, "step": 4678 }, { "epoch": 0.5155922865013775, "grad_norm": 3.486950397491455, "learning_rate": 4.832181516305786e-06, "loss": 0.4277, "step": 4679 }, { "epoch": 0.515702479338843, "grad_norm": 5.796440124511719, "learning_rate": 4.830434074120443e-06, "loss": 0.4239, "step": 4680 }, { "epoch": 0.5158126721763086, "grad_norm": 6.509128570556641, "learning_rate": 4.828686652669853e-06, "loss": 0.4457, "step": 4681 }, { "epoch": 0.5159228650137742, "grad_norm": 4.2097487449646, "learning_rate": 4.826939252167693e-06, "loss": 0.369, "step": 4682 }, { "epoch": 0.5160330578512397, "grad_norm": 7.689331531524658, "learning_rate": 4.825191872827633e-06, "loss": 0.3942, "step": 4683 }, { "epoch": 0.5161432506887053, "grad_norm": 7.9642839431762695, "learning_rate": 4.82344451486335e-06, "loss": 0.4656, "step": 4684 }, { "epoch": 0.5162534435261708, "grad_norm": 10.46125602722168, "learning_rate": 4.821697178488513e-06, "loss": 0.5403, "step": 4685 }, { "epoch": 0.5163636363636364, "grad_norm": 6.136967182159424, "learning_rate": 4.819949863916784e-06, "loss": 0.4271, "step": 4686 }, { "epoch": 0.516473829201102, "grad_norm": 5.80336332321167, "learning_rate": 4.818202571361834e-06, "loss": 0.3986, "step": 4687 }, { "epoch": 0.5165840220385675, "grad_norm": 7.430635929107666, "learning_rate": 4.8164553010373185e-06, "loss": 0.3748, "step": 4688 }, { "epoch": 0.5166942148760331, "grad_norm": 4.121450424194336, "learning_rate": 4.814708053156897e-06, "loss": 0.3597, "step": 4689 }, { "epoch": 0.5168044077134987, "grad_norm": 5.318500995635986, "learning_rate": 4.812960827934228e-06, "loss": 0.483, "step": 4690 }, { "epoch": 0.5169146005509642, "grad_norm": 4.661088466644287, "learning_rate": 4.811213625582961e-06, "loss": 0.4194, "step": 4691 }, { "epoch": 0.5170247933884298, "grad_norm": 5.725320816040039, "learning_rate": 4.809466446316751e-06, "loss": 0.3859, "step": 4692 }, { "epoch": 0.5171349862258953, "grad_norm": 5.202004432678223, "learning_rate": 4.807719290349242e-06, "loss": 0.4025, "step": 4693 }, { "epoch": 0.5172451790633609, "grad_norm": 5.964555740356445, "learning_rate": 4.805972157894078e-06, "loss": 0.3745, "step": 4694 }, { "epoch": 0.5173553719008265, "grad_norm": 8.678040504455566, "learning_rate": 4.804225049164903e-06, "loss": 0.5042, "step": 4695 }, { "epoch": 0.517465564738292, "grad_norm": 6.35433292388916, "learning_rate": 4.802477964375356e-06, "loss": 0.3602, "step": 4696 }, { "epoch": 0.5175757575757576, "grad_norm": 7.269749164581299, "learning_rate": 4.80073090373907e-06, "loss": 0.455, "step": 4697 }, { "epoch": 0.5176859504132232, "grad_norm": 4.803569316864014, "learning_rate": 4.798983867469681e-06, "loss": 0.4071, "step": 4698 }, { "epoch": 0.5177961432506887, "grad_norm": 5.32893180847168, "learning_rate": 4.797236855780818e-06, "loss": 0.3967, "step": 4699 }, { "epoch": 0.5179063360881543, "grad_norm": 5.919497013092041, "learning_rate": 4.795489868886106e-06, "loss": 0.3981, "step": 4700 }, { "epoch": 0.5180165289256199, "grad_norm": 6.551672458648682, "learning_rate": 4.793742906999172e-06, "loss": 0.3516, "step": 4701 }, { "epoch": 0.5181267217630854, "grad_norm": 13.528398513793945, "learning_rate": 4.791995970333636e-06, "loss": 0.4038, "step": 4702 }, { "epoch": 0.518236914600551, "grad_norm": 6.5294318199157715, "learning_rate": 4.790249059103113e-06, "loss": 0.3319, "step": 4703 }, { "epoch": 0.5183471074380165, "grad_norm": 6.170133113861084, "learning_rate": 4.788502173521222e-06, "loss": 0.4436, "step": 4704 }, { "epoch": 0.5184573002754821, "grad_norm": 5.018182277679443, "learning_rate": 4.7867553138015715e-06, "loss": 0.3969, "step": 4705 }, { "epoch": 0.5185674931129477, "grad_norm": 7.418249607086182, "learning_rate": 4.785008480157772e-06, "loss": 0.3985, "step": 4706 }, { "epoch": 0.5186776859504132, "grad_norm": 5.478403568267822, "learning_rate": 4.783261672803427e-06, "loss": 0.3892, "step": 4707 }, { "epoch": 0.5187878787878788, "grad_norm": 12.865488052368164, "learning_rate": 4.7815148919521385e-06, "loss": 0.5772, "step": 4708 }, { "epoch": 0.5188980716253444, "grad_norm": 4.450212001800537, "learning_rate": 4.7797681378175065e-06, "loss": 0.3204, "step": 4709 }, { "epoch": 0.5190082644628099, "grad_norm": 6.353028774261475, "learning_rate": 4.778021410613126e-06, "loss": 0.4429, "step": 4710 }, { "epoch": 0.5191184573002755, "grad_norm": 4.610799312591553, "learning_rate": 4.776274710552588e-06, "loss": 0.353, "step": 4711 }, { "epoch": 0.519228650137741, "grad_norm": 6.441452503204346, "learning_rate": 4.774528037849482e-06, "loss": 0.4051, "step": 4712 }, { "epoch": 0.5193388429752066, "grad_norm": 14.416350364685059, "learning_rate": 4.772781392717397e-06, "loss": 0.3931, "step": 4713 }, { "epoch": 0.5194490358126722, "grad_norm": 11.664390563964844, "learning_rate": 4.771034775369907e-06, "loss": 0.3659, "step": 4714 }, { "epoch": 0.5195592286501377, "grad_norm": 5.152129173278809, "learning_rate": 4.769288186020599e-06, "loss": 0.3688, "step": 4715 }, { "epoch": 0.5196694214876033, "grad_norm": 6.308830261230469, "learning_rate": 4.767541624883045e-06, "loss": 0.4702, "step": 4716 }, { "epoch": 0.5197796143250689, "grad_norm": 10.4258394241333, "learning_rate": 4.765795092170815e-06, "loss": 0.4772, "step": 4717 }, { "epoch": 0.5198898071625344, "grad_norm": 12.771700859069824, "learning_rate": 4.764048588097482e-06, "loss": 0.4561, "step": 4718 }, { "epoch": 0.52, "grad_norm": 7.97894811630249, "learning_rate": 4.762302112876605e-06, "loss": 0.484, "step": 4719 }, { "epoch": 0.5201101928374655, "grad_norm": 15.211674690246582, "learning_rate": 4.7605556667217505e-06, "loss": 0.5272, "step": 4720 }, { "epoch": 0.5202203856749311, "grad_norm": 7.436028480529785, "learning_rate": 4.758809249846475e-06, "loss": 0.4014, "step": 4721 }, { "epoch": 0.5203305785123967, "grad_norm": 6.159117221832275, "learning_rate": 4.757062862464328e-06, "loss": 0.4912, "step": 4722 }, { "epoch": 0.5204407713498622, "grad_norm": 6.888415336608887, "learning_rate": 4.755316504788869e-06, "loss": 0.381, "step": 4723 }, { "epoch": 0.5205509641873278, "grad_norm": 9.91435718536377, "learning_rate": 4.7535701770336385e-06, "loss": 0.4835, "step": 4724 }, { "epoch": 0.5206611570247934, "grad_norm": 8.647757530212402, "learning_rate": 4.7518238794121805e-06, "loss": 0.5128, "step": 4725 }, { "epoch": 0.5207713498622589, "grad_norm": 7.840506076812744, "learning_rate": 4.750077612138037e-06, "loss": 0.4447, "step": 4726 }, { "epoch": 0.5208815426997245, "grad_norm": 8.255101203918457, "learning_rate": 4.748331375424742e-06, "loss": 0.4715, "step": 4727 }, { "epoch": 0.5209917355371901, "grad_norm": 6.768373489379883, "learning_rate": 4.746585169485828e-06, "loss": 0.4443, "step": 4728 }, { "epoch": 0.5211019283746556, "grad_norm": 6.331600666046143, "learning_rate": 4.744838994534824e-06, "loss": 0.3851, "step": 4729 }, { "epoch": 0.5212121212121212, "grad_norm": 6.041339874267578, "learning_rate": 4.743092850785255e-06, "loss": 0.4211, "step": 4730 }, { "epoch": 0.5213223140495867, "grad_norm": 10.140803337097168, "learning_rate": 4.74134673845064e-06, "loss": 0.5106, "step": 4731 }, { "epoch": 0.5214325068870523, "grad_norm": 5.4736456871032715, "learning_rate": 4.7396006577444976e-06, "loss": 0.4362, "step": 4732 }, { "epoch": 0.5215426997245179, "grad_norm": 6.823335647583008, "learning_rate": 4.73785460888034e-06, "loss": 0.3611, "step": 4733 }, { "epoch": 0.5216528925619834, "grad_norm": 6.919731140136719, "learning_rate": 4.736108592071678e-06, "loss": 0.4689, "step": 4734 }, { "epoch": 0.521763085399449, "grad_norm": 6.117292404174805, "learning_rate": 4.734362607532015e-06, "loss": 0.3818, "step": 4735 }, { "epoch": 0.5218732782369147, "grad_norm": 5.677313327789307, "learning_rate": 4.732616655474853e-06, "loss": 0.4574, "step": 4736 }, { "epoch": 0.5219834710743801, "grad_norm": 14.314566612243652, "learning_rate": 4.73087073611369e-06, "loss": 0.3608, "step": 4737 }, { "epoch": 0.5220936639118458, "grad_norm": 7.583749771118164, "learning_rate": 4.72912484966202e-06, "loss": 0.4606, "step": 4738 }, { "epoch": 0.5222038567493112, "grad_norm": 4.8243865966796875, "learning_rate": 4.72737899633333e-06, "loss": 0.4029, "step": 4739 }, { "epoch": 0.5223140495867769, "grad_norm": 5.845529079437256, "learning_rate": 4.725633176341107e-06, "loss": 0.425, "step": 4740 }, { "epoch": 0.5224242424242425, "grad_norm": 6.174137115478516, "learning_rate": 4.723887389898833e-06, "loss": 0.4219, "step": 4741 }, { "epoch": 0.522534435261708, "grad_norm": 8.151043891906738, "learning_rate": 4.722141637219981e-06, "loss": 0.4189, "step": 4742 }, { "epoch": 0.5226446280991736, "grad_norm": 5.855828762054443, "learning_rate": 4.72039591851803e-06, "loss": 0.3719, "step": 4743 }, { "epoch": 0.5227548209366392, "grad_norm": 5.095317363739014, "learning_rate": 4.718650234006446e-06, "loss": 0.422, "step": 4744 }, { "epoch": 0.5228650137741047, "grad_norm": 5.047654628753662, "learning_rate": 4.71690458389869e-06, "loss": 0.3451, "step": 4745 }, { "epoch": 0.5229752066115703, "grad_norm": 5.244139671325684, "learning_rate": 4.71515896840823e-06, "loss": 0.3126, "step": 4746 }, { "epoch": 0.5230853994490358, "grad_norm": 6.747472286224365, "learning_rate": 4.7134133877485146e-06, "loss": 0.4296, "step": 4747 }, { "epoch": 0.5231955922865014, "grad_norm": 6.123372554779053, "learning_rate": 4.711667842133003e-06, "loss": 0.4063, "step": 4748 }, { "epoch": 0.523305785123967, "grad_norm": 8.187582015991211, "learning_rate": 4.709922331775138e-06, "loss": 0.4062, "step": 4749 }, { "epoch": 0.5234159779614325, "grad_norm": 11.04067325592041, "learning_rate": 4.708176856888362e-06, "loss": 0.4127, "step": 4750 }, { "epoch": 0.5235261707988981, "grad_norm": 7.561423301696777, "learning_rate": 4.706431417686121e-06, "loss": 0.4394, "step": 4751 }, { "epoch": 0.5236363636363637, "grad_norm": 5.916963577270508, "learning_rate": 4.704686014381842e-06, "loss": 0.4553, "step": 4752 }, { "epoch": 0.5237465564738292, "grad_norm": 7.415162086486816, "learning_rate": 4.702940647188958e-06, "loss": 0.3999, "step": 4753 }, { "epoch": 0.5238567493112948, "grad_norm": 3.885070323944092, "learning_rate": 4.701195316320897e-06, "loss": 0.3886, "step": 4754 }, { "epoch": 0.5239669421487604, "grad_norm": 7.7326459884643555, "learning_rate": 4.699450021991078e-06, "loss": 0.4516, "step": 4755 }, { "epoch": 0.5240771349862259, "grad_norm": 8.060406684875488, "learning_rate": 4.697704764412917e-06, "loss": 0.4573, "step": 4756 }, { "epoch": 0.5241873278236915, "grad_norm": 5.327139854431152, "learning_rate": 4.695959543799829e-06, "loss": 0.4009, "step": 4757 }, { "epoch": 0.524297520661157, "grad_norm": 4.9981513023376465, "learning_rate": 4.6942143603652214e-06, "loss": 0.3951, "step": 4758 }, { "epoch": 0.5244077134986226, "grad_norm": 6.298994064331055, "learning_rate": 4.692469214322497e-06, "loss": 0.4261, "step": 4759 }, { "epoch": 0.5245179063360882, "grad_norm": 8.435798645019531, "learning_rate": 4.690724105885055e-06, "loss": 0.4001, "step": 4760 }, { "epoch": 0.5246280991735537, "grad_norm": 7.095319747924805, "learning_rate": 4.688979035266288e-06, "loss": 0.41, "step": 4761 }, { "epoch": 0.5247382920110193, "grad_norm": 7.2349700927734375, "learning_rate": 4.687234002679589e-06, "loss": 0.461, "step": 4762 }, { "epoch": 0.5248484848484849, "grad_norm": 9.026803970336914, "learning_rate": 4.68548900833834e-06, "loss": 0.4939, "step": 4763 }, { "epoch": 0.5249586776859504, "grad_norm": 5.365535259246826, "learning_rate": 4.683744052455922e-06, "loss": 0.4438, "step": 4764 }, { "epoch": 0.525068870523416, "grad_norm": 6.975589752197266, "learning_rate": 4.681999135245712e-06, "loss": 0.3798, "step": 4765 }, { "epoch": 0.5251790633608815, "grad_norm": 11.416638374328613, "learning_rate": 4.68025425692108e-06, "loss": 0.4842, "step": 4766 }, { "epoch": 0.5252892561983471, "grad_norm": 8.365035057067871, "learning_rate": 4.678509417695389e-06, "loss": 0.4478, "step": 4767 }, { "epoch": 0.5253994490358127, "grad_norm": 5.908651351928711, "learning_rate": 4.676764617782006e-06, "loss": 0.4277, "step": 4768 }, { "epoch": 0.5255096418732782, "grad_norm": 7.646467208862305, "learning_rate": 4.675019857394285e-06, "loss": 0.3445, "step": 4769 }, { "epoch": 0.5256198347107438, "grad_norm": 5.816633701324463, "learning_rate": 4.673275136745574e-06, "loss": 0.4133, "step": 4770 }, { "epoch": 0.5257300275482094, "grad_norm": 6.1945366859436035, "learning_rate": 4.671530456049225e-06, "loss": 0.3272, "step": 4771 }, { "epoch": 0.5258402203856749, "grad_norm": 5.908159255981445, "learning_rate": 4.66978581551858e-06, "loss": 0.4196, "step": 4772 }, { "epoch": 0.5259504132231405, "grad_norm": 5.871518611907959, "learning_rate": 4.6680412153669695e-06, "loss": 0.3546, "step": 4773 }, { "epoch": 0.526060606060606, "grad_norm": 5.87644624710083, "learning_rate": 4.666296655807735e-06, "loss": 0.4596, "step": 4774 }, { "epoch": 0.5261707988980716, "grad_norm": 6.668318271636963, "learning_rate": 4.664552137054194e-06, "loss": 0.3977, "step": 4775 }, { "epoch": 0.5262809917355372, "grad_norm": 12.507335662841797, "learning_rate": 4.662807659319676e-06, "loss": 0.3817, "step": 4776 }, { "epoch": 0.5263911845730027, "grad_norm": 5.290792465209961, "learning_rate": 4.661063222817497e-06, "loss": 0.3623, "step": 4777 }, { "epoch": 0.5265013774104683, "grad_norm": 14.073418617248535, "learning_rate": 4.659318827760964e-06, "loss": 0.4862, "step": 4778 }, { "epoch": 0.5266115702479339, "grad_norm": 6.559229850769043, "learning_rate": 4.65757447436339e-06, "loss": 0.4044, "step": 4779 }, { "epoch": 0.5267217630853994, "grad_norm": 6.175151348114014, "learning_rate": 4.655830162838074e-06, "loss": 0.3925, "step": 4780 }, { "epoch": 0.526831955922865, "grad_norm": 5.202744483947754, "learning_rate": 4.654085893398312e-06, "loss": 0.3941, "step": 4781 }, { "epoch": 0.5269421487603306, "grad_norm": 10.19947338104248, "learning_rate": 4.652341666257398e-06, "loss": 0.496, "step": 4782 }, { "epoch": 0.5270523415977961, "grad_norm": 9.504716873168945, "learning_rate": 4.650597481628617e-06, "loss": 0.4463, "step": 4783 }, { "epoch": 0.5271625344352617, "grad_norm": 6.426805019378662, "learning_rate": 4.64885333972525e-06, "loss": 0.436, "step": 4784 }, { "epoch": 0.5272727272727272, "grad_norm": 6.865482330322266, "learning_rate": 4.647109240760574e-06, "loss": 0.4941, "step": 4785 }, { "epoch": 0.5273829201101928, "grad_norm": 5.157108306884766, "learning_rate": 4.645365184947861e-06, "loss": 0.3815, "step": 4786 }, { "epoch": 0.5274931129476584, "grad_norm": 5.182690143585205, "learning_rate": 4.643621172500372e-06, "loss": 0.3838, "step": 4787 }, { "epoch": 0.5276033057851239, "grad_norm": 10.033770561218262, "learning_rate": 4.6418772036313716e-06, "loss": 0.4114, "step": 4788 }, { "epoch": 0.5277134986225895, "grad_norm": 5.148810863494873, "learning_rate": 4.6401332785541125e-06, "loss": 0.3935, "step": 4789 }, { "epoch": 0.5278236914600551, "grad_norm": 4.2936482429504395, "learning_rate": 4.6383893974818464e-06, "loss": 0.4128, "step": 4790 }, { "epoch": 0.5279338842975206, "grad_norm": 7.7514543533325195, "learning_rate": 4.636645560627815e-06, "loss": 0.4589, "step": 4791 }, { "epoch": 0.5280440771349862, "grad_norm": 6.3132805824279785, "learning_rate": 4.634901768205257e-06, "loss": 0.3303, "step": 4792 }, { "epoch": 0.5281542699724517, "grad_norm": 6.861356258392334, "learning_rate": 4.633158020427408e-06, "loss": 0.3197, "step": 4793 }, { "epoch": 0.5282644628099173, "grad_norm": 8.240154266357422, "learning_rate": 4.631414317507495e-06, "loss": 0.3735, "step": 4794 }, { "epoch": 0.528374655647383, "grad_norm": 6.859533786773682, "learning_rate": 4.629670659658739e-06, "loss": 0.3824, "step": 4795 }, { "epoch": 0.5284848484848484, "grad_norm": 11.968518257141113, "learning_rate": 4.627927047094358e-06, "loss": 0.4285, "step": 4796 }, { "epoch": 0.528595041322314, "grad_norm": 10.671728134155273, "learning_rate": 4.626183480027564e-06, "loss": 0.5099, "step": 4797 }, { "epoch": 0.5287052341597797, "grad_norm": 8.56689167022705, "learning_rate": 4.62443995867156e-06, "loss": 0.3508, "step": 4798 }, { "epoch": 0.5288154269972452, "grad_norm": 5.919952392578125, "learning_rate": 4.622696483239549e-06, "loss": 0.4478, "step": 4799 }, { "epoch": 0.5289256198347108, "grad_norm": 7.455764293670654, "learning_rate": 4.6209530539447265e-06, "loss": 0.4184, "step": 4800 }, { "epoch": 0.5290358126721763, "grad_norm": 11.0386381149292, "learning_rate": 4.619209671000276e-06, "loss": 0.4672, "step": 4801 }, { "epoch": 0.5291460055096419, "grad_norm": 7.714338779449463, "learning_rate": 4.617466334619387e-06, "loss": 0.3995, "step": 4802 }, { "epoch": 0.5292561983471075, "grad_norm": 3.729965925216675, "learning_rate": 4.6157230450152315e-06, "loss": 0.3921, "step": 4803 }, { "epoch": 0.529366391184573, "grad_norm": 8.325998306274414, "learning_rate": 4.613979802400986e-06, "loss": 0.412, "step": 4804 }, { "epoch": 0.5294765840220386, "grad_norm": 6.081939697265625, "learning_rate": 4.612236606989815e-06, "loss": 0.3978, "step": 4805 }, { "epoch": 0.5295867768595042, "grad_norm": 9.263617515563965, "learning_rate": 4.610493458994876e-06, "loss": 0.4348, "step": 4806 }, { "epoch": 0.5296969696969697, "grad_norm": 8.411460876464844, "learning_rate": 4.608750358629329e-06, "loss": 0.4951, "step": 4807 }, { "epoch": 0.5298071625344353, "grad_norm": 9.316654205322266, "learning_rate": 4.607007306106318e-06, "loss": 0.43, "step": 4808 }, { "epoch": 0.5299173553719009, "grad_norm": 6.742702484130859, "learning_rate": 4.605264301638986e-06, "loss": 0.4105, "step": 4809 }, { "epoch": 0.5300275482093664, "grad_norm": 6.03303861618042, "learning_rate": 4.603521345440474e-06, "loss": 0.457, "step": 4810 }, { "epoch": 0.530137741046832, "grad_norm": 5.3249735832214355, "learning_rate": 4.60177843772391e-06, "loss": 0.384, "step": 4811 }, { "epoch": 0.5302479338842975, "grad_norm": 10.849640846252441, "learning_rate": 4.600035578702418e-06, "loss": 0.3535, "step": 4812 }, { "epoch": 0.5303581267217631, "grad_norm": 5.814062118530273, "learning_rate": 4.5982927685891196e-06, "loss": 0.4242, "step": 4813 }, { "epoch": 0.5304683195592287, "grad_norm": 4.485311508178711, "learning_rate": 4.596550007597128e-06, "loss": 0.4173, "step": 4814 }, { "epoch": 0.5305785123966942, "grad_norm": 9.369732856750488, "learning_rate": 4.594807295939548e-06, "loss": 0.4045, "step": 4815 }, { "epoch": 0.5306887052341598, "grad_norm": 4.321104526519775, "learning_rate": 4.593064633829483e-06, "loss": 0.3466, "step": 4816 }, { "epoch": 0.5307988980716254, "grad_norm": 5.65838098526001, "learning_rate": 4.591322021480027e-06, "loss": 0.4088, "step": 4817 }, { "epoch": 0.5309090909090909, "grad_norm": 7.255067348480225, "learning_rate": 4.58957945910427e-06, "loss": 0.3777, "step": 4818 }, { "epoch": 0.5310192837465565, "grad_norm": 5.063383102416992, "learning_rate": 4.587836946915294e-06, "loss": 0.3763, "step": 4819 }, { "epoch": 0.531129476584022, "grad_norm": 8.16020393371582, "learning_rate": 4.586094485126175e-06, "loss": 0.5009, "step": 4820 }, { "epoch": 0.5312396694214876, "grad_norm": 5.291675567626953, "learning_rate": 4.584352073949986e-06, "loss": 0.4175, "step": 4821 }, { "epoch": 0.5313498622589532, "grad_norm": 7.2242536544799805, "learning_rate": 4.58260971359979e-06, "loss": 0.3968, "step": 4822 }, { "epoch": 0.5314600550964187, "grad_norm": 6.445919513702393, "learning_rate": 4.580867404288644e-06, "loss": 0.3964, "step": 4823 }, { "epoch": 0.5315702479338843, "grad_norm": 8.131845474243164, "learning_rate": 4.579125146229601e-06, "loss": 0.4372, "step": 4824 }, { "epoch": 0.5316804407713499, "grad_norm": 3.518427610397339, "learning_rate": 4.577382939635709e-06, "loss": 0.4025, "step": 4825 }, { "epoch": 0.5317906336088154, "grad_norm": 6.3140082359313965, "learning_rate": 4.575640784720003e-06, "loss": 0.4022, "step": 4826 }, { "epoch": 0.531900826446281, "grad_norm": 5.330982208251953, "learning_rate": 4.573898681695519e-06, "loss": 0.3742, "step": 4827 }, { "epoch": 0.5320110192837466, "grad_norm": 10.08835220336914, "learning_rate": 4.572156630775285e-06, "loss": 0.4913, "step": 4828 }, { "epoch": 0.5321212121212121, "grad_norm": 4.119600772857666, "learning_rate": 4.570414632172315e-06, "loss": 0.3523, "step": 4829 }, { "epoch": 0.5322314049586777, "grad_norm": 6.530597686767578, "learning_rate": 4.568672686099631e-06, "loss": 0.3307, "step": 4830 }, { "epoch": 0.5323415977961432, "grad_norm": 6.231881618499756, "learning_rate": 4.566930792770234e-06, "loss": 0.3645, "step": 4831 }, { "epoch": 0.5324517906336088, "grad_norm": 5.70702600479126, "learning_rate": 4.56518895239713e-06, "loss": 0.4019, "step": 4832 }, { "epoch": 0.5325619834710744, "grad_norm": 9.638153076171875, "learning_rate": 4.5634471651933125e-06, "loss": 0.4779, "step": 4833 }, { "epoch": 0.5326721763085399, "grad_norm": 7.994751930236816, "learning_rate": 4.561705431371766e-06, "loss": 0.3918, "step": 4834 }, { "epoch": 0.5327823691460055, "grad_norm": 9.847349166870117, "learning_rate": 4.559963751145477e-06, "loss": 0.4104, "step": 4835 }, { "epoch": 0.5328925619834711, "grad_norm": 6.911519527435303, "learning_rate": 4.5582221247274175e-06, "loss": 0.4688, "step": 4836 }, { "epoch": 0.5330027548209366, "grad_norm": 6.8758463859558105, "learning_rate": 4.556480552330555e-06, "loss": 0.3543, "step": 4837 }, { "epoch": 0.5331129476584022, "grad_norm": 6.75565767288208, "learning_rate": 4.554739034167855e-06, "loss": 0.4163, "step": 4838 }, { "epoch": 0.5332231404958677, "grad_norm": 6.917755126953125, "learning_rate": 4.552997570452271e-06, "loss": 0.4408, "step": 4839 }, { "epoch": 0.5333333333333333, "grad_norm": 5.163933277130127, "learning_rate": 4.551256161396749e-06, "loss": 0.4325, "step": 4840 }, { "epoch": 0.5334435261707989, "grad_norm": 9.991934776306152, "learning_rate": 4.549514807214235e-06, "loss": 0.4224, "step": 4841 }, { "epoch": 0.5335537190082644, "grad_norm": 5.259696006774902, "learning_rate": 4.547773508117663e-06, "loss": 0.379, "step": 4842 }, { "epoch": 0.53366391184573, "grad_norm": 4.730058193206787, "learning_rate": 4.5460322643199586e-06, "loss": 0.3877, "step": 4843 }, { "epoch": 0.5337741046831956, "grad_norm": 7.9191412925720215, "learning_rate": 4.5442910760340466e-06, "loss": 0.424, "step": 4844 }, { "epoch": 0.5338842975206611, "grad_norm": 5.3816704750061035, "learning_rate": 4.542549943472841e-06, "loss": 0.4266, "step": 4845 }, { "epoch": 0.5339944903581267, "grad_norm": 5.254730224609375, "learning_rate": 4.540808866849249e-06, "loss": 0.3668, "step": 4846 }, { "epoch": 0.5341046831955922, "grad_norm": 13.787521362304688, "learning_rate": 4.539067846376173e-06, "loss": 0.5494, "step": 4847 }, { "epoch": 0.5342148760330578, "grad_norm": 8.365334510803223, "learning_rate": 4.537326882266506e-06, "loss": 0.3826, "step": 4848 }, { "epoch": 0.5343250688705234, "grad_norm": 7.230132102966309, "learning_rate": 4.535585974733138e-06, "loss": 0.3738, "step": 4849 }, { "epoch": 0.5344352617079889, "grad_norm": 6.887158393859863, "learning_rate": 4.5338451239889465e-06, "loss": 0.4763, "step": 4850 }, { "epoch": 0.5345454545454545, "grad_norm": 8.59938907623291, "learning_rate": 4.532104330246807e-06, "loss": 0.4696, "step": 4851 }, { "epoch": 0.5346556473829202, "grad_norm": 7.57608699798584, "learning_rate": 4.530363593719585e-06, "loss": 0.4282, "step": 4852 }, { "epoch": 0.5347658402203856, "grad_norm": 6.073071479797363, "learning_rate": 4.5286229146201425e-06, "loss": 0.4261, "step": 4853 }, { "epoch": 0.5348760330578513, "grad_norm": 5.064709186553955, "learning_rate": 4.526882293161328e-06, "loss": 0.4004, "step": 4854 }, { "epoch": 0.5349862258953169, "grad_norm": 4.467386245727539, "learning_rate": 4.525141729555991e-06, "loss": 0.3556, "step": 4855 }, { "epoch": 0.5350964187327824, "grad_norm": 6.954026699066162, "learning_rate": 4.5234012240169686e-06, "loss": 0.4263, "step": 4856 }, { "epoch": 0.535206611570248, "grad_norm": 8.104146957397461, "learning_rate": 4.521660776757089e-06, "loss": 0.3759, "step": 4857 }, { "epoch": 0.5353168044077135, "grad_norm": 6.06421422958374, "learning_rate": 4.519920387989182e-06, "loss": 0.3673, "step": 4858 }, { "epoch": 0.5354269972451791, "grad_norm": 6.338195323944092, "learning_rate": 4.518180057926061e-06, "loss": 0.4627, "step": 4859 }, { "epoch": 0.5355371900826447, "grad_norm": 7.764068603515625, "learning_rate": 4.516439786780538e-06, "loss": 0.3891, "step": 4860 }, { "epoch": 0.5356473829201102, "grad_norm": 6.029236793518066, "learning_rate": 4.514699574765415e-06, "loss": 0.4353, "step": 4861 }, { "epoch": 0.5357575757575758, "grad_norm": 6.852285385131836, "learning_rate": 4.5129594220934856e-06, "loss": 0.379, "step": 4862 }, { "epoch": 0.5358677685950414, "grad_norm": 6.268723964691162, "learning_rate": 4.511219328977541e-06, "loss": 0.3621, "step": 4863 }, { "epoch": 0.5359779614325069, "grad_norm": 10.014451026916504, "learning_rate": 4.509479295630362e-06, "loss": 0.438, "step": 4864 }, { "epoch": 0.5360881542699725, "grad_norm": 11.356203079223633, "learning_rate": 4.507739322264717e-06, "loss": 0.4089, "step": 4865 }, { "epoch": 0.536198347107438, "grad_norm": 10.65330982208252, "learning_rate": 4.50599940909338e-06, "loss": 0.4433, "step": 4866 }, { "epoch": 0.5363085399449036, "grad_norm": 9.675298690795898, "learning_rate": 4.504259556329105e-06, "loss": 0.4201, "step": 4867 }, { "epoch": 0.5364187327823692, "grad_norm": 8.174824714660645, "learning_rate": 4.5025197641846445e-06, "loss": 0.4172, "step": 4868 }, { "epoch": 0.5365289256198347, "grad_norm": 13.778491973876953, "learning_rate": 4.5007800328727435e-06, "loss": 0.5102, "step": 4869 }, { "epoch": 0.5366391184573003, "grad_norm": 7.946699619293213, "learning_rate": 4.499040362606139e-06, "loss": 0.41, "step": 4870 }, { "epoch": 0.5367493112947659, "grad_norm": 4.874997615814209, "learning_rate": 4.497300753597557e-06, "loss": 0.425, "step": 4871 }, { "epoch": 0.5368595041322314, "grad_norm": 8.760504722595215, "learning_rate": 4.495561206059723e-06, "loss": 0.4714, "step": 4872 }, { "epoch": 0.536969696969697, "grad_norm": 5.017094612121582, "learning_rate": 4.49382172020535e-06, "loss": 0.4442, "step": 4873 }, { "epoch": 0.5370798898071625, "grad_norm": 5.726899147033691, "learning_rate": 4.492082296247145e-06, "loss": 0.4706, "step": 4874 }, { "epoch": 0.5371900826446281, "grad_norm": 5.397989273071289, "learning_rate": 4.490342934397807e-06, "loss": 0.3746, "step": 4875 }, { "epoch": 0.5373002754820937, "grad_norm": 4.507936477661133, "learning_rate": 4.488603634870026e-06, "loss": 0.3411, "step": 4876 }, { "epoch": 0.5374104683195592, "grad_norm": 5.176347255706787, "learning_rate": 4.486864397876488e-06, "loss": 0.4165, "step": 4877 }, { "epoch": 0.5375206611570248, "grad_norm": 12.070116996765137, "learning_rate": 4.485125223629868e-06, "loss": 0.5233, "step": 4878 }, { "epoch": 0.5376308539944904, "grad_norm": 11.164689064025879, "learning_rate": 4.4833861123428355e-06, "loss": 0.4604, "step": 4879 }, { "epoch": 0.5377410468319559, "grad_norm": 7.950995445251465, "learning_rate": 4.481647064228051e-06, "loss": 0.402, "step": 4880 }, { "epoch": 0.5378512396694215, "grad_norm": 8.600005149841309, "learning_rate": 4.479908079498168e-06, "loss": 0.4309, "step": 4881 }, { "epoch": 0.5379614325068871, "grad_norm": 6.681096076965332, "learning_rate": 4.47816915836583e-06, "loss": 0.4075, "step": 4882 }, { "epoch": 0.5380716253443526, "grad_norm": 6.604003429412842, "learning_rate": 4.476430301043678e-06, "loss": 0.4222, "step": 4883 }, { "epoch": 0.5381818181818182, "grad_norm": 5.344298362731934, "learning_rate": 4.474691507744339e-06, "loss": 0.4041, "step": 4884 }, { "epoch": 0.5382920110192837, "grad_norm": 7.361419200897217, "learning_rate": 4.472952778680436e-06, "loss": 0.427, "step": 4885 }, { "epoch": 0.5384022038567493, "grad_norm": 5.2496514320373535, "learning_rate": 4.4712141140645835e-06, "loss": 0.4253, "step": 4886 }, { "epoch": 0.5385123966942149, "grad_norm": 6.846745491027832, "learning_rate": 4.469475514109387e-06, "loss": 0.3627, "step": 4887 }, { "epoch": 0.5386225895316804, "grad_norm": 5.707091808319092, "learning_rate": 4.467736979027445e-06, "loss": 0.4314, "step": 4888 }, { "epoch": 0.538732782369146, "grad_norm": 5.808233737945557, "learning_rate": 4.46599850903135e-06, "loss": 0.4623, "step": 4889 }, { "epoch": 0.5388429752066116, "grad_norm": 5.639697551727295, "learning_rate": 4.46426010433368e-06, "loss": 0.3758, "step": 4890 }, { "epoch": 0.5389531680440771, "grad_norm": 7.901959419250488, "learning_rate": 4.462521765147014e-06, "loss": 0.341, "step": 4891 }, { "epoch": 0.5390633608815427, "grad_norm": 5.190210819244385, "learning_rate": 4.460783491683917e-06, "loss": 0.4943, "step": 4892 }, { "epoch": 0.5391735537190082, "grad_norm": 5.680449485778809, "learning_rate": 4.4590452841569446e-06, "loss": 0.383, "step": 4893 }, { "epoch": 0.5392837465564738, "grad_norm": 6.7000250816345215, "learning_rate": 4.457307142778653e-06, "loss": 0.396, "step": 4894 }, { "epoch": 0.5393939393939394, "grad_norm": 5.925076961517334, "learning_rate": 4.45556906776158e-06, "loss": 0.4445, "step": 4895 }, { "epoch": 0.5395041322314049, "grad_norm": 8.168147087097168, "learning_rate": 4.453831059318259e-06, "loss": 0.327, "step": 4896 }, { "epoch": 0.5396143250688705, "grad_norm": 5.0076165199279785, "learning_rate": 4.452093117661221e-06, "loss": 0.402, "step": 4897 }, { "epoch": 0.5397245179063361, "grad_norm": 6.923157215118408, "learning_rate": 4.450355243002979e-06, "loss": 0.4016, "step": 4898 }, { "epoch": 0.5398347107438016, "grad_norm": 4.7264838218688965, "learning_rate": 4.448617435556044e-06, "loss": 0.3793, "step": 4899 }, { "epoch": 0.5399449035812672, "grad_norm": 5.012410640716553, "learning_rate": 4.446879695532919e-06, "loss": 0.3114, "step": 4900 }, { "epoch": 0.5400550964187327, "grad_norm": 9.185056686401367, "learning_rate": 4.445142023146095e-06, "loss": 0.4989, "step": 4901 }, { "epoch": 0.5401652892561983, "grad_norm": 7.158074855804443, "learning_rate": 4.44340441860806e-06, "loss": 0.4307, "step": 4902 }, { "epoch": 0.5402754820936639, "grad_norm": 5.751270771026611, "learning_rate": 4.441666882131288e-06, "loss": 0.3339, "step": 4903 }, { "epoch": 0.5403856749311294, "grad_norm": 4.243267059326172, "learning_rate": 4.439929413928247e-06, "loss": 0.401, "step": 4904 }, { "epoch": 0.540495867768595, "grad_norm": 5.914028167724609, "learning_rate": 4.438192014211398e-06, "loss": 0.3814, "step": 4905 }, { "epoch": 0.5406060606060606, "grad_norm": 6.544396877288818, "learning_rate": 4.4364546831931945e-06, "loss": 0.3849, "step": 4906 }, { "epoch": 0.5407162534435261, "grad_norm": 6.030860424041748, "learning_rate": 4.434717421086076e-06, "loss": 0.4617, "step": 4907 }, { "epoch": 0.5408264462809917, "grad_norm": 5.7662248611450195, "learning_rate": 4.43298022810248e-06, "loss": 0.425, "step": 4908 }, { "epoch": 0.5409366391184574, "grad_norm": 4.316056251525879, "learning_rate": 4.431243104454833e-06, "loss": 0.3816, "step": 4909 }, { "epoch": 0.5410468319559228, "grad_norm": 10.407288551330566, "learning_rate": 4.42950605035555e-06, "loss": 0.4001, "step": 4910 }, { "epoch": 0.5411570247933885, "grad_norm": 8.184249877929688, "learning_rate": 4.427769066017043e-06, "loss": 0.4545, "step": 4911 }, { "epoch": 0.541267217630854, "grad_norm": 7.201406478881836, "learning_rate": 4.426032151651712e-06, "loss": 0.5168, "step": 4912 }, { "epoch": 0.5413774104683196, "grad_norm": 6.947605133056641, "learning_rate": 4.424295307471948e-06, "loss": 0.413, "step": 4913 }, { "epoch": 0.5414876033057852, "grad_norm": 5.5525102615356445, "learning_rate": 4.422558533690136e-06, "loss": 0.392, "step": 4914 }, { "epoch": 0.5415977961432507, "grad_norm": 7.134677410125732, "learning_rate": 4.420821830518652e-06, "loss": 0.4661, "step": 4915 }, { "epoch": 0.5417079889807163, "grad_norm": 6.866796016693115, "learning_rate": 4.419085198169861e-06, "loss": 0.4207, "step": 4916 }, { "epoch": 0.5418181818181819, "grad_norm": 6.162034511566162, "learning_rate": 4.417348636856121e-06, "loss": 0.4213, "step": 4917 }, { "epoch": 0.5419283746556474, "grad_norm": 11.50009536743164, "learning_rate": 4.415612146789781e-06, "loss": 0.4765, "step": 4918 }, { "epoch": 0.542038567493113, "grad_norm": 6.429296016693115, "learning_rate": 4.413875728183181e-06, "loss": 0.4699, "step": 4919 }, { "epoch": 0.5421487603305785, "grad_norm": 8.378180503845215, "learning_rate": 4.412139381248655e-06, "loss": 0.5049, "step": 4920 }, { "epoch": 0.5422589531680441, "grad_norm": 5.784651279449463, "learning_rate": 4.410403106198521e-06, "loss": 0.3182, "step": 4921 }, { "epoch": 0.5423691460055097, "grad_norm": 8.84086799621582, "learning_rate": 4.408666903245098e-06, "loss": 0.4641, "step": 4922 }, { "epoch": 0.5424793388429752, "grad_norm": 7.057933807373047, "learning_rate": 4.406930772600691e-06, "loss": 0.4934, "step": 4923 }, { "epoch": 0.5425895316804408, "grad_norm": 6.431195259094238, "learning_rate": 4.40519471447759e-06, "loss": 0.3925, "step": 4924 }, { "epoch": 0.5426997245179064, "grad_norm": 6.534265041351318, "learning_rate": 4.403458729088092e-06, "loss": 0.3092, "step": 4925 }, { "epoch": 0.5428099173553719, "grad_norm": 7.719228744506836, "learning_rate": 4.4017228166444696e-06, "loss": 0.3748, "step": 4926 }, { "epoch": 0.5429201101928375, "grad_norm": 8.462837219238281, "learning_rate": 4.399986977358992e-06, "loss": 0.4354, "step": 4927 }, { "epoch": 0.5430303030303031, "grad_norm": 5.98094367980957, "learning_rate": 4.398251211443923e-06, "loss": 0.3639, "step": 4928 }, { "epoch": 0.5431404958677686, "grad_norm": 6.449747085571289, "learning_rate": 4.396515519111512e-06, "loss": 0.471, "step": 4929 }, { "epoch": 0.5432506887052342, "grad_norm": 4.866857051849365, "learning_rate": 4.3947799005740065e-06, "loss": 0.3987, "step": 4930 }, { "epoch": 0.5433608815426997, "grad_norm": 5.388254642486572, "learning_rate": 4.3930443560436346e-06, "loss": 0.4468, "step": 4931 }, { "epoch": 0.5434710743801653, "grad_norm": 5.047433376312256, "learning_rate": 4.391308885732622e-06, "loss": 0.406, "step": 4932 }, { "epoch": 0.5435812672176309, "grad_norm": 5.697027683258057, "learning_rate": 4.3895734898531885e-06, "loss": 0.4061, "step": 4933 }, { "epoch": 0.5436914600550964, "grad_norm": 6.802773475646973, "learning_rate": 4.387838168617536e-06, "loss": 0.3757, "step": 4934 }, { "epoch": 0.543801652892562, "grad_norm": 5.0467705726623535, "learning_rate": 4.386102922237864e-06, "loss": 0.4016, "step": 4935 }, { "epoch": 0.5439118457300276, "grad_norm": 4.891822814941406, "learning_rate": 4.384367750926362e-06, "loss": 0.3805, "step": 4936 }, { "epoch": 0.5440220385674931, "grad_norm": 6.10658597946167, "learning_rate": 4.382632654895206e-06, "loss": 0.4592, "step": 4937 }, { "epoch": 0.5441322314049587, "grad_norm": 13.864315032958984, "learning_rate": 4.380897634356567e-06, "loss": 0.4804, "step": 4938 }, { "epoch": 0.5442424242424242, "grad_norm": 5.945678234100342, "learning_rate": 4.379162689522608e-06, "loss": 0.3606, "step": 4939 }, { "epoch": 0.5443526170798898, "grad_norm": 8.377779960632324, "learning_rate": 4.377427820605479e-06, "loss": 0.3898, "step": 4940 }, { "epoch": 0.5444628099173554, "grad_norm": 6.678970813751221, "learning_rate": 4.3756930278173196e-06, "loss": 0.4064, "step": 4941 }, { "epoch": 0.5445730027548209, "grad_norm": 4.300466060638428, "learning_rate": 4.373958311370266e-06, "loss": 0.4041, "step": 4942 }, { "epoch": 0.5446831955922865, "grad_norm": 6.053644180297852, "learning_rate": 4.37222367147644e-06, "loss": 0.3838, "step": 4943 }, { "epoch": 0.5447933884297521, "grad_norm": 5.004225730895996, "learning_rate": 4.3704891083479575e-06, "loss": 0.4211, "step": 4944 }, { "epoch": 0.5449035812672176, "grad_norm": 8.41515064239502, "learning_rate": 4.368754622196921e-06, "loss": 0.416, "step": 4945 }, { "epoch": 0.5450137741046832, "grad_norm": 7.956199645996094, "learning_rate": 4.367020213235426e-06, "loss": 0.4295, "step": 4946 }, { "epoch": 0.5451239669421487, "grad_norm": 5.374857425689697, "learning_rate": 4.365285881675561e-06, "loss": 0.4256, "step": 4947 }, { "epoch": 0.5452341597796143, "grad_norm": 7.09722900390625, "learning_rate": 4.3635516277294e-06, "loss": 0.3855, "step": 4948 }, { "epoch": 0.5453443526170799, "grad_norm": 4.6613030433654785, "learning_rate": 4.361817451609008e-06, "loss": 0.3554, "step": 4949 }, { "epoch": 0.5454545454545454, "grad_norm": 4.342301368713379, "learning_rate": 4.360083353526447e-06, "loss": 0.4192, "step": 4950 }, { "epoch": 0.545564738292011, "grad_norm": 5.399786949157715, "learning_rate": 4.358349333693765e-06, "loss": 0.4142, "step": 4951 }, { "epoch": 0.5456749311294766, "grad_norm": 8.767022132873535, "learning_rate": 4.356615392322994e-06, "loss": 0.411, "step": 4952 }, { "epoch": 0.5457851239669421, "grad_norm": 9.512101173400879, "learning_rate": 4.35488152962617e-06, "loss": 0.4414, "step": 4953 }, { "epoch": 0.5458953168044077, "grad_norm": 8.008020401000977, "learning_rate": 4.353147745815308e-06, "loss": 0.3263, "step": 4954 }, { "epoch": 0.5460055096418733, "grad_norm": 6.651403903961182, "learning_rate": 4.3514140411024156e-06, "loss": 0.3493, "step": 4955 }, { "epoch": 0.5461157024793388, "grad_norm": 6.7860331535339355, "learning_rate": 4.349680415699499e-06, "loss": 0.402, "step": 4956 }, { "epoch": 0.5462258953168044, "grad_norm": 5.144118785858154, "learning_rate": 4.347946869818541e-06, "loss": 0.411, "step": 4957 }, { "epoch": 0.5463360881542699, "grad_norm": 12.622493743896484, "learning_rate": 4.346213403671529e-06, "loss": 0.501, "step": 4958 }, { "epoch": 0.5464462809917355, "grad_norm": 8.2738618850708, "learning_rate": 4.344480017470429e-06, "loss": 0.3893, "step": 4959 }, { "epoch": 0.5465564738292011, "grad_norm": 6.189448356628418, "learning_rate": 4.342746711427202e-06, "loss": 0.388, "step": 4960 }, { "epoch": 0.5466666666666666, "grad_norm": 6.918076515197754, "learning_rate": 4.3410134857538e-06, "loss": 0.4298, "step": 4961 }, { "epoch": 0.5467768595041322, "grad_norm": 5.94843053817749, "learning_rate": 4.339280340662165e-06, "loss": 0.3492, "step": 4962 }, { "epoch": 0.5468870523415978, "grad_norm": 12.531344413757324, "learning_rate": 4.337547276364225e-06, "loss": 0.5248, "step": 4963 }, { "epoch": 0.5469972451790633, "grad_norm": 5.890338897705078, "learning_rate": 4.335814293071907e-06, "loss": 0.3161, "step": 4964 }, { "epoch": 0.547107438016529, "grad_norm": 5.140368938446045, "learning_rate": 4.334081390997119e-06, "loss": 0.4451, "step": 4965 }, { "epoch": 0.5472176308539944, "grad_norm": 5.555690288543701, "learning_rate": 4.332348570351761e-06, "loss": 0.4427, "step": 4966 }, { "epoch": 0.54732782369146, "grad_norm": 6.685448169708252, "learning_rate": 4.330615831347729e-06, "loss": 0.3855, "step": 4967 }, { "epoch": 0.5474380165289257, "grad_norm": 5.776802062988281, "learning_rate": 4.328883174196901e-06, "loss": 0.3634, "step": 4968 }, { "epoch": 0.5475482093663911, "grad_norm": 6.580652713775635, "learning_rate": 4.32715059911115e-06, "loss": 0.4399, "step": 4969 }, { "epoch": 0.5476584022038568, "grad_norm": 5.1589813232421875, "learning_rate": 4.325418106302339e-06, "loss": 0.3577, "step": 4970 }, { "epoch": 0.5477685950413224, "grad_norm": 5.44032621383667, "learning_rate": 4.323685695982316e-06, "loss": 0.4031, "step": 4971 }, { "epoch": 0.5478787878787879, "grad_norm": 8.49659252166748, "learning_rate": 4.321953368362927e-06, "loss": 0.4534, "step": 4972 }, { "epoch": 0.5479889807162535, "grad_norm": 7.460810661315918, "learning_rate": 4.320221123656001e-06, "loss": 0.4431, "step": 4973 }, { "epoch": 0.548099173553719, "grad_norm": 9.8892183303833, "learning_rate": 4.318488962073358e-06, "loss": 0.4066, "step": 4974 }, { "epoch": 0.5482093663911846, "grad_norm": 6.912549018859863, "learning_rate": 4.316756883826811e-06, "loss": 0.4044, "step": 4975 }, { "epoch": 0.5483195592286502, "grad_norm": 4.6018757820129395, "learning_rate": 4.315024889128161e-06, "loss": 0.3947, "step": 4976 }, { "epoch": 0.5484297520661157, "grad_norm": 4.4889726638793945, "learning_rate": 4.313292978189197e-06, "loss": 0.4163, "step": 4977 }, { "epoch": 0.5485399449035813, "grad_norm": 5.36277437210083, "learning_rate": 4.311561151221702e-06, "loss": 0.416, "step": 4978 }, { "epoch": 0.5486501377410469, "grad_norm": 6.793168067932129, "learning_rate": 4.309829408437446e-06, "loss": 0.3331, "step": 4979 }, { "epoch": 0.5487603305785124, "grad_norm": 6.599664688110352, "learning_rate": 4.308097750048183e-06, "loss": 0.3934, "step": 4980 }, { "epoch": 0.548870523415978, "grad_norm": 7.5874714851379395, "learning_rate": 4.30636617626567e-06, "loss": 0.4842, "step": 4981 }, { "epoch": 0.5489807162534436, "grad_norm": 4.51835298538208, "learning_rate": 4.304634687301642e-06, "loss": 0.3492, "step": 4982 }, { "epoch": 0.5490909090909091, "grad_norm": 5.6787519454956055, "learning_rate": 4.302903283367828e-06, "loss": 0.4191, "step": 4983 }, { "epoch": 0.5492011019283747, "grad_norm": 7.3988542556762695, "learning_rate": 4.30117196467595e-06, "loss": 0.3969, "step": 4984 }, { "epoch": 0.5493112947658402, "grad_norm": 7.988712787628174, "learning_rate": 4.2994407314377105e-06, "loss": 0.5196, "step": 4985 }, { "epoch": 0.5494214876033058, "grad_norm": 4.772488594055176, "learning_rate": 4.297709583864813e-06, "loss": 0.3467, "step": 4986 }, { "epoch": 0.5495316804407714, "grad_norm": 4.936898231506348, "learning_rate": 4.2959785221689406e-06, "loss": 0.3985, "step": 4987 }, { "epoch": 0.5496418732782369, "grad_norm": 8.552613258361816, "learning_rate": 4.294247546561768e-06, "loss": 0.3919, "step": 4988 }, { "epoch": 0.5497520661157025, "grad_norm": 4.918736457824707, "learning_rate": 4.2925166572549685e-06, "loss": 0.3695, "step": 4989 }, { "epoch": 0.5498622589531681, "grad_norm": 13.8289213180542, "learning_rate": 4.290785854460191e-06, "loss": 0.4322, "step": 4990 }, { "epoch": 0.5499724517906336, "grad_norm": 7.58026647567749, "learning_rate": 4.289055138389082e-06, "loss": 0.3489, "step": 4991 }, { "epoch": 0.5500826446280992, "grad_norm": 9.824568748474121, "learning_rate": 4.2873245092532776e-06, "loss": 0.435, "step": 4992 }, { "epoch": 0.5501928374655647, "grad_norm": 7.381153583526611, "learning_rate": 4.2855939672644e-06, "loss": 0.4866, "step": 4993 }, { "epoch": 0.5503030303030303, "grad_norm": 12.453740119934082, "learning_rate": 4.283863512634061e-06, "loss": 0.4905, "step": 4994 }, { "epoch": 0.5504132231404959, "grad_norm": 6.226518154144287, "learning_rate": 4.282133145573867e-06, "loss": 0.39, "step": 4995 }, { "epoch": 0.5505234159779614, "grad_norm": 6.088111400604248, "learning_rate": 4.280402866295406e-06, "loss": 0.3693, "step": 4996 }, { "epoch": 0.550633608815427, "grad_norm": 6.662594318389893, "learning_rate": 4.2786726750102596e-06, "loss": 0.3761, "step": 4997 }, { "epoch": 0.5507438016528926, "grad_norm": 4.674248695373535, "learning_rate": 4.276942571929998e-06, "loss": 0.3635, "step": 4998 }, { "epoch": 0.5508539944903581, "grad_norm": 11.55190658569336, "learning_rate": 4.275212557266181e-06, "loss": 0.4362, "step": 4999 }, { "epoch": 0.5509641873278237, "grad_norm": 4.43834924697876, "learning_rate": 4.273482631230358e-06, "loss": 0.3354, "step": 5000 }, { "epoch": 0.5510743801652892, "grad_norm": 9.896891593933105, "learning_rate": 4.271752794034065e-06, "loss": 0.4346, "step": 5001 }, { "epoch": 0.5511845730027548, "grad_norm": 5.692206382751465, "learning_rate": 4.270023045888829e-06, "loss": 0.3872, "step": 5002 }, { "epoch": 0.5512947658402204, "grad_norm": 4.49644660949707, "learning_rate": 4.268293387006168e-06, "loss": 0.4012, "step": 5003 }, { "epoch": 0.5514049586776859, "grad_norm": 6.0624518394470215, "learning_rate": 4.2665638175975854e-06, "loss": 0.4274, "step": 5004 }, { "epoch": 0.5515151515151515, "grad_norm": 5.555050373077393, "learning_rate": 4.264834337874574e-06, "loss": 0.389, "step": 5005 }, { "epoch": 0.5516253443526171, "grad_norm": 7.829788684844971, "learning_rate": 4.263104948048619e-06, "loss": 0.3417, "step": 5006 }, { "epoch": 0.5517355371900826, "grad_norm": 5.923528671264648, "learning_rate": 4.261375648331194e-06, "loss": 0.3599, "step": 5007 }, { "epoch": 0.5518457300275482, "grad_norm": 9.991193771362305, "learning_rate": 4.259646438933755e-06, "loss": 0.4564, "step": 5008 }, { "epoch": 0.5519559228650138, "grad_norm": 4.449314594268799, "learning_rate": 4.257917320067756e-06, "loss": 0.3436, "step": 5009 }, { "epoch": 0.5520661157024793, "grad_norm": 5.419429779052734, "learning_rate": 4.256188291944637e-06, "loss": 0.4088, "step": 5010 }, { "epoch": 0.5521763085399449, "grad_norm": 7.611926555633545, "learning_rate": 4.2544593547758214e-06, "loss": 0.3403, "step": 5011 }, { "epoch": 0.5522865013774104, "grad_norm": 7.113373279571533, "learning_rate": 4.252730508772731e-06, "loss": 0.3637, "step": 5012 }, { "epoch": 0.552396694214876, "grad_norm": 5.663134574890137, "learning_rate": 4.251001754146766e-06, "loss": 0.4462, "step": 5013 }, { "epoch": 0.5525068870523416, "grad_norm": 6.283078193664551, "learning_rate": 4.249273091109327e-06, "loss": 0.3731, "step": 5014 }, { "epoch": 0.5526170798898071, "grad_norm": 7.763297080993652, "learning_rate": 4.247544519871793e-06, "loss": 0.4351, "step": 5015 }, { "epoch": 0.5527272727272727, "grad_norm": 4.79278564453125, "learning_rate": 4.2458160406455355e-06, "loss": 0.3428, "step": 5016 }, { "epoch": 0.5528374655647383, "grad_norm": 8.19612979888916, "learning_rate": 4.24408765364192e-06, "loss": 0.366, "step": 5017 }, { "epoch": 0.5529476584022038, "grad_norm": 8.124463081359863, "learning_rate": 4.2423593590722925e-06, "loss": 0.3977, "step": 5018 }, { "epoch": 0.5530578512396694, "grad_norm": 7.299890041351318, "learning_rate": 4.240631157147989e-06, "loss": 0.3687, "step": 5019 }, { "epoch": 0.5531680440771349, "grad_norm": 6.376744747161865, "learning_rate": 4.238903048080342e-06, "loss": 0.3768, "step": 5020 }, { "epoch": 0.5532782369146005, "grad_norm": 10.836827278137207, "learning_rate": 4.237175032080664e-06, "loss": 0.4017, "step": 5021 }, { "epoch": 0.5533884297520661, "grad_norm": 5.046148777008057, "learning_rate": 4.235447109360257e-06, "loss": 0.3483, "step": 5022 }, { "epoch": 0.5534986225895316, "grad_norm": 5.973788261413574, "learning_rate": 4.233719280130418e-06, "loss": 0.4942, "step": 5023 }, { "epoch": 0.5536088154269972, "grad_norm": 9.961180686950684, "learning_rate": 4.231991544602426e-06, "loss": 0.3557, "step": 5024 }, { "epoch": 0.5537190082644629, "grad_norm": 9.15896224975586, "learning_rate": 4.23026390298755e-06, "loss": 0.3715, "step": 5025 }, { "epoch": 0.5538292011019283, "grad_norm": 5.100748062133789, "learning_rate": 4.228536355497051e-06, "loss": 0.4171, "step": 5026 }, { "epoch": 0.553939393939394, "grad_norm": 7.906282424926758, "learning_rate": 4.226808902342174e-06, "loss": 0.4004, "step": 5027 }, { "epoch": 0.5540495867768596, "grad_norm": 6.108539581298828, "learning_rate": 4.225081543734153e-06, "loss": 0.3568, "step": 5028 }, { "epoch": 0.554159779614325, "grad_norm": 9.712579727172852, "learning_rate": 4.223354279884216e-06, "loss": 0.4134, "step": 5029 }, { "epoch": 0.5542699724517907, "grad_norm": 4.735379219055176, "learning_rate": 4.221627111003571e-06, "loss": 0.4288, "step": 5030 }, { "epoch": 0.5543801652892562, "grad_norm": 7.070766925811768, "learning_rate": 4.219900037303421e-06, "loss": 0.3591, "step": 5031 }, { "epoch": 0.5544903581267218, "grad_norm": 5.8401031494140625, "learning_rate": 4.2181730589949546e-06, "loss": 0.4314, "step": 5032 }, { "epoch": 0.5546005509641874, "grad_norm": 7.939309597015381, "learning_rate": 4.216446176289346e-06, "loss": 0.4832, "step": 5033 }, { "epoch": 0.5547107438016529, "grad_norm": 5.133927345275879, "learning_rate": 4.214719389397766e-06, "loss": 0.3338, "step": 5034 }, { "epoch": 0.5548209366391185, "grad_norm": 8.18437671661377, "learning_rate": 4.212992698531366e-06, "loss": 0.4691, "step": 5035 }, { "epoch": 0.5549311294765841, "grad_norm": 9.384045600891113, "learning_rate": 4.211266103901286e-06, "loss": 0.4659, "step": 5036 }, { "epoch": 0.5550413223140496, "grad_norm": 23.223127365112305, "learning_rate": 4.209539605718659e-06, "loss": 0.412, "step": 5037 }, { "epoch": 0.5551515151515152, "grad_norm": 6.536337375640869, "learning_rate": 4.207813204194604e-06, "loss": 0.3964, "step": 5038 }, { "epoch": 0.5552617079889807, "grad_norm": 10.770186424255371, "learning_rate": 4.2060868995402235e-06, "loss": 0.5334, "step": 5039 }, { "epoch": 0.5553719008264463, "grad_norm": 6.0237040519714355, "learning_rate": 4.204360691966618e-06, "loss": 0.4517, "step": 5040 }, { "epoch": 0.5554820936639119, "grad_norm": 7.980887413024902, "learning_rate": 4.202634581684865e-06, "loss": 0.4244, "step": 5041 }, { "epoch": 0.5555922865013774, "grad_norm": 5.497826099395752, "learning_rate": 4.200908568906041e-06, "loss": 0.4697, "step": 5042 }, { "epoch": 0.555702479338843, "grad_norm": 7.60978364944458, "learning_rate": 4.199182653841203e-06, "loss": 0.3848, "step": 5043 }, { "epoch": 0.5558126721763086, "grad_norm": 5.508256435394287, "learning_rate": 4.1974568367013955e-06, "loss": 0.3393, "step": 5044 }, { "epoch": 0.5559228650137741, "grad_norm": 9.76912784576416, "learning_rate": 4.195731117697659e-06, "loss": 0.4678, "step": 5045 }, { "epoch": 0.5560330578512397, "grad_norm": 11.024624824523926, "learning_rate": 4.194005497041012e-06, "loss": 0.4008, "step": 5046 }, { "epoch": 0.5561432506887052, "grad_norm": 9.174602508544922, "learning_rate": 4.192279974942468e-06, "loss": 0.4359, "step": 5047 }, { "epoch": 0.5562534435261708, "grad_norm": 4.278285503387451, "learning_rate": 4.190554551613027e-06, "loss": 0.3259, "step": 5048 }, { "epoch": 0.5563636363636364, "grad_norm": 5.89907169342041, "learning_rate": 4.188829227263674e-06, "loss": 0.4437, "step": 5049 }, { "epoch": 0.5564738292011019, "grad_norm": 5.54419469833374, "learning_rate": 4.187104002105384e-06, "loss": 0.3755, "step": 5050 }, { "epoch": 0.5565840220385675, "grad_norm": 7.27056884765625, "learning_rate": 4.185378876349121e-06, "loss": 0.3301, "step": 5051 }, { "epoch": 0.5566942148760331, "grad_norm": 4.663144111633301, "learning_rate": 4.183653850205837e-06, "loss": 0.48, "step": 5052 }, { "epoch": 0.5568044077134986, "grad_norm": 7.8643975257873535, "learning_rate": 4.181928923886468e-06, "loss": 0.4575, "step": 5053 }, { "epoch": 0.5569146005509642, "grad_norm": 10.424306869506836, "learning_rate": 4.1802040976019424e-06, "loss": 0.4388, "step": 5054 }, { "epoch": 0.5570247933884298, "grad_norm": 4.337207794189453, "learning_rate": 4.178479371563172e-06, "loss": 0.468, "step": 5055 }, { "epoch": 0.5571349862258953, "grad_norm": 9.239445686340332, "learning_rate": 4.176754745981061e-06, "loss": 0.4442, "step": 5056 }, { "epoch": 0.5572451790633609, "grad_norm": 6.625512599945068, "learning_rate": 4.175030221066497e-06, "loss": 0.3327, "step": 5057 }, { "epoch": 0.5573553719008264, "grad_norm": 6.2668070793151855, "learning_rate": 4.173305797030359e-06, "loss": 0.4486, "step": 5058 }, { "epoch": 0.557465564738292, "grad_norm": 5.827762603759766, "learning_rate": 4.171581474083511e-06, "loss": 0.3978, "step": 5059 }, { "epoch": 0.5575757575757576, "grad_norm": 6.255558490753174, "learning_rate": 4.169857252436806e-06, "loss": 0.367, "step": 5060 }, { "epoch": 0.5576859504132231, "grad_norm": 9.605900764465332, "learning_rate": 4.168133132301082e-06, "loss": 0.5043, "step": 5061 }, { "epoch": 0.5577961432506887, "grad_norm": 5.620938301086426, "learning_rate": 4.16640911388717e-06, "loss": 0.3788, "step": 5062 }, { "epoch": 0.5579063360881543, "grad_norm": 4.651265621185303, "learning_rate": 4.164685197405884e-06, "loss": 0.3796, "step": 5063 }, { "epoch": 0.5580165289256198, "grad_norm": 5.623221397399902, "learning_rate": 4.162961383068027e-06, "loss": 0.3577, "step": 5064 }, { "epoch": 0.5581267217630854, "grad_norm": 9.15078067779541, "learning_rate": 4.161237671084388e-06, "loss": 0.45, "step": 5065 }, { "epoch": 0.5582369146005509, "grad_norm": 4.805917739868164, "learning_rate": 4.159514061665748e-06, "loss": 0.3958, "step": 5066 }, { "epoch": 0.5583471074380165, "grad_norm": 6.629087448120117, "learning_rate": 4.157790555022867e-06, "loss": 0.3458, "step": 5067 }, { "epoch": 0.5584573002754821, "grad_norm": 10.288884162902832, "learning_rate": 4.156067151366504e-06, "loss": 0.4493, "step": 5068 }, { "epoch": 0.5585674931129476, "grad_norm": 4.732473850250244, "learning_rate": 4.154343850907393e-06, "loss": 0.4328, "step": 5069 }, { "epoch": 0.5586776859504132, "grad_norm": 5.083303928375244, "learning_rate": 4.152620653856267e-06, "loss": 0.3709, "step": 5070 }, { "epoch": 0.5587878787878788, "grad_norm": 7.439194202423096, "learning_rate": 4.150897560423839e-06, "loss": 0.4592, "step": 5071 }, { "epoch": 0.5588980716253443, "grad_norm": 6.688822269439697, "learning_rate": 4.149174570820809e-06, "loss": 0.3108, "step": 5072 }, { "epoch": 0.5590082644628099, "grad_norm": 5.189765453338623, "learning_rate": 4.1474516852578695e-06, "loss": 0.3649, "step": 5073 }, { "epoch": 0.5591184573002754, "grad_norm": 6.02034330368042, "learning_rate": 4.145728903945696e-06, "loss": 0.4075, "step": 5074 }, { "epoch": 0.559228650137741, "grad_norm": 6.01516056060791, "learning_rate": 4.14400622709495e-06, "loss": 0.3907, "step": 5075 }, { "epoch": 0.5593388429752066, "grad_norm": 8.235849380493164, "learning_rate": 4.142283654916288e-06, "loss": 0.4458, "step": 5076 }, { "epoch": 0.5594490358126721, "grad_norm": 6.125243186950684, "learning_rate": 4.1405611876203455e-06, "loss": 0.456, "step": 5077 }, { "epoch": 0.5595592286501377, "grad_norm": 5.977057456970215, "learning_rate": 4.138838825417747e-06, "loss": 0.3886, "step": 5078 }, { "epoch": 0.5596694214876033, "grad_norm": 6.936330795288086, "learning_rate": 4.137116568519108e-06, "loss": 0.3994, "step": 5079 }, { "epoch": 0.5597796143250688, "grad_norm": 6.488701820373535, "learning_rate": 4.135394417135027e-06, "loss": 0.4766, "step": 5080 }, { "epoch": 0.5598898071625344, "grad_norm": 6.146881103515625, "learning_rate": 4.133672371476091e-06, "loss": 0.4505, "step": 5081 }, { "epoch": 0.56, "grad_norm": 11.271879196166992, "learning_rate": 4.131950431752873e-06, "loss": 0.3688, "step": 5082 }, { "epoch": 0.5601101928374655, "grad_norm": 5.499881744384766, "learning_rate": 4.130228598175936e-06, "loss": 0.405, "step": 5083 }, { "epoch": 0.5602203856749312, "grad_norm": 5.5053391456604, "learning_rate": 4.1285068709558285e-06, "loss": 0.4246, "step": 5084 }, { "epoch": 0.5603305785123966, "grad_norm": 5.651018142700195, "learning_rate": 4.126785250303084e-06, "loss": 0.4212, "step": 5085 }, { "epoch": 0.5604407713498623, "grad_norm": 5.410347938537598, "learning_rate": 4.1250637364282246e-06, "loss": 0.4234, "step": 5086 }, { "epoch": 0.5605509641873279, "grad_norm": 4.950188159942627, "learning_rate": 4.123342329541761e-06, "loss": 0.4005, "step": 5087 }, { "epoch": 0.5606611570247934, "grad_norm": 4.54240608215332, "learning_rate": 4.121621029854188e-06, "loss": 0.4118, "step": 5088 }, { "epoch": 0.560771349862259, "grad_norm": 4.574641227722168, "learning_rate": 4.119899837575988e-06, "loss": 0.3916, "step": 5089 }, { "epoch": 0.5608815426997246, "grad_norm": 6.007846832275391, "learning_rate": 4.118178752917632e-06, "loss": 0.4032, "step": 5090 }, { "epoch": 0.5609917355371901, "grad_norm": 4.737065315246582, "learning_rate": 4.116457776089576e-06, "loss": 0.3332, "step": 5091 }, { "epoch": 0.5611019283746557, "grad_norm": 14.039338111877441, "learning_rate": 4.114736907302263e-06, "loss": 0.5713, "step": 5092 }, { "epoch": 0.5612121212121212, "grad_norm": 4.842670440673828, "learning_rate": 4.113016146766124e-06, "loss": 0.4207, "step": 5093 }, { "epoch": 0.5613223140495868, "grad_norm": 5.0246806144714355, "learning_rate": 4.111295494691575e-06, "loss": 0.3918, "step": 5094 }, { "epoch": 0.5614325068870524, "grad_norm": 7.352241516113281, "learning_rate": 4.1095749512890185e-06, "loss": 0.3944, "step": 5095 }, { "epoch": 0.5615426997245179, "grad_norm": 6.591299057006836, "learning_rate": 4.107854516768848e-06, "loss": 0.3938, "step": 5096 }, { "epoch": 0.5616528925619835, "grad_norm": 8.282949447631836, "learning_rate": 4.1061341913414386e-06, "loss": 0.383, "step": 5097 }, { "epoch": 0.5617630853994491, "grad_norm": 10.040361404418945, "learning_rate": 4.104413975217155e-06, "loss": 0.4855, "step": 5098 }, { "epoch": 0.5618732782369146, "grad_norm": 5.768685340881348, "learning_rate": 4.102693868606349e-06, "loss": 0.3668, "step": 5099 }, { "epoch": 0.5619834710743802, "grad_norm": 7.725187301635742, "learning_rate": 4.100973871719351e-06, "loss": 0.4449, "step": 5100 }, { "epoch": 0.5620936639118457, "grad_norm": 6.909238815307617, "learning_rate": 4.0992539847664935e-06, "loss": 0.3774, "step": 5101 }, { "epoch": 0.5622038567493113, "grad_norm": 5.824862480163574, "learning_rate": 4.097534207958081e-06, "loss": 0.3557, "step": 5102 }, { "epoch": 0.5623140495867769, "grad_norm": 6.053623199462891, "learning_rate": 4.095814541504409e-06, "loss": 0.4114, "step": 5103 }, { "epoch": 0.5624242424242424, "grad_norm": 9.661008834838867, "learning_rate": 4.094094985615766e-06, "loss": 0.4756, "step": 5104 }, { "epoch": 0.562534435261708, "grad_norm": 7.061156272888184, "learning_rate": 4.092375540502418e-06, "loss": 0.4518, "step": 5105 }, { "epoch": 0.5626446280991736, "grad_norm": 8.996638298034668, "learning_rate": 4.090656206374622e-06, "loss": 0.4032, "step": 5106 }, { "epoch": 0.5627548209366391, "grad_norm": 4.997591495513916, "learning_rate": 4.0889369834426195e-06, "loss": 0.3713, "step": 5107 }, { "epoch": 0.5628650137741047, "grad_norm": 9.249066352844238, "learning_rate": 4.087217871916641e-06, "loss": 0.4829, "step": 5108 }, { "epoch": 0.5629752066115703, "grad_norm": 8.250449180603027, "learning_rate": 4.0854988720069e-06, "loss": 0.4207, "step": 5109 }, { "epoch": 0.5630853994490358, "grad_norm": 5.6974406242370605, "learning_rate": 4.0837799839236e-06, "loss": 0.4235, "step": 5110 }, { "epoch": 0.5631955922865014, "grad_norm": 6.328078746795654, "learning_rate": 4.082061207876927e-06, "loss": 0.3676, "step": 5111 }, { "epoch": 0.5633057851239669, "grad_norm": 9.134344100952148, "learning_rate": 4.080342544077058e-06, "loss": 0.4886, "step": 5112 }, { "epoch": 0.5634159779614325, "grad_norm": 7.8390960693359375, "learning_rate": 4.078623992734151e-06, "loss": 0.4518, "step": 5113 }, { "epoch": 0.5635261707988981, "grad_norm": 8.834000587463379, "learning_rate": 4.076905554058353e-06, "loss": 0.4235, "step": 5114 }, { "epoch": 0.5636363636363636, "grad_norm": 6.361852645874023, "learning_rate": 4.0751872282598e-06, "loss": 0.3848, "step": 5115 }, { "epoch": 0.5637465564738292, "grad_norm": 6.1839094161987305, "learning_rate": 4.073469015548608e-06, "loss": 0.3905, "step": 5116 }, { "epoch": 0.5638567493112948, "grad_norm": 6.1756391525268555, "learning_rate": 4.0717509161348815e-06, "loss": 0.4582, "step": 5117 }, { "epoch": 0.5639669421487603, "grad_norm": 4.953059196472168, "learning_rate": 4.0700329302287165e-06, "loss": 0.3591, "step": 5118 }, { "epoch": 0.5640771349862259, "grad_norm": 5.793675422668457, "learning_rate": 4.068315058040187e-06, "loss": 0.3357, "step": 5119 }, { "epoch": 0.5641873278236914, "grad_norm": 11.507821083068848, "learning_rate": 4.0665972997793565e-06, "loss": 0.5176, "step": 5120 }, { "epoch": 0.564297520661157, "grad_norm": 9.467365264892578, "learning_rate": 4.064879655656278e-06, "loss": 0.4334, "step": 5121 }, { "epoch": 0.5644077134986226, "grad_norm": 7.418822288513184, "learning_rate": 4.063162125880986e-06, "loss": 0.5048, "step": 5122 }, { "epoch": 0.5645179063360881, "grad_norm": 9.262983322143555, "learning_rate": 4.061444710663498e-06, "loss": 0.3923, "step": 5123 }, { "epoch": 0.5646280991735537, "grad_norm": 8.48256778717041, "learning_rate": 4.05972741021383e-06, "loss": 0.456, "step": 5124 }, { "epoch": 0.5647382920110193, "grad_norm": 5.7215094566345215, "learning_rate": 4.0580102247419684e-06, "loss": 0.3397, "step": 5125 }, { "epoch": 0.5648484848484848, "grad_norm": 5.982723712921143, "learning_rate": 4.0562931544578975e-06, "loss": 0.4135, "step": 5126 }, { "epoch": 0.5649586776859504, "grad_norm": 9.416303634643555, "learning_rate": 4.054576199571584e-06, "loss": 0.3751, "step": 5127 }, { "epoch": 0.5650688705234159, "grad_norm": 5.361845970153809, "learning_rate": 4.0528593602929715e-06, "loss": 0.3938, "step": 5128 }, { "epoch": 0.5651790633608815, "grad_norm": 14.400737762451172, "learning_rate": 4.051142636832007e-06, "loss": 0.4752, "step": 5129 }, { "epoch": 0.5652892561983471, "grad_norm": 6.782834053039551, "learning_rate": 4.0494260293986095e-06, "loss": 0.3934, "step": 5130 }, { "epoch": 0.5653994490358126, "grad_norm": 5.719883441925049, "learning_rate": 4.047709538202686e-06, "loss": 0.3518, "step": 5131 }, { "epoch": 0.5655096418732782, "grad_norm": 6.53598165512085, "learning_rate": 4.045993163454137e-06, "loss": 0.3836, "step": 5132 }, { "epoch": 0.5656198347107438, "grad_norm": 5.460474014282227, "learning_rate": 4.044276905362838e-06, "loss": 0.4103, "step": 5133 }, { "epoch": 0.5657300275482093, "grad_norm": 9.956212997436523, "learning_rate": 4.042560764138657e-06, "loss": 0.4957, "step": 5134 }, { "epoch": 0.5658402203856749, "grad_norm": 8.960071563720703, "learning_rate": 4.040844739991447e-06, "loss": 0.4983, "step": 5135 }, { "epoch": 0.5659504132231405, "grad_norm": 8.893988609313965, "learning_rate": 4.039128833131046e-06, "loss": 0.4494, "step": 5136 }, { "epoch": 0.566060606060606, "grad_norm": 4.337855339050293, "learning_rate": 4.037413043767274e-06, "loss": 0.3394, "step": 5137 }, { "epoch": 0.5661707988980716, "grad_norm": 7.404200553894043, "learning_rate": 4.035697372109944e-06, "loss": 0.4619, "step": 5138 }, { "epoch": 0.5662809917355371, "grad_norm": 6.283228397369385, "learning_rate": 4.033981818368849e-06, "loss": 0.3573, "step": 5139 }, { "epoch": 0.5663911845730027, "grad_norm": 4.769010543823242, "learning_rate": 4.03226638275377e-06, "loss": 0.3701, "step": 5140 }, { "epoch": 0.5665013774104684, "grad_norm": 8.625991821289062, "learning_rate": 4.030551065474472e-06, "loss": 0.3806, "step": 5141 }, { "epoch": 0.5666115702479338, "grad_norm": 6.381789207458496, "learning_rate": 4.0288358667407055e-06, "loss": 0.3414, "step": 5142 }, { "epoch": 0.5667217630853995, "grad_norm": 8.321535110473633, "learning_rate": 4.02712078676221e-06, "loss": 0.3744, "step": 5143 }, { "epoch": 0.5668319559228651, "grad_norm": 5.784425258636475, "learning_rate": 4.025405825748706e-06, "loss": 0.3542, "step": 5144 }, { "epoch": 0.5669421487603306, "grad_norm": 5.705422401428223, "learning_rate": 4.023690983909901e-06, "loss": 0.4205, "step": 5145 }, { "epoch": 0.5670523415977962, "grad_norm": 7.45026159286499, "learning_rate": 4.021976261455488e-06, "loss": 0.468, "step": 5146 }, { "epoch": 0.5671625344352617, "grad_norm": 6.045354843139648, "learning_rate": 4.020261658595147e-06, "loss": 0.4172, "step": 5147 }, { "epoch": 0.5672727272727273, "grad_norm": 11.95577621459961, "learning_rate": 4.0185471755385404e-06, "loss": 0.3981, "step": 5148 }, { "epoch": 0.5673829201101929, "grad_norm": 6.558634281158447, "learning_rate": 4.01683281249532e-06, "loss": 0.4548, "step": 5149 }, { "epoch": 0.5674931129476584, "grad_norm": 5.374818801879883, "learning_rate": 4.015118569675118e-06, "loss": 0.4464, "step": 5150 }, { "epoch": 0.567603305785124, "grad_norm": 5.898446083068848, "learning_rate": 4.013404447287554e-06, "loss": 0.434, "step": 5151 }, { "epoch": 0.5677134986225896, "grad_norm": 6.746851444244385, "learning_rate": 4.011690445542237e-06, "loss": 0.405, "step": 5152 }, { "epoch": 0.5678236914600551, "grad_norm": 5.899205207824707, "learning_rate": 4.009976564648752e-06, "loss": 0.3967, "step": 5153 }, { "epoch": 0.5679338842975207, "grad_norm": 5.740797996520996, "learning_rate": 4.008262804816679e-06, "loss": 0.4142, "step": 5154 }, { "epoch": 0.5680440771349863, "grad_norm": 9.104641914367676, "learning_rate": 4.006549166255577e-06, "loss": 0.3933, "step": 5155 }, { "epoch": 0.5681542699724518, "grad_norm": 8.068328857421875, "learning_rate": 4.004835649174992e-06, "loss": 0.3886, "step": 5156 }, { "epoch": 0.5682644628099174, "grad_norm": 7.784996509552002, "learning_rate": 4.003122253784457e-06, "loss": 0.377, "step": 5157 }, { "epoch": 0.5683746556473829, "grad_norm": 5.714048862457275, "learning_rate": 4.001408980293487e-06, "loss": 0.4052, "step": 5158 }, { "epoch": 0.5684848484848485, "grad_norm": 7.091597080230713, "learning_rate": 3.999695828911581e-06, "loss": 0.428, "step": 5159 }, { "epoch": 0.5685950413223141, "grad_norm": 7.845519065856934, "learning_rate": 3.99798279984823e-06, "loss": 0.3705, "step": 5160 }, { "epoch": 0.5687052341597796, "grad_norm": 10.018808364868164, "learning_rate": 3.9962698933129026e-06, "loss": 0.4184, "step": 5161 }, { "epoch": 0.5688154269972452, "grad_norm": 18.566343307495117, "learning_rate": 3.9945571095150545e-06, "loss": 0.4997, "step": 5162 }, { "epoch": 0.5689256198347108, "grad_norm": 4.974708080291748, "learning_rate": 3.992844448664132e-06, "loss": 0.4585, "step": 5163 }, { "epoch": 0.5690358126721763, "grad_norm": 9.041829109191895, "learning_rate": 3.991131910969558e-06, "loss": 0.3548, "step": 5164 }, { "epoch": 0.5691460055096419, "grad_norm": 5.789597034454346, "learning_rate": 3.989419496640742e-06, "loss": 0.4318, "step": 5165 }, { "epoch": 0.5692561983471074, "grad_norm": 8.98617935180664, "learning_rate": 3.987707205887084e-06, "loss": 0.3519, "step": 5166 }, { "epoch": 0.569366391184573, "grad_norm": 4.784662246704102, "learning_rate": 3.985995038917961e-06, "loss": 0.3853, "step": 5167 }, { "epoch": 0.5694765840220386, "grad_norm": 8.933760643005371, "learning_rate": 3.984282995942746e-06, "loss": 0.4976, "step": 5168 }, { "epoch": 0.5695867768595041, "grad_norm": 4.532029151916504, "learning_rate": 3.982571077170786e-06, "loss": 0.4092, "step": 5169 }, { "epoch": 0.5696969696969697, "grad_norm": 4.453444004058838, "learning_rate": 3.980859282811414e-06, "loss": 0.398, "step": 5170 }, { "epoch": 0.5698071625344353, "grad_norm": 4.47035026550293, "learning_rate": 3.979147613073956e-06, "loss": 0.3952, "step": 5171 }, { "epoch": 0.5699173553719008, "grad_norm": 5.4785990715026855, "learning_rate": 3.977436068167714e-06, "loss": 0.3992, "step": 5172 }, { "epoch": 0.5700275482093664, "grad_norm": 7.377060890197754, "learning_rate": 3.975724648301976e-06, "loss": 0.4763, "step": 5173 }, { "epoch": 0.5701377410468319, "grad_norm": 8.86286449432373, "learning_rate": 3.974013353686022e-06, "loss": 0.4504, "step": 5174 }, { "epoch": 0.5702479338842975, "grad_norm": 8.21593952178955, "learning_rate": 3.972302184529108e-06, "loss": 0.4363, "step": 5175 }, { "epoch": 0.5703581267217631, "grad_norm": 4.344210147857666, "learning_rate": 3.9705911410404785e-06, "loss": 0.4251, "step": 5176 }, { "epoch": 0.5704683195592286, "grad_norm": 4.425926685333252, "learning_rate": 3.968880223429364e-06, "loss": 0.4166, "step": 5177 }, { "epoch": 0.5705785123966942, "grad_norm": 6.354044437408447, "learning_rate": 3.967169431904975e-06, "loss": 0.3828, "step": 5178 }, { "epoch": 0.5706887052341598, "grad_norm": 5.4605278968811035, "learning_rate": 3.96545876667651e-06, "loss": 0.4334, "step": 5179 }, { "epoch": 0.5707988980716253, "grad_norm": 8.027145385742188, "learning_rate": 3.963748227953154e-06, "loss": 0.4629, "step": 5180 }, { "epoch": 0.5709090909090909, "grad_norm": 5.73824405670166, "learning_rate": 3.962037815944071e-06, "loss": 0.4121, "step": 5181 }, { "epoch": 0.5710192837465565, "grad_norm": 9.135799407958984, "learning_rate": 3.960327530858415e-06, "loss": 0.4432, "step": 5182 }, { "epoch": 0.571129476584022, "grad_norm": 5.127810478210449, "learning_rate": 3.95861737290532e-06, "loss": 0.3297, "step": 5183 }, { "epoch": 0.5712396694214876, "grad_norm": 5.376129150390625, "learning_rate": 3.956907342293908e-06, "loss": 0.4374, "step": 5184 }, { "epoch": 0.5713498622589531, "grad_norm": 8.083807945251465, "learning_rate": 3.955197439233283e-06, "loss": 0.4545, "step": 5185 }, { "epoch": 0.5714600550964187, "grad_norm": 6.188045501708984, "learning_rate": 3.953487663932535e-06, "loss": 0.403, "step": 5186 }, { "epoch": 0.5715702479338843, "grad_norm": 5.2420125007629395, "learning_rate": 3.951778016600734e-06, "loss": 0.3563, "step": 5187 }, { "epoch": 0.5716804407713498, "grad_norm": 5.644331455230713, "learning_rate": 3.950068497446944e-06, "loss": 0.4251, "step": 5188 }, { "epoch": 0.5717906336088154, "grad_norm": 7.3118157386779785, "learning_rate": 3.948359106680205e-06, "loss": 0.4561, "step": 5189 }, { "epoch": 0.571900826446281, "grad_norm": 6.968026161193848, "learning_rate": 3.946649844509539e-06, "loss": 0.4548, "step": 5190 }, { "epoch": 0.5720110192837465, "grad_norm": 5.8956193923950195, "learning_rate": 3.944940711143964e-06, "loss": 0.4194, "step": 5191 }, { "epoch": 0.5721212121212121, "grad_norm": 5.4817986488342285, "learning_rate": 3.9432317067924716e-06, "loss": 0.4281, "step": 5192 }, { "epoch": 0.5722314049586776, "grad_norm": 7.703038215637207, "learning_rate": 3.941522831664041e-06, "loss": 0.4371, "step": 5193 }, { "epoch": 0.5723415977961432, "grad_norm": 5.166708946228027, "learning_rate": 3.939814085967636e-06, "loss": 0.4106, "step": 5194 }, { "epoch": 0.5724517906336088, "grad_norm": 7.860630035400391, "learning_rate": 3.938105469912204e-06, "loss": 0.4284, "step": 5195 }, { "epoch": 0.5725619834710743, "grad_norm": 8.984552383422852, "learning_rate": 3.93639698370668e-06, "loss": 0.4867, "step": 5196 }, { "epoch": 0.57267217630854, "grad_norm": 6.660092353820801, "learning_rate": 3.934688627559977e-06, "loss": 0.4048, "step": 5197 }, { "epoch": 0.5727823691460056, "grad_norm": 5.779437065124512, "learning_rate": 3.932980401680994e-06, "loss": 0.3702, "step": 5198 }, { "epoch": 0.572892561983471, "grad_norm": 9.79520320892334, "learning_rate": 3.931272306278619e-06, "loss": 0.4939, "step": 5199 }, { "epoch": 0.5730027548209367, "grad_norm": 5.133498191833496, "learning_rate": 3.9295643415617164e-06, "loss": 0.3741, "step": 5200 }, { "epoch": 0.5731129476584021, "grad_norm": 3.9077324867248535, "learning_rate": 3.9278565077391404e-06, "loss": 0.4111, "step": 5201 }, { "epoch": 0.5732231404958678, "grad_norm": 5.43770170211792, "learning_rate": 3.926148805019728e-06, "loss": 0.4027, "step": 5202 }, { "epoch": 0.5733333333333334, "grad_norm": 7.171450138092041, "learning_rate": 3.924441233612298e-06, "loss": 0.3585, "step": 5203 }, { "epoch": 0.5734435261707989, "grad_norm": 4.4244303703308105, "learning_rate": 3.922733793725654e-06, "loss": 0.3767, "step": 5204 }, { "epoch": 0.5735537190082645, "grad_norm": 6.719311714172363, "learning_rate": 3.921026485568587e-06, "loss": 0.3987, "step": 5205 }, { "epoch": 0.5736639118457301, "grad_norm": 5.38864803314209, "learning_rate": 3.919319309349865e-06, "loss": 0.3972, "step": 5206 }, { "epoch": 0.5737741046831956, "grad_norm": 6.519610404968262, "learning_rate": 3.917612265278246e-06, "loss": 0.3909, "step": 5207 }, { "epoch": 0.5738842975206612, "grad_norm": 5.475235462188721, "learning_rate": 3.91590535356247e-06, "loss": 0.3649, "step": 5208 }, { "epoch": 0.5739944903581268, "grad_norm": 9.819671630859375, "learning_rate": 3.91419857441126e-06, "loss": 0.4592, "step": 5209 }, { "epoch": 0.5741046831955923, "grad_norm": 6.266026020050049, "learning_rate": 3.912491928033324e-06, "loss": 0.3988, "step": 5210 }, { "epoch": 0.5742148760330579, "grad_norm": 9.54811954498291, "learning_rate": 3.910785414637351e-06, "loss": 0.4041, "step": 5211 }, { "epoch": 0.5743250688705234, "grad_norm": 8.638733863830566, "learning_rate": 3.909079034432018e-06, "loss": 0.3596, "step": 5212 }, { "epoch": 0.574435261707989, "grad_norm": 7.72868013381958, "learning_rate": 3.907372787625982e-06, "loss": 0.3528, "step": 5213 }, { "epoch": 0.5745454545454546, "grad_norm": 11.883960723876953, "learning_rate": 3.905666674427887e-06, "loss": 0.4519, "step": 5214 }, { "epoch": 0.5746556473829201, "grad_norm": 14.226672172546387, "learning_rate": 3.903960695046354e-06, "loss": 0.4518, "step": 5215 }, { "epoch": 0.5747658402203857, "grad_norm": 13.8538236618042, "learning_rate": 3.902254849689999e-06, "loss": 0.4335, "step": 5216 }, { "epoch": 0.5748760330578513, "grad_norm": 7.674587726593018, "learning_rate": 3.900549138567413e-06, "loss": 0.4194, "step": 5217 }, { "epoch": 0.5749862258953168, "grad_norm": 4.918647289276123, "learning_rate": 3.8988435618871685e-06, "loss": 0.3704, "step": 5218 }, { "epoch": 0.5750964187327824, "grad_norm": 10.45202922821045, "learning_rate": 3.897138119857833e-06, "loss": 0.4117, "step": 5219 }, { "epoch": 0.5752066115702479, "grad_norm": 4.533913612365723, "learning_rate": 3.895432812687944e-06, "loss": 0.3856, "step": 5220 }, { "epoch": 0.5753168044077135, "grad_norm": 4.864469051361084, "learning_rate": 3.89372764058603e-06, "loss": 0.3646, "step": 5221 }, { "epoch": 0.5754269972451791, "grad_norm": 7.552703857421875, "learning_rate": 3.892022603760605e-06, "loss": 0.3838, "step": 5222 }, { "epoch": 0.5755371900826446, "grad_norm": 8.9181489944458, "learning_rate": 3.890317702420158e-06, "loss": 0.386, "step": 5223 }, { "epoch": 0.5756473829201102, "grad_norm": 10.64404010772705, "learning_rate": 3.888612936773173e-06, "loss": 0.3579, "step": 5224 }, { "epoch": 0.5757575757575758, "grad_norm": 5.714846611022949, "learning_rate": 3.886908307028108e-06, "loss": 0.3722, "step": 5225 }, { "epoch": 0.5758677685950413, "grad_norm": 8.370100975036621, "learning_rate": 3.885203813393404e-06, "loss": 0.5137, "step": 5226 }, { "epoch": 0.5759779614325069, "grad_norm": 6.253170490264893, "learning_rate": 3.883499456077495e-06, "loss": 0.3986, "step": 5227 }, { "epoch": 0.5760881542699724, "grad_norm": 7.653555393218994, "learning_rate": 3.881795235288788e-06, "loss": 0.3785, "step": 5228 }, { "epoch": 0.576198347107438, "grad_norm": 4.932460784912109, "learning_rate": 3.880091151235678e-06, "loss": 0.3807, "step": 5229 }, { "epoch": 0.5763085399449036, "grad_norm": 8.360217094421387, "learning_rate": 3.878387204126544e-06, "loss": 0.4467, "step": 5230 }, { "epoch": 0.5764187327823691, "grad_norm": 4.929142951965332, "learning_rate": 3.8766833941697464e-06, "loss": 0.3762, "step": 5231 }, { "epoch": 0.5765289256198347, "grad_norm": 7.1862688064575195, "learning_rate": 3.874979721573628e-06, "loss": 0.3917, "step": 5232 }, { "epoch": 0.5766391184573003, "grad_norm": 11.578612327575684, "learning_rate": 3.873276186546519e-06, "loss": 0.3606, "step": 5233 }, { "epoch": 0.5767493112947658, "grad_norm": 8.164372444152832, "learning_rate": 3.871572789296727e-06, "loss": 0.4254, "step": 5234 }, { "epoch": 0.5768595041322314, "grad_norm": 6.481094837188721, "learning_rate": 3.8698695300325475e-06, "loss": 0.429, "step": 5235 }, { "epoch": 0.576969696969697, "grad_norm": 4.183745861053467, "learning_rate": 3.868166408962258e-06, "loss": 0.3469, "step": 5236 }, { "epoch": 0.5770798898071625, "grad_norm": 8.76431941986084, "learning_rate": 3.8664634262941155e-06, "loss": 0.4076, "step": 5237 }, { "epoch": 0.5771900826446281, "grad_norm": 7.580943584442139, "learning_rate": 3.864760582236367e-06, "loss": 0.4, "step": 5238 }, { "epoch": 0.5773002754820936, "grad_norm": 9.020663261413574, "learning_rate": 3.863057876997236e-06, "loss": 0.3657, "step": 5239 }, { "epoch": 0.5774104683195592, "grad_norm": 10.859776496887207, "learning_rate": 3.861355310784932e-06, "loss": 0.4231, "step": 5240 }, { "epoch": 0.5775206611570248, "grad_norm": 7.436887264251709, "learning_rate": 3.8596528838076476e-06, "loss": 0.4406, "step": 5241 }, { "epoch": 0.5776308539944903, "grad_norm": 10.0736722946167, "learning_rate": 3.857950596273558e-06, "loss": 0.4015, "step": 5242 }, { "epoch": 0.5777410468319559, "grad_norm": 5.721993446350098, "learning_rate": 3.8562484483908185e-06, "loss": 0.3466, "step": 5243 }, { "epoch": 0.5778512396694215, "grad_norm": 6.503122329711914, "learning_rate": 3.854546440367575e-06, "loss": 0.3774, "step": 5244 }, { "epoch": 0.577961432506887, "grad_norm": 6.205786228179932, "learning_rate": 3.852844572411949e-06, "loss": 0.3698, "step": 5245 }, { "epoch": 0.5780716253443526, "grad_norm": 6.713566303253174, "learning_rate": 3.851142844732043e-06, "loss": 0.4425, "step": 5246 }, { "epoch": 0.5781818181818181, "grad_norm": 6.699551105499268, "learning_rate": 3.849441257535955e-06, "loss": 0.4607, "step": 5247 }, { "epoch": 0.5782920110192837, "grad_norm": 3.772110939025879, "learning_rate": 3.847739811031751e-06, "loss": 0.3504, "step": 5248 }, { "epoch": 0.5784022038567493, "grad_norm": 7.316378593444824, "learning_rate": 3.846038505427487e-06, "loss": 0.3804, "step": 5249 }, { "epoch": 0.5785123966942148, "grad_norm": 11.61896800994873, "learning_rate": 3.844337340931204e-06, "loss": 0.658, "step": 5250 }, { "epoch": 0.5786225895316804, "grad_norm": 12.600423812866211, "learning_rate": 3.842636317750918e-06, "loss": 0.4261, "step": 5251 }, { "epoch": 0.578732782369146, "grad_norm": 4.730981826782227, "learning_rate": 3.840935436094639e-06, "loss": 0.4067, "step": 5252 }, { "epoch": 0.5788429752066115, "grad_norm": 5.613015651702881, "learning_rate": 3.839234696170348e-06, "loss": 0.4097, "step": 5253 }, { "epoch": 0.5789531680440771, "grad_norm": 10.097566604614258, "learning_rate": 3.8375340981860134e-06, "loss": 0.4101, "step": 5254 }, { "epoch": 0.5790633608815428, "grad_norm": 6.698999404907227, "learning_rate": 3.8358336423495904e-06, "loss": 0.3602, "step": 5255 }, { "epoch": 0.5791735537190082, "grad_norm": 6.215074062347412, "learning_rate": 3.834133328869011e-06, "loss": 0.4701, "step": 5256 }, { "epoch": 0.5792837465564739, "grad_norm": 10.790955543518066, "learning_rate": 3.832433157952189e-06, "loss": 0.3983, "step": 5257 }, { "epoch": 0.5793939393939394, "grad_norm": 6.542206764221191, "learning_rate": 3.830733129807029e-06, "loss": 0.3291, "step": 5258 }, { "epoch": 0.579504132231405, "grad_norm": 10.732033729553223, "learning_rate": 3.829033244641411e-06, "loss": 0.4505, "step": 5259 }, { "epoch": 0.5796143250688706, "grad_norm": 8.658686637878418, "learning_rate": 3.827333502663195e-06, "loss": 0.4709, "step": 5260 }, { "epoch": 0.5797245179063361, "grad_norm": 4.506829261779785, "learning_rate": 3.825633904080234e-06, "loss": 0.3502, "step": 5261 }, { "epoch": 0.5798347107438017, "grad_norm": 7.29030704498291, "learning_rate": 3.823934449100354e-06, "loss": 0.4117, "step": 5262 }, { "epoch": 0.5799449035812673, "grad_norm": 6.036261558532715, "learning_rate": 3.822235137931366e-06, "loss": 0.4133, "step": 5263 }, { "epoch": 0.5800550964187328, "grad_norm": 7.769625663757324, "learning_rate": 3.820535970781066e-06, "loss": 0.4837, "step": 5264 }, { "epoch": 0.5801652892561984, "grad_norm": 5.557866096496582, "learning_rate": 3.818836947857229e-06, "loss": 0.462, "step": 5265 }, { "epoch": 0.5802754820936639, "grad_norm": 5.770900726318359, "learning_rate": 3.817138069367615e-06, "loss": 0.4381, "step": 5266 }, { "epoch": 0.5803856749311295, "grad_norm": 9.156088829040527, "learning_rate": 3.8154393355199656e-06, "loss": 0.4591, "step": 5267 }, { "epoch": 0.5804958677685951, "grad_norm": 5.883398532867432, "learning_rate": 3.8137407465220012e-06, "loss": 0.4588, "step": 5268 }, { "epoch": 0.5806060606060606, "grad_norm": 6.452761173248291, "learning_rate": 3.8120423025814314e-06, "loss": 0.415, "step": 5269 }, { "epoch": 0.5807162534435262, "grad_norm": 7.139335632324219, "learning_rate": 3.8103440039059418e-06, "loss": 0.3779, "step": 5270 }, { "epoch": 0.5808264462809918, "grad_norm": 6.148501873016357, "learning_rate": 3.8086458507032033e-06, "loss": 0.467, "step": 5271 }, { "epoch": 0.5809366391184573, "grad_norm": 5.470493793487549, "learning_rate": 3.8069478431808686e-06, "loss": 0.435, "step": 5272 }, { "epoch": 0.5810468319559229, "grad_norm": 5.511624336242676, "learning_rate": 3.8052499815465738e-06, "loss": 0.3664, "step": 5273 }, { "epoch": 0.5811570247933884, "grad_norm": 6.3985371589660645, "learning_rate": 3.803552266007931e-06, "loss": 0.4445, "step": 5274 }, { "epoch": 0.581267217630854, "grad_norm": 6.291167736053467, "learning_rate": 3.8018546967725444e-06, "loss": 0.404, "step": 5275 }, { "epoch": 0.5813774104683196, "grad_norm": 10.149051666259766, "learning_rate": 3.800157274047994e-06, "loss": 0.4318, "step": 5276 }, { "epoch": 0.5814876033057851, "grad_norm": 11.656339645385742, "learning_rate": 3.7984599980418393e-06, "loss": 0.4842, "step": 5277 }, { "epoch": 0.5815977961432507, "grad_norm": 7.794707298278809, "learning_rate": 3.7967628689616304e-06, "loss": 0.4812, "step": 5278 }, { "epoch": 0.5817079889807163, "grad_norm": 5.070430278778076, "learning_rate": 3.79506588701489e-06, "loss": 0.3897, "step": 5279 }, { "epoch": 0.5818181818181818, "grad_norm": 4.962933540344238, "learning_rate": 3.793369052409132e-06, "loss": 0.3878, "step": 5280 }, { "epoch": 0.5819283746556474, "grad_norm": 6.511288166046143, "learning_rate": 3.791672365351845e-06, "loss": 0.4635, "step": 5281 }, { "epoch": 0.582038567493113, "grad_norm": 6.625985622406006, "learning_rate": 3.7899758260505e-06, "loss": 0.4141, "step": 5282 }, { "epoch": 0.5821487603305785, "grad_norm": 5.150052547454834, "learning_rate": 3.788279434712558e-06, "loss": 0.3894, "step": 5283 }, { "epoch": 0.5822589531680441, "grad_norm": 8.391870498657227, "learning_rate": 3.7865831915454515e-06, "loss": 0.4382, "step": 5284 }, { "epoch": 0.5823691460055096, "grad_norm": 8.36882495880127, "learning_rate": 3.7848870967565996e-06, "loss": 0.3315, "step": 5285 }, { "epoch": 0.5824793388429752, "grad_norm": 6.879951000213623, "learning_rate": 3.783191150553405e-06, "loss": 0.4199, "step": 5286 }, { "epoch": 0.5825895316804408, "grad_norm": 5.488747596740723, "learning_rate": 3.7814953531432495e-06, "loss": 0.4037, "step": 5287 }, { "epoch": 0.5826997245179063, "grad_norm": 4.865351676940918, "learning_rate": 3.7797997047334966e-06, "loss": 0.3363, "step": 5288 }, { "epoch": 0.5828099173553719, "grad_norm": 4.42156457901001, "learning_rate": 3.7781042055314943e-06, "loss": 0.3398, "step": 5289 }, { "epoch": 0.5829201101928375, "grad_norm": 8.863175392150879, "learning_rate": 3.7764088557445686e-06, "loss": 0.4587, "step": 5290 }, { "epoch": 0.583030303030303, "grad_norm": 9.259750366210938, "learning_rate": 3.77471365558003e-06, "loss": 0.4136, "step": 5291 }, { "epoch": 0.5831404958677686, "grad_norm": 8.834726333618164, "learning_rate": 3.7730186052451713e-06, "loss": 0.4891, "step": 5292 }, { "epoch": 0.5832506887052341, "grad_norm": 6.8883280754089355, "learning_rate": 3.771323704947263e-06, "loss": 0.4081, "step": 5293 }, { "epoch": 0.5833608815426997, "grad_norm": 5.524451732635498, "learning_rate": 3.769628954893562e-06, "loss": 0.3943, "step": 5294 }, { "epoch": 0.5834710743801653, "grad_norm": 6.886898994445801, "learning_rate": 3.767934355291303e-06, "loss": 0.4264, "step": 5295 }, { "epoch": 0.5835812672176308, "grad_norm": 6.704071998596191, "learning_rate": 3.766239906347704e-06, "loss": 0.4009, "step": 5296 }, { "epoch": 0.5836914600550964, "grad_norm": 6.265433311462402, "learning_rate": 3.764545608269966e-06, "loss": 0.3758, "step": 5297 }, { "epoch": 0.583801652892562, "grad_norm": 6.239924907684326, "learning_rate": 3.76285146126527e-06, "loss": 0.3858, "step": 5298 }, { "epoch": 0.5839118457300275, "grad_norm": 8.371216773986816, "learning_rate": 3.761157465540776e-06, "loss": 0.4182, "step": 5299 }, { "epoch": 0.5840220385674931, "grad_norm": 9.449613571166992, "learning_rate": 3.759463621303631e-06, "loss": 0.428, "step": 5300 }, { "epoch": 0.5841322314049586, "grad_norm": 6.359887599945068, "learning_rate": 3.7577699287609613e-06, "loss": 0.376, "step": 5301 }, { "epoch": 0.5842424242424242, "grad_norm": 4.445738792419434, "learning_rate": 3.756076388119868e-06, "loss": 0.3728, "step": 5302 }, { "epoch": 0.5843526170798898, "grad_norm": 6.449606418609619, "learning_rate": 3.7543829995874464e-06, "loss": 0.3856, "step": 5303 }, { "epoch": 0.5844628099173553, "grad_norm": 7.634322643280029, "learning_rate": 3.752689763370765e-06, "loss": 0.3719, "step": 5304 }, { "epoch": 0.5845730027548209, "grad_norm": 5.385597229003906, "learning_rate": 3.750996679676869e-06, "loss": 0.4238, "step": 5305 }, { "epoch": 0.5846831955922865, "grad_norm": 8.28349494934082, "learning_rate": 3.7493037487128005e-06, "loss": 0.4034, "step": 5306 }, { "epoch": 0.584793388429752, "grad_norm": 8.161641120910645, "learning_rate": 3.7476109706855644e-06, "loss": 0.3451, "step": 5307 }, { "epoch": 0.5849035812672176, "grad_norm": 7.038846015930176, "learning_rate": 3.745918345802162e-06, "loss": 0.3661, "step": 5308 }, { "epoch": 0.5850137741046832, "grad_norm": 13.376177787780762, "learning_rate": 3.7442258742695692e-06, "loss": 0.5265, "step": 5309 }, { "epoch": 0.5851239669421487, "grad_norm": 6.555881023406982, "learning_rate": 3.7425335562947394e-06, "loss": 0.4477, "step": 5310 }, { "epoch": 0.5852341597796143, "grad_norm": 6.7689642906188965, "learning_rate": 3.740841392084618e-06, "loss": 0.41, "step": 5311 }, { "epoch": 0.5853443526170798, "grad_norm": 6.448429584503174, "learning_rate": 3.7391493818461188e-06, "loss": 0.4429, "step": 5312 }, { "epoch": 0.5854545454545454, "grad_norm": 5.073526382446289, "learning_rate": 3.7374575257861454e-06, "loss": 0.4066, "step": 5313 }, { "epoch": 0.5855647382920111, "grad_norm": 10.34382438659668, "learning_rate": 3.735765824111583e-06, "loss": 0.4164, "step": 5314 }, { "epoch": 0.5856749311294766, "grad_norm": 10.397635459899902, "learning_rate": 3.7340742770292922e-06, "loss": 0.4666, "step": 5315 }, { "epoch": 0.5857851239669422, "grad_norm": 5.910182952880859, "learning_rate": 3.7323828847461172e-06, "loss": 0.3822, "step": 5316 }, { "epoch": 0.5858953168044078, "grad_norm": 9.688176155090332, "learning_rate": 3.730691647468886e-06, "loss": 0.3534, "step": 5317 }, { "epoch": 0.5860055096418733, "grad_norm": 10.011178970336914, "learning_rate": 3.729000565404405e-06, "loss": 0.4479, "step": 5318 }, { "epoch": 0.5861157024793389, "grad_norm": 6.989460468292236, "learning_rate": 3.7273096387594585e-06, "loss": 0.3823, "step": 5319 }, { "epoch": 0.5862258953168044, "grad_norm": 5.006401538848877, "learning_rate": 3.7256188677408213e-06, "loss": 0.4027, "step": 5320 }, { "epoch": 0.58633608815427, "grad_norm": 10.467499732971191, "learning_rate": 3.7239282525552378e-06, "loss": 0.4665, "step": 5321 }, { "epoch": 0.5864462809917356, "grad_norm": 6.434284687042236, "learning_rate": 3.722237793409442e-06, "loss": 0.428, "step": 5322 }, { "epoch": 0.5865564738292011, "grad_norm": 5.784710884094238, "learning_rate": 3.7205474905101454e-06, "loss": 0.3429, "step": 5323 }, { "epoch": 0.5866666666666667, "grad_norm": 7.143020153045654, "learning_rate": 3.7188573440640373e-06, "loss": 0.3901, "step": 5324 }, { "epoch": 0.5867768595041323, "grad_norm": 6.1825761795043945, "learning_rate": 3.717167354277795e-06, "loss": 0.4052, "step": 5325 }, { "epoch": 0.5868870523415978, "grad_norm": 5.028227806091309, "learning_rate": 3.7154775213580717e-06, "loss": 0.3879, "step": 5326 }, { "epoch": 0.5869972451790634, "grad_norm": 7.450397968292236, "learning_rate": 3.7137878455115005e-06, "loss": 0.4181, "step": 5327 }, { "epoch": 0.5871074380165289, "grad_norm": 6.3655290603637695, "learning_rate": 3.7120983269446997e-06, "loss": 0.4269, "step": 5328 }, { "epoch": 0.5872176308539945, "grad_norm": 6.551460266113281, "learning_rate": 3.710408965864265e-06, "loss": 0.3654, "step": 5329 }, { "epoch": 0.5873278236914601, "grad_norm": 5.877065181732178, "learning_rate": 3.7087197624767725e-06, "loss": 0.4024, "step": 5330 }, { "epoch": 0.5874380165289256, "grad_norm": 6.857378959655762, "learning_rate": 3.707030716988783e-06, "loss": 0.4336, "step": 5331 }, { "epoch": 0.5875482093663912, "grad_norm": 9.565370559692383, "learning_rate": 3.7053418296068342e-06, "loss": 0.4952, "step": 5332 }, { "epoch": 0.5876584022038568, "grad_norm": 8.197833061218262, "learning_rate": 3.703653100537442e-06, "loss": 0.4648, "step": 5333 }, { "epoch": 0.5877685950413223, "grad_norm": 10.442931175231934, "learning_rate": 3.701964529987113e-06, "loss": 0.4563, "step": 5334 }, { "epoch": 0.5878787878787879, "grad_norm": 5.644998550415039, "learning_rate": 3.7002761181623215e-06, "loss": 0.4125, "step": 5335 }, { "epoch": 0.5879889807162535, "grad_norm": 7.055531978607178, "learning_rate": 3.698587865269534e-06, "loss": 0.4379, "step": 5336 }, { "epoch": 0.588099173553719, "grad_norm": 6.240884304046631, "learning_rate": 3.6968997715151907e-06, "loss": 0.427, "step": 5337 }, { "epoch": 0.5882093663911846, "grad_norm": 5.399500846862793, "learning_rate": 3.69521183710571e-06, "loss": 0.3934, "step": 5338 }, { "epoch": 0.5883195592286501, "grad_norm": 7.054924964904785, "learning_rate": 3.6935240622475023e-06, "loss": 0.4656, "step": 5339 }, { "epoch": 0.5884297520661157, "grad_norm": 5.625728607177734, "learning_rate": 3.6918364471469447e-06, "loss": 0.3853, "step": 5340 }, { "epoch": 0.5885399449035813, "grad_norm": 9.67719554901123, "learning_rate": 3.6901489920104023e-06, "loss": 0.465, "step": 5341 }, { "epoch": 0.5886501377410468, "grad_norm": 6.909407138824463, "learning_rate": 3.6884616970442234e-06, "loss": 0.3759, "step": 5342 }, { "epoch": 0.5887603305785124, "grad_norm": 6.184091567993164, "learning_rate": 3.6867745624547278e-06, "loss": 0.463, "step": 5343 }, { "epoch": 0.588870523415978, "grad_norm": 6.955829620361328, "learning_rate": 3.6850875884482223e-06, "loss": 0.4659, "step": 5344 }, { "epoch": 0.5889807162534435, "grad_norm": 7.018570423126221, "learning_rate": 3.6834007752309936e-06, "loss": 0.3883, "step": 5345 }, { "epoch": 0.5890909090909091, "grad_norm": 9.0138578414917, "learning_rate": 3.6817141230093067e-06, "loss": 0.4157, "step": 5346 }, { "epoch": 0.5892011019283746, "grad_norm": 4.636580944061279, "learning_rate": 3.6800276319894055e-06, "loss": 0.4195, "step": 5347 }, { "epoch": 0.5893112947658402, "grad_norm": 7.094888210296631, "learning_rate": 3.6783413023775206e-06, "loss": 0.4772, "step": 5348 }, { "epoch": 0.5894214876033058, "grad_norm": 8.197665214538574, "learning_rate": 3.6766551343798553e-06, "loss": 0.3932, "step": 5349 }, { "epoch": 0.5895316804407713, "grad_norm": 6.349987506866455, "learning_rate": 3.6749691282025986e-06, "loss": 0.4241, "step": 5350 }, { "epoch": 0.5896418732782369, "grad_norm": 5.951522350311279, "learning_rate": 3.6732832840519167e-06, "loss": 0.3908, "step": 5351 }, { "epoch": 0.5897520661157025, "grad_norm": 10.102296829223633, "learning_rate": 3.6715976021339563e-06, "loss": 0.3268, "step": 5352 }, { "epoch": 0.589862258953168, "grad_norm": 8.515700340270996, "learning_rate": 3.669912082654846e-06, "loss": 0.4486, "step": 5353 }, { "epoch": 0.5899724517906336, "grad_norm": 4.492301940917969, "learning_rate": 3.6682267258206938e-06, "loss": 0.4722, "step": 5354 }, { "epoch": 0.5900826446280992, "grad_norm": 4.916325092315674, "learning_rate": 3.666541531837585e-06, "loss": 0.4163, "step": 5355 }, { "epoch": 0.5901928374655647, "grad_norm": 9.863926887512207, "learning_rate": 3.66485650091159e-06, "loss": 0.5008, "step": 5356 }, { "epoch": 0.5903030303030303, "grad_norm": 6.5478291511535645, "learning_rate": 3.6631716332487556e-06, "loss": 0.4371, "step": 5357 }, { "epoch": 0.5904132231404958, "grad_norm": 12.289795875549316, "learning_rate": 3.661486929055109e-06, "loss": 0.4448, "step": 5358 }, { "epoch": 0.5905234159779614, "grad_norm": 4.799616813659668, "learning_rate": 3.65980238853666e-06, "loss": 0.3825, "step": 5359 }, { "epoch": 0.590633608815427, "grad_norm": 8.292051315307617, "learning_rate": 3.6581180118993965e-06, "loss": 0.4481, "step": 5360 }, { "epoch": 0.5907438016528925, "grad_norm": 8.100136756896973, "learning_rate": 3.6564337993492822e-06, "loss": 0.4519, "step": 5361 }, { "epoch": 0.5908539944903581, "grad_norm": 6.405386924743652, "learning_rate": 3.6547497510922703e-06, "loss": 0.3976, "step": 5362 }, { "epoch": 0.5909641873278237, "grad_norm": 21.084047317504883, "learning_rate": 3.6530658673342843e-06, "loss": 0.3932, "step": 5363 }, { "epoch": 0.5910743801652892, "grad_norm": 8.751500129699707, "learning_rate": 3.651382148281235e-06, "loss": 0.4776, "step": 5364 }, { "epoch": 0.5911845730027548, "grad_norm": 5.426620006561279, "learning_rate": 3.64969859413901e-06, "loss": 0.4031, "step": 5365 }, { "epoch": 0.5912947658402203, "grad_norm": 10.462101936340332, "learning_rate": 3.6480152051134715e-06, "loss": 0.4014, "step": 5366 }, { "epoch": 0.5914049586776859, "grad_norm": 6.521287441253662, "learning_rate": 3.6463319814104734e-06, "loss": 0.5193, "step": 5367 }, { "epoch": 0.5915151515151515, "grad_norm": 4.917036056518555, "learning_rate": 3.6446489232358385e-06, "loss": 0.4223, "step": 5368 }, { "epoch": 0.591625344352617, "grad_norm": 8.916620254516602, "learning_rate": 3.6429660307953723e-06, "loss": 0.419, "step": 5369 }, { "epoch": 0.5917355371900826, "grad_norm": 5.543242931365967, "learning_rate": 3.6412833042948663e-06, "loss": 0.4031, "step": 5370 }, { "epoch": 0.5918457300275483, "grad_norm": 7.404268741607666, "learning_rate": 3.6396007439400826e-06, "loss": 0.3445, "step": 5371 }, { "epoch": 0.5919559228650138, "grad_norm": 9.950736999511719, "learning_rate": 3.6379183499367667e-06, "loss": 0.451, "step": 5372 }, { "epoch": 0.5920661157024794, "grad_norm": 5.994956970214844, "learning_rate": 3.6362361224906463e-06, "loss": 0.4803, "step": 5373 }, { "epoch": 0.5921763085399449, "grad_norm": 4.3853230476379395, "learning_rate": 3.634554061807425e-06, "loss": 0.3754, "step": 5374 }, { "epoch": 0.5922865013774105, "grad_norm": 8.13461971282959, "learning_rate": 3.6328721680927868e-06, "loss": 0.5147, "step": 5375 }, { "epoch": 0.5923966942148761, "grad_norm": 4.483111381530762, "learning_rate": 3.631190441552398e-06, "loss": 0.3936, "step": 5376 }, { "epoch": 0.5925068870523416, "grad_norm": 5.281414985656738, "learning_rate": 3.6295088823919005e-06, "loss": 0.4366, "step": 5377 }, { "epoch": 0.5926170798898072, "grad_norm": 10.191956520080566, "learning_rate": 3.62782749081692e-06, "loss": 0.5176, "step": 5378 }, { "epoch": 0.5927272727272728, "grad_norm": 13.563871383666992, "learning_rate": 3.6261462670330573e-06, "loss": 0.4426, "step": 5379 }, { "epoch": 0.5928374655647383, "grad_norm": 9.997957229614258, "learning_rate": 3.624465211245894e-06, "loss": 0.3575, "step": 5380 }, { "epoch": 0.5929476584022039, "grad_norm": 5.238548278808594, "learning_rate": 3.622784323660994e-06, "loss": 0.3669, "step": 5381 }, { "epoch": 0.5930578512396695, "grad_norm": 4.02475118637085, "learning_rate": 3.621103604483899e-06, "loss": 0.3908, "step": 5382 }, { "epoch": 0.593168044077135, "grad_norm": 11.147467613220215, "learning_rate": 3.6194230539201256e-06, "loss": 0.4198, "step": 5383 }, { "epoch": 0.5932782369146006, "grad_norm": 9.837335586547852, "learning_rate": 3.6177426721751786e-06, "loss": 0.4558, "step": 5384 }, { "epoch": 0.5933884297520661, "grad_norm": 5.696954250335693, "learning_rate": 3.6160624594545347e-06, "loss": 0.4837, "step": 5385 }, { "epoch": 0.5934986225895317, "grad_norm": 6.0080084800720215, "learning_rate": 3.614382415963652e-06, "loss": 0.3908, "step": 5386 }, { "epoch": 0.5936088154269973, "grad_norm": 5.469796657562256, "learning_rate": 3.6127025419079714e-06, "loss": 0.3436, "step": 5387 }, { "epoch": 0.5937190082644628, "grad_norm": 5.619692802429199, "learning_rate": 3.611022837492908e-06, "loss": 0.4052, "step": 5388 }, { "epoch": 0.5938292011019284, "grad_norm": 4.303910255432129, "learning_rate": 3.6093433029238576e-06, "loss": 0.3026, "step": 5389 }, { "epoch": 0.593939393939394, "grad_norm": 4.820587158203125, "learning_rate": 3.6076639384061985e-06, "loss": 0.3967, "step": 5390 }, { "epoch": 0.5940495867768595, "grad_norm": 6.786526679992676, "learning_rate": 3.6059847441452835e-06, "loss": 0.3944, "step": 5391 }, { "epoch": 0.5941597796143251, "grad_norm": 6.68752384185791, "learning_rate": 3.6043057203464483e-06, "loss": 0.452, "step": 5392 }, { "epoch": 0.5942699724517906, "grad_norm": 8.34605598449707, "learning_rate": 3.602626867215006e-06, "loss": 0.4646, "step": 5393 }, { "epoch": 0.5943801652892562, "grad_norm": 10.381217002868652, "learning_rate": 3.600948184956246e-06, "loss": 0.3274, "step": 5394 }, { "epoch": 0.5944903581267218, "grad_norm": 7.704305648803711, "learning_rate": 3.599269673775444e-06, "loss": 0.3973, "step": 5395 }, { "epoch": 0.5946005509641873, "grad_norm": 4.4532623291015625, "learning_rate": 3.5975913338778513e-06, "loss": 0.415, "step": 5396 }, { "epoch": 0.5947107438016529, "grad_norm": 3.834787130355835, "learning_rate": 3.595913165468691e-06, "loss": 0.3236, "step": 5397 }, { "epoch": 0.5948209366391185, "grad_norm": 8.35515308380127, "learning_rate": 3.5942351687531795e-06, "loss": 0.4118, "step": 5398 }, { "epoch": 0.594931129476584, "grad_norm": 11.47801399230957, "learning_rate": 3.5925573439364996e-06, "loss": 0.4985, "step": 5399 }, { "epoch": 0.5950413223140496, "grad_norm": 23.297760009765625, "learning_rate": 3.5908796912238174e-06, "loss": 0.4167, "step": 5400 }, { "epoch": 0.5951515151515151, "grad_norm": 5.2379937171936035, "learning_rate": 3.589202210820285e-06, "loss": 0.4222, "step": 5401 }, { "epoch": 0.5952617079889807, "grad_norm": 10.262834548950195, "learning_rate": 3.5875249029310204e-06, "loss": 0.4393, "step": 5402 }, { "epoch": 0.5953719008264463, "grad_norm": 8.890426635742188, "learning_rate": 3.585847767761129e-06, "loss": 0.3535, "step": 5403 }, { "epoch": 0.5954820936639118, "grad_norm": 5.533564567565918, "learning_rate": 3.584170805515694e-06, "loss": 0.3808, "step": 5404 }, { "epoch": 0.5955922865013774, "grad_norm": 8.540761947631836, "learning_rate": 3.5824940163997757e-06, "loss": 0.4659, "step": 5405 }, { "epoch": 0.595702479338843, "grad_norm": 10.449670791625977, "learning_rate": 3.580817400618415e-06, "loss": 0.4835, "step": 5406 }, { "epoch": 0.5958126721763085, "grad_norm": 4.463833808898926, "learning_rate": 3.57914095837663e-06, "loss": 0.4041, "step": 5407 }, { "epoch": 0.5959228650137741, "grad_norm": 5.430807113647461, "learning_rate": 3.5774646898794186e-06, "loss": 0.433, "step": 5408 }, { "epoch": 0.5960330578512397, "grad_norm": 6.412762641906738, "learning_rate": 3.5757885953317578e-06, "loss": 0.3562, "step": 5409 }, { "epoch": 0.5961432506887052, "grad_norm": 4.589330673217773, "learning_rate": 3.5741126749386025e-06, "loss": 0.3971, "step": 5410 }, { "epoch": 0.5962534435261708, "grad_norm": 6.2130045890808105, "learning_rate": 3.5724369289048845e-06, "loss": 0.4398, "step": 5411 }, { "epoch": 0.5963636363636363, "grad_norm": 8.505125999450684, "learning_rate": 3.5707613574355194e-06, "loss": 0.3777, "step": 5412 }, { "epoch": 0.5964738292011019, "grad_norm": 6.111608505249023, "learning_rate": 3.569085960735397e-06, "loss": 0.4284, "step": 5413 }, { "epoch": 0.5965840220385675, "grad_norm": 6.80982780456543, "learning_rate": 3.567410739009386e-06, "loss": 0.4572, "step": 5414 }, { "epoch": 0.596694214876033, "grad_norm": 7.65366268157959, "learning_rate": 3.5657356924623367e-06, "loss": 0.4361, "step": 5415 }, { "epoch": 0.5968044077134986, "grad_norm": 7.172601699829102, "learning_rate": 3.564060821299076e-06, "loss": 0.4988, "step": 5416 }, { "epoch": 0.5969146005509642, "grad_norm": 5.870329856872559, "learning_rate": 3.5623861257244062e-06, "loss": 0.3892, "step": 5417 }, { "epoch": 0.5970247933884297, "grad_norm": 10.348566055297852, "learning_rate": 3.560711605943116e-06, "loss": 0.3357, "step": 5418 }, { "epoch": 0.5971349862258953, "grad_norm": 4.690185546875, "learning_rate": 3.5590372621599634e-06, "loss": 0.4008, "step": 5419 }, { "epoch": 0.5972451790633608, "grad_norm": 4.000380039215088, "learning_rate": 3.5573630945796934e-06, "loss": 0.3698, "step": 5420 }, { "epoch": 0.5973553719008264, "grad_norm": 5.777688026428223, "learning_rate": 3.555689103407024e-06, "loss": 0.3873, "step": 5421 }, { "epoch": 0.597465564738292, "grad_norm": 13.219320297241211, "learning_rate": 3.5540152888466515e-06, "loss": 0.5007, "step": 5422 }, { "epoch": 0.5975757575757575, "grad_norm": 7.085648059844971, "learning_rate": 3.552341651103255e-06, "loss": 0.449, "step": 5423 }, { "epoch": 0.5976859504132231, "grad_norm": 8.671232223510742, "learning_rate": 3.550668190381489e-06, "loss": 0.4129, "step": 5424 }, { "epoch": 0.5977961432506887, "grad_norm": 10.42537784576416, "learning_rate": 3.548994906885982e-06, "loss": 0.4701, "step": 5425 }, { "epoch": 0.5979063360881542, "grad_norm": 4.7576494216918945, "learning_rate": 3.547321800821353e-06, "loss": 0.3749, "step": 5426 }, { "epoch": 0.5980165289256199, "grad_norm": 9.288346290588379, "learning_rate": 3.545648872392185e-06, "loss": 0.3779, "step": 5427 }, { "epoch": 0.5981267217630853, "grad_norm": 7.297476768493652, "learning_rate": 3.5439761218030465e-06, "loss": 0.3754, "step": 5428 }, { "epoch": 0.598236914600551, "grad_norm": 7.915855884552002, "learning_rate": 3.542303549258489e-06, "loss": 0.3809, "step": 5429 }, { "epoch": 0.5983471074380166, "grad_norm": 9.086050033569336, "learning_rate": 3.540631154963033e-06, "loss": 0.3856, "step": 5430 }, { "epoch": 0.598457300275482, "grad_norm": 6.687013149261475, "learning_rate": 3.5389589391211805e-06, "loss": 0.4279, "step": 5431 }, { "epoch": 0.5985674931129477, "grad_norm": 4.62927770614624, "learning_rate": 3.5372869019374146e-06, "loss": 0.3981, "step": 5432 }, { "epoch": 0.5986776859504133, "grad_norm": 6.236980438232422, "learning_rate": 3.5356150436161908e-06, "loss": 0.3737, "step": 5433 }, { "epoch": 0.5987878787878788, "grad_norm": 8.671568870544434, "learning_rate": 3.5339433643619515e-06, "loss": 0.4959, "step": 5434 }, { "epoch": 0.5988980716253444, "grad_norm": 4.198315620422363, "learning_rate": 3.5322718643791087e-06, "loss": 0.3807, "step": 5435 }, { "epoch": 0.59900826446281, "grad_norm": 5.558796405792236, "learning_rate": 3.530600543872055e-06, "loss": 0.426, "step": 5436 }, { "epoch": 0.5991184573002755, "grad_norm": 5.832391262054443, "learning_rate": 3.528929403045163e-06, "loss": 0.42, "step": 5437 }, { "epoch": 0.5992286501377411, "grad_norm": 5.3307929039001465, "learning_rate": 3.5272584421027823e-06, "loss": 0.3636, "step": 5438 }, { "epoch": 0.5993388429752066, "grad_norm": 6.033764362335205, "learning_rate": 3.525587661249239e-06, "loss": 0.3113, "step": 5439 }, { "epoch": 0.5994490358126722, "grad_norm": 4.488221168518066, "learning_rate": 3.5239170606888405e-06, "loss": 0.3893, "step": 5440 }, { "epoch": 0.5995592286501378, "grad_norm": 4.295153617858887, "learning_rate": 3.522246640625868e-06, "loss": 0.375, "step": 5441 }, { "epoch": 0.5996694214876033, "grad_norm": 4.770776748657227, "learning_rate": 3.520576401264584e-06, "loss": 0.4244, "step": 5442 }, { "epoch": 0.5997796143250689, "grad_norm": 7.971227645874023, "learning_rate": 3.5189063428092276e-06, "loss": 0.4109, "step": 5443 }, { "epoch": 0.5998898071625345, "grad_norm": 11.983476638793945, "learning_rate": 3.5172364654640144e-06, "loss": 0.5333, "step": 5444 }, { "epoch": 0.6, "grad_norm": 6.334736347198486, "learning_rate": 3.5155667694331408e-06, "loss": 0.4252, "step": 5445 }, { "epoch": 0.6001101928374656, "grad_norm": 4.475926876068115, "learning_rate": 3.5138972549207794e-06, "loss": 0.3493, "step": 5446 }, { "epoch": 0.6002203856749311, "grad_norm": 4.665364742279053, "learning_rate": 3.5122279221310795e-06, "loss": 0.4304, "step": 5447 }, { "epoch": 0.6003305785123967, "grad_norm": 7.550339221954346, "learning_rate": 3.510558771268171e-06, "loss": 0.4307, "step": 5448 }, { "epoch": 0.6003305785123967, "eval_loss": 0.4078207314014435, "eval_runtime": 41.9468, "eval_samples_per_second": 17.498, "eval_steps_per_second": 2.193, "step": 5448 }, { "epoch": 0.6004407713498623, "grad_norm": 7.081995487213135, "learning_rate": 3.5088898025361596e-06, "loss": 0.3884, "step": 5449 }, { "epoch": 0.6005509641873278, "grad_norm": 7.735726356506348, "learning_rate": 3.5072210161391273e-06, "loss": 0.5048, "step": 5450 }, { "epoch": 0.6006611570247934, "grad_norm": 6.870943069458008, "learning_rate": 3.505552412281138e-06, "loss": 0.4516, "step": 5451 }, { "epoch": 0.600771349862259, "grad_norm": 6.664737701416016, "learning_rate": 3.5038839911662303e-06, "loss": 0.3871, "step": 5452 }, { "epoch": 0.6008815426997245, "grad_norm": 7.977838516235352, "learning_rate": 3.502215752998418e-06, "loss": 0.4709, "step": 5453 }, { "epoch": 0.6009917355371901, "grad_norm": 8.963409423828125, "learning_rate": 3.5005476979816992e-06, "loss": 0.4825, "step": 5454 }, { "epoch": 0.6011019283746556, "grad_norm": 9.17885684967041, "learning_rate": 3.4988798263200463e-06, "loss": 0.4628, "step": 5455 }, { "epoch": 0.6012121212121212, "grad_norm": 7.315991401672363, "learning_rate": 3.497212138217404e-06, "loss": 0.3959, "step": 5456 }, { "epoch": 0.6013223140495868, "grad_norm": 4.886915683746338, "learning_rate": 3.4955446338777064e-06, "loss": 0.3892, "step": 5457 }, { "epoch": 0.6014325068870523, "grad_norm": 5.84981107711792, "learning_rate": 3.493877313504853e-06, "loss": 0.3855, "step": 5458 }, { "epoch": 0.6015426997245179, "grad_norm": 6.625370502471924, "learning_rate": 3.492210177302727e-06, "loss": 0.4021, "step": 5459 }, { "epoch": 0.6016528925619835, "grad_norm": 10.513236999511719, "learning_rate": 3.490543225475189e-06, "loss": 0.5396, "step": 5460 }, { "epoch": 0.601763085399449, "grad_norm": 14.246024131774902, "learning_rate": 3.488876458226075e-06, "loss": 0.4531, "step": 5461 }, { "epoch": 0.6018732782369146, "grad_norm": 6.541625499725342, "learning_rate": 3.487209875759202e-06, "loss": 0.3737, "step": 5462 }, { "epoch": 0.6019834710743802, "grad_norm": 10.214615821838379, "learning_rate": 3.4855434782783603e-06, "loss": 0.441, "step": 5463 }, { "epoch": 0.6020936639118457, "grad_norm": 8.126017570495605, "learning_rate": 3.4838772659873175e-06, "loss": 0.3262, "step": 5464 }, { "epoch": 0.6022038567493113, "grad_norm": 9.761160850524902, "learning_rate": 3.4822112390898233e-06, "loss": 0.3774, "step": 5465 }, { "epoch": 0.6023140495867768, "grad_norm": 5.821922779083252, "learning_rate": 3.480545397789601e-06, "loss": 0.4295, "step": 5466 }, { "epoch": 0.6024242424242424, "grad_norm": 5.46732234954834, "learning_rate": 3.4788797422903496e-06, "loss": 0.3349, "step": 5467 }, { "epoch": 0.602534435261708, "grad_norm": 6.44102144241333, "learning_rate": 3.4772142727957515e-06, "loss": 0.4165, "step": 5468 }, { "epoch": 0.6026446280991735, "grad_norm": 5.899956703186035, "learning_rate": 3.4755489895094596e-06, "loss": 0.4426, "step": 5469 }, { "epoch": 0.6027548209366391, "grad_norm": 4.245342254638672, "learning_rate": 3.4738838926351078e-06, "loss": 0.3504, "step": 5470 }, { "epoch": 0.6028650137741047, "grad_norm": 5.119142055511475, "learning_rate": 3.4722189823763075e-06, "loss": 0.3728, "step": 5471 }, { "epoch": 0.6029752066115702, "grad_norm": 5.943778038024902, "learning_rate": 3.470554258936645e-06, "loss": 0.3109, "step": 5472 }, { "epoch": 0.6030853994490358, "grad_norm": 5.078598976135254, "learning_rate": 3.4688897225196845e-06, "loss": 0.3263, "step": 5473 }, { "epoch": 0.6031955922865013, "grad_norm": 6.279187202453613, "learning_rate": 3.4672253733289694e-06, "loss": 0.4356, "step": 5474 }, { "epoch": 0.6033057851239669, "grad_norm": 5.126062393188477, "learning_rate": 3.4655612115680172e-06, "loss": 0.3613, "step": 5475 }, { "epoch": 0.6034159779614325, "grad_norm": 5.338726043701172, "learning_rate": 3.4638972374403246e-06, "loss": 0.3471, "step": 5476 }, { "epoch": 0.603526170798898, "grad_norm": 10.157291412353516, "learning_rate": 3.462233451149365e-06, "loss": 0.4231, "step": 5477 }, { "epoch": 0.6036363636363636, "grad_norm": 5.979432582855225, "learning_rate": 3.4605698528985866e-06, "loss": 0.3649, "step": 5478 }, { "epoch": 0.6037465564738292, "grad_norm": 6.608692169189453, "learning_rate": 3.4589064428914186e-06, "loss": 0.4855, "step": 5479 }, { "epoch": 0.6038567493112947, "grad_norm": 15.17431640625, "learning_rate": 3.457243221331266e-06, "loss": 0.5176, "step": 5480 }, { "epoch": 0.6039669421487603, "grad_norm": 5.733303070068359, "learning_rate": 3.4555801884215036e-06, "loss": 0.3628, "step": 5481 }, { "epoch": 0.604077134986226, "grad_norm": 9.653572082519531, "learning_rate": 3.453917344365496e-06, "loss": 0.4355, "step": 5482 }, { "epoch": 0.6041873278236914, "grad_norm": 6.833795070648193, "learning_rate": 3.452254689366577e-06, "loss": 0.3556, "step": 5483 }, { "epoch": 0.604297520661157, "grad_norm": 5.048872470855713, "learning_rate": 3.450592223628054e-06, "loss": 0.3802, "step": 5484 }, { "epoch": 0.6044077134986225, "grad_norm": 6.3752546310424805, "learning_rate": 3.4489299473532212e-06, "loss": 0.3629, "step": 5485 }, { "epoch": 0.6045179063360882, "grad_norm": 15.690077781677246, "learning_rate": 3.4472678607453406e-06, "loss": 0.444, "step": 5486 }, { "epoch": 0.6046280991735538, "grad_norm": 5.63066291809082, "learning_rate": 3.4456059640076535e-06, "loss": 0.438, "step": 5487 }, { "epoch": 0.6047382920110193, "grad_norm": 11.27439022064209, "learning_rate": 3.4439442573433834e-06, "loss": 0.425, "step": 5488 }, { "epoch": 0.6048484848484849, "grad_norm": 4.574676990509033, "learning_rate": 3.4422827409557197e-06, "loss": 0.3495, "step": 5489 }, { "epoch": 0.6049586776859505, "grad_norm": 12.163301467895508, "learning_rate": 3.4406214150478417e-06, "loss": 0.4243, "step": 5490 }, { "epoch": 0.605068870523416, "grad_norm": 5.973365306854248, "learning_rate": 3.4389602798228942e-06, "loss": 0.3563, "step": 5491 }, { "epoch": 0.6051790633608816, "grad_norm": 6.112514019012451, "learning_rate": 3.4372993354840034e-06, "loss": 0.4301, "step": 5492 }, { "epoch": 0.6052892561983471, "grad_norm": 6.745209217071533, "learning_rate": 3.4356385822342734e-06, "loss": 0.4964, "step": 5493 }, { "epoch": 0.6053994490358127, "grad_norm": 6.515758991241455, "learning_rate": 3.4339780202767824e-06, "loss": 0.4525, "step": 5494 }, { "epoch": 0.6055096418732783, "grad_norm": 7.227394104003906, "learning_rate": 3.432317649814586e-06, "loss": 0.3632, "step": 5495 }, { "epoch": 0.6056198347107438, "grad_norm": 6.263385772705078, "learning_rate": 3.430657471050717e-06, "loss": 0.3573, "step": 5496 }, { "epoch": 0.6057300275482094, "grad_norm": 7.537135601043701, "learning_rate": 3.4289974841881848e-06, "loss": 0.423, "step": 5497 }, { "epoch": 0.605840220385675, "grad_norm": 7.491336822509766, "learning_rate": 3.4273376894299726e-06, "loss": 0.4863, "step": 5498 }, { "epoch": 0.6059504132231405, "grad_norm": 6.500481128692627, "learning_rate": 3.4256780869790456e-06, "loss": 0.4532, "step": 5499 }, { "epoch": 0.6060606060606061, "grad_norm": 7.756498336791992, "learning_rate": 3.4240186770383406e-06, "loss": 0.3945, "step": 5500 }, { "epoch": 0.6061707988980716, "grad_norm": 7.27528715133667, "learning_rate": 3.422359459810771e-06, "loss": 0.4888, "step": 5501 }, { "epoch": 0.6062809917355372, "grad_norm": 6.987998008728027, "learning_rate": 3.420700435499231e-06, "loss": 0.3375, "step": 5502 }, { "epoch": 0.6063911845730028, "grad_norm": 7.22456169128418, "learning_rate": 3.419041604306586e-06, "loss": 0.4189, "step": 5503 }, { "epoch": 0.6065013774104683, "grad_norm": 18.85123062133789, "learning_rate": 3.4173829664356823e-06, "loss": 0.5285, "step": 5504 }, { "epoch": 0.6066115702479339, "grad_norm": 4.069933891296387, "learning_rate": 3.4157245220893387e-06, "loss": 0.4348, "step": 5505 }, { "epoch": 0.6067217630853995, "grad_norm": 6.977851867675781, "learning_rate": 3.414066271470352e-06, "loss": 0.415, "step": 5506 }, { "epoch": 0.606831955922865, "grad_norm": 5.772762775421143, "learning_rate": 3.4124082147814972e-06, "loss": 0.4229, "step": 5507 }, { "epoch": 0.6069421487603306, "grad_norm": 4.402000904083252, "learning_rate": 3.410750352225522e-06, "loss": 0.4296, "step": 5508 }, { "epoch": 0.6070523415977962, "grad_norm": 5.187463760375977, "learning_rate": 3.409092684005152e-06, "loss": 0.3901, "step": 5509 }, { "epoch": 0.6071625344352617, "grad_norm": 9.41743278503418, "learning_rate": 3.40743521032309e-06, "loss": 0.525, "step": 5510 }, { "epoch": 0.6072727272727273, "grad_norm": 10.870046615600586, "learning_rate": 3.405777931382015e-06, "loss": 0.5082, "step": 5511 }, { "epoch": 0.6073829201101928, "grad_norm": 9.105496406555176, "learning_rate": 3.404120847384577e-06, "loss": 0.4436, "step": 5512 }, { "epoch": 0.6074931129476584, "grad_norm": 4.680967330932617, "learning_rate": 3.402463958533413e-06, "loss": 0.3457, "step": 5513 }, { "epoch": 0.607603305785124, "grad_norm": 7.194051265716553, "learning_rate": 3.4008072650311248e-06, "loss": 0.3418, "step": 5514 }, { "epoch": 0.6077134986225895, "grad_norm": 3.9928011894226074, "learning_rate": 3.3991507670802943e-06, "loss": 0.4321, "step": 5515 }, { "epoch": 0.6078236914600551, "grad_norm": 8.524062156677246, "learning_rate": 3.397494464883486e-06, "loss": 0.4718, "step": 5516 }, { "epoch": 0.6079338842975207, "grad_norm": 5.69117546081543, "learning_rate": 3.395838358643228e-06, "loss": 0.3543, "step": 5517 }, { "epoch": 0.6080440771349862, "grad_norm": 7.059926509857178, "learning_rate": 3.394182448562038e-06, "loss": 0.3468, "step": 5518 }, { "epoch": 0.6081542699724518, "grad_norm": 5.54630708694458, "learning_rate": 3.392526734842398e-06, "loss": 0.3813, "step": 5519 }, { "epoch": 0.6082644628099173, "grad_norm": 5.180455684661865, "learning_rate": 3.39087121768677e-06, "loss": 0.4141, "step": 5520 }, { "epoch": 0.6083746556473829, "grad_norm": 10.610570907592773, "learning_rate": 3.3892158972975996e-06, "loss": 0.4915, "step": 5521 }, { "epoch": 0.6084848484848485, "grad_norm": 7.216747760772705, "learning_rate": 3.387560773877295e-06, "loss": 0.3115, "step": 5522 }, { "epoch": 0.608595041322314, "grad_norm": 4.480962753295898, "learning_rate": 3.385905847628249e-06, "loss": 0.3949, "step": 5523 }, { "epoch": 0.6087052341597796, "grad_norm": 8.370291709899902, "learning_rate": 3.384251118752829e-06, "loss": 0.4791, "step": 5524 }, { "epoch": 0.6088154269972452, "grad_norm": 5.0554656982421875, "learning_rate": 3.382596587453378e-06, "loss": 0.3885, "step": 5525 }, { "epoch": 0.6089256198347107, "grad_norm": 7.361545085906982, "learning_rate": 3.3809422539322114e-06, "loss": 0.392, "step": 5526 }, { "epoch": 0.6090358126721763, "grad_norm": 5.282715320587158, "learning_rate": 3.3792881183916264e-06, "loss": 0.3994, "step": 5527 }, { "epoch": 0.6091460055096418, "grad_norm": 7.640480995178223, "learning_rate": 3.3776341810338918e-06, "loss": 0.4252, "step": 5528 }, { "epoch": 0.6092561983471074, "grad_norm": 11.432671546936035, "learning_rate": 3.3759804420612523e-06, "loss": 0.4544, "step": 5529 }, { "epoch": 0.609366391184573, "grad_norm": 4.013760566711426, "learning_rate": 3.3743269016759315e-06, "loss": 0.3352, "step": 5530 }, { "epoch": 0.6094765840220385, "grad_norm": 6.269103050231934, "learning_rate": 3.372673560080124e-06, "loss": 0.4492, "step": 5531 }, { "epoch": 0.6095867768595041, "grad_norm": 10.109606742858887, "learning_rate": 3.3710204174760057e-06, "loss": 0.4854, "step": 5532 }, { "epoch": 0.6096969696969697, "grad_norm": 7.672402858734131, "learning_rate": 3.3693674740657232e-06, "loss": 0.395, "step": 5533 }, { "epoch": 0.6098071625344352, "grad_norm": 5.026207447052002, "learning_rate": 3.3677147300514003e-06, "loss": 0.3911, "step": 5534 }, { "epoch": 0.6099173553719008, "grad_norm": 6.89813232421875, "learning_rate": 3.366062185635138e-06, "loss": 0.3677, "step": 5535 }, { "epoch": 0.6100275482093664, "grad_norm": 9.764766693115234, "learning_rate": 3.3644098410190116e-06, "loss": 0.4638, "step": 5536 }, { "epoch": 0.6101377410468319, "grad_norm": 12.982881546020508, "learning_rate": 3.3627576964050703e-06, "loss": 0.4426, "step": 5537 }, { "epoch": 0.6102479338842975, "grad_norm": 6.899441719055176, "learning_rate": 3.3611057519953426e-06, "loss": 0.4431, "step": 5538 }, { "epoch": 0.610358126721763, "grad_norm": 7.647733688354492, "learning_rate": 3.3594540079918314e-06, "loss": 0.4726, "step": 5539 }, { "epoch": 0.6104683195592286, "grad_norm": 5.014935493469238, "learning_rate": 3.357802464596509e-06, "loss": 0.4366, "step": 5540 }, { "epoch": 0.6105785123966943, "grad_norm": 13.708841323852539, "learning_rate": 3.3561511220113342e-06, "loss": 0.3356, "step": 5541 }, { "epoch": 0.6106887052341597, "grad_norm": 5.648785591125488, "learning_rate": 3.354499980438234e-06, "loss": 0.3524, "step": 5542 }, { "epoch": 0.6107988980716254, "grad_norm": 7.1190667152404785, "learning_rate": 3.352849040079108e-06, "loss": 0.4991, "step": 5543 }, { "epoch": 0.610909090909091, "grad_norm": 5.480741500854492, "learning_rate": 3.3511983011358423e-06, "loss": 0.4242, "step": 5544 }, { "epoch": 0.6110192837465565, "grad_norm": 5.00535249710083, "learning_rate": 3.349547763810285e-06, "loss": 0.4734, "step": 5545 }, { "epoch": 0.6111294765840221, "grad_norm": 7.967171669006348, "learning_rate": 3.347897428304272e-06, "loss": 0.3367, "step": 5546 }, { "epoch": 0.6112396694214876, "grad_norm": 10.291219711303711, "learning_rate": 3.3462472948196044e-06, "loss": 0.4168, "step": 5547 }, { "epoch": 0.6113498622589532, "grad_norm": 10.36643123626709, "learning_rate": 3.3445973635580626e-06, "loss": 0.4614, "step": 5548 }, { "epoch": 0.6114600550964188, "grad_norm": 8.678853034973145, "learning_rate": 3.342947634721406e-06, "loss": 0.3642, "step": 5549 }, { "epoch": 0.6115702479338843, "grad_norm": 6.127782821655273, "learning_rate": 3.341298108511363e-06, "loss": 0.3883, "step": 5550 }, { "epoch": 0.6116804407713499, "grad_norm": 7.922155380249023, "learning_rate": 3.33964878512964e-06, "loss": 0.4173, "step": 5551 }, { "epoch": 0.6117906336088155, "grad_norm": 7.354754447937012, "learning_rate": 3.337999664777919e-06, "loss": 0.3638, "step": 5552 }, { "epoch": 0.611900826446281, "grad_norm": 5.784652233123779, "learning_rate": 3.336350747657857e-06, "loss": 0.4165, "step": 5553 }, { "epoch": 0.6120110192837466, "grad_norm": 10.026205062866211, "learning_rate": 3.3347020339710844e-06, "loss": 0.4651, "step": 5554 }, { "epoch": 0.6121212121212121, "grad_norm": 9.611244201660156, "learning_rate": 3.33305352391921e-06, "loss": 0.4452, "step": 5555 }, { "epoch": 0.6122314049586777, "grad_norm": 8.440895080566406, "learning_rate": 3.3314052177038147e-06, "loss": 0.413, "step": 5556 }, { "epoch": 0.6123415977961433, "grad_norm": 5.659351348876953, "learning_rate": 3.329757115526456e-06, "loss": 0.4292, "step": 5557 }, { "epoch": 0.6124517906336088, "grad_norm": 14.748259544372559, "learning_rate": 3.3281092175886665e-06, "loss": 0.4598, "step": 5558 }, { "epoch": 0.6125619834710744, "grad_norm": 5.136844635009766, "learning_rate": 3.326461524091952e-06, "loss": 0.3272, "step": 5559 }, { "epoch": 0.61267217630854, "grad_norm": 5.436107158660889, "learning_rate": 3.3248140352377957e-06, "loss": 0.3365, "step": 5560 }, { "epoch": 0.6127823691460055, "grad_norm": 5.402589797973633, "learning_rate": 3.3231667512276553e-06, "loss": 0.3975, "step": 5561 }, { "epoch": 0.6128925619834711, "grad_norm": 5.3792595863342285, "learning_rate": 3.321519672262962e-06, "loss": 0.363, "step": 5562 }, { "epoch": 0.6130027548209367, "grad_norm": 8.311089515686035, "learning_rate": 3.319872798545123e-06, "loss": 0.4721, "step": 5563 }, { "epoch": 0.6131129476584022, "grad_norm": 6.57785701751709, "learning_rate": 3.3182261302755216e-06, "loss": 0.4175, "step": 5564 }, { "epoch": 0.6132231404958678, "grad_norm": 7.138577461242676, "learning_rate": 3.3165796676555118e-06, "loss": 0.4104, "step": 5565 }, { "epoch": 0.6133333333333333, "grad_norm": 5.004648685455322, "learning_rate": 3.3149334108864273e-06, "loss": 0.423, "step": 5566 }, { "epoch": 0.6134435261707989, "grad_norm": 10.52160930633545, "learning_rate": 3.3132873601695764e-06, "loss": 0.5053, "step": 5567 }, { "epoch": 0.6135537190082645, "grad_norm": 7.567537784576416, "learning_rate": 3.311641515706234e-06, "loss": 0.542, "step": 5568 }, { "epoch": 0.61366391184573, "grad_norm": 4.513036727905273, "learning_rate": 3.3099958776976636e-06, "loss": 0.4161, "step": 5569 }, { "epoch": 0.6137741046831956, "grad_norm": 5.943395137786865, "learning_rate": 3.3083504463450943e-06, "loss": 0.3304, "step": 5570 }, { "epoch": 0.6138842975206612, "grad_norm": 6.845271110534668, "learning_rate": 3.3067052218497263e-06, "loss": 0.3635, "step": 5571 }, { "epoch": 0.6139944903581267, "grad_norm": 7.157235622406006, "learning_rate": 3.3050602044127473e-06, "loss": 0.4635, "step": 5572 }, { "epoch": 0.6141046831955923, "grad_norm": 5.039437770843506, "learning_rate": 3.3034153942353055e-06, "loss": 0.3877, "step": 5573 }, { "epoch": 0.6142148760330578, "grad_norm": 5.90138578414917, "learning_rate": 3.301770791518536e-06, "loss": 0.4233, "step": 5574 }, { "epoch": 0.6143250688705234, "grad_norm": 5.284835338592529, "learning_rate": 3.300126396463542e-06, "loss": 0.4305, "step": 5575 }, { "epoch": 0.614435261707989, "grad_norm": 6.650893211364746, "learning_rate": 3.2984822092713987e-06, "loss": 0.4265, "step": 5576 }, { "epoch": 0.6145454545454545, "grad_norm": 8.513042449951172, "learning_rate": 3.2968382301431646e-06, "loss": 0.4179, "step": 5577 }, { "epoch": 0.6146556473829201, "grad_norm": 6.663072109222412, "learning_rate": 3.2951944592798645e-06, "loss": 0.4126, "step": 5578 }, { "epoch": 0.6147658402203857, "grad_norm": 3.8044331073760986, "learning_rate": 3.2935508968825e-06, "loss": 0.3906, "step": 5579 }, { "epoch": 0.6148760330578512, "grad_norm": 5.1406097412109375, "learning_rate": 3.291907543152052e-06, "loss": 0.4087, "step": 5580 }, { "epoch": 0.6149862258953168, "grad_norm": 7.120194435119629, "learning_rate": 3.2902643982894696e-06, "loss": 0.378, "step": 5581 }, { "epoch": 0.6150964187327824, "grad_norm": 7.6343183517456055, "learning_rate": 3.2886214624956776e-06, "loss": 0.4943, "step": 5582 }, { "epoch": 0.6152066115702479, "grad_norm": 5.776088237762451, "learning_rate": 3.2869787359715786e-06, "loss": 0.3461, "step": 5583 }, { "epoch": 0.6153168044077135, "grad_norm": 5.77926778793335, "learning_rate": 3.285336218918047e-06, "loss": 0.3914, "step": 5584 }, { "epoch": 0.615426997245179, "grad_norm": 5.3097243309021, "learning_rate": 3.2836939115359313e-06, "loss": 0.4227, "step": 5585 }, { "epoch": 0.6155371900826446, "grad_norm": 4.755492687225342, "learning_rate": 3.2820518140260554e-06, "loss": 0.3989, "step": 5586 }, { "epoch": 0.6156473829201102, "grad_norm": 5.929659366607666, "learning_rate": 3.280409926589216e-06, "loss": 0.3955, "step": 5587 }, { "epoch": 0.6157575757575757, "grad_norm": 6.221659183502197, "learning_rate": 3.278768249426189e-06, "loss": 0.3877, "step": 5588 }, { "epoch": 0.6158677685950413, "grad_norm": 5.330348968505859, "learning_rate": 3.2771267827377177e-06, "loss": 0.3422, "step": 5589 }, { "epoch": 0.6159779614325069, "grad_norm": 12.225643157958984, "learning_rate": 3.2754855267245232e-06, "loss": 0.4134, "step": 5590 }, { "epoch": 0.6160881542699724, "grad_norm": 4.399528980255127, "learning_rate": 3.2738444815873015e-06, "loss": 0.4353, "step": 5591 }, { "epoch": 0.616198347107438, "grad_norm": 8.618428230285645, "learning_rate": 3.2722036475267215e-06, "loss": 0.3784, "step": 5592 }, { "epoch": 0.6163085399449035, "grad_norm": 9.162566184997559, "learning_rate": 3.2705630247434258e-06, "loss": 0.4122, "step": 5593 }, { "epoch": 0.6164187327823691, "grad_norm": 5.548257827758789, "learning_rate": 3.2689226134380333e-06, "loss": 0.3735, "step": 5594 }, { "epoch": 0.6165289256198347, "grad_norm": 6.5701446533203125, "learning_rate": 3.267282413811135e-06, "loss": 0.4465, "step": 5595 }, { "epoch": 0.6166391184573002, "grad_norm": 6.101266384124756, "learning_rate": 3.265642426063296e-06, "loss": 0.4459, "step": 5596 }, { "epoch": 0.6167493112947658, "grad_norm": 6.442191123962402, "learning_rate": 3.264002650395058e-06, "loss": 0.3938, "step": 5597 }, { "epoch": 0.6168595041322315, "grad_norm": 5.264941692352295, "learning_rate": 3.2623630870069346e-06, "loss": 0.3521, "step": 5598 }, { "epoch": 0.616969696969697, "grad_norm": 8.560416221618652, "learning_rate": 3.260723736099411e-06, "loss": 0.4031, "step": 5599 }, { "epoch": 0.6170798898071626, "grad_norm": 7.50861120223999, "learning_rate": 3.2590845978729528e-06, "loss": 0.3334, "step": 5600 }, { "epoch": 0.617190082644628, "grad_norm": 5.766119003295898, "learning_rate": 3.2574456725279946e-06, "loss": 0.4062, "step": 5601 }, { "epoch": 0.6173002754820937, "grad_norm": 7.3114705085754395, "learning_rate": 3.255806960264947e-06, "loss": 0.4019, "step": 5602 }, { "epoch": 0.6174104683195593, "grad_norm": 6.906582355499268, "learning_rate": 3.2541684612841943e-06, "loss": 0.4805, "step": 5603 }, { "epoch": 0.6175206611570248, "grad_norm": 14.328627586364746, "learning_rate": 3.2525301757860915e-06, "loss": 0.4274, "step": 5604 }, { "epoch": 0.6176308539944904, "grad_norm": 8.022102355957031, "learning_rate": 3.250892103970975e-06, "loss": 0.3747, "step": 5605 }, { "epoch": 0.617741046831956, "grad_norm": 8.918832778930664, "learning_rate": 3.2492542460391467e-06, "loss": 0.4359, "step": 5606 }, { "epoch": 0.6178512396694215, "grad_norm": 6.053462505340576, "learning_rate": 3.2476166021908856e-06, "loss": 0.4104, "step": 5607 }, { "epoch": 0.6179614325068871, "grad_norm": 7.203765392303467, "learning_rate": 3.24597917262645e-06, "loss": 0.4289, "step": 5608 }, { "epoch": 0.6180716253443527, "grad_norm": 5.25016975402832, "learning_rate": 3.2443419575460623e-06, "loss": 0.3791, "step": 5609 }, { "epoch": 0.6181818181818182, "grad_norm": 7.548518657684326, "learning_rate": 3.242704957149925e-06, "loss": 0.4507, "step": 5610 }, { "epoch": 0.6182920110192838, "grad_norm": 11.34952449798584, "learning_rate": 3.241068171638212e-06, "loss": 0.5606, "step": 5611 }, { "epoch": 0.6184022038567493, "grad_norm": 12.98247241973877, "learning_rate": 3.2394316012110726e-06, "loss": 0.5672, "step": 5612 }, { "epoch": 0.6185123966942149, "grad_norm": 5.332764625549316, "learning_rate": 3.237795246068628e-06, "loss": 0.3759, "step": 5613 }, { "epoch": 0.6186225895316805, "grad_norm": 6.132878303527832, "learning_rate": 3.2361591064109754e-06, "loss": 0.4482, "step": 5614 }, { "epoch": 0.618732782369146, "grad_norm": 6.75705623626709, "learning_rate": 3.234523182438182e-06, "loss": 0.3936, "step": 5615 }, { "epoch": 0.6188429752066116, "grad_norm": 5.98726224899292, "learning_rate": 3.2328874743502935e-06, "loss": 0.3458, "step": 5616 }, { "epoch": 0.6189531680440772, "grad_norm": 7.1376166343688965, "learning_rate": 3.231251982347324e-06, "loss": 0.4255, "step": 5617 }, { "epoch": 0.6190633608815427, "grad_norm": 5.139072895050049, "learning_rate": 3.229616706629265e-06, "loss": 0.4557, "step": 5618 }, { "epoch": 0.6191735537190083, "grad_norm": 6.690254211425781, "learning_rate": 3.22798164739608e-06, "loss": 0.4788, "step": 5619 }, { "epoch": 0.6192837465564738, "grad_norm": 7.521590232849121, "learning_rate": 3.2263468048477066e-06, "loss": 0.4368, "step": 5620 }, { "epoch": 0.6193939393939394, "grad_norm": 10.254018783569336, "learning_rate": 3.224712179184054e-06, "loss": 0.3912, "step": 5621 }, { "epoch": 0.619504132231405, "grad_norm": 8.72264289855957, "learning_rate": 3.2230777706050087e-06, "loss": 0.4295, "step": 5622 }, { "epoch": 0.6196143250688705, "grad_norm": 3.8662004470825195, "learning_rate": 3.221443579310428e-06, "loss": 0.3662, "step": 5623 }, { "epoch": 0.6197245179063361, "grad_norm": 4.6445722579956055, "learning_rate": 3.2198096055001404e-06, "loss": 0.4469, "step": 5624 }, { "epoch": 0.6198347107438017, "grad_norm": 4.952694416046143, "learning_rate": 3.2181758493739535e-06, "loss": 0.3441, "step": 5625 }, { "epoch": 0.6199449035812672, "grad_norm": 6.553282260894775, "learning_rate": 3.2165423111316453e-06, "loss": 0.3735, "step": 5626 }, { "epoch": 0.6200550964187328, "grad_norm": 7.485846042633057, "learning_rate": 3.2149089909729623e-06, "loss": 0.3838, "step": 5627 }, { "epoch": 0.6201652892561983, "grad_norm": 4.6056389808654785, "learning_rate": 3.2132758890976357e-06, "loss": 0.3903, "step": 5628 }, { "epoch": 0.6202754820936639, "grad_norm": 6.660373210906982, "learning_rate": 3.2116430057053594e-06, "loss": 0.4082, "step": 5629 }, { "epoch": 0.6203856749311295, "grad_norm": 6.841281890869141, "learning_rate": 3.2100103409958062e-06, "loss": 0.4608, "step": 5630 }, { "epoch": 0.620495867768595, "grad_norm": 12.513483047485352, "learning_rate": 3.2083778951686206e-06, "loss": 0.5196, "step": 5631 }, { "epoch": 0.6206060606060606, "grad_norm": 6.187821388244629, "learning_rate": 3.2067456684234167e-06, "loss": 0.4053, "step": 5632 }, { "epoch": 0.6207162534435262, "grad_norm": 6.212988376617432, "learning_rate": 3.205113660959791e-06, "loss": 0.3763, "step": 5633 }, { "epoch": 0.6208264462809917, "grad_norm": 7.4557623863220215, "learning_rate": 3.2034818729773056e-06, "loss": 0.4272, "step": 5634 }, { "epoch": 0.6209366391184573, "grad_norm": 5.5205206871032715, "learning_rate": 3.201850304675494e-06, "loss": 0.4393, "step": 5635 }, { "epoch": 0.6210468319559229, "grad_norm": 7.638397216796875, "learning_rate": 3.200218956253873e-06, "loss": 0.457, "step": 5636 }, { "epoch": 0.6211570247933884, "grad_norm": 3.999941110610962, "learning_rate": 3.1985878279119213e-06, "loss": 0.4008, "step": 5637 }, { "epoch": 0.621267217630854, "grad_norm": 7.912863731384277, "learning_rate": 3.196956919849097e-06, "loss": 0.3695, "step": 5638 }, { "epoch": 0.6213774104683195, "grad_norm": 4.004800796508789, "learning_rate": 3.195326232264829e-06, "loss": 0.3662, "step": 5639 }, { "epoch": 0.6214876033057851, "grad_norm": 5.542474746704102, "learning_rate": 3.193695765358522e-06, "loss": 0.3078, "step": 5640 }, { "epoch": 0.6215977961432507, "grad_norm": 7.421055316925049, "learning_rate": 3.192065519329549e-06, "loss": 0.3615, "step": 5641 }, { "epoch": 0.6217079889807162, "grad_norm": 6.017780780792236, "learning_rate": 3.190435494377262e-06, "loss": 0.332, "step": 5642 }, { "epoch": 0.6218181818181818, "grad_norm": 8.54306697845459, "learning_rate": 3.188805690700979e-06, "loss": 0.3651, "step": 5643 }, { "epoch": 0.6219283746556474, "grad_norm": 9.726669311523438, "learning_rate": 3.1871761084999975e-06, "loss": 0.4086, "step": 5644 }, { "epoch": 0.6220385674931129, "grad_norm": 5.96169376373291, "learning_rate": 3.1855467479735836e-06, "loss": 0.3899, "step": 5645 }, { "epoch": 0.6221487603305785, "grad_norm": 4.833744049072266, "learning_rate": 3.183917609320978e-06, "loss": 0.3217, "step": 5646 }, { "epoch": 0.622258953168044, "grad_norm": 6.837959289550781, "learning_rate": 3.1822886927413945e-06, "loss": 0.3345, "step": 5647 }, { "epoch": 0.6223691460055096, "grad_norm": 8.42690372467041, "learning_rate": 3.1806599984340182e-06, "loss": 0.4644, "step": 5648 }, { "epoch": 0.6224793388429752, "grad_norm": 10.13464641571045, "learning_rate": 3.179031526598008e-06, "loss": 0.3748, "step": 5649 }, { "epoch": 0.6225895316804407, "grad_norm": 13.409700393676758, "learning_rate": 3.1774032774324973e-06, "loss": 0.523, "step": 5650 }, { "epoch": 0.6226997245179063, "grad_norm": 7.18253755569458, "learning_rate": 3.1757752511365903e-06, "loss": 0.3761, "step": 5651 }, { "epoch": 0.622809917355372, "grad_norm": 6.525432109832764, "learning_rate": 3.1741474479093615e-06, "loss": 0.3809, "step": 5652 }, { "epoch": 0.6229201101928374, "grad_norm": 12.289591789245605, "learning_rate": 3.1725198679498647e-06, "loss": 0.5713, "step": 5653 }, { "epoch": 0.623030303030303, "grad_norm": 6.035792350769043, "learning_rate": 3.17089251145712e-06, "loss": 0.3966, "step": 5654 }, { "epoch": 0.6231404958677685, "grad_norm": 9.94541072845459, "learning_rate": 3.169265378630123e-06, "loss": 0.3731, "step": 5655 }, { "epoch": 0.6232506887052341, "grad_norm": 7.422900199890137, "learning_rate": 3.1676384696678436e-06, "loss": 0.3012, "step": 5656 }, { "epoch": 0.6233608815426998, "grad_norm": 5.478695392608643, "learning_rate": 3.16601178476922e-06, "loss": 0.3706, "step": 5657 }, { "epoch": 0.6234710743801652, "grad_norm": 4.026650905609131, "learning_rate": 3.164385324133168e-06, "loss": 0.3987, "step": 5658 }, { "epoch": 0.6235812672176309, "grad_norm": 5.533215522766113, "learning_rate": 3.1627590879585723e-06, "loss": 0.2778, "step": 5659 }, { "epoch": 0.6236914600550965, "grad_norm": 5.62987756729126, "learning_rate": 3.161133076444288e-06, "loss": 0.4016, "step": 5660 }, { "epoch": 0.623801652892562, "grad_norm": 5.299964427947998, "learning_rate": 3.159507289789151e-06, "loss": 0.4233, "step": 5661 }, { "epoch": 0.6239118457300276, "grad_norm": 5.24252462387085, "learning_rate": 3.1578817281919644e-06, "loss": 0.2786, "step": 5662 }, { "epoch": 0.6240220385674932, "grad_norm": 6.27246618270874, "learning_rate": 3.156256391851499e-06, "loss": 0.3504, "step": 5663 }, { "epoch": 0.6241322314049587, "grad_norm": 7.873795032501221, "learning_rate": 3.1546312809665093e-06, "loss": 0.4294, "step": 5664 }, { "epoch": 0.6242424242424243, "grad_norm": 6.295063495635986, "learning_rate": 3.153006395735712e-06, "loss": 0.4972, "step": 5665 }, { "epoch": 0.6243526170798898, "grad_norm": 6.084031581878662, "learning_rate": 3.1513817363577997e-06, "loss": 0.4003, "step": 5666 }, { "epoch": 0.6244628099173554, "grad_norm": 6.049535751342773, "learning_rate": 3.1497573030314433e-06, "loss": 0.4134, "step": 5667 }, { "epoch": 0.624573002754821, "grad_norm": 4.959476947784424, "learning_rate": 3.148133095955276e-06, "loss": 0.3097, "step": 5668 }, { "epoch": 0.6246831955922865, "grad_norm": 4.211620330810547, "learning_rate": 3.146509115327907e-06, "loss": 0.4245, "step": 5669 }, { "epoch": 0.6247933884297521, "grad_norm": 10.23599624633789, "learning_rate": 3.1448853613479238e-06, "loss": 0.5412, "step": 5670 }, { "epoch": 0.6249035812672177, "grad_norm": 4.663244724273682, "learning_rate": 3.1432618342138765e-06, "loss": 0.3413, "step": 5671 }, { "epoch": 0.6250137741046832, "grad_norm": 5.570018768310547, "learning_rate": 3.1416385341242957e-06, "loss": 0.3754, "step": 5672 }, { "epoch": 0.6251239669421488, "grad_norm": 7.070735931396484, "learning_rate": 3.140015461277679e-06, "loss": 0.4864, "step": 5673 }, { "epoch": 0.6252341597796143, "grad_norm": 11.128875732421875, "learning_rate": 3.1383926158724976e-06, "loss": 0.4269, "step": 5674 }, { "epoch": 0.6253443526170799, "grad_norm": 6.50960111618042, "learning_rate": 3.1367699981071962e-06, "loss": 0.4417, "step": 5675 }, { "epoch": 0.6254545454545455, "grad_norm": 9.919767379760742, "learning_rate": 3.135147608180191e-06, "loss": 0.4222, "step": 5676 }, { "epoch": 0.625564738292011, "grad_norm": 7.701219081878662, "learning_rate": 3.1335254462898686e-06, "loss": 0.435, "step": 5677 }, { "epoch": 0.6256749311294766, "grad_norm": 9.599321365356445, "learning_rate": 3.131903512634591e-06, "loss": 0.4074, "step": 5678 }, { "epoch": 0.6257851239669422, "grad_norm": 18.00543975830078, "learning_rate": 3.1302818074126885e-06, "loss": 0.5061, "step": 5679 }, { "epoch": 0.6258953168044077, "grad_norm": 4.5613603591918945, "learning_rate": 3.128660330822466e-06, "loss": 0.3742, "step": 5680 }, { "epoch": 0.6260055096418733, "grad_norm": 5.788570880889893, "learning_rate": 3.127039083062201e-06, "loss": 0.349, "step": 5681 }, { "epoch": 0.6261157024793389, "grad_norm": 9.021220207214355, "learning_rate": 3.1254180643301413e-06, "loss": 0.3669, "step": 5682 }, { "epoch": 0.6262258953168044, "grad_norm": 8.388569831848145, "learning_rate": 3.123797274824506e-06, "loss": 0.3591, "step": 5683 }, { "epoch": 0.62633608815427, "grad_norm": 5.5817718505859375, "learning_rate": 3.12217671474349e-06, "loss": 0.4128, "step": 5684 }, { "epoch": 0.6264462809917355, "grad_norm": 5.523597717285156, "learning_rate": 3.1205563842852544e-06, "loss": 0.3976, "step": 5685 }, { "epoch": 0.6265564738292011, "grad_norm": 9.741737365722656, "learning_rate": 3.1189362836479386e-06, "loss": 0.3773, "step": 5686 }, { "epoch": 0.6266666666666667, "grad_norm": 3.5196845531463623, "learning_rate": 3.1173164130296486e-06, "loss": 0.3505, "step": 5687 }, { "epoch": 0.6267768595041322, "grad_norm": 6.845584869384766, "learning_rate": 3.1156967726284644e-06, "loss": 0.4427, "step": 5688 }, { "epoch": 0.6268870523415978, "grad_norm": 4.908968448638916, "learning_rate": 3.114077362642439e-06, "loss": 0.3494, "step": 5689 }, { "epoch": 0.6269972451790634, "grad_norm": 8.34274959564209, "learning_rate": 3.1124581832695973e-06, "loss": 0.3322, "step": 5690 }, { "epoch": 0.6271074380165289, "grad_norm": 11.896194458007812, "learning_rate": 3.110839234707929e-06, "loss": 0.5003, "step": 5691 }, { "epoch": 0.6272176308539945, "grad_norm": 4.664172172546387, "learning_rate": 3.109220517155409e-06, "loss": 0.3848, "step": 5692 }, { "epoch": 0.62732782369146, "grad_norm": 6.579066276550293, "learning_rate": 3.1076020308099707e-06, "loss": 0.3774, "step": 5693 }, { "epoch": 0.6274380165289256, "grad_norm": 4.803685188293457, "learning_rate": 3.1059837758695256e-06, "loss": 0.4275, "step": 5694 }, { "epoch": 0.6275482093663912, "grad_norm": 5.253371238708496, "learning_rate": 3.1043657525319597e-06, "loss": 0.4377, "step": 5695 }, { "epoch": 0.6276584022038567, "grad_norm": 9.491144180297852, "learning_rate": 3.102747960995124e-06, "loss": 0.4879, "step": 5696 }, { "epoch": 0.6277685950413223, "grad_norm": 6.025481224060059, "learning_rate": 3.1011304014568433e-06, "loss": 0.3644, "step": 5697 }, { "epoch": 0.6278787878787879, "grad_norm": 5.618700981140137, "learning_rate": 3.099513074114917e-06, "loss": 0.4425, "step": 5698 }, { "epoch": 0.6279889807162534, "grad_norm": 4.917667388916016, "learning_rate": 3.0978959791671128e-06, "loss": 0.3253, "step": 5699 }, { "epoch": 0.628099173553719, "grad_norm": 9.240817070007324, "learning_rate": 3.0962791168111738e-06, "loss": 0.3723, "step": 5700 }, { "epoch": 0.6282093663911845, "grad_norm": 6.664559841156006, "learning_rate": 3.0946624872448096e-06, "loss": 0.3735, "step": 5701 }, { "epoch": 0.6283195592286501, "grad_norm": 5.612759113311768, "learning_rate": 3.0930460906657043e-06, "loss": 0.3224, "step": 5702 }, { "epoch": 0.6284297520661157, "grad_norm": 10.004939079284668, "learning_rate": 3.091429927271514e-06, "loss": 0.4557, "step": 5703 }, { "epoch": 0.6285399449035812, "grad_norm": 7.886843681335449, "learning_rate": 3.0898139972598645e-06, "loss": 0.4325, "step": 5704 }, { "epoch": 0.6286501377410468, "grad_norm": 4.917661190032959, "learning_rate": 3.0881983008283534e-06, "loss": 0.3939, "step": 5705 }, { "epoch": 0.6287603305785124, "grad_norm": 9.051050186157227, "learning_rate": 3.0865828381745515e-06, "loss": 0.4182, "step": 5706 }, { "epoch": 0.6288705234159779, "grad_norm": 7.852141380310059, "learning_rate": 3.0849676094960003e-06, "loss": 0.3589, "step": 5707 }, { "epoch": 0.6289807162534435, "grad_norm": 4.527191638946533, "learning_rate": 3.0833526149902093e-06, "loss": 0.3663, "step": 5708 }, { "epoch": 0.6290909090909091, "grad_norm": 5.753724575042725, "learning_rate": 3.081737854854665e-06, "loss": 0.386, "step": 5709 }, { "epoch": 0.6292011019283746, "grad_norm": 7.792405128479004, "learning_rate": 3.0801233292868216e-06, "loss": 0.4654, "step": 5710 }, { "epoch": 0.6293112947658402, "grad_norm": 11.35915470123291, "learning_rate": 3.078509038484105e-06, "loss": 0.3484, "step": 5711 }, { "epoch": 0.6294214876033057, "grad_norm": 5.1471357345581055, "learning_rate": 3.0768949826439135e-06, "loss": 0.4525, "step": 5712 }, { "epoch": 0.6295316804407713, "grad_norm": 6.306460857391357, "learning_rate": 3.0752811619636175e-06, "loss": 0.362, "step": 5713 }, { "epoch": 0.629641873278237, "grad_norm": 14.662524223327637, "learning_rate": 3.073667576640552e-06, "loss": 0.4656, "step": 5714 }, { "epoch": 0.6297520661157024, "grad_norm": 6.419051647186279, "learning_rate": 3.0720542268720344e-06, "loss": 0.4059, "step": 5715 }, { "epoch": 0.629862258953168, "grad_norm": 4.525341033935547, "learning_rate": 3.070441112855343e-06, "loss": 0.4006, "step": 5716 }, { "epoch": 0.6299724517906337, "grad_norm": 8.255001068115234, "learning_rate": 3.0688282347877346e-06, "loss": 0.4168, "step": 5717 }, { "epoch": 0.6300826446280992, "grad_norm": 4.753320693969727, "learning_rate": 3.0672155928664345e-06, "loss": 0.3782, "step": 5718 }, { "epoch": 0.6301928374655648, "grad_norm": 6.017674922943115, "learning_rate": 3.065603187288634e-06, "loss": 0.4393, "step": 5719 }, { "epoch": 0.6303030303030303, "grad_norm": 6.011033535003662, "learning_rate": 3.0639910182515045e-06, "loss": 0.3328, "step": 5720 }, { "epoch": 0.6304132231404959, "grad_norm": 4.837926864624023, "learning_rate": 3.0623790859521853e-06, "loss": 0.3611, "step": 5721 }, { "epoch": 0.6305234159779615, "grad_norm": 9.484621047973633, "learning_rate": 3.0607673905877787e-06, "loss": 0.4761, "step": 5722 }, { "epoch": 0.630633608815427, "grad_norm": 6.29550838470459, "learning_rate": 3.0591559323553745e-06, "loss": 0.3861, "step": 5723 }, { "epoch": 0.6307438016528926, "grad_norm": 6.089902400970459, "learning_rate": 3.0575447114520175e-06, "loss": 0.3484, "step": 5724 }, { "epoch": 0.6308539944903582, "grad_norm": 5.730831146240234, "learning_rate": 3.0559337280747314e-06, "loss": 0.4115, "step": 5725 }, { "epoch": 0.6309641873278237, "grad_norm": 5.556731224060059, "learning_rate": 3.05432298242051e-06, "loss": 0.3475, "step": 5726 }, { "epoch": 0.6310743801652893, "grad_norm": 11.543536186218262, "learning_rate": 3.052712474686318e-06, "loss": 0.4169, "step": 5727 }, { "epoch": 0.6311845730027548, "grad_norm": 5.412355899810791, "learning_rate": 3.051102205069088e-06, "loss": 0.4294, "step": 5728 }, { "epoch": 0.6312947658402204, "grad_norm": 7.309114933013916, "learning_rate": 3.049492173765729e-06, "loss": 0.327, "step": 5729 }, { "epoch": 0.631404958677686, "grad_norm": 5.162247657775879, "learning_rate": 3.047882380973115e-06, "loss": 0.3876, "step": 5730 }, { "epoch": 0.6315151515151515, "grad_norm": 12.657859802246094, "learning_rate": 3.046272826888097e-06, "loss": 0.4236, "step": 5731 }, { "epoch": 0.6316253443526171, "grad_norm": 7.963898181915283, "learning_rate": 3.044663511707491e-06, "loss": 0.4182, "step": 5732 }, { "epoch": 0.6317355371900827, "grad_norm": 5.893122673034668, "learning_rate": 3.0430544356280865e-06, "loss": 0.4278, "step": 5733 }, { "epoch": 0.6318457300275482, "grad_norm": 5.781898021697998, "learning_rate": 3.041445598846644e-06, "loss": 0.3831, "step": 5734 }, { "epoch": 0.6319559228650138, "grad_norm": 13.247063636779785, "learning_rate": 3.039837001559895e-06, "loss": 0.5004, "step": 5735 }, { "epoch": 0.6320661157024794, "grad_norm": 7.441000938415527, "learning_rate": 3.0382286439645382e-06, "loss": 0.3866, "step": 5736 }, { "epoch": 0.6321763085399449, "grad_norm": 5.362796783447266, "learning_rate": 3.036620526257249e-06, "loss": 0.4517, "step": 5737 }, { "epoch": 0.6322865013774105, "grad_norm": 6.049092769622803, "learning_rate": 3.0350126486346694e-06, "loss": 0.4513, "step": 5738 }, { "epoch": 0.632396694214876, "grad_norm": 8.4575777053833, "learning_rate": 3.0334050112934106e-06, "loss": 0.3578, "step": 5739 }, { "epoch": 0.6325068870523416, "grad_norm": 3.689763069152832, "learning_rate": 3.0317976144300598e-06, "loss": 0.3682, "step": 5740 }, { "epoch": 0.6326170798898072, "grad_norm": 4.175183296203613, "learning_rate": 3.03019045824117e-06, "loss": 0.4013, "step": 5741 }, { "epoch": 0.6327272727272727, "grad_norm": 5.330626487731934, "learning_rate": 3.028583542923266e-06, "loss": 0.4065, "step": 5742 }, { "epoch": 0.6328374655647383, "grad_norm": 7.9115777015686035, "learning_rate": 3.026976868672844e-06, "loss": 0.4795, "step": 5743 }, { "epoch": 0.6329476584022039, "grad_norm": 8.067718505859375, "learning_rate": 3.025370435686371e-06, "loss": 0.3546, "step": 5744 }, { "epoch": 0.6330578512396694, "grad_norm": 8.21186351776123, "learning_rate": 3.0237642441602837e-06, "loss": 0.28, "step": 5745 }, { "epoch": 0.633168044077135, "grad_norm": 8.001065254211426, "learning_rate": 3.0221582942909903e-06, "loss": 0.4416, "step": 5746 }, { "epoch": 0.6332782369146005, "grad_norm": 5.936722278594971, "learning_rate": 3.020552586274865e-06, "loss": 0.4107, "step": 5747 }, { "epoch": 0.6333884297520661, "grad_norm": 17.30539321899414, "learning_rate": 3.0189471203082593e-06, "loss": 0.5181, "step": 5748 }, { "epoch": 0.6334986225895317, "grad_norm": 7.969041347503662, "learning_rate": 3.017341896587492e-06, "loss": 0.3932, "step": 5749 }, { "epoch": 0.6336088154269972, "grad_norm": 5.316683769226074, "learning_rate": 3.0157369153088477e-06, "loss": 0.4158, "step": 5750 }, { "epoch": 0.6337190082644628, "grad_norm": 11.801061630249023, "learning_rate": 3.0141321766685914e-06, "loss": 0.471, "step": 5751 }, { "epoch": 0.6338292011019284, "grad_norm": 8.095755577087402, "learning_rate": 3.01252768086295e-06, "loss": 0.4333, "step": 5752 }, { "epoch": 0.6339393939393939, "grad_norm": 4.716119766235352, "learning_rate": 3.010923428088121e-06, "loss": 0.4192, "step": 5753 }, { "epoch": 0.6340495867768595, "grad_norm": 6.167051315307617, "learning_rate": 3.00931941854028e-06, "loss": 0.3824, "step": 5754 }, { "epoch": 0.634159779614325, "grad_norm": 5.537266254425049, "learning_rate": 3.0077156524155637e-06, "loss": 0.4078, "step": 5755 }, { "epoch": 0.6342699724517906, "grad_norm": 3.6839773654937744, "learning_rate": 3.0061121299100824e-06, "loss": 0.3859, "step": 5756 }, { "epoch": 0.6343801652892562, "grad_norm": 7.0088043212890625, "learning_rate": 3.00450885121992e-06, "loss": 0.3354, "step": 5757 }, { "epoch": 0.6344903581267217, "grad_norm": 5.540916442871094, "learning_rate": 3.0029058165411245e-06, "loss": 0.3837, "step": 5758 }, { "epoch": 0.6346005509641873, "grad_norm": 8.917105674743652, "learning_rate": 3.0013030260697194e-06, "loss": 0.4784, "step": 5759 }, { "epoch": 0.6347107438016529, "grad_norm": 8.26288890838623, "learning_rate": 2.9997004800016956e-06, "loss": 0.4377, "step": 5760 }, { "epoch": 0.6348209366391184, "grad_norm": 6.514774322509766, "learning_rate": 2.9980981785330125e-06, "loss": 0.3802, "step": 5761 }, { "epoch": 0.634931129476584, "grad_norm": 9.283097267150879, "learning_rate": 2.996496121859605e-06, "loss": 0.4863, "step": 5762 }, { "epoch": 0.6350413223140496, "grad_norm": 6.459282875061035, "learning_rate": 2.994894310177373e-06, "loss": 0.3799, "step": 5763 }, { "epoch": 0.6351515151515151, "grad_norm": 8.162826538085938, "learning_rate": 2.993292743682188e-06, "loss": 0.3887, "step": 5764 }, { "epoch": 0.6352617079889807, "grad_norm": 6.5035576820373535, "learning_rate": 2.9916914225698923e-06, "loss": 0.4283, "step": 5765 }, { "epoch": 0.6353719008264462, "grad_norm": 4.4118733406066895, "learning_rate": 2.990090347036298e-06, "loss": 0.3896, "step": 5766 }, { "epoch": 0.6354820936639118, "grad_norm": 6.801360607147217, "learning_rate": 2.9884895172771854e-06, "loss": 0.3955, "step": 5767 }, { "epoch": 0.6355922865013774, "grad_norm": 7.458156108856201, "learning_rate": 2.986888933488308e-06, "loss": 0.4166, "step": 5768 }, { "epoch": 0.6357024793388429, "grad_norm": 7.173938274383545, "learning_rate": 2.9852885958653855e-06, "loss": 0.4162, "step": 5769 }, { "epoch": 0.6358126721763085, "grad_norm": 6.3631486892700195, "learning_rate": 2.9836885046041095e-06, "loss": 0.4605, "step": 5770 }, { "epoch": 0.6359228650137742, "grad_norm": 5.616750240325928, "learning_rate": 2.9820886599001434e-06, "loss": 0.4108, "step": 5771 }, { "epoch": 0.6360330578512396, "grad_norm": 4.267113208770752, "learning_rate": 2.980489061949116e-06, "loss": 0.284, "step": 5772 }, { "epoch": 0.6361432506887053, "grad_norm": 4.92551851272583, "learning_rate": 2.978889710946631e-06, "loss": 0.4409, "step": 5773 }, { "epoch": 0.6362534435261707, "grad_norm": 18.585920333862305, "learning_rate": 2.977290607088257e-06, "loss": 0.5592, "step": 5774 }, { "epoch": 0.6363636363636364, "grad_norm": 7.982060432434082, "learning_rate": 2.9756917505695336e-06, "loss": 0.4083, "step": 5775 }, { "epoch": 0.636473829201102, "grad_norm": 7.351476669311523, "learning_rate": 2.974093141585974e-06, "loss": 0.42, "step": 5776 }, { "epoch": 0.6365840220385675, "grad_norm": 14.6085844039917, "learning_rate": 2.9724947803330585e-06, "loss": 0.4173, "step": 5777 }, { "epoch": 0.6366942148760331, "grad_norm": 8.948366165161133, "learning_rate": 2.9708966670062313e-06, "loss": 0.374, "step": 5778 }, { "epoch": 0.6368044077134987, "grad_norm": 6.008039951324463, "learning_rate": 2.9692988018009195e-06, "loss": 0.4181, "step": 5779 }, { "epoch": 0.6369146005509642, "grad_norm": 5.3842902183532715, "learning_rate": 2.967701184912508e-06, "loss": 0.4204, "step": 5780 }, { "epoch": 0.6370247933884298, "grad_norm": 8.259960174560547, "learning_rate": 2.9661038165363537e-06, "loss": 0.38, "step": 5781 }, { "epoch": 0.6371349862258954, "grad_norm": 7.987982749938965, "learning_rate": 2.9645066968677906e-06, "loss": 0.3577, "step": 5782 }, { "epoch": 0.6372451790633609, "grad_norm": 8.198049545288086, "learning_rate": 2.9629098261021127e-06, "loss": 0.4439, "step": 5783 }, { "epoch": 0.6373553719008265, "grad_norm": 6.748956203460693, "learning_rate": 2.961313204434587e-06, "loss": 0.3691, "step": 5784 }, { "epoch": 0.637465564738292, "grad_norm": 6.4396162033081055, "learning_rate": 2.9597168320604543e-06, "loss": 0.3588, "step": 5785 }, { "epoch": 0.6375757575757576, "grad_norm": 8.94629192352295, "learning_rate": 2.9581207091749154e-06, "loss": 0.431, "step": 5786 }, { "epoch": 0.6376859504132232, "grad_norm": 3.996971845626831, "learning_rate": 2.956524835973153e-06, "loss": 0.3787, "step": 5787 }, { "epoch": 0.6377961432506887, "grad_norm": 14.571632385253906, "learning_rate": 2.9549292126503086e-06, "loss": 0.4889, "step": 5788 }, { "epoch": 0.6379063360881543, "grad_norm": 8.187118530273438, "learning_rate": 2.9533338394014976e-06, "loss": 0.3969, "step": 5789 }, { "epoch": 0.6380165289256199, "grad_norm": 7.629312515258789, "learning_rate": 2.951738716421805e-06, "loss": 0.3848, "step": 5790 }, { "epoch": 0.6381267217630854, "grad_norm": 15.203229904174805, "learning_rate": 2.9501438439062844e-06, "loss": 0.49, "step": 5791 }, { "epoch": 0.638236914600551, "grad_norm": 5.426725387573242, "learning_rate": 2.948549222049959e-06, "loss": 0.3781, "step": 5792 }, { "epoch": 0.6383471074380165, "grad_norm": 8.121671676635742, "learning_rate": 2.946954851047822e-06, "loss": 0.3217, "step": 5793 }, { "epoch": 0.6384573002754821, "grad_norm": 9.843147277832031, "learning_rate": 2.945360731094834e-06, "loss": 0.3904, "step": 5794 }, { "epoch": 0.6385674931129477, "grad_norm": 7.5617523193359375, "learning_rate": 2.943766862385926e-06, "loss": 0.4492, "step": 5795 }, { "epoch": 0.6386776859504132, "grad_norm": 4.36297082901001, "learning_rate": 2.942173245116e-06, "loss": 0.3465, "step": 5796 }, { "epoch": 0.6387878787878788, "grad_norm": 8.202534675598145, "learning_rate": 2.9405798794799257e-06, "loss": 0.314, "step": 5797 }, { "epoch": 0.6388980716253444, "grad_norm": 5.054417133331299, "learning_rate": 2.938986765672539e-06, "loss": 0.4032, "step": 5798 }, { "epoch": 0.6390082644628099, "grad_norm": 4.384051322937012, "learning_rate": 2.9373939038886524e-06, "loss": 0.368, "step": 5799 }, { "epoch": 0.6391184573002755, "grad_norm": 8.412352561950684, "learning_rate": 2.9358012943230395e-06, "loss": 0.4504, "step": 5800 }, { "epoch": 0.639228650137741, "grad_norm": 28.05645179748535, "learning_rate": 2.934208937170449e-06, "loss": 0.4941, "step": 5801 }, { "epoch": 0.6393388429752066, "grad_norm": 8.390089988708496, "learning_rate": 2.9326168326255963e-06, "loss": 0.3966, "step": 5802 }, { "epoch": 0.6394490358126722, "grad_norm": 6.950432777404785, "learning_rate": 2.9310249808831635e-06, "loss": 0.3803, "step": 5803 }, { "epoch": 0.6395592286501377, "grad_norm": 9.502593040466309, "learning_rate": 2.9294333821378085e-06, "loss": 0.422, "step": 5804 }, { "epoch": 0.6396694214876033, "grad_norm": 13.224533081054688, "learning_rate": 2.927842036584153e-06, "loss": 0.5388, "step": 5805 }, { "epoch": 0.6397796143250689, "grad_norm": 7.394783973693848, "learning_rate": 2.9262509444167853e-06, "loss": 0.3838, "step": 5806 }, { "epoch": 0.6398898071625344, "grad_norm": 8.313453674316406, "learning_rate": 2.9246601058302703e-06, "loss": 0.4582, "step": 5807 }, { "epoch": 0.64, "grad_norm": 8.038554191589355, "learning_rate": 2.9230695210191377e-06, "loss": 0.4205, "step": 5808 }, { "epoch": 0.6401101928374656, "grad_norm": 5.241087436676025, "learning_rate": 2.921479190177883e-06, "loss": 0.4286, "step": 5809 }, { "epoch": 0.6402203856749311, "grad_norm": 6.424064636230469, "learning_rate": 2.919889113500979e-06, "loss": 0.4076, "step": 5810 }, { "epoch": 0.6403305785123967, "grad_norm": 5.438494682312012, "learning_rate": 2.9182992911828585e-06, "loss": 0.3951, "step": 5811 }, { "epoch": 0.6404407713498622, "grad_norm": 5.5653886795043945, "learning_rate": 2.9167097234179275e-06, "loss": 0.4086, "step": 5812 }, { "epoch": 0.6405509641873278, "grad_norm": 5.504902362823486, "learning_rate": 2.9151204104005614e-06, "loss": 0.3923, "step": 5813 }, { "epoch": 0.6406611570247934, "grad_norm": 8.266979217529297, "learning_rate": 2.913531352325103e-06, "loss": 0.442, "step": 5814 }, { "epoch": 0.6407713498622589, "grad_norm": 6.890100002288818, "learning_rate": 2.9119425493858677e-06, "loss": 0.3856, "step": 5815 }, { "epoch": 0.6408815426997245, "grad_norm": 7.708473205566406, "learning_rate": 2.9103540017771316e-06, "loss": 0.4208, "step": 5816 }, { "epoch": 0.6409917355371901, "grad_norm": 5.167667388916016, "learning_rate": 2.908765709693147e-06, "loss": 0.3485, "step": 5817 }, { "epoch": 0.6411019283746556, "grad_norm": 5.596955299377441, "learning_rate": 2.907177673328134e-06, "loss": 0.4133, "step": 5818 }, { "epoch": 0.6412121212121212, "grad_norm": 6.119378566741943, "learning_rate": 2.9055898928762775e-06, "loss": 0.4351, "step": 5819 }, { "epoch": 0.6413223140495867, "grad_norm": 8.178197860717773, "learning_rate": 2.9040023685317298e-06, "loss": 0.4298, "step": 5820 }, { "epoch": 0.6414325068870523, "grad_norm": 4.932050704956055, "learning_rate": 2.902415100488624e-06, "loss": 0.3811, "step": 5821 }, { "epoch": 0.6415426997245179, "grad_norm": 5.623518943786621, "learning_rate": 2.900828088941049e-06, "loss": 0.4268, "step": 5822 }, { "epoch": 0.6416528925619834, "grad_norm": 4.286214351654053, "learning_rate": 2.899241334083063e-06, "loss": 0.3584, "step": 5823 }, { "epoch": 0.641763085399449, "grad_norm": 5.3854851722717285, "learning_rate": 2.8976548361087043e-06, "loss": 0.3922, "step": 5824 }, { "epoch": 0.6418732782369146, "grad_norm": 7.0205793380737305, "learning_rate": 2.8960685952119672e-06, "loss": 0.4221, "step": 5825 }, { "epoch": 0.6419834710743801, "grad_norm": 6.966156005859375, "learning_rate": 2.8944826115868165e-06, "loss": 0.4168, "step": 5826 }, { "epoch": 0.6420936639118457, "grad_norm": 6.308687686920166, "learning_rate": 2.8928968854271967e-06, "loss": 0.3444, "step": 5827 }, { "epoch": 0.6422038567493112, "grad_norm": 11.661874771118164, "learning_rate": 2.8913114169270052e-06, "loss": 0.4469, "step": 5828 }, { "epoch": 0.6423140495867768, "grad_norm": 5.717617511749268, "learning_rate": 2.88972620628012e-06, "loss": 0.3863, "step": 5829 }, { "epoch": 0.6424242424242425, "grad_norm": 6.405765056610107, "learning_rate": 2.888141253680379e-06, "loss": 0.3678, "step": 5830 }, { "epoch": 0.642534435261708, "grad_norm": 7.72976541519165, "learning_rate": 2.886556559321595e-06, "loss": 0.4918, "step": 5831 }, { "epoch": 0.6426446280991736, "grad_norm": 12.591797828674316, "learning_rate": 2.884972123397547e-06, "loss": 0.465, "step": 5832 }, { "epoch": 0.6427548209366392, "grad_norm": 7.761502265930176, "learning_rate": 2.883387946101979e-06, "loss": 0.4708, "step": 5833 }, { "epoch": 0.6428650137741047, "grad_norm": 5.839196681976318, "learning_rate": 2.8818040276286073e-06, "loss": 0.4671, "step": 5834 }, { "epoch": 0.6429752066115703, "grad_norm": 4.984602451324463, "learning_rate": 2.8802203681711195e-06, "loss": 0.3963, "step": 5835 }, { "epoch": 0.6430853994490359, "grad_norm": 5.926804065704346, "learning_rate": 2.878636967923162e-06, "loss": 0.3643, "step": 5836 }, { "epoch": 0.6431955922865014, "grad_norm": 8.573546409606934, "learning_rate": 2.8770538270783576e-06, "loss": 0.4617, "step": 5837 }, { "epoch": 0.643305785123967, "grad_norm": 9.431220054626465, "learning_rate": 2.8754709458302966e-06, "loss": 0.4977, "step": 5838 }, { "epoch": 0.6434159779614325, "grad_norm": 5.257115364074707, "learning_rate": 2.8738883243725324e-06, "loss": 0.3939, "step": 5839 }, { "epoch": 0.6435261707988981, "grad_norm": 5.686413764953613, "learning_rate": 2.872305962898593e-06, "loss": 0.4053, "step": 5840 }, { "epoch": 0.6436363636363637, "grad_norm": 6.769815444946289, "learning_rate": 2.870723861601972e-06, "loss": 0.3597, "step": 5841 }, { "epoch": 0.6437465564738292, "grad_norm": 5.612032413482666, "learning_rate": 2.869142020676127e-06, "loss": 0.3669, "step": 5842 }, { "epoch": 0.6438567493112948, "grad_norm": 7.373021125793457, "learning_rate": 2.86756044031449e-06, "loss": 0.3263, "step": 5843 }, { "epoch": 0.6439669421487604, "grad_norm": 5.95004940032959, "learning_rate": 2.865979120710462e-06, "loss": 0.4416, "step": 5844 }, { "epoch": 0.6440771349862259, "grad_norm": 5.859914302825928, "learning_rate": 2.864398062057403e-06, "loss": 0.4089, "step": 5845 }, { "epoch": 0.6441873278236915, "grad_norm": 5.572892665863037, "learning_rate": 2.8628172645486506e-06, "loss": 0.4197, "step": 5846 }, { "epoch": 0.644297520661157, "grad_norm": 6.591333389282227, "learning_rate": 2.861236728377508e-06, "loss": 0.3455, "step": 5847 }, { "epoch": 0.6444077134986226, "grad_norm": 5.3536553382873535, "learning_rate": 2.8596564537372416e-06, "loss": 0.3486, "step": 5848 }, { "epoch": 0.6445179063360882, "grad_norm": 9.382023811340332, "learning_rate": 2.8580764408210916e-06, "loss": 0.4225, "step": 5849 }, { "epoch": 0.6446280991735537, "grad_norm": 8.87354564666748, "learning_rate": 2.856496689822265e-06, "loss": 0.4025, "step": 5850 }, { "epoch": 0.6447382920110193, "grad_norm": 5.720553874969482, "learning_rate": 2.854917200933933e-06, "loss": 0.386, "step": 5851 }, { "epoch": 0.6448484848484849, "grad_norm": 6.788865089416504, "learning_rate": 2.8533379743492424e-06, "loss": 0.431, "step": 5852 }, { "epoch": 0.6449586776859504, "grad_norm": 5.583028316497803, "learning_rate": 2.851759010261298e-06, "loss": 0.4201, "step": 5853 }, { "epoch": 0.645068870523416, "grad_norm": 8.681328773498535, "learning_rate": 2.8501803088631795e-06, "loss": 0.4024, "step": 5854 }, { "epoch": 0.6451790633608815, "grad_norm": 5.74639368057251, "learning_rate": 2.8486018703479344e-06, "loss": 0.362, "step": 5855 }, { "epoch": 0.6452892561983471, "grad_norm": 8.127897262573242, "learning_rate": 2.8470236949085722e-06, "loss": 0.3904, "step": 5856 }, { "epoch": 0.6453994490358127, "grad_norm": 12.584700584411621, "learning_rate": 2.845445782738081e-06, "loss": 0.4496, "step": 5857 }, { "epoch": 0.6455096418732782, "grad_norm": 7.233221530914307, "learning_rate": 2.8438681340294063e-06, "loss": 0.4184, "step": 5858 }, { "epoch": 0.6456198347107438, "grad_norm": 4.437359809875488, "learning_rate": 2.8422907489754603e-06, "loss": 0.4101, "step": 5859 }, { "epoch": 0.6457300275482094, "grad_norm": 11.709005355834961, "learning_rate": 2.840713627769136e-06, "loss": 0.4677, "step": 5860 }, { "epoch": 0.6458402203856749, "grad_norm": 6.178116321563721, "learning_rate": 2.8391367706032834e-06, "loss": 0.3714, "step": 5861 }, { "epoch": 0.6459504132231405, "grad_norm": 8.382662773132324, "learning_rate": 2.8375601776707197e-06, "loss": 0.4539, "step": 5862 }, { "epoch": 0.6460606060606061, "grad_norm": 5.94052791595459, "learning_rate": 2.8359838491642344e-06, "loss": 0.3629, "step": 5863 }, { "epoch": 0.6461707988980716, "grad_norm": 8.428426742553711, "learning_rate": 2.834407785276586e-06, "loss": 0.4307, "step": 5864 }, { "epoch": 0.6462809917355372, "grad_norm": 5.8498759269714355, "learning_rate": 2.8328319862004927e-06, "loss": 0.3718, "step": 5865 }, { "epoch": 0.6463911845730027, "grad_norm": 6.83188533782959, "learning_rate": 2.831256452128649e-06, "loss": 0.3799, "step": 5866 }, { "epoch": 0.6465013774104683, "grad_norm": 6.298277378082275, "learning_rate": 2.829681183253713e-06, "loss": 0.3998, "step": 5867 }, { "epoch": 0.6466115702479339, "grad_norm": 6.411405086517334, "learning_rate": 2.8281061797683086e-06, "loss": 0.4445, "step": 5868 }, { "epoch": 0.6467217630853994, "grad_norm": 7.376020431518555, "learning_rate": 2.8265314418650315e-06, "loss": 0.3643, "step": 5869 }, { "epoch": 0.646831955922865, "grad_norm": 5.0421319007873535, "learning_rate": 2.824956969736441e-06, "loss": 0.4507, "step": 5870 }, { "epoch": 0.6469421487603306, "grad_norm": 5.176947593688965, "learning_rate": 2.8233827635750687e-06, "loss": 0.4157, "step": 5871 }, { "epoch": 0.6470523415977961, "grad_norm": 9.123809814453125, "learning_rate": 2.8218088235734076e-06, "loss": 0.478, "step": 5872 }, { "epoch": 0.6471625344352617, "grad_norm": 7.190445423126221, "learning_rate": 2.820235149923921e-06, "loss": 0.406, "step": 5873 }, { "epoch": 0.6472727272727272, "grad_norm": 8.6001558303833, "learning_rate": 2.8186617428190446e-06, "loss": 0.4365, "step": 5874 }, { "epoch": 0.6473829201101928, "grad_norm": 7.1687445640563965, "learning_rate": 2.8170886024511705e-06, "loss": 0.4479, "step": 5875 }, { "epoch": 0.6474931129476584, "grad_norm": 5.2592854499816895, "learning_rate": 2.815515729012668e-06, "loss": 0.3346, "step": 5876 }, { "epoch": 0.6476033057851239, "grad_norm": 7.434048175811768, "learning_rate": 2.813943122695871e-06, "loss": 0.3995, "step": 5877 }, { "epoch": 0.6477134986225895, "grad_norm": 7.412423610687256, "learning_rate": 2.812370783693078e-06, "loss": 0.5136, "step": 5878 }, { "epoch": 0.6478236914600551, "grad_norm": 5.2689056396484375, "learning_rate": 2.8107987121965542e-06, "loss": 0.3406, "step": 5879 }, { "epoch": 0.6479338842975206, "grad_norm": 4.5380988121032715, "learning_rate": 2.8092269083985404e-06, "loss": 0.396, "step": 5880 }, { "epoch": 0.6480440771349862, "grad_norm": 5.02008581161499, "learning_rate": 2.807655372491237e-06, "loss": 0.4053, "step": 5881 }, { "epoch": 0.6481542699724517, "grad_norm": 4.435138702392578, "learning_rate": 2.8060841046668085e-06, "loss": 0.4039, "step": 5882 }, { "epoch": 0.6482644628099173, "grad_norm": 6.067267894744873, "learning_rate": 2.8045131051173996e-06, "loss": 0.3952, "step": 5883 }, { "epoch": 0.648374655647383, "grad_norm": 6.425743103027344, "learning_rate": 2.8029423740351087e-06, "loss": 0.4121, "step": 5884 }, { "epoch": 0.6484848484848484, "grad_norm": 8.314208030700684, "learning_rate": 2.8013719116120104e-06, "loss": 0.4324, "step": 5885 }, { "epoch": 0.648595041322314, "grad_norm": 7.725162506103516, "learning_rate": 2.799801718040139e-06, "loss": 0.3912, "step": 5886 }, { "epoch": 0.6487052341597797, "grad_norm": 6.990640163421631, "learning_rate": 2.7982317935115035e-06, "loss": 0.3516, "step": 5887 }, { "epoch": 0.6488154269972451, "grad_norm": 7.705892562866211, "learning_rate": 2.7966621382180758e-06, "loss": 0.4825, "step": 5888 }, { "epoch": 0.6489256198347108, "grad_norm": 12.551563262939453, "learning_rate": 2.7950927523517936e-06, "loss": 0.4183, "step": 5889 }, { "epoch": 0.6490358126721764, "grad_norm": 6.314427852630615, "learning_rate": 2.7935236361045643e-06, "loss": 0.3639, "step": 5890 }, { "epoch": 0.6491460055096419, "grad_norm": 9.067972183227539, "learning_rate": 2.791954789668264e-06, "loss": 0.5317, "step": 5891 }, { "epoch": 0.6492561983471075, "grad_norm": 4.194504737854004, "learning_rate": 2.790386213234729e-06, "loss": 0.3472, "step": 5892 }, { "epoch": 0.649366391184573, "grad_norm": 8.523187637329102, "learning_rate": 2.78881790699577e-06, "loss": 0.3514, "step": 5893 }, { "epoch": 0.6494765840220386, "grad_norm": 7.889522552490234, "learning_rate": 2.787249871143163e-06, "loss": 0.463, "step": 5894 }, { "epoch": 0.6495867768595042, "grad_norm": 6.233395099639893, "learning_rate": 2.785682105868645e-06, "loss": 0.4157, "step": 5895 }, { "epoch": 0.6496969696969697, "grad_norm": 5.324563503265381, "learning_rate": 2.784114611363927e-06, "loss": 0.318, "step": 5896 }, { "epoch": 0.6498071625344353, "grad_norm": 6.411282539367676, "learning_rate": 2.7825473878206865e-06, "loss": 0.3624, "step": 5897 }, { "epoch": 0.6499173553719009, "grad_norm": 7.308784484863281, "learning_rate": 2.7809804354305612e-06, "loss": 0.4053, "step": 5898 }, { "epoch": 0.6500275482093664, "grad_norm": 8.107905387878418, "learning_rate": 2.779413754385163e-06, "loss": 0.401, "step": 5899 }, { "epoch": 0.650137741046832, "grad_norm": 8.488053321838379, "learning_rate": 2.777847344876069e-06, "loss": 0.3543, "step": 5900 }, { "epoch": 0.6502479338842975, "grad_norm": 4.050614356994629, "learning_rate": 2.7762812070948183e-06, "loss": 0.43, "step": 5901 }, { "epoch": 0.6503581267217631, "grad_norm": 5.892617702484131, "learning_rate": 2.774715341232922e-06, "loss": 0.3606, "step": 5902 }, { "epoch": 0.6504683195592287, "grad_norm": 10.13648796081543, "learning_rate": 2.7731497474818587e-06, "loss": 0.3664, "step": 5903 }, { "epoch": 0.6505785123966942, "grad_norm": 5.645157337188721, "learning_rate": 2.7715844260330672e-06, "loss": 0.4184, "step": 5904 }, { "epoch": 0.6506887052341598, "grad_norm": 5.31406831741333, "learning_rate": 2.770019377077959e-06, "loss": 0.3485, "step": 5905 }, { "epoch": 0.6507988980716254, "grad_norm": 9.741802215576172, "learning_rate": 2.768454600807912e-06, "loss": 0.4814, "step": 5906 }, { "epoch": 0.6509090909090909, "grad_norm": 7.9445905685424805, "learning_rate": 2.7668900974142666e-06, "loss": 0.4288, "step": 5907 }, { "epoch": 0.6510192837465565, "grad_norm": 7.178967475891113, "learning_rate": 2.765325867088333e-06, "loss": 0.3788, "step": 5908 }, { "epoch": 0.6511294765840221, "grad_norm": 5.582773685455322, "learning_rate": 2.76376191002139e-06, "loss": 0.4043, "step": 5909 }, { "epoch": 0.6512396694214876, "grad_norm": 5.5358476638793945, "learning_rate": 2.762198226404676e-06, "loss": 0.4202, "step": 5910 }, { "epoch": 0.6513498622589532, "grad_norm": 9.965605735778809, "learning_rate": 2.760634816429405e-06, "loss": 0.401, "step": 5911 }, { "epoch": 0.6514600550964187, "grad_norm": 7.121329307556152, "learning_rate": 2.7590716802867462e-06, "loss": 0.3202, "step": 5912 }, { "epoch": 0.6515702479338843, "grad_norm": 6.912757873535156, "learning_rate": 2.75750881816785e-06, "loss": 0.3622, "step": 5913 }, { "epoch": 0.6516804407713499, "grad_norm": 8.120623588562012, "learning_rate": 2.7559462302638223e-06, "loss": 0.3756, "step": 5914 }, { "epoch": 0.6517906336088154, "grad_norm": 4.33920431137085, "learning_rate": 2.754383916765734e-06, "loss": 0.3147, "step": 5915 }, { "epoch": 0.651900826446281, "grad_norm": 5.247227191925049, "learning_rate": 2.7528218778646345e-06, "loss": 0.3487, "step": 5916 }, { "epoch": 0.6520110192837466, "grad_norm": 6.693728446960449, "learning_rate": 2.7512601137515277e-06, "loss": 0.4002, "step": 5917 }, { "epoch": 0.6521212121212121, "grad_norm": 12.273859024047852, "learning_rate": 2.7496986246173873e-06, "loss": 0.5216, "step": 5918 }, { "epoch": 0.6522314049586777, "grad_norm": 10.704306602478027, "learning_rate": 2.7481374106531555e-06, "loss": 0.4146, "step": 5919 }, { "epoch": 0.6523415977961432, "grad_norm": 5.9599456787109375, "learning_rate": 2.7465764720497423e-06, "loss": 0.3649, "step": 5920 }, { "epoch": 0.6524517906336088, "grad_norm": 8.873656272888184, "learning_rate": 2.745015808998017e-06, "loss": 0.4255, "step": 5921 }, { "epoch": 0.6525619834710744, "grad_norm": 5.334264278411865, "learning_rate": 2.743455421688822e-06, "loss": 0.4006, "step": 5922 }, { "epoch": 0.6526721763085399, "grad_norm": 6.818568706512451, "learning_rate": 2.741895310312965e-06, "loss": 0.4139, "step": 5923 }, { "epoch": 0.6527823691460055, "grad_norm": 6.1837358474731445, "learning_rate": 2.7403354750612145e-06, "loss": 0.3791, "step": 5924 }, { "epoch": 0.6528925619834711, "grad_norm": 7.931142330169678, "learning_rate": 2.7387759161243116e-06, "loss": 0.4047, "step": 5925 }, { "epoch": 0.6530027548209366, "grad_norm": 7.647500038146973, "learning_rate": 2.737216633692962e-06, "loss": 0.4004, "step": 5926 }, { "epoch": 0.6531129476584022, "grad_norm": 6.578307151794434, "learning_rate": 2.735657627957837e-06, "loss": 0.2985, "step": 5927 }, { "epoch": 0.6532231404958677, "grad_norm": 4.906883239746094, "learning_rate": 2.734098899109572e-06, "loss": 0.3725, "step": 5928 }, { "epoch": 0.6533333333333333, "grad_norm": 7.8243913650512695, "learning_rate": 2.732540447338771e-06, "loss": 0.4206, "step": 5929 }, { "epoch": 0.6534435261707989, "grad_norm": 8.364114761352539, "learning_rate": 2.7309822728360057e-06, "loss": 0.4373, "step": 5930 }, { "epoch": 0.6535537190082644, "grad_norm": 7.875283718109131, "learning_rate": 2.7294243757918094e-06, "loss": 0.3778, "step": 5931 }, { "epoch": 0.65366391184573, "grad_norm": 12.430623054504395, "learning_rate": 2.7278667563966836e-06, "loss": 0.4091, "step": 5932 }, { "epoch": 0.6537741046831956, "grad_norm": 7.027017593383789, "learning_rate": 2.7263094148410996e-06, "loss": 0.4117, "step": 5933 }, { "epoch": 0.6538842975206611, "grad_norm": 7.087715148925781, "learning_rate": 2.7247523513154874e-06, "loss": 0.3524, "step": 5934 }, { "epoch": 0.6539944903581267, "grad_norm": 7.618420124053955, "learning_rate": 2.723195566010248e-06, "loss": 0.432, "step": 5935 }, { "epoch": 0.6541046831955923, "grad_norm": 4.888277530670166, "learning_rate": 2.7216390591157494e-06, "loss": 0.3826, "step": 5936 }, { "epoch": 0.6542148760330578, "grad_norm": 8.381621360778809, "learning_rate": 2.7200828308223214e-06, "loss": 0.4517, "step": 5937 }, { "epoch": 0.6543250688705234, "grad_norm": 7.997573375701904, "learning_rate": 2.718526881320258e-06, "loss": 0.4573, "step": 5938 }, { "epoch": 0.6544352617079889, "grad_norm": 11.966435432434082, "learning_rate": 2.7169712107998303e-06, "loss": 0.4254, "step": 5939 }, { "epoch": 0.6545454545454545, "grad_norm": 9.864051818847656, "learning_rate": 2.7154158194512625e-06, "loss": 0.4535, "step": 5940 }, { "epoch": 0.6546556473829201, "grad_norm": 10.945318222045898, "learning_rate": 2.7138607074647516e-06, "loss": 0.3582, "step": 5941 }, { "epoch": 0.6547658402203856, "grad_norm": 4.697882175445557, "learning_rate": 2.712305875030461e-06, "loss": 0.4087, "step": 5942 }, { "epoch": 0.6548760330578512, "grad_norm": 8.20838451385498, "learning_rate": 2.710751322338513e-06, "loss": 0.4489, "step": 5943 }, { "epoch": 0.6549862258953169, "grad_norm": 4.127697467803955, "learning_rate": 2.709197049579005e-06, "loss": 0.3989, "step": 5944 }, { "epoch": 0.6550964187327823, "grad_norm": 4.515989780426025, "learning_rate": 2.707643056941992e-06, "loss": 0.3887, "step": 5945 }, { "epoch": 0.655206611570248, "grad_norm": 5.887902736663818, "learning_rate": 2.7060893446174994e-06, "loss": 0.4113, "step": 5946 }, { "epoch": 0.6553168044077134, "grad_norm": 6.794983863830566, "learning_rate": 2.7045359127955197e-06, "loss": 0.3861, "step": 5947 }, { "epoch": 0.655426997245179, "grad_norm": 5.16549015045166, "learning_rate": 2.702982761666005e-06, "loss": 0.3808, "step": 5948 }, { "epoch": 0.6555371900826447, "grad_norm": 6.333678722381592, "learning_rate": 2.701429891418878e-06, "loss": 0.3983, "step": 5949 }, { "epoch": 0.6556473829201102, "grad_norm": 7.293991565704346, "learning_rate": 2.6998773022440283e-06, "loss": 0.3974, "step": 5950 }, { "epoch": 0.6557575757575758, "grad_norm": 8.225202560424805, "learning_rate": 2.698324994331305e-06, "loss": 0.4225, "step": 5951 }, { "epoch": 0.6558677685950414, "grad_norm": 5.183380603790283, "learning_rate": 2.696772967870527e-06, "loss": 0.3583, "step": 5952 }, { "epoch": 0.6559779614325069, "grad_norm": 8.09007453918457, "learning_rate": 2.695221223051482e-06, "loss": 0.4006, "step": 5953 }, { "epoch": 0.6560881542699725, "grad_norm": 5.152414321899414, "learning_rate": 2.693669760063914e-06, "loss": 0.4265, "step": 5954 }, { "epoch": 0.656198347107438, "grad_norm": 15.005812644958496, "learning_rate": 2.692118579097541e-06, "loss": 0.4978, "step": 5955 }, { "epoch": 0.6563085399449036, "grad_norm": 7.070697784423828, "learning_rate": 2.6905676803420444e-06, "loss": 0.3571, "step": 5956 }, { "epoch": 0.6564187327823692, "grad_norm": 13.157657623291016, "learning_rate": 2.6890170639870676e-06, "loss": 0.4747, "step": 5957 }, { "epoch": 0.6565289256198347, "grad_norm": 5.683324813842773, "learning_rate": 2.6874667302222237e-06, "loss": 0.3628, "step": 5958 }, { "epoch": 0.6566391184573003, "grad_norm": 5.332111358642578, "learning_rate": 2.6859166792370905e-06, "loss": 0.3987, "step": 5959 }, { "epoch": 0.6567493112947659, "grad_norm": 6.43701171875, "learning_rate": 2.6843669112212073e-06, "loss": 0.4387, "step": 5960 }, { "epoch": 0.6568595041322314, "grad_norm": 7.0052714347839355, "learning_rate": 2.682817426364084e-06, "loss": 0.3602, "step": 5961 }, { "epoch": 0.656969696969697, "grad_norm": 7.186418533325195, "learning_rate": 2.6812682248551945e-06, "loss": 0.4391, "step": 5962 }, { "epoch": 0.6570798898071626, "grad_norm": 4.473569393157959, "learning_rate": 2.6797193068839753e-06, "loss": 0.3754, "step": 5963 }, { "epoch": 0.6571900826446281, "grad_norm": 4.460716247558594, "learning_rate": 2.6781706726398304e-06, "loss": 0.3265, "step": 5964 }, { "epoch": 0.6573002754820937, "grad_norm": 6.775029182434082, "learning_rate": 2.676622322312132e-06, "loss": 0.3858, "step": 5965 }, { "epoch": 0.6574104683195592, "grad_norm": 4.9734673500061035, "learning_rate": 2.67507425609021e-06, "loss": 0.3462, "step": 5966 }, { "epoch": 0.6575206611570248, "grad_norm": 5.981369495391846, "learning_rate": 2.6735264741633656e-06, "loss": 0.4071, "step": 5967 }, { "epoch": 0.6576308539944904, "grad_norm": 5.739418029785156, "learning_rate": 2.6719789767208635e-06, "loss": 0.3575, "step": 5968 }, { "epoch": 0.6577410468319559, "grad_norm": 5.659451007843018, "learning_rate": 2.670431763951938e-06, "loss": 0.3853, "step": 5969 }, { "epoch": 0.6578512396694215, "grad_norm": 5.818564414978027, "learning_rate": 2.6688848360457796e-06, "loss": 0.3174, "step": 5970 }, { "epoch": 0.6579614325068871, "grad_norm": 5.7039361000061035, "learning_rate": 2.6673381931915466e-06, "loss": 0.3567, "step": 5971 }, { "epoch": 0.6580716253443526, "grad_norm": 6.77376127243042, "learning_rate": 2.665791835578372e-06, "loss": 0.3836, "step": 5972 }, { "epoch": 0.6581818181818182, "grad_norm": 11.987671852111816, "learning_rate": 2.6642457633953424e-06, "loss": 0.4798, "step": 5973 }, { "epoch": 0.6582920110192837, "grad_norm": 4.910957336425781, "learning_rate": 2.6626999768315092e-06, "loss": 0.3534, "step": 5974 }, { "epoch": 0.6584022038567493, "grad_norm": 5.625916481018066, "learning_rate": 2.6611544760759023e-06, "loss": 0.3831, "step": 5975 }, { "epoch": 0.6585123966942149, "grad_norm": 5.2465739250183105, "learning_rate": 2.659609261317503e-06, "loss": 0.4074, "step": 5976 }, { "epoch": 0.6586225895316804, "grad_norm": 5.320181369781494, "learning_rate": 2.65806433274526e-06, "loss": 0.4268, "step": 5977 }, { "epoch": 0.658732782369146, "grad_norm": 4.958621025085449, "learning_rate": 2.6565196905480917e-06, "loss": 0.3769, "step": 5978 }, { "epoch": 0.6588429752066116, "grad_norm": 4.646249771118164, "learning_rate": 2.6549753349148812e-06, "loss": 0.416, "step": 5979 }, { "epoch": 0.6589531680440771, "grad_norm": 7.4992876052856445, "learning_rate": 2.6534312660344696e-06, "loss": 0.4483, "step": 5980 }, { "epoch": 0.6590633608815427, "grad_norm": 5.924458980560303, "learning_rate": 2.651887484095671e-06, "loss": 0.3781, "step": 5981 }, { "epoch": 0.6591735537190082, "grad_norm": 4.546374320983887, "learning_rate": 2.6503439892872594e-06, "loss": 0.4083, "step": 5982 }, { "epoch": 0.6592837465564738, "grad_norm": 9.89686107635498, "learning_rate": 2.6488007817979793e-06, "loss": 0.3721, "step": 5983 }, { "epoch": 0.6593939393939394, "grad_norm": 7.717336177825928, "learning_rate": 2.6472578618165313e-06, "loss": 0.4094, "step": 5984 }, { "epoch": 0.6595041322314049, "grad_norm": 6.707863807678223, "learning_rate": 2.645715229531588e-06, "loss": 0.4327, "step": 5985 }, { "epoch": 0.6596143250688705, "grad_norm": 6.7714314460754395, "learning_rate": 2.644172885131786e-06, "loss": 0.3257, "step": 5986 }, { "epoch": 0.6597245179063361, "grad_norm": 11.701902389526367, "learning_rate": 2.6426308288057222e-06, "loss": 0.5138, "step": 5987 }, { "epoch": 0.6598347107438016, "grad_norm": 12.116415977478027, "learning_rate": 2.6410890607419625e-06, "loss": 0.4811, "step": 5988 }, { "epoch": 0.6599449035812672, "grad_norm": 5.822449207305908, "learning_rate": 2.6395475811290383e-06, "loss": 0.4029, "step": 5989 }, { "epoch": 0.6600550964187328, "grad_norm": 5.782731533050537, "learning_rate": 2.638006390155441e-06, "loss": 0.4049, "step": 5990 }, { "epoch": 0.6601652892561983, "grad_norm": 7.295383930206299, "learning_rate": 2.6364654880096306e-06, "loss": 0.3913, "step": 5991 }, { "epoch": 0.6602754820936639, "grad_norm": 6.990212440490723, "learning_rate": 2.6349248748800327e-06, "loss": 0.4191, "step": 5992 }, { "epoch": 0.6603856749311294, "grad_norm": 7.561840534210205, "learning_rate": 2.6333845509550315e-06, "loss": 0.3883, "step": 5993 }, { "epoch": 0.660495867768595, "grad_norm": 8.130766868591309, "learning_rate": 2.631844516422983e-06, "loss": 0.4768, "step": 5994 }, { "epoch": 0.6606060606060606, "grad_norm": 5.095539569854736, "learning_rate": 2.6303047714722053e-06, "loss": 0.3954, "step": 5995 }, { "epoch": 0.6607162534435261, "grad_norm": 5.467076301574707, "learning_rate": 2.6287653162909767e-06, "loss": 0.3641, "step": 5996 }, { "epoch": 0.6608264462809917, "grad_norm": 8.240248680114746, "learning_rate": 2.6272261510675468e-06, "loss": 0.3504, "step": 5997 }, { "epoch": 0.6609366391184573, "grad_norm": 4.505929946899414, "learning_rate": 2.6256872759901275e-06, "loss": 0.3668, "step": 5998 }, { "epoch": 0.6610468319559228, "grad_norm": 8.998342514038086, "learning_rate": 2.6241486912468916e-06, "loss": 0.3551, "step": 5999 }, { "epoch": 0.6611570247933884, "grad_norm": 9.616366386413574, "learning_rate": 2.622610397025982e-06, "loss": 0.461, "step": 6000 }, { "epoch": 0.6612672176308539, "grad_norm": 6.253893852233887, "learning_rate": 2.621072393515503e-06, "loss": 0.42, "step": 6001 }, { "epoch": 0.6613774104683195, "grad_norm": 6.103658199310303, "learning_rate": 2.6195346809035217e-06, "loss": 0.4098, "step": 6002 }, { "epoch": 0.6614876033057852, "grad_norm": 4.5206499099731445, "learning_rate": 2.6179972593780758e-06, "loss": 0.4353, "step": 6003 }, { "epoch": 0.6615977961432506, "grad_norm": 4.7155022621154785, "learning_rate": 2.6164601291271574e-06, "loss": 0.308, "step": 6004 }, { "epoch": 0.6617079889807163, "grad_norm": 4.543933391571045, "learning_rate": 2.6149232903387333e-06, "loss": 0.4354, "step": 6005 }, { "epoch": 0.6618181818181819, "grad_norm": 8.870038986206055, "learning_rate": 2.6133867432007304e-06, "loss": 0.4134, "step": 6006 }, { "epoch": 0.6619283746556474, "grad_norm": 7.9077301025390625, "learning_rate": 2.6118504879010364e-06, "loss": 0.3229, "step": 6007 }, { "epoch": 0.662038567493113, "grad_norm": 5.109295845031738, "learning_rate": 2.6103145246275095e-06, "loss": 0.3921, "step": 6008 }, { "epoch": 0.6621487603305786, "grad_norm": 7.649944305419922, "learning_rate": 2.6087788535679696e-06, "loss": 0.3764, "step": 6009 }, { "epoch": 0.6622589531680441, "grad_norm": 6.6462812423706055, "learning_rate": 2.607243474910198e-06, "loss": 0.3619, "step": 6010 }, { "epoch": 0.6623691460055097, "grad_norm": 4.974469184875488, "learning_rate": 2.605708388841945e-06, "loss": 0.3555, "step": 6011 }, { "epoch": 0.6624793388429752, "grad_norm": 5.443259239196777, "learning_rate": 2.604173595550924e-06, "loss": 0.397, "step": 6012 }, { "epoch": 0.6625895316804408, "grad_norm": 7.249265193939209, "learning_rate": 2.6026390952248084e-06, "loss": 0.4894, "step": 6013 }, { "epoch": 0.6626997245179064, "grad_norm": 6.921874523162842, "learning_rate": 2.6011048880512407e-06, "loss": 0.3643, "step": 6014 }, { "epoch": 0.6628099173553719, "grad_norm": 7.4875054359436035, "learning_rate": 2.5995709742178277e-06, "loss": 0.3984, "step": 6015 }, { "epoch": 0.6629201101928375, "grad_norm": 4.925077438354492, "learning_rate": 2.598037353912135e-06, "loss": 0.3666, "step": 6016 }, { "epoch": 0.6630303030303031, "grad_norm": 4.4653639793396, "learning_rate": 2.5965040273216967e-06, "loss": 0.3581, "step": 6017 }, { "epoch": 0.6631404958677686, "grad_norm": 10.477150917053223, "learning_rate": 2.5949709946340136e-06, "loss": 0.4583, "step": 6018 }, { "epoch": 0.6632506887052342, "grad_norm": 5.215625286102295, "learning_rate": 2.5934382560365417e-06, "loss": 0.3667, "step": 6019 }, { "epoch": 0.6633608815426997, "grad_norm": 3.861738443374634, "learning_rate": 2.591905811716709e-06, "loss": 0.4071, "step": 6020 }, { "epoch": 0.6634710743801653, "grad_norm": 6.306840896606445, "learning_rate": 2.5903736618619067e-06, "loss": 0.3782, "step": 6021 }, { "epoch": 0.6635812672176309, "grad_norm": 4.3274078369140625, "learning_rate": 2.5888418066594845e-06, "loss": 0.375, "step": 6022 }, { "epoch": 0.6636914600550964, "grad_norm": 6.996295928955078, "learning_rate": 2.5873102462967604e-06, "loss": 0.3847, "step": 6023 }, { "epoch": 0.663801652892562, "grad_norm": 10.424880027770996, "learning_rate": 2.585778980961018e-06, "loss": 0.4013, "step": 6024 }, { "epoch": 0.6639118457300276, "grad_norm": 6.157477855682373, "learning_rate": 2.584248010839502e-06, "loss": 0.4322, "step": 6025 }, { "epoch": 0.6640220385674931, "grad_norm": 9.453383445739746, "learning_rate": 2.582717336119419e-06, "loss": 0.4269, "step": 6026 }, { "epoch": 0.6641322314049587, "grad_norm": 15.271723747253418, "learning_rate": 2.5811869569879446e-06, "loss": 0.3975, "step": 6027 }, { "epoch": 0.6642424242424242, "grad_norm": 7.037370681762695, "learning_rate": 2.579656873632216e-06, "loss": 0.4087, "step": 6028 }, { "epoch": 0.6643526170798898, "grad_norm": 9.355405807495117, "learning_rate": 2.5781270862393327e-06, "loss": 0.4038, "step": 6029 }, { "epoch": 0.6644628099173554, "grad_norm": 8.743352890014648, "learning_rate": 2.576597594996355e-06, "loss": 0.3952, "step": 6030 }, { "epoch": 0.6645730027548209, "grad_norm": 4.763707637786865, "learning_rate": 2.5750684000903194e-06, "loss": 0.4084, "step": 6031 }, { "epoch": 0.6646831955922865, "grad_norm": 12.654218673706055, "learning_rate": 2.5735395017082136e-06, "loss": 0.3919, "step": 6032 }, { "epoch": 0.6647933884297521, "grad_norm": 8.429920196533203, "learning_rate": 2.5720109000369898e-06, "loss": 0.3137, "step": 6033 }, { "epoch": 0.6649035812672176, "grad_norm": 6.514552116394043, "learning_rate": 2.5704825952635753e-06, "loss": 0.3254, "step": 6034 }, { "epoch": 0.6650137741046832, "grad_norm": 5.537646293640137, "learning_rate": 2.568954587574849e-06, "loss": 0.305, "step": 6035 }, { "epoch": 0.6651239669421488, "grad_norm": 8.543807029724121, "learning_rate": 2.567426877157656e-06, "loss": 0.4117, "step": 6036 }, { "epoch": 0.6652341597796143, "grad_norm": 10.421896934509277, "learning_rate": 2.565899464198809e-06, "loss": 0.5015, "step": 6037 }, { "epoch": 0.6653443526170799, "grad_norm": 5.44912052154541, "learning_rate": 2.5643723488850813e-06, "loss": 0.3334, "step": 6038 }, { "epoch": 0.6654545454545454, "grad_norm": 6.181161880493164, "learning_rate": 2.5628455314032143e-06, "loss": 0.4497, "step": 6039 }, { "epoch": 0.665564738292011, "grad_norm": 9.51405143737793, "learning_rate": 2.5613190119399033e-06, "loss": 0.4369, "step": 6040 }, { "epoch": 0.6656749311294766, "grad_norm": 5.872657299041748, "learning_rate": 2.5597927906818166e-06, "loss": 0.358, "step": 6041 }, { "epoch": 0.6657851239669421, "grad_norm": 11.218254089355469, "learning_rate": 2.5582668678155842e-06, "loss": 0.4721, "step": 6042 }, { "epoch": 0.6658953168044077, "grad_norm": 8.075851440429688, "learning_rate": 2.5567412435277937e-06, "loss": 0.2994, "step": 6043 }, { "epoch": 0.6660055096418733, "grad_norm": 5.535632133483887, "learning_rate": 2.555215918005003e-06, "loss": 0.3541, "step": 6044 }, { "epoch": 0.6661157024793388, "grad_norm": 5.90950345993042, "learning_rate": 2.553690891433733e-06, "loss": 0.4333, "step": 6045 }, { "epoch": 0.6662258953168044, "grad_norm": 5.168428421020508, "learning_rate": 2.552166164000461e-06, "loss": 0.481, "step": 6046 }, { "epoch": 0.6663360881542699, "grad_norm": 10.744282722473145, "learning_rate": 2.5506417358916365e-06, "loss": 0.4343, "step": 6047 }, { "epoch": 0.6664462809917355, "grad_norm": 5.804863452911377, "learning_rate": 2.5491176072936683e-06, "loss": 0.4045, "step": 6048 }, { "epoch": 0.6665564738292011, "grad_norm": 7.872255802154541, "learning_rate": 2.5475937783929276e-06, "loss": 0.4052, "step": 6049 }, { "epoch": 0.6666666666666666, "grad_norm": 6.6324687004089355, "learning_rate": 2.5460702493757506e-06, "loss": 0.3889, "step": 6050 }, { "epoch": 0.6667768595041322, "grad_norm": 14.796236038208008, "learning_rate": 2.5445470204284384e-06, "loss": 0.4138, "step": 6051 }, { "epoch": 0.6668870523415978, "grad_norm": 8.73970890045166, "learning_rate": 2.5430240917372506e-06, "loss": 0.4081, "step": 6052 }, { "epoch": 0.6669972451790633, "grad_norm": 8.211316108703613, "learning_rate": 2.541501463488414e-06, "loss": 0.4085, "step": 6053 }, { "epoch": 0.6671074380165289, "grad_norm": 5.342787265777588, "learning_rate": 2.5399791358681203e-06, "loss": 0.3759, "step": 6054 }, { "epoch": 0.6672176308539944, "grad_norm": 7.133078575134277, "learning_rate": 2.5384571090625166e-06, "loss": 0.424, "step": 6055 }, { "epoch": 0.66732782369146, "grad_norm": 6.504353046417236, "learning_rate": 2.5369353832577224e-06, "loss": 0.391, "step": 6056 }, { "epoch": 0.6674380165289256, "grad_norm": 10.371742248535156, "learning_rate": 2.5354139586398164e-06, "loss": 0.4737, "step": 6057 }, { "epoch": 0.6675482093663911, "grad_norm": 6.145484447479248, "learning_rate": 2.5338928353948376e-06, "loss": 0.3838, "step": 6058 }, { "epoch": 0.6676584022038567, "grad_norm": 3.5295302867889404, "learning_rate": 2.532372013708793e-06, "loss": 0.3551, "step": 6059 }, { "epoch": 0.6677685950413224, "grad_norm": 7.3660478591918945, "learning_rate": 2.530851493767652e-06, "loss": 0.415, "step": 6060 }, { "epoch": 0.6678787878787878, "grad_norm": 7.272532939910889, "learning_rate": 2.529331275757343e-06, "loss": 0.2895, "step": 6061 }, { "epoch": 0.6679889807162535, "grad_norm": 4.882059574127197, "learning_rate": 2.527811359863763e-06, "loss": 0.3936, "step": 6062 }, { "epoch": 0.6680991735537191, "grad_norm": 7.911858558654785, "learning_rate": 2.5262917462727664e-06, "loss": 0.424, "step": 6063 }, { "epoch": 0.6682093663911846, "grad_norm": 10.215099334716797, "learning_rate": 2.5247724351701757e-06, "loss": 0.4689, "step": 6064 }, { "epoch": 0.6683195592286502, "grad_norm": 5.460385799407959, "learning_rate": 2.523253426741775e-06, "loss": 0.3514, "step": 6065 }, { "epoch": 0.6684297520661157, "grad_norm": 6.721065044403076, "learning_rate": 2.5217347211733067e-06, "loss": 0.4132, "step": 6066 }, { "epoch": 0.6685399449035813, "grad_norm": 8.900752067565918, "learning_rate": 2.5202163186504867e-06, "loss": 0.4125, "step": 6067 }, { "epoch": 0.6686501377410469, "grad_norm": 5.635085105895996, "learning_rate": 2.5186982193589833e-06, "loss": 0.3659, "step": 6068 }, { "epoch": 0.6687603305785124, "grad_norm": 11.600287437438965, "learning_rate": 2.51718042348443e-06, "loss": 0.3947, "step": 6069 }, { "epoch": 0.668870523415978, "grad_norm": 4.45723819732666, "learning_rate": 2.515662931212428e-06, "loss": 0.3916, "step": 6070 }, { "epoch": 0.6689807162534436, "grad_norm": 5.884436130523682, "learning_rate": 2.514145742728539e-06, "loss": 0.4151, "step": 6071 }, { "epoch": 0.6690909090909091, "grad_norm": 4.856365203857422, "learning_rate": 2.5126288582182827e-06, "loss": 0.3855, "step": 6072 }, { "epoch": 0.6692011019283747, "grad_norm": 8.981498718261719, "learning_rate": 2.5111122778671495e-06, "loss": 0.4045, "step": 6073 }, { "epoch": 0.6693112947658402, "grad_norm": 9.999101638793945, "learning_rate": 2.5095960018605887e-06, "loss": 0.4695, "step": 6074 }, { "epoch": 0.6694214876033058, "grad_norm": 6.24324893951416, "learning_rate": 2.5080800303840104e-06, "loss": 0.3786, "step": 6075 }, { "epoch": 0.6695316804407714, "grad_norm": 5.593008518218994, "learning_rate": 2.5065643636227897e-06, "loss": 0.2895, "step": 6076 }, { "epoch": 0.6696418732782369, "grad_norm": 5.002102375030518, "learning_rate": 2.5050490017622686e-06, "loss": 0.3435, "step": 6077 }, { "epoch": 0.6697520661157025, "grad_norm": 5.277883529663086, "learning_rate": 2.5035339449877426e-06, "loss": 0.3893, "step": 6078 }, { "epoch": 0.6698622589531681, "grad_norm": 4.154916763305664, "learning_rate": 2.5020191934844774e-06, "loss": 0.3005, "step": 6079 }, { "epoch": 0.6699724517906336, "grad_norm": 7.4428486824035645, "learning_rate": 2.5005047474376975e-06, "loss": 0.4769, "step": 6080 }, { "epoch": 0.6700826446280992, "grad_norm": 13.213733673095703, "learning_rate": 2.4989906070325947e-06, "loss": 0.4506, "step": 6081 }, { "epoch": 0.6701928374655647, "grad_norm": 4.562686443328857, "learning_rate": 2.4974767724543157e-06, "loss": 0.418, "step": 6082 }, { "epoch": 0.6703030303030303, "grad_norm": 12.098550796508789, "learning_rate": 2.4959632438879765e-06, "loss": 0.4256, "step": 6083 }, { "epoch": 0.6704132231404959, "grad_norm": 10.366657257080078, "learning_rate": 2.494450021518655e-06, "loss": 0.3768, "step": 6084 }, { "epoch": 0.6705234159779614, "grad_norm": 9.916677474975586, "learning_rate": 2.4929371055313884e-06, "loss": 0.5525, "step": 6085 }, { "epoch": 0.670633608815427, "grad_norm": 6.2902374267578125, "learning_rate": 2.4914244961111742e-06, "loss": 0.4118, "step": 6086 }, { "epoch": 0.6707438016528926, "grad_norm": 4.70620584487915, "learning_rate": 2.4899121934429836e-06, "loss": 0.3298, "step": 6087 }, { "epoch": 0.6708539944903581, "grad_norm": 4.512409687042236, "learning_rate": 2.4884001977117406e-06, "loss": 0.3745, "step": 6088 }, { "epoch": 0.6709641873278237, "grad_norm": 4.995675563812256, "learning_rate": 2.4868885091023284e-06, "loss": 0.3612, "step": 6089 }, { "epoch": 0.6710743801652893, "grad_norm": 7.682406902313232, "learning_rate": 2.485377127799607e-06, "loss": 0.3819, "step": 6090 }, { "epoch": 0.6711845730027548, "grad_norm": 9.726338386535645, "learning_rate": 2.4838660539883863e-06, "loss": 0.4186, "step": 6091 }, { "epoch": 0.6712947658402204, "grad_norm": 7.077490329742432, "learning_rate": 2.4823552878534385e-06, "loss": 0.4742, "step": 6092 }, { "epoch": 0.6714049586776859, "grad_norm": 6.823843002319336, "learning_rate": 2.48084482957951e-06, "loss": 0.4916, "step": 6093 }, { "epoch": 0.6715151515151515, "grad_norm": 6.0614728927612305, "learning_rate": 2.4793346793512957e-06, "loss": 0.3698, "step": 6094 }, { "epoch": 0.6716253443526171, "grad_norm": 8.090015411376953, "learning_rate": 2.4778248373534626e-06, "loss": 0.4433, "step": 6095 }, { "epoch": 0.6717355371900826, "grad_norm": 9.832079887390137, "learning_rate": 2.4763153037706323e-06, "loss": 0.4631, "step": 6096 }, { "epoch": 0.6718457300275482, "grad_norm": 6.708822250366211, "learning_rate": 2.4748060787873953e-06, "loss": 0.3908, "step": 6097 }, { "epoch": 0.6719559228650138, "grad_norm": 7.857901573181152, "learning_rate": 2.4732971625883023e-06, "loss": 0.3987, "step": 6098 }, { "epoch": 0.6720661157024793, "grad_norm": 7.758438587188721, "learning_rate": 2.471788555357863e-06, "loss": 0.3449, "step": 6099 }, { "epoch": 0.6721763085399449, "grad_norm": 7.877081871032715, "learning_rate": 2.4702802572805536e-06, "loss": 0.4052, "step": 6100 }, { "epoch": 0.6722865013774104, "grad_norm": 7.760099411010742, "learning_rate": 2.468772268540812e-06, "loss": 0.3716, "step": 6101 }, { "epoch": 0.672396694214876, "grad_norm": 5.632152557373047, "learning_rate": 2.467264589323034e-06, "loss": 0.3336, "step": 6102 }, { "epoch": 0.6725068870523416, "grad_norm": 5.384594917297363, "learning_rate": 2.4657572198115826e-06, "loss": 0.343, "step": 6103 }, { "epoch": 0.6726170798898071, "grad_norm": 6.8102827072143555, "learning_rate": 2.4642501601907826e-06, "loss": 0.4512, "step": 6104 }, { "epoch": 0.6727272727272727, "grad_norm": 8.789267539978027, "learning_rate": 2.4627434106449155e-06, "loss": 0.3556, "step": 6105 }, { "epoch": 0.6728374655647383, "grad_norm": 4.4546098709106445, "learning_rate": 2.461236971358231e-06, "loss": 0.3245, "step": 6106 }, { "epoch": 0.6729476584022038, "grad_norm": 5.878533363342285, "learning_rate": 2.4597308425149395e-06, "loss": 0.3935, "step": 6107 }, { "epoch": 0.6730578512396694, "grad_norm": 6.608068466186523, "learning_rate": 2.45822502429921e-06, "loss": 0.4419, "step": 6108 }, { "epoch": 0.673168044077135, "grad_norm": 5.274361610412598, "learning_rate": 2.456719516895177e-06, "loss": 0.3739, "step": 6109 }, { "epoch": 0.6732782369146005, "grad_norm": 6.25014591217041, "learning_rate": 2.4552143204869377e-06, "loss": 0.4178, "step": 6110 }, { "epoch": 0.6733884297520661, "grad_norm": 6.15048885345459, "learning_rate": 2.4537094352585466e-06, "loss": 0.3752, "step": 6111 }, { "epoch": 0.6734986225895316, "grad_norm": 5.450191497802734, "learning_rate": 2.4522048613940242e-06, "loss": 0.3514, "step": 6112 }, { "epoch": 0.6736088154269972, "grad_norm": 6.499011993408203, "learning_rate": 2.4507005990773543e-06, "loss": 0.4243, "step": 6113 }, { "epoch": 0.6737190082644628, "grad_norm": 7.088918209075928, "learning_rate": 2.4491966484924763e-06, "loss": 0.413, "step": 6114 }, { "epoch": 0.6738292011019283, "grad_norm": 9.151850700378418, "learning_rate": 2.4476930098232964e-06, "loss": 0.4335, "step": 6115 }, { "epoch": 0.673939393939394, "grad_norm": 8.162264823913574, "learning_rate": 2.4461896832536846e-06, "loss": 0.3641, "step": 6116 }, { "epoch": 0.6740495867768596, "grad_norm": 8.007330894470215, "learning_rate": 2.4446866689674654e-06, "loss": 0.5077, "step": 6117 }, { "epoch": 0.674159779614325, "grad_norm": 5.151622772216797, "learning_rate": 2.443183967148433e-06, "loss": 0.4177, "step": 6118 }, { "epoch": 0.6742699724517907, "grad_norm": 6.72404146194458, "learning_rate": 2.4416815779803367e-06, "loss": 0.3665, "step": 6119 }, { "epoch": 0.6743801652892562, "grad_norm": 5.265436172485352, "learning_rate": 2.440179501646892e-06, "loss": 0.4831, "step": 6120 }, { "epoch": 0.6744903581267218, "grad_norm": 6.513745307922363, "learning_rate": 2.4386777383317773e-06, "loss": 0.3932, "step": 6121 }, { "epoch": 0.6746005509641874, "grad_norm": 6.415255069732666, "learning_rate": 2.4371762882186235e-06, "loss": 0.4229, "step": 6122 }, { "epoch": 0.6747107438016529, "grad_norm": 4.8438286781311035, "learning_rate": 2.4356751514910385e-06, "loss": 0.3149, "step": 6123 }, { "epoch": 0.6748209366391185, "grad_norm": 9.928213119506836, "learning_rate": 2.434174328332579e-06, "loss": 0.4716, "step": 6124 }, { "epoch": 0.6749311294765841, "grad_norm": 6.0422773361206055, "learning_rate": 2.4326738189267647e-06, "loss": 0.3704, "step": 6125 }, { "epoch": 0.6750413223140496, "grad_norm": 8.048456192016602, "learning_rate": 2.431173623457087e-06, "loss": 0.4681, "step": 6126 }, { "epoch": 0.6751515151515152, "grad_norm": 5.316216468811035, "learning_rate": 2.4296737421069875e-06, "loss": 0.4271, "step": 6127 }, { "epoch": 0.6752617079889807, "grad_norm": 10.065850257873535, "learning_rate": 2.428174175059873e-06, "loss": 0.3364, "step": 6128 }, { "epoch": 0.6753719008264463, "grad_norm": 11.229357719421387, "learning_rate": 2.426674922499113e-06, "loss": 0.4465, "step": 6129 }, { "epoch": 0.6754820936639119, "grad_norm": 15.492269515991211, "learning_rate": 2.425175984608042e-06, "loss": 0.5522, "step": 6130 }, { "epoch": 0.6755922865013774, "grad_norm": 7.198604106903076, "learning_rate": 2.4236773615699466e-06, "loss": 0.379, "step": 6131 }, { "epoch": 0.675702479338843, "grad_norm": 8.281965255737305, "learning_rate": 2.422179053568083e-06, "loss": 0.3656, "step": 6132 }, { "epoch": 0.6758126721763086, "grad_norm": 8.31258773803711, "learning_rate": 2.420681060785668e-06, "loss": 0.4397, "step": 6133 }, { "epoch": 0.6759228650137741, "grad_norm": 7.666805744171143, "learning_rate": 2.4191833834058753e-06, "loss": 0.4572, "step": 6134 }, { "epoch": 0.6760330578512397, "grad_norm": 7.042750835418701, "learning_rate": 2.417686021611844e-06, "loss": 0.3219, "step": 6135 }, { "epoch": 0.6761432506887053, "grad_norm": 9.428544998168945, "learning_rate": 2.416188975586673e-06, "loss": 0.3839, "step": 6136 }, { "epoch": 0.6762534435261708, "grad_norm": 9.478087425231934, "learning_rate": 2.4146922455134266e-06, "loss": 0.4228, "step": 6137 }, { "epoch": 0.6763636363636364, "grad_norm": 4.778660297393799, "learning_rate": 2.413195831575122e-06, "loss": 0.3969, "step": 6138 }, { "epoch": 0.6764738292011019, "grad_norm": 6.882425785064697, "learning_rate": 2.411699733954745e-06, "loss": 0.4432, "step": 6139 }, { "epoch": 0.6765840220385675, "grad_norm": 5.772907257080078, "learning_rate": 2.4102039528352424e-06, "loss": 0.4684, "step": 6140 }, { "epoch": 0.6766942148760331, "grad_norm": 6.315256118774414, "learning_rate": 2.408708488399516e-06, "loss": 0.3635, "step": 6141 }, { "epoch": 0.6768044077134986, "grad_norm": 6.439986705780029, "learning_rate": 2.407213340830436e-06, "loss": 0.3949, "step": 6142 }, { "epoch": 0.6769146005509642, "grad_norm": 7.057758808135986, "learning_rate": 2.405718510310832e-06, "loss": 0.475, "step": 6143 }, { "epoch": 0.6770247933884298, "grad_norm": 8.165635108947754, "learning_rate": 2.404223997023493e-06, "loss": 0.4448, "step": 6144 }, { "epoch": 0.6771349862258953, "grad_norm": 6.370461940765381, "learning_rate": 2.4027298011511656e-06, "loss": 0.4336, "step": 6145 }, { "epoch": 0.6772451790633609, "grad_norm": 6.985379219055176, "learning_rate": 2.4012359228765703e-06, "loss": 0.4041, "step": 6146 }, { "epoch": 0.6773553719008264, "grad_norm": 7.5502214431762695, "learning_rate": 2.3997423623823763e-06, "loss": 0.4298, "step": 6147 }, { "epoch": 0.677465564738292, "grad_norm": 7.182582378387451, "learning_rate": 2.398249119851215e-06, "loss": 0.3973, "step": 6148 }, { "epoch": 0.6775757575757576, "grad_norm": 5.377007961273193, "learning_rate": 2.3967561954656882e-06, "loss": 0.3949, "step": 6149 }, { "epoch": 0.6776859504132231, "grad_norm": 4.182321548461914, "learning_rate": 2.3952635894083488e-06, "loss": 0.3896, "step": 6150 }, { "epoch": 0.6777961432506887, "grad_norm": 8.063091278076172, "learning_rate": 2.3937713018617178e-06, "loss": 0.4259, "step": 6151 }, { "epoch": 0.6779063360881543, "grad_norm": 7.529224395751953, "learning_rate": 2.39227933300827e-06, "loss": 0.4392, "step": 6152 }, { "epoch": 0.6780165289256198, "grad_norm": 6.323045253753662, "learning_rate": 2.390787683030448e-06, "loss": 0.4528, "step": 6153 }, { "epoch": 0.6781267217630854, "grad_norm": 10.965755462646484, "learning_rate": 2.389296352110654e-06, "loss": 0.5396, "step": 6154 }, { "epoch": 0.6782369146005509, "grad_norm": 7.637385845184326, "learning_rate": 2.387805340431246e-06, "loss": 0.4123, "step": 6155 }, { "epoch": 0.6783471074380165, "grad_norm": 6.656174659729004, "learning_rate": 2.38631464817455e-06, "loss": 0.492, "step": 6156 }, { "epoch": 0.6784573002754821, "grad_norm": 5.587393283843994, "learning_rate": 2.3848242755228507e-06, "loss": 0.4045, "step": 6157 }, { "epoch": 0.6785674931129476, "grad_norm": 5.630966663360596, "learning_rate": 2.3833342226583893e-06, "loss": 0.3986, "step": 6158 }, { "epoch": 0.6786776859504132, "grad_norm": 5.846369743347168, "learning_rate": 2.381844489763374e-06, "loss": 0.4388, "step": 6159 }, { "epoch": 0.6787878787878788, "grad_norm": 6.487740516662598, "learning_rate": 2.3803550770199723e-06, "loss": 0.3781, "step": 6160 }, { "epoch": 0.6788980716253443, "grad_norm": 6.642772674560547, "learning_rate": 2.378865984610309e-06, "loss": 0.4177, "step": 6161 }, { "epoch": 0.6790082644628099, "grad_norm": 6.779574394226074, "learning_rate": 2.377377212716473e-06, "loss": 0.415, "step": 6162 }, { "epoch": 0.6791184573002755, "grad_norm": 10.08366584777832, "learning_rate": 2.3758887615205163e-06, "loss": 0.4536, "step": 6163 }, { "epoch": 0.679228650137741, "grad_norm": 7.583836555480957, "learning_rate": 2.3744006312044445e-06, "loss": 0.4071, "step": 6164 }, { "epoch": 0.6793388429752066, "grad_norm": 4.750543117523193, "learning_rate": 2.3729128219502295e-06, "loss": 0.3305, "step": 6165 }, { "epoch": 0.6794490358126721, "grad_norm": 6.158310890197754, "learning_rate": 2.3714253339398052e-06, "loss": 0.3796, "step": 6166 }, { "epoch": 0.6795592286501377, "grad_norm": 5.552796840667725, "learning_rate": 2.3699381673550597e-06, "loss": 0.3541, "step": 6167 }, { "epoch": 0.6796694214876033, "grad_norm": 5.88632345199585, "learning_rate": 2.3684513223778475e-06, "loss": 0.3827, "step": 6168 }, { "epoch": 0.6797796143250688, "grad_norm": 8.556108474731445, "learning_rate": 2.3669647991899847e-06, "loss": 0.511, "step": 6169 }, { "epoch": 0.6798898071625344, "grad_norm": 5.67225980758667, "learning_rate": 2.3654785979732407e-06, "loss": 0.4465, "step": 6170 }, { "epoch": 0.68, "grad_norm": 4.995718002319336, "learning_rate": 2.3639927189093528e-06, "loss": 0.3814, "step": 6171 }, { "epoch": 0.6801101928374655, "grad_norm": 7.998754978179932, "learning_rate": 2.362507162180017e-06, "loss": 0.4225, "step": 6172 }, { "epoch": 0.6802203856749311, "grad_norm": 5.867599010467529, "learning_rate": 2.361021927966887e-06, "loss": 0.4606, "step": 6173 }, { "epoch": 0.6803305785123966, "grad_norm": 9.48109245300293, "learning_rate": 2.3595370164515796e-06, "loss": 0.3874, "step": 6174 }, { "epoch": 0.6804407713498623, "grad_norm": 9.793745994567871, "learning_rate": 2.3580524278156748e-06, "loss": 0.4474, "step": 6175 }, { "epoch": 0.6805509641873279, "grad_norm": 4.540402412414551, "learning_rate": 2.356568162240706e-06, "loss": 0.4161, "step": 6176 }, { "epoch": 0.6806611570247934, "grad_norm": 6.409126281738281, "learning_rate": 2.355084219908175e-06, "loss": 0.4146, "step": 6177 }, { "epoch": 0.680771349862259, "grad_norm": 8.314022064208984, "learning_rate": 2.3536006009995343e-06, "loss": 0.401, "step": 6178 }, { "epoch": 0.6808815426997246, "grad_norm": 4.711172103881836, "learning_rate": 2.352117305696211e-06, "loss": 0.3854, "step": 6179 }, { "epoch": 0.6809917355371901, "grad_norm": 9.868963241577148, "learning_rate": 2.35063433417958e-06, "loss": 0.3401, "step": 6180 }, { "epoch": 0.6811019283746557, "grad_norm": 7.214323997497559, "learning_rate": 2.349151686630978e-06, "loss": 0.3826, "step": 6181 }, { "epoch": 0.6812121212121212, "grad_norm": 5.444054126739502, "learning_rate": 2.347669363231712e-06, "loss": 0.3377, "step": 6182 }, { "epoch": 0.6813223140495868, "grad_norm": 7.184004783630371, "learning_rate": 2.3461873641630394e-06, "loss": 0.4225, "step": 6183 }, { "epoch": 0.6814325068870524, "grad_norm": 5.51975154876709, "learning_rate": 2.3447056896061765e-06, "loss": 0.3937, "step": 6184 }, { "epoch": 0.6815426997245179, "grad_norm": 6.245907306671143, "learning_rate": 2.343224339742313e-06, "loss": 0.4114, "step": 6185 }, { "epoch": 0.6816528925619835, "grad_norm": 4.655656814575195, "learning_rate": 2.3417433147525864e-06, "loss": 0.4115, "step": 6186 }, { "epoch": 0.6817630853994491, "grad_norm": 10.695345878601074, "learning_rate": 2.3402626148180957e-06, "loss": 0.5083, "step": 6187 }, { "epoch": 0.6818732782369146, "grad_norm": 4.769364833831787, "learning_rate": 2.3387822401199055e-06, "loss": 0.3928, "step": 6188 }, { "epoch": 0.6819834710743802, "grad_norm": 10.957562446594238, "learning_rate": 2.3373021908390397e-06, "loss": 0.4721, "step": 6189 }, { "epoch": 0.6820936639118458, "grad_norm": 7.979918479919434, "learning_rate": 2.335822467156477e-06, "loss": 0.3685, "step": 6190 }, { "epoch": 0.6822038567493113, "grad_norm": 4.247445106506348, "learning_rate": 2.334343069253162e-06, "loss": 0.4332, "step": 6191 }, { "epoch": 0.6823140495867769, "grad_norm": 8.737055778503418, "learning_rate": 2.3328639973099983e-06, "loss": 0.3704, "step": 6192 }, { "epoch": 0.6824242424242424, "grad_norm": 6.368855953216553, "learning_rate": 2.331385251507849e-06, "loss": 0.3423, "step": 6193 }, { "epoch": 0.682534435261708, "grad_norm": 8.097928047180176, "learning_rate": 2.3299068320275342e-06, "loss": 0.4046, "step": 6194 }, { "epoch": 0.6826446280991736, "grad_norm": 5.629691123962402, "learning_rate": 2.3284287390498388e-06, "loss": 0.432, "step": 6195 }, { "epoch": 0.6827548209366391, "grad_norm": 5.790268898010254, "learning_rate": 2.3269509727555084e-06, "loss": 0.3309, "step": 6196 }, { "epoch": 0.6828650137741047, "grad_norm": 5.701327323913574, "learning_rate": 2.325473533325242e-06, "loss": 0.4051, "step": 6197 }, { "epoch": 0.6829752066115703, "grad_norm": 10.208582878112793, "learning_rate": 2.323996420939705e-06, "loss": 0.4147, "step": 6198 }, { "epoch": 0.6830853994490358, "grad_norm": 6.084228515625, "learning_rate": 2.3225196357795227e-06, "loss": 0.4382, "step": 6199 }, { "epoch": 0.6831955922865014, "grad_norm": 6.657327175140381, "learning_rate": 2.3210431780252742e-06, "loss": 0.4025, "step": 6200 }, { "epoch": 0.6833057851239669, "grad_norm": 5.073029041290283, "learning_rate": 2.3195670478575046e-06, "loss": 0.3901, "step": 6201 }, { "epoch": 0.6834159779614325, "grad_norm": 8.293045997619629, "learning_rate": 2.3180912454567195e-06, "loss": 0.432, "step": 6202 }, { "epoch": 0.6835261707988981, "grad_norm": 8.201218605041504, "learning_rate": 2.3166157710033806e-06, "loss": 0.4196, "step": 6203 }, { "epoch": 0.6836363636363636, "grad_norm": 6.732716083526611, "learning_rate": 2.3151406246779055e-06, "loss": 0.408, "step": 6204 }, { "epoch": 0.6837465564738292, "grad_norm": 6.923007488250732, "learning_rate": 2.313665806660686e-06, "loss": 0.3639, "step": 6205 }, { "epoch": 0.6838567493112948, "grad_norm": 5.650262832641602, "learning_rate": 2.3121913171320586e-06, "loss": 0.386, "step": 6206 }, { "epoch": 0.6839669421487603, "grad_norm": 4.500156402587891, "learning_rate": 2.3107171562723284e-06, "loss": 0.3851, "step": 6207 }, { "epoch": 0.6840771349862259, "grad_norm": 5.889590263366699, "learning_rate": 2.309243324261759e-06, "loss": 0.3609, "step": 6208 }, { "epoch": 0.6841873278236914, "grad_norm": 5.953109264373779, "learning_rate": 2.3077698212805694e-06, "loss": 0.3453, "step": 6209 }, { "epoch": 0.684297520661157, "grad_norm": 6.79673433303833, "learning_rate": 2.3062966475089445e-06, "loss": 0.3652, "step": 6210 }, { "epoch": 0.6844077134986226, "grad_norm": 8.246170997619629, "learning_rate": 2.304823803127023e-06, "loss": 0.4856, "step": 6211 }, { "epoch": 0.6845179063360881, "grad_norm": 6.314090728759766, "learning_rate": 2.303351288314908e-06, "loss": 0.4042, "step": 6212 }, { "epoch": 0.6846280991735537, "grad_norm": 13.75493049621582, "learning_rate": 2.3018791032526615e-06, "loss": 0.3992, "step": 6213 }, { "epoch": 0.6847382920110193, "grad_norm": 8.827356338500977, "learning_rate": 2.300407248120302e-06, "loss": 0.3295, "step": 6214 }, { "epoch": 0.6848484848484848, "grad_norm": 8.514598846435547, "learning_rate": 2.2989357230978114e-06, "loss": 0.4799, "step": 6215 }, { "epoch": 0.6849586776859504, "grad_norm": 6.385679721832275, "learning_rate": 2.2974645283651314e-06, "loss": 0.4168, "step": 6216 }, { "epoch": 0.685068870523416, "grad_norm": 5.92914342880249, "learning_rate": 2.2959936641021585e-06, "loss": 0.4095, "step": 6217 }, { "epoch": 0.6851790633608815, "grad_norm": 6.143651962280273, "learning_rate": 2.294523130488753e-06, "loss": 0.4403, "step": 6218 }, { "epoch": 0.6852892561983471, "grad_norm": 5.817147254943848, "learning_rate": 2.293052927704736e-06, "loss": 0.4051, "step": 6219 }, { "epoch": 0.6853994490358126, "grad_norm": 4.9564714431762695, "learning_rate": 2.291583055929882e-06, "loss": 0.3457, "step": 6220 }, { "epoch": 0.6855096418732782, "grad_norm": 14.586307525634766, "learning_rate": 2.290113515343931e-06, "loss": 0.5367, "step": 6221 }, { "epoch": 0.6856198347107438, "grad_norm": 5.410526752471924, "learning_rate": 2.288644306126582e-06, "loss": 0.3927, "step": 6222 }, { "epoch": 0.6857300275482093, "grad_norm": 4.907934188842773, "learning_rate": 2.2871754284574885e-06, "loss": 0.3944, "step": 6223 }, { "epoch": 0.6858402203856749, "grad_norm": 7.13827657699585, "learning_rate": 2.285706882516269e-06, "loss": 0.3411, "step": 6224 }, { "epoch": 0.6859504132231405, "grad_norm": 7.94610071182251, "learning_rate": 2.2842386684825e-06, "loss": 0.416, "step": 6225 }, { "epoch": 0.686060606060606, "grad_norm": 4.437480926513672, "learning_rate": 2.2827707865357146e-06, "loss": 0.4489, "step": 6226 }, { "epoch": 0.6861707988980716, "grad_norm": 4.621830463409424, "learning_rate": 2.2813032368554084e-06, "loss": 0.388, "step": 6227 }, { "epoch": 0.6862809917355371, "grad_norm": 4.54764986038208, "learning_rate": 2.2798360196210366e-06, "loss": 0.3375, "step": 6228 }, { "epoch": 0.6863911845730027, "grad_norm": 7.10068416595459, "learning_rate": 2.27836913501201e-06, "loss": 0.3517, "step": 6229 }, { "epoch": 0.6865013774104683, "grad_norm": 10.14759635925293, "learning_rate": 2.2769025832077026e-06, "loss": 0.308, "step": 6230 }, { "epoch": 0.6866115702479338, "grad_norm": 4.241217613220215, "learning_rate": 2.2754363643874477e-06, "loss": 0.4252, "step": 6231 }, { "epoch": 0.6867217630853995, "grad_norm": 7.68984842300415, "learning_rate": 2.2739704787305333e-06, "loss": 0.4598, "step": 6232 }, { "epoch": 0.6868319559228651, "grad_norm": 7.011338233947754, "learning_rate": 2.272504926416212e-06, "loss": 0.3465, "step": 6233 }, { "epoch": 0.6869421487603306, "grad_norm": 5.404778003692627, "learning_rate": 2.271039707623693e-06, "loss": 0.4343, "step": 6234 }, { "epoch": 0.6870523415977962, "grad_norm": 5.496755123138428, "learning_rate": 2.2695748225321474e-06, "loss": 0.4764, "step": 6235 }, { "epoch": 0.6871625344352618, "grad_norm": 7.6246867179870605, "learning_rate": 2.2681102713207015e-06, "loss": 0.4154, "step": 6236 }, { "epoch": 0.6872727272727273, "grad_norm": 4.848220348358154, "learning_rate": 2.266646054168439e-06, "loss": 0.3688, "step": 6237 }, { "epoch": 0.6873829201101929, "grad_norm": 5.936487197875977, "learning_rate": 2.2651821712544133e-06, "loss": 0.4215, "step": 6238 }, { "epoch": 0.6874931129476584, "grad_norm": 6.518362045288086, "learning_rate": 2.2637186227576265e-06, "loss": 0.3588, "step": 6239 }, { "epoch": 0.687603305785124, "grad_norm": 9.76972770690918, "learning_rate": 2.2622554088570397e-06, "loss": 0.4332, "step": 6240 }, { "epoch": 0.6877134986225896, "grad_norm": 5.921270847320557, "learning_rate": 2.260792529731584e-06, "loss": 0.335, "step": 6241 }, { "epoch": 0.6878236914600551, "grad_norm": 3.6244571208953857, "learning_rate": 2.259329985560139e-06, "loss": 0.3482, "step": 6242 }, { "epoch": 0.6879338842975207, "grad_norm": 8.61227035522461, "learning_rate": 2.257867776521544e-06, "loss": 0.3787, "step": 6243 }, { "epoch": 0.6880440771349863, "grad_norm": 5.597924709320068, "learning_rate": 2.256405902794602e-06, "loss": 0.3788, "step": 6244 }, { "epoch": 0.6881542699724518, "grad_norm": 7.333438396453857, "learning_rate": 2.2549443645580755e-06, "loss": 0.3755, "step": 6245 }, { "epoch": 0.6882644628099174, "grad_norm": 8.690994262695312, "learning_rate": 2.253483161990679e-06, "loss": 0.4823, "step": 6246 }, { "epoch": 0.6883746556473829, "grad_norm": 7.609672546386719, "learning_rate": 2.252022295271092e-06, "loss": 0.4277, "step": 6247 }, { "epoch": 0.6884848484848485, "grad_norm": 7.242532253265381, "learning_rate": 2.250561764577951e-06, "loss": 0.3676, "step": 6248 }, { "epoch": 0.6885950413223141, "grad_norm": 8.508557319641113, "learning_rate": 2.2491015700898543e-06, "loss": 0.4702, "step": 6249 }, { "epoch": 0.6887052341597796, "grad_norm": 14.057016372680664, "learning_rate": 2.2476417119853527e-06, "loss": 0.5032, "step": 6250 }, { "epoch": 0.6888154269972452, "grad_norm": 5.928760528564453, "learning_rate": 2.2461821904429616e-06, "loss": 0.4208, "step": 6251 }, { "epoch": 0.6889256198347108, "grad_norm": 7.277674198150635, "learning_rate": 2.244723005641154e-06, "loss": 0.4078, "step": 6252 }, { "epoch": 0.6890358126721763, "grad_norm": 7.040360450744629, "learning_rate": 2.2432641577583584e-06, "loss": 0.4071, "step": 6253 }, { "epoch": 0.6891460055096419, "grad_norm": 4.986152648925781, "learning_rate": 2.2418056469729666e-06, "loss": 0.3865, "step": 6254 }, { "epoch": 0.6892561983471074, "grad_norm": 8.636344909667969, "learning_rate": 2.2403474734633284e-06, "loss": 0.4156, "step": 6255 }, { "epoch": 0.689366391184573, "grad_norm": 8.572731018066406, "learning_rate": 2.238889637407748e-06, "loss": 0.459, "step": 6256 }, { "epoch": 0.6894765840220386, "grad_norm": 10.21874713897705, "learning_rate": 2.237432138984493e-06, "loss": 0.4493, "step": 6257 }, { "epoch": 0.6895867768595041, "grad_norm": 4.540192127227783, "learning_rate": 2.235974978371791e-06, "loss": 0.3693, "step": 6258 }, { "epoch": 0.6896969696969697, "grad_norm": 6.4375176429748535, "learning_rate": 2.234518155747821e-06, "loss": 0.379, "step": 6259 }, { "epoch": 0.6898071625344353, "grad_norm": 5.030807971954346, "learning_rate": 2.233061671290728e-06, "loss": 0.3979, "step": 6260 }, { "epoch": 0.6899173553719008, "grad_norm": 7.331912994384766, "learning_rate": 2.231605525178614e-06, "loss": 0.4685, "step": 6261 }, { "epoch": 0.6900275482093664, "grad_norm": 11.071955680847168, "learning_rate": 2.230149717589535e-06, "loss": 0.4517, "step": 6262 }, { "epoch": 0.690137741046832, "grad_norm": 5.575560569763184, "learning_rate": 2.228694248701511e-06, "loss": 0.3085, "step": 6263 }, { "epoch": 0.6902479338842975, "grad_norm": 5.732341766357422, "learning_rate": 2.2272391186925196e-06, "loss": 0.343, "step": 6264 }, { "epoch": 0.6903581267217631, "grad_norm": 12.872480392456055, "learning_rate": 2.2257843277404944e-06, "loss": 0.4617, "step": 6265 }, { "epoch": 0.6904683195592286, "grad_norm": 5.646525859832764, "learning_rate": 2.2243298760233302e-06, "loss": 0.3853, "step": 6266 }, { "epoch": 0.6905785123966942, "grad_norm": 8.56628704071045, "learning_rate": 2.2228757637188805e-06, "loss": 0.3555, "step": 6267 }, { "epoch": 0.6906887052341598, "grad_norm": 6.781405925750732, "learning_rate": 2.221421991004953e-06, "loss": 0.4026, "step": 6268 }, { "epoch": 0.6907988980716253, "grad_norm": 5.899105072021484, "learning_rate": 2.2199685580593207e-06, "loss": 0.3981, "step": 6269 }, { "epoch": 0.6909090909090909, "grad_norm": 5.597877502441406, "learning_rate": 2.218515465059707e-06, "loss": 0.3396, "step": 6270 }, { "epoch": 0.6910192837465565, "grad_norm": 9.017989158630371, "learning_rate": 2.2170627121838012e-06, "loss": 0.3915, "step": 6271 }, { "epoch": 0.691129476584022, "grad_norm": 5.523779392242432, "learning_rate": 2.215610299609249e-06, "loss": 0.4337, "step": 6272 }, { "epoch": 0.6912396694214876, "grad_norm": 10.373221397399902, "learning_rate": 2.2141582275136494e-06, "loss": 0.4425, "step": 6273 }, { "epoch": 0.6913498622589531, "grad_norm": 4.727308750152588, "learning_rate": 2.212706496074566e-06, "loss": 0.4192, "step": 6274 }, { "epoch": 0.6914600550964187, "grad_norm": 6.766533374786377, "learning_rate": 2.21125510546952e-06, "loss": 0.4341, "step": 6275 }, { "epoch": 0.6915702479338843, "grad_norm": 6.907382011413574, "learning_rate": 2.209804055875987e-06, "loss": 0.3731, "step": 6276 }, { "epoch": 0.6916804407713498, "grad_norm": 5.283140182495117, "learning_rate": 2.2083533474714032e-06, "loss": 0.3699, "step": 6277 }, { "epoch": 0.6917906336088154, "grad_norm": 6.515407085418701, "learning_rate": 2.2069029804331665e-06, "loss": 0.4355, "step": 6278 }, { "epoch": 0.691900826446281, "grad_norm": 5.4836649894714355, "learning_rate": 2.2054529549386257e-06, "loss": 0.3901, "step": 6279 }, { "epoch": 0.6920110192837465, "grad_norm": 8.09799575805664, "learning_rate": 2.2040032711650928e-06, "loss": 0.4018, "step": 6280 }, { "epoch": 0.6921212121212121, "grad_norm": 10.193501472473145, "learning_rate": 2.2025539292898402e-06, "loss": 0.4296, "step": 6281 }, { "epoch": 0.6922314049586776, "grad_norm": 5.739263534545898, "learning_rate": 2.2011049294900915e-06, "loss": 0.3733, "step": 6282 }, { "epoch": 0.6923415977961432, "grad_norm": 10.090827941894531, "learning_rate": 2.1996562719430337e-06, "loss": 0.4062, "step": 6283 }, { "epoch": 0.6924517906336088, "grad_norm": 7.952030658721924, "learning_rate": 2.1982079568258123e-06, "loss": 0.3899, "step": 6284 }, { "epoch": 0.6925619834710743, "grad_norm": 6.2705488204956055, "learning_rate": 2.196759984315527e-06, "loss": 0.3744, "step": 6285 }, { "epoch": 0.6926721763085399, "grad_norm": 9.91977596282959, "learning_rate": 2.1953123545892373e-06, "loss": 0.503, "step": 6286 }, { "epoch": 0.6927823691460055, "grad_norm": 9.509714126586914, "learning_rate": 2.193865067823965e-06, "loss": 0.3961, "step": 6287 }, { "epoch": 0.692892561983471, "grad_norm": 8.341251373291016, "learning_rate": 2.192418124196683e-06, "loss": 0.3947, "step": 6288 }, { "epoch": 0.6930027548209367, "grad_norm": 5.873130798339844, "learning_rate": 2.1909715238843253e-06, "loss": 0.4422, "step": 6289 }, { "epoch": 0.6931129476584023, "grad_norm": 8.459653854370117, "learning_rate": 2.189525267063786e-06, "loss": 0.4469, "step": 6290 }, { "epoch": 0.6932231404958678, "grad_norm": 5.0621490478515625, "learning_rate": 2.1880793539119168e-06, "loss": 0.4427, "step": 6291 }, { "epoch": 0.6933333333333334, "grad_norm": 21.41733741760254, "learning_rate": 2.1866337846055215e-06, "loss": 0.4542, "step": 6292 }, { "epoch": 0.6934435261707989, "grad_norm": 11.350135803222656, "learning_rate": 2.185188559321369e-06, "loss": 0.3635, "step": 6293 }, { "epoch": 0.6935537190082645, "grad_norm": 7.1028337478637695, "learning_rate": 2.1837436782361843e-06, "loss": 0.4029, "step": 6294 }, { "epoch": 0.6936639118457301, "grad_norm": 4.9193902015686035, "learning_rate": 2.1822991415266487e-06, "loss": 0.389, "step": 6295 }, { "epoch": 0.6937741046831956, "grad_norm": 6.254681587219238, "learning_rate": 2.1808549493693975e-06, "loss": 0.44, "step": 6296 }, { "epoch": 0.6938842975206612, "grad_norm": 9.382376670837402, "learning_rate": 2.1794111019410364e-06, "loss": 0.4691, "step": 6297 }, { "epoch": 0.6939944903581268, "grad_norm": 5.008315563201904, "learning_rate": 2.1779675994181167e-06, "loss": 0.3961, "step": 6298 }, { "epoch": 0.6941046831955923, "grad_norm": 6.2896552085876465, "learning_rate": 2.1765244419771494e-06, "loss": 0.3416, "step": 6299 }, { "epoch": 0.6942148760330579, "grad_norm": 8.428092956542969, "learning_rate": 2.1750816297946117e-06, "loss": 0.4795, "step": 6300 }, { "epoch": 0.6943250688705234, "grad_norm": 8.456198692321777, "learning_rate": 2.1736391630469296e-06, "loss": 0.4117, "step": 6301 }, { "epoch": 0.694435261707989, "grad_norm": 14.213221549987793, "learning_rate": 2.1721970419104883e-06, "loss": 0.4622, "step": 6302 }, { "epoch": 0.6945454545454546, "grad_norm": 12.13443660736084, "learning_rate": 2.170755266561634e-06, "loss": 0.4652, "step": 6303 }, { "epoch": 0.6946556473829201, "grad_norm": 6.786264419555664, "learning_rate": 2.169313837176668e-06, "loss": 0.3888, "step": 6304 }, { "epoch": 0.6947658402203857, "grad_norm": 6.256235599517822, "learning_rate": 2.1678727539318537e-06, "loss": 0.4624, "step": 6305 }, { "epoch": 0.6948760330578513, "grad_norm": 5.675997257232666, "learning_rate": 2.1664320170034043e-06, "loss": 0.3924, "step": 6306 }, { "epoch": 0.6949862258953168, "grad_norm": 7.280065536499023, "learning_rate": 2.1649916265674968e-06, "loss": 0.3564, "step": 6307 }, { "epoch": 0.6950964187327824, "grad_norm": 5.219188213348389, "learning_rate": 2.1635515828002655e-06, "loss": 0.3962, "step": 6308 }, { "epoch": 0.6952066115702479, "grad_norm": 6.4076409339904785, "learning_rate": 2.1621118858777983e-06, "loss": 0.4154, "step": 6309 }, { "epoch": 0.6953168044077135, "grad_norm": 5.759407997131348, "learning_rate": 2.160672535976145e-06, "loss": 0.4566, "step": 6310 }, { "epoch": 0.6954269972451791, "grad_norm": 8.507913589477539, "learning_rate": 2.1592335332713123e-06, "loss": 0.3917, "step": 6311 }, { "epoch": 0.6955371900826446, "grad_norm": 6.995584964752197, "learning_rate": 2.15779487793926e-06, "loss": 0.3178, "step": 6312 }, { "epoch": 0.6956473829201102, "grad_norm": 6.555243015289307, "learning_rate": 2.156356570155911e-06, "loss": 0.3329, "step": 6313 }, { "epoch": 0.6957575757575758, "grad_norm": 9.83935546875, "learning_rate": 2.154918610097145e-06, "loss": 0.4674, "step": 6314 }, { "epoch": 0.6958677685950413, "grad_norm": 11.09785270690918, "learning_rate": 2.153480997938794e-06, "loss": 0.4431, "step": 6315 }, { "epoch": 0.6959779614325069, "grad_norm": 8.9512300491333, "learning_rate": 2.152043733856653e-06, "loss": 0.4768, "step": 6316 }, { "epoch": 0.6960881542699725, "grad_norm": 6.576215744018555, "learning_rate": 2.150606818026475e-06, "loss": 0.3314, "step": 6317 }, { "epoch": 0.696198347107438, "grad_norm": 4.43838357925415, "learning_rate": 2.149170250623964e-06, "loss": 0.3905, "step": 6318 }, { "epoch": 0.6963085399449036, "grad_norm": 6.3059282302856445, "learning_rate": 2.147734031824787e-06, "loss": 0.4542, "step": 6319 }, { "epoch": 0.6964187327823691, "grad_norm": 11.818520545959473, "learning_rate": 2.146298161804569e-06, "loss": 0.4935, "step": 6320 }, { "epoch": 0.6965289256198347, "grad_norm": 5.9157280921936035, "learning_rate": 2.1448626407388863e-06, "loss": 0.3611, "step": 6321 }, { "epoch": 0.6966391184573003, "grad_norm": 19.772029876708984, "learning_rate": 2.1434274688032784e-06, "loss": 0.474, "step": 6322 }, { "epoch": 0.6967493112947658, "grad_norm": 7.780381202697754, "learning_rate": 2.1419926461732417e-06, "loss": 0.3949, "step": 6323 }, { "epoch": 0.6968595041322314, "grad_norm": 4.548693656921387, "learning_rate": 2.1405581730242244e-06, "loss": 0.4388, "step": 6324 }, { "epoch": 0.696969696969697, "grad_norm": 7.220384120941162, "learning_rate": 2.139124049531638e-06, "loss": 0.3578, "step": 6325 }, { "epoch": 0.6970798898071625, "grad_norm": 5.096642971038818, "learning_rate": 2.1376902758708505e-06, "loss": 0.391, "step": 6326 }, { "epoch": 0.6971900826446281, "grad_norm": 9.401430130004883, "learning_rate": 2.136256852217183e-06, "loss": 0.3521, "step": 6327 }, { "epoch": 0.6973002754820936, "grad_norm": 5.316445827484131, "learning_rate": 2.1348237787459188e-06, "loss": 0.3578, "step": 6328 }, { "epoch": 0.6974104683195592, "grad_norm": 4.958508491516113, "learning_rate": 2.1333910556322928e-06, "loss": 0.4235, "step": 6329 }, { "epoch": 0.6975206611570248, "grad_norm": 8.173996925354004, "learning_rate": 2.1319586830515032e-06, "loss": 0.4319, "step": 6330 }, { "epoch": 0.6976308539944903, "grad_norm": 4.869649887084961, "learning_rate": 2.130526661178703e-06, "loss": 0.3519, "step": 6331 }, { "epoch": 0.6977410468319559, "grad_norm": 6.183595657348633, "learning_rate": 2.1290949901889967e-06, "loss": 0.3677, "step": 6332 }, { "epoch": 0.6978512396694215, "grad_norm": 6.6458635330200195, "learning_rate": 2.1276636702574587e-06, "loss": 0.4512, "step": 6333 }, { "epoch": 0.697961432506887, "grad_norm": 3.9340484142303467, "learning_rate": 2.126232701559109e-06, "loss": 0.4129, "step": 6334 }, { "epoch": 0.6980716253443526, "grad_norm": 4.056954383850098, "learning_rate": 2.124802084268926e-06, "loss": 0.3039, "step": 6335 }, { "epoch": 0.6981818181818182, "grad_norm": 13.909092903137207, "learning_rate": 2.12337181856185e-06, "loss": 0.2881, "step": 6336 }, { "epoch": 0.6982920110192837, "grad_norm": 5.884676933288574, "learning_rate": 2.121941904612777e-06, "loss": 0.4296, "step": 6337 }, { "epoch": 0.6984022038567493, "grad_norm": 4.607361793518066, "learning_rate": 2.1205123425965555e-06, "loss": 0.3976, "step": 6338 }, { "epoch": 0.6985123966942148, "grad_norm": 5.883996486663818, "learning_rate": 2.119083132687997e-06, "loss": 0.371, "step": 6339 }, { "epoch": 0.6986225895316804, "grad_norm": 6.691327095031738, "learning_rate": 2.117654275061867e-06, "loss": 0.4552, "step": 6340 }, { "epoch": 0.698732782369146, "grad_norm": 5.95828914642334, "learning_rate": 2.1162257698928866e-06, "loss": 0.3003, "step": 6341 }, { "epoch": 0.6988429752066115, "grad_norm": 5.617248058319092, "learning_rate": 2.1147976173557363e-06, "loss": 0.4259, "step": 6342 }, { "epoch": 0.6989531680440771, "grad_norm": 5.405389785766602, "learning_rate": 2.113369817625054e-06, "loss": 0.3632, "step": 6343 }, { "epoch": 0.6990633608815428, "grad_norm": 9.319952011108398, "learning_rate": 2.1119423708754295e-06, "loss": 0.4152, "step": 6344 }, { "epoch": 0.6991735537190082, "grad_norm": 7.254077434539795, "learning_rate": 2.110515277281415e-06, "loss": 0.4259, "step": 6345 }, { "epoch": 0.6992837465564739, "grad_norm": 9.147215843200684, "learning_rate": 2.1090885370175176e-06, "loss": 0.4393, "step": 6346 }, { "epoch": 0.6993939393939393, "grad_norm": 6.482443332672119, "learning_rate": 2.107662150258202e-06, "loss": 0.3583, "step": 6347 }, { "epoch": 0.699504132231405, "grad_norm": 4.191505432128906, "learning_rate": 2.1062361171778865e-06, "loss": 0.3227, "step": 6348 }, { "epoch": 0.6996143250688706, "grad_norm": 7.555344104766846, "learning_rate": 2.1048104379509493e-06, "loss": 0.4246, "step": 6349 }, { "epoch": 0.699724517906336, "grad_norm": 10.89263916015625, "learning_rate": 2.1033851127517263e-06, "loss": 0.4577, "step": 6350 }, { "epoch": 0.6998347107438017, "grad_norm": 4.815510272979736, "learning_rate": 2.101960141754506e-06, "loss": 0.3936, "step": 6351 }, { "epoch": 0.6999449035812673, "grad_norm": 8.367965698242188, "learning_rate": 2.100535525133533e-06, "loss": 0.4262, "step": 6352 }, { "epoch": 0.7000550964187328, "grad_norm": 4.15157413482666, "learning_rate": 2.099111263063018e-06, "loss": 0.4028, "step": 6353 }, { "epoch": 0.7001652892561984, "grad_norm": 6.863077163696289, "learning_rate": 2.097687355717118e-06, "loss": 0.372, "step": 6354 }, { "epoch": 0.7002754820936639, "grad_norm": 5.519453048706055, "learning_rate": 2.0962638032699467e-06, "loss": 0.3509, "step": 6355 }, { "epoch": 0.7003856749311295, "grad_norm": 5.474221706390381, "learning_rate": 2.094840605895586e-06, "loss": 0.34, "step": 6356 }, { "epoch": 0.7003856749311295, "eval_loss": 0.40236836671829224, "eval_runtime": 41.9485, "eval_samples_per_second": 17.498, "eval_steps_per_second": 2.193, "step": 6356 }, { "epoch": 0.7004958677685951, "grad_norm": 5.4912519454956055, "learning_rate": 2.093417763768062e-06, "loss": 0.4118, "step": 6357 }, { "epoch": 0.7006060606060606, "grad_norm": 10.394960403442383, "learning_rate": 2.0919952770613584e-06, "loss": 0.5167, "step": 6358 }, { "epoch": 0.7007162534435262, "grad_norm": 4.946646690368652, "learning_rate": 2.090573145949426e-06, "loss": 0.3794, "step": 6359 }, { "epoch": 0.7008264462809918, "grad_norm": 6.778929233551025, "learning_rate": 2.08915137060616e-06, "loss": 0.3765, "step": 6360 }, { "epoch": 0.7009366391184573, "grad_norm": 10.721000671386719, "learning_rate": 2.08772995120542e-06, "loss": 0.4748, "step": 6361 }, { "epoch": 0.7010468319559229, "grad_norm": 5.147397518157959, "learning_rate": 2.0863088879210158e-06, "loss": 0.3863, "step": 6362 }, { "epoch": 0.7011570247933885, "grad_norm": 5.916102409362793, "learning_rate": 2.0848881809267185e-06, "loss": 0.4713, "step": 6363 }, { "epoch": 0.701267217630854, "grad_norm": 5.3361735343933105, "learning_rate": 2.0834678303962556e-06, "loss": 0.3733, "step": 6364 }, { "epoch": 0.7013774104683196, "grad_norm": 6.816646099090576, "learning_rate": 2.082047836503307e-06, "loss": 0.3823, "step": 6365 }, { "epoch": 0.7014876033057851, "grad_norm": 8.116055488586426, "learning_rate": 2.0806281994215128e-06, "loss": 0.4014, "step": 6366 }, { "epoch": 0.7015977961432507, "grad_norm": 5.204188346862793, "learning_rate": 2.0792089193244693e-06, "loss": 0.3623, "step": 6367 }, { "epoch": 0.7017079889807163, "grad_norm": 5.562734127044678, "learning_rate": 2.0777899963857244e-06, "loss": 0.3095, "step": 6368 }, { "epoch": 0.7018181818181818, "grad_norm": 9.468295097351074, "learning_rate": 2.0763714307787893e-06, "loss": 0.432, "step": 6369 }, { "epoch": 0.7019283746556474, "grad_norm": 9.8388671875, "learning_rate": 2.074953222677128e-06, "loss": 0.4079, "step": 6370 }, { "epoch": 0.702038567493113, "grad_norm": 11.453285217285156, "learning_rate": 2.073535372254158e-06, "loss": 0.4765, "step": 6371 }, { "epoch": 0.7021487603305785, "grad_norm": 5.276843547821045, "learning_rate": 2.072117879683258e-06, "loss": 0.421, "step": 6372 }, { "epoch": 0.7022589531680441, "grad_norm": 4.76207160949707, "learning_rate": 2.070700745137763e-06, "loss": 0.4099, "step": 6373 }, { "epoch": 0.7023691460055096, "grad_norm": 6.459706783294678, "learning_rate": 2.0692839687909578e-06, "loss": 0.3702, "step": 6374 }, { "epoch": 0.7024793388429752, "grad_norm": 4.18793249130249, "learning_rate": 2.06786755081609e-06, "loss": 0.465, "step": 6375 }, { "epoch": 0.7025895316804408, "grad_norm": 10.031018257141113, "learning_rate": 2.066451491386363e-06, "loss": 0.3843, "step": 6376 }, { "epoch": 0.7026997245179063, "grad_norm": 5.30232572555542, "learning_rate": 2.0650357906749304e-06, "loss": 0.3887, "step": 6377 }, { "epoch": 0.7028099173553719, "grad_norm": 5.033413887023926, "learning_rate": 2.0636204488549083e-06, "loss": 0.4034, "step": 6378 }, { "epoch": 0.7029201101928375, "grad_norm": 5.639863967895508, "learning_rate": 2.062205466099368e-06, "loss": 0.3672, "step": 6379 }, { "epoch": 0.703030303030303, "grad_norm": 10.517995834350586, "learning_rate": 2.060790842581332e-06, "loss": 0.4371, "step": 6380 }, { "epoch": 0.7031404958677686, "grad_norm": 9.898099899291992, "learning_rate": 2.0593765784737846e-06, "loss": 0.3787, "step": 6381 }, { "epoch": 0.7032506887052341, "grad_norm": 5.050538063049316, "learning_rate": 2.057962673949665e-06, "loss": 0.4103, "step": 6382 }, { "epoch": 0.7033608815426997, "grad_norm": 6.702361583709717, "learning_rate": 2.0565491291818647e-06, "loss": 0.4268, "step": 6383 }, { "epoch": 0.7034710743801653, "grad_norm": 9.471160888671875, "learning_rate": 2.0551359443432347e-06, "loss": 0.3962, "step": 6384 }, { "epoch": 0.7035812672176308, "grad_norm": 8.269988059997559, "learning_rate": 2.0537231196065836e-06, "loss": 0.4447, "step": 6385 }, { "epoch": 0.7036914600550964, "grad_norm": 7.267406940460205, "learning_rate": 2.05231065514467e-06, "loss": 0.4116, "step": 6386 }, { "epoch": 0.703801652892562, "grad_norm": 8.130721092224121, "learning_rate": 2.050898551130215e-06, "loss": 0.446, "step": 6387 }, { "epoch": 0.7039118457300275, "grad_norm": 8.853819847106934, "learning_rate": 2.0494868077358875e-06, "loss": 0.3528, "step": 6388 }, { "epoch": 0.7040220385674931, "grad_norm": 6.168217658996582, "learning_rate": 2.048075425134325e-06, "loss": 0.3955, "step": 6389 }, { "epoch": 0.7041322314049587, "grad_norm": 5.140215873718262, "learning_rate": 2.046664403498109e-06, "loss": 0.4235, "step": 6390 }, { "epoch": 0.7042424242424242, "grad_norm": 7.185565948486328, "learning_rate": 2.0452537429997782e-06, "loss": 0.4495, "step": 6391 }, { "epoch": 0.7043526170798898, "grad_norm": 6.418686866760254, "learning_rate": 2.0438434438118366e-06, "loss": 0.3828, "step": 6392 }, { "epoch": 0.7044628099173553, "grad_norm": 5.245448589324951, "learning_rate": 2.0424335061067345e-06, "loss": 0.4428, "step": 6393 }, { "epoch": 0.7045730027548209, "grad_norm": 5.457551956176758, "learning_rate": 2.0410239300568785e-06, "loss": 0.4255, "step": 6394 }, { "epoch": 0.7046831955922865, "grad_norm": 5.4160943031311035, "learning_rate": 2.0396147158346362e-06, "loss": 0.3421, "step": 6395 }, { "epoch": 0.704793388429752, "grad_norm": 7.206965923309326, "learning_rate": 2.038205863612329e-06, "loss": 0.3551, "step": 6396 }, { "epoch": 0.7049035812672176, "grad_norm": 7.743720531463623, "learning_rate": 2.036797373562231e-06, "loss": 0.4317, "step": 6397 }, { "epoch": 0.7050137741046832, "grad_norm": 5.294939041137695, "learning_rate": 2.0353892458565742e-06, "loss": 0.4029, "step": 6398 }, { "epoch": 0.7051239669421487, "grad_norm": 4.676883697509766, "learning_rate": 2.03398148066755e-06, "loss": 0.4449, "step": 6399 }, { "epoch": 0.7052341597796143, "grad_norm": 6.229726314544678, "learning_rate": 2.0325740781672975e-06, "loss": 0.3878, "step": 6400 }, { "epoch": 0.7053443526170798, "grad_norm": 7.104625701904297, "learning_rate": 2.0311670385279177e-06, "loss": 0.4034, "step": 6401 }, { "epoch": 0.7054545454545454, "grad_norm": 5.908341407775879, "learning_rate": 2.0297603619214644e-06, "loss": 0.3683, "step": 6402 }, { "epoch": 0.705564738292011, "grad_norm": 4.865956783294678, "learning_rate": 2.028354048519951e-06, "loss": 0.4301, "step": 6403 }, { "epoch": 0.7056749311294765, "grad_norm": 9.047842025756836, "learning_rate": 2.026948098495339e-06, "loss": 0.4413, "step": 6404 }, { "epoch": 0.7057851239669422, "grad_norm": 5.223608493804932, "learning_rate": 2.0255425120195533e-06, "loss": 0.4028, "step": 6405 }, { "epoch": 0.7058953168044078, "grad_norm": 5.946341514587402, "learning_rate": 2.0241372892644702e-06, "loss": 0.4659, "step": 6406 }, { "epoch": 0.7060055096418733, "grad_norm": 6.7283501625061035, "learning_rate": 2.0227324304019203e-06, "loss": 0.4225, "step": 6407 }, { "epoch": 0.7061157024793389, "grad_norm": 6.779466152191162, "learning_rate": 2.0213279356036925e-06, "loss": 0.4736, "step": 6408 }, { "epoch": 0.7062258953168044, "grad_norm": 8.905494689941406, "learning_rate": 2.019923805041533e-06, "loss": 0.368, "step": 6409 }, { "epoch": 0.70633608815427, "grad_norm": 9.43920612335205, "learning_rate": 2.018520038887139e-06, "loss": 0.4197, "step": 6410 }, { "epoch": 0.7064462809917356, "grad_norm": 7.749484539031982, "learning_rate": 2.017116637312161e-06, "loss": 0.4171, "step": 6411 }, { "epoch": 0.7065564738292011, "grad_norm": 10.579435348510742, "learning_rate": 2.0157136004882156e-06, "loss": 0.4442, "step": 6412 }, { "epoch": 0.7066666666666667, "grad_norm": 4.212276935577393, "learning_rate": 2.0143109285868653e-06, "loss": 0.3502, "step": 6413 }, { "epoch": 0.7067768595041323, "grad_norm": 6.677924156188965, "learning_rate": 2.012908621779626e-06, "loss": 0.3678, "step": 6414 }, { "epoch": 0.7068870523415978, "grad_norm": 4.494370460510254, "learning_rate": 2.0115066802379818e-06, "loss": 0.3759, "step": 6415 }, { "epoch": 0.7069972451790634, "grad_norm": 6.205049991607666, "learning_rate": 2.0101051041333593e-06, "loss": 0.4057, "step": 6416 }, { "epoch": 0.707107438016529, "grad_norm": 5.39734411239624, "learning_rate": 2.008703893637145e-06, "loss": 0.3744, "step": 6417 }, { "epoch": 0.7072176308539945, "grad_norm": 6.977585315704346, "learning_rate": 2.007303048920684e-06, "loss": 0.3501, "step": 6418 }, { "epoch": 0.7073278236914601, "grad_norm": 7.957043647766113, "learning_rate": 2.00590257015527e-06, "loss": 0.4709, "step": 6419 }, { "epoch": 0.7074380165289256, "grad_norm": 3.9455745220184326, "learning_rate": 2.004502457512158e-06, "loss": 0.3972, "step": 6420 }, { "epoch": 0.7075482093663912, "grad_norm": 5.484201431274414, "learning_rate": 2.003102711162553e-06, "loss": 0.3926, "step": 6421 }, { "epoch": 0.7076584022038568, "grad_norm": 5.1551594734191895, "learning_rate": 2.001703331277619e-06, "loss": 0.4625, "step": 6422 }, { "epoch": 0.7077685950413223, "grad_norm": 6.230329990386963, "learning_rate": 2.0003043180284763e-06, "loss": 0.4086, "step": 6423 }, { "epoch": 0.7078787878787879, "grad_norm": 7.448983192443848, "learning_rate": 1.998905671586195e-06, "loss": 0.4738, "step": 6424 }, { "epoch": 0.7079889807162535, "grad_norm": 9.219151496887207, "learning_rate": 1.9975073921218043e-06, "loss": 0.3801, "step": 6425 }, { "epoch": 0.708099173553719, "grad_norm": 4.740880966186523, "learning_rate": 1.9961094798062903e-06, "loss": 0.305, "step": 6426 }, { "epoch": 0.7082093663911846, "grad_norm": 7.525774955749512, "learning_rate": 1.9947119348105877e-06, "loss": 0.455, "step": 6427 }, { "epoch": 0.7083195592286501, "grad_norm": 7.525498390197754, "learning_rate": 1.993314757305592e-06, "loss": 0.4045, "step": 6428 }, { "epoch": 0.7084297520661157, "grad_norm": 5.27136754989624, "learning_rate": 1.991917947462153e-06, "loss": 0.3178, "step": 6429 }, { "epoch": 0.7085399449035813, "grad_norm": 4.343788146972656, "learning_rate": 1.9905215054510724e-06, "loss": 0.2995, "step": 6430 }, { "epoch": 0.7086501377410468, "grad_norm": 12.280888557434082, "learning_rate": 1.98912543144311e-06, "loss": 0.516, "step": 6431 }, { "epoch": 0.7087603305785124, "grad_norm": 4.431636333465576, "learning_rate": 1.9877297256089813e-06, "loss": 0.3228, "step": 6432 }, { "epoch": 0.708870523415978, "grad_norm": 5.902378559112549, "learning_rate": 1.9863343881193513e-06, "loss": 0.4118, "step": 6433 }, { "epoch": 0.7089807162534435, "grad_norm": 9.444318771362305, "learning_rate": 1.9849394191448467e-06, "loss": 0.4742, "step": 6434 }, { "epoch": 0.7090909090909091, "grad_norm": 4.91517448425293, "learning_rate": 1.9835448188560474e-06, "loss": 0.3875, "step": 6435 }, { "epoch": 0.7092011019283747, "grad_norm": 5.268383026123047, "learning_rate": 1.9821505874234833e-06, "loss": 0.4068, "step": 6436 }, { "epoch": 0.7093112947658402, "grad_norm": 4.532389163970947, "learning_rate": 1.980756725017644e-06, "loss": 0.3579, "step": 6437 }, { "epoch": 0.7094214876033058, "grad_norm": 7.421934127807617, "learning_rate": 1.9793632318089755e-06, "loss": 0.3508, "step": 6438 }, { "epoch": 0.7095316804407713, "grad_norm": 6.017141819000244, "learning_rate": 1.9779701079678732e-06, "loss": 0.4149, "step": 6439 }, { "epoch": 0.7096418732782369, "grad_norm": 4.674948692321777, "learning_rate": 1.9765773536646902e-06, "loss": 0.368, "step": 6440 }, { "epoch": 0.7097520661157025, "grad_norm": 9.248699188232422, "learning_rate": 1.9751849690697377e-06, "loss": 0.2843, "step": 6441 }, { "epoch": 0.709862258953168, "grad_norm": 3.4135935306549072, "learning_rate": 1.9737929543532743e-06, "loss": 0.346, "step": 6442 }, { "epoch": 0.7099724517906336, "grad_norm": 4.658424377441406, "learning_rate": 1.9724013096855206e-06, "loss": 0.4154, "step": 6443 }, { "epoch": 0.7100826446280992, "grad_norm": 5.710280418395996, "learning_rate": 1.9710100352366436e-06, "loss": 0.435, "step": 6444 }, { "epoch": 0.7101928374655647, "grad_norm": 7.507155895233154, "learning_rate": 1.9696191311767777e-06, "loss": 0.4263, "step": 6445 }, { "epoch": 0.7103030303030303, "grad_norm": 5.260646343231201, "learning_rate": 1.968228597676001e-06, "loss": 0.3965, "step": 6446 }, { "epoch": 0.7104132231404958, "grad_norm": 6.585782051086426, "learning_rate": 1.9668384349043456e-06, "loss": 0.3621, "step": 6447 }, { "epoch": 0.7105234159779614, "grad_norm": 7.680531978607178, "learning_rate": 1.965448643031811e-06, "loss": 0.4604, "step": 6448 }, { "epoch": 0.710633608815427, "grad_norm": 7.507516860961914, "learning_rate": 1.9640592222283373e-06, "loss": 0.4089, "step": 6449 }, { "epoch": 0.7107438016528925, "grad_norm": 6.0743021965026855, "learning_rate": 1.962670172663823e-06, "loss": 0.3646, "step": 6450 }, { "epoch": 0.7108539944903581, "grad_norm": 8.247718811035156, "learning_rate": 1.961281494508129e-06, "loss": 0.4097, "step": 6451 }, { "epoch": 0.7109641873278237, "grad_norm": 5.167181015014648, "learning_rate": 1.9598931879310616e-06, "loss": 0.4324, "step": 6452 }, { "epoch": 0.7110743801652892, "grad_norm": 5.881624221801758, "learning_rate": 1.958505253102383e-06, "loss": 0.3745, "step": 6453 }, { "epoch": 0.7111845730027548, "grad_norm": 5.003997325897217, "learning_rate": 1.957117690191814e-06, "loss": 0.4214, "step": 6454 }, { "epoch": 0.7112947658402203, "grad_norm": 8.985822677612305, "learning_rate": 1.955730499369028e-06, "loss": 0.4729, "step": 6455 }, { "epoch": 0.7114049586776859, "grad_norm": 10.160449981689453, "learning_rate": 1.95434368080365e-06, "loss": 0.4224, "step": 6456 }, { "epoch": 0.7115151515151515, "grad_norm": 9.159520149230957, "learning_rate": 1.9529572346652646e-06, "loss": 0.3845, "step": 6457 }, { "epoch": 0.711625344352617, "grad_norm": 5.77703332901001, "learning_rate": 1.951571161123408e-06, "loss": 0.3817, "step": 6458 }, { "epoch": 0.7117355371900826, "grad_norm": 6.839252948760986, "learning_rate": 1.9501854603475713e-06, "loss": 0.4795, "step": 6459 }, { "epoch": 0.7118457300275483, "grad_norm": 5.172279357910156, "learning_rate": 1.948800132507198e-06, "loss": 0.3201, "step": 6460 }, { "epoch": 0.7119559228650137, "grad_norm": 5.667455196380615, "learning_rate": 1.9474151777716895e-06, "loss": 0.3929, "step": 6461 }, { "epoch": 0.7120661157024794, "grad_norm": 6.601551532745361, "learning_rate": 1.9460305963104004e-06, "loss": 0.4091, "step": 6462 }, { "epoch": 0.712176308539945, "grad_norm": 3.548739194869995, "learning_rate": 1.9446463882926377e-06, "loss": 0.3791, "step": 6463 }, { "epoch": 0.7122865013774105, "grad_norm": 8.532393455505371, "learning_rate": 1.9432625538876644e-06, "loss": 0.2892, "step": 6464 }, { "epoch": 0.7123966942148761, "grad_norm": 8.71661376953125, "learning_rate": 1.9418790932646998e-06, "loss": 0.4074, "step": 6465 }, { "epoch": 0.7125068870523416, "grad_norm": 8.869766235351562, "learning_rate": 1.9404960065929116e-06, "loss": 0.3689, "step": 6466 }, { "epoch": 0.7126170798898072, "grad_norm": 6.914660453796387, "learning_rate": 1.9391132940414287e-06, "loss": 0.415, "step": 6467 }, { "epoch": 0.7127272727272728, "grad_norm": 4.588566780090332, "learning_rate": 1.937730955779331e-06, "loss": 0.4176, "step": 6468 }, { "epoch": 0.7128374655647383, "grad_norm": 5.332058429718018, "learning_rate": 1.936348991975652e-06, "loss": 0.4324, "step": 6469 }, { "epoch": 0.7129476584022039, "grad_norm": 5.654483318328857, "learning_rate": 1.9349674027993766e-06, "loss": 0.305, "step": 6470 }, { "epoch": 0.7130578512396695, "grad_norm": 5.828920364379883, "learning_rate": 1.9335861884194536e-06, "loss": 0.3845, "step": 6471 }, { "epoch": 0.713168044077135, "grad_norm": 4.670661449432373, "learning_rate": 1.932205349004775e-06, "loss": 0.26, "step": 6472 }, { "epoch": 0.7132782369146006, "grad_norm": 5.2829909324646, "learning_rate": 1.930824884724194e-06, "loss": 0.417, "step": 6473 }, { "epoch": 0.7133884297520661, "grad_norm": 6.3276686668396, "learning_rate": 1.929444795746517e-06, "loss": 0.4473, "step": 6474 }, { "epoch": 0.7134986225895317, "grad_norm": 11.624544143676758, "learning_rate": 1.928065082240499e-06, "loss": 0.3489, "step": 6475 }, { "epoch": 0.7136088154269973, "grad_norm": 8.43012523651123, "learning_rate": 1.926685744374857e-06, "loss": 0.4578, "step": 6476 }, { "epoch": 0.7137190082644628, "grad_norm": 6.978827476501465, "learning_rate": 1.925306782318256e-06, "loss": 0.3079, "step": 6477 }, { "epoch": 0.7138292011019284, "grad_norm": 10.418410301208496, "learning_rate": 1.923928196239318e-06, "loss": 0.2941, "step": 6478 }, { "epoch": 0.713939393939394, "grad_norm": 7.191355228424072, "learning_rate": 1.92254998630662e-06, "loss": 0.4816, "step": 6479 }, { "epoch": 0.7140495867768595, "grad_norm": 9.33836841583252, "learning_rate": 1.9211721526886883e-06, "loss": 0.4029, "step": 6480 }, { "epoch": 0.7141597796143251, "grad_norm": 5.70156717300415, "learning_rate": 1.919794695554008e-06, "loss": 0.4368, "step": 6481 }, { "epoch": 0.7142699724517906, "grad_norm": 10.257763862609863, "learning_rate": 1.9184176150710184e-06, "loss": 0.4192, "step": 6482 }, { "epoch": 0.7143801652892562, "grad_norm": 4.961391448974609, "learning_rate": 1.9170409114081067e-06, "loss": 0.414, "step": 6483 }, { "epoch": 0.7144903581267218, "grad_norm": 6.4829630851745605, "learning_rate": 1.9156645847336203e-06, "loss": 0.4496, "step": 6484 }, { "epoch": 0.7146005509641873, "grad_norm": 6.972085952758789, "learning_rate": 1.91428863521586e-06, "loss": 0.3441, "step": 6485 }, { "epoch": 0.7147107438016529, "grad_norm": 10.662055969238281, "learning_rate": 1.9129130630230753e-06, "loss": 0.4387, "step": 6486 }, { "epoch": 0.7148209366391185, "grad_norm": 5.661942958831787, "learning_rate": 1.9115378683234742e-06, "loss": 0.2902, "step": 6487 }, { "epoch": 0.714931129476584, "grad_norm": 4.665764331817627, "learning_rate": 1.910163051285219e-06, "loss": 0.3051, "step": 6488 }, { "epoch": 0.7150413223140496, "grad_norm": 5.3506574630737305, "learning_rate": 1.9087886120764227e-06, "loss": 0.3345, "step": 6489 }, { "epoch": 0.7151515151515152, "grad_norm": 6.438321590423584, "learning_rate": 1.9074145508651533e-06, "loss": 0.38, "step": 6490 }, { "epoch": 0.7152617079889807, "grad_norm": 11.505531311035156, "learning_rate": 1.9060408678194347e-06, "loss": 0.4181, "step": 6491 }, { "epoch": 0.7153719008264463, "grad_norm": 5.2197442054748535, "learning_rate": 1.9046675631072404e-06, "loss": 0.4098, "step": 6492 }, { "epoch": 0.7154820936639118, "grad_norm": 7.508244037628174, "learning_rate": 1.903294636896501e-06, "loss": 0.4202, "step": 6493 }, { "epoch": 0.7155922865013774, "grad_norm": 9.025035858154297, "learning_rate": 1.9019220893551016e-06, "loss": 0.4205, "step": 6494 }, { "epoch": 0.715702479338843, "grad_norm": 9.53231430053711, "learning_rate": 1.9005499206508755e-06, "loss": 0.4843, "step": 6495 }, { "epoch": 0.7158126721763085, "grad_norm": 8.423941612243652, "learning_rate": 1.8991781309516155e-06, "loss": 0.3284, "step": 6496 }, { "epoch": 0.7159228650137741, "grad_norm": 5.7038469314575195, "learning_rate": 1.8978067204250673e-06, "loss": 0.3674, "step": 6497 }, { "epoch": 0.7160330578512397, "grad_norm": 7.418968200683594, "learning_rate": 1.8964356892389253e-06, "loss": 0.4702, "step": 6498 }, { "epoch": 0.7161432506887052, "grad_norm": 5.119117259979248, "learning_rate": 1.8950650375608432e-06, "loss": 0.4034, "step": 6499 }, { "epoch": 0.7162534435261708, "grad_norm": 6.179746627807617, "learning_rate": 1.8936947655584259e-06, "loss": 0.3843, "step": 6500 }, { "epoch": 0.7163636363636363, "grad_norm": 4.594128608703613, "learning_rate": 1.8923248733992344e-06, "loss": 0.3719, "step": 6501 }, { "epoch": 0.7164738292011019, "grad_norm": 5.928335189819336, "learning_rate": 1.890955361250778e-06, "loss": 0.4113, "step": 6502 }, { "epoch": 0.7165840220385675, "grad_norm": 7.248212814331055, "learning_rate": 1.88958622928052e-06, "loss": 0.4168, "step": 6503 }, { "epoch": 0.716694214876033, "grad_norm": 6.2532830238342285, "learning_rate": 1.8882174776558866e-06, "loss": 0.3212, "step": 6504 }, { "epoch": 0.7168044077134986, "grad_norm": 4.506024360656738, "learning_rate": 1.8868491065442468e-06, "loss": 0.4239, "step": 6505 }, { "epoch": 0.7169146005509642, "grad_norm": 9.820466041564941, "learning_rate": 1.8854811161129238e-06, "loss": 0.3794, "step": 6506 }, { "epoch": 0.7170247933884297, "grad_norm": 9.815295219421387, "learning_rate": 1.8841135065292043e-06, "loss": 0.4058, "step": 6507 }, { "epoch": 0.7171349862258953, "grad_norm": 8.686479568481445, "learning_rate": 1.8827462779603173e-06, "loss": 0.4168, "step": 6508 }, { "epoch": 0.7172451790633608, "grad_norm": 8.61069107055664, "learning_rate": 1.881379430573448e-06, "loss": 0.3631, "step": 6509 }, { "epoch": 0.7173553719008264, "grad_norm": 5.897222995758057, "learning_rate": 1.8800129645357384e-06, "loss": 0.4198, "step": 6510 }, { "epoch": 0.717465564738292, "grad_norm": 9.235694885253906, "learning_rate": 1.8786468800142832e-06, "loss": 0.3956, "step": 6511 }, { "epoch": 0.7175757575757575, "grad_norm": 4.371232509613037, "learning_rate": 1.8772811771761257e-06, "loss": 0.4216, "step": 6512 }, { "epoch": 0.7176859504132231, "grad_norm": 4.844611167907715, "learning_rate": 1.875915856188268e-06, "loss": 0.3879, "step": 6513 }, { "epoch": 0.7177961432506887, "grad_norm": 7.7547101974487305, "learning_rate": 1.8745509172176624e-06, "loss": 0.4434, "step": 6514 }, { "epoch": 0.7179063360881542, "grad_norm": 9.482111930847168, "learning_rate": 1.8731863604312183e-06, "loss": 0.5215, "step": 6515 }, { "epoch": 0.7180165289256198, "grad_norm": 4.813543796539307, "learning_rate": 1.871822185995792e-06, "loss": 0.3946, "step": 6516 }, { "epoch": 0.7181267217630855, "grad_norm": 8.135725975036621, "learning_rate": 1.8704583940781972e-06, "loss": 0.4224, "step": 6517 }, { "epoch": 0.718236914600551, "grad_norm": 7.12558126449585, "learning_rate": 1.8690949848452034e-06, "loss": 0.3971, "step": 6518 }, { "epoch": 0.7183471074380166, "grad_norm": 4.535747528076172, "learning_rate": 1.8677319584635257e-06, "loss": 0.334, "step": 6519 }, { "epoch": 0.718457300275482, "grad_norm": 5.1335272789001465, "learning_rate": 1.8663693150998391e-06, "loss": 0.3054, "step": 6520 }, { "epoch": 0.7185674931129477, "grad_norm": 4.701351165771484, "learning_rate": 1.865007054920771e-06, "loss": 0.4164, "step": 6521 }, { "epoch": 0.7186776859504133, "grad_norm": 7.359562397003174, "learning_rate": 1.8636451780928967e-06, "loss": 0.4328, "step": 6522 }, { "epoch": 0.7187878787878788, "grad_norm": 5.535491943359375, "learning_rate": 1.8622836847827508e-06, "loss": 0.4366, "step": 6523 }, { "epoch": 0.7188980716253444, "grad_norm": 9.683491706848145, "learning_rate": 1.8609225751568193e-06, "loss": 0.5647, "step": 6524 }, { "epoch": 0.71900826446281, "grad_norm": 9.465622901916504, "learning_rate": 1.8595618493815377e-06, "loss": 0.3914, "step": 6525 }, { "epoch": 0.7191184573002755, "grad_norm": 6.814040184020996, "learning_rate": 1.8582015076232995e-06, "loss": 0.4837, "step": 6526 }, { "epoch": 0.7192286501377411, "grad_norm": 7.456498622894287, "learning_rate": 1.85684155004845e-06, "loss": 0.3526, "step": 6527 }, { "epoch": 0.7193388429752066, "grad_norm": 8.250312805175781, "learning_rate": 1.8554819768232835e-06, "loss": 0.3609, "step": 6528 }, { "epoch": 0.7194490358126722, "grad_norm": 5.8619537353515625, "learning_rate": 1.8541227881140528e-06, "loss": 0.3902, "step": 6529 }, { "epoch": 0.7195592286501378, "grad_norm": 9.79912281036377, "learning_rate": 1.8527639840869622e-06, "loss": 0.4381, "step": 6530 }, { "epoch": 0.7196694214876033, "grad_norm": 5.200512886047363, "learning_rate": 1.8514055649081646e-06, "loss": 0.4007, "step": 6531 }, { "epoch": 0.7197796143250689, "grad_norm": 6.89130973815918, "learning_rate": 1.8500475307437721e-06, "loss": 0.4139, "step": 6532 }, { "epoch": 0.7198898071625345, "grad_norm": 5.5149664878845215, "learning_rate": 1.8486898817598474e-06, "loss": 0.34, "step": 6533 }, { "epoch": 0.72, "grad_norm": 8.045257568359375, "learning_rate": 1.8473326181224033e-06, "loss": 0.4186, "step": 6534 }, { "epoch": 0.7201101928374656, "grad_norm": 7.109325885772705, "learning_rate": 1.845975739997411e-06, "loss": 0.3819, "step": 6535 }, { "epoch": 0.7202203856749311, "grad_norm": 6.589534282684326, "learning_rate": 1.844619247550788e-06, "loss": 0.4972, "step": 6536 }, { "epoch": 0.7203305785123967, "grad_norm": 6.8329243659973145, "learning_rate": 1.8432631409484091e-06, "loss": 0.4131, "step": 6537 }, { "epoch": 0.7204407713498623, "grad_norm": 8.504344940185547, "learning_rate": 1.8419074203561034e-06, "loss": 0.4125, "step": 6538 }, { "epoch": 0.7205509641873278, "grad_norm": 6.963781833648682, "learning_rate": 1.8405520859396458e-06, "loss": 0.4397, "step": 6539 }, { "epoch": 0.7206611570247934, "grad_norm": 9.599796295166016, "learning_rate": 1.8391971378647715e-06, "loss": 0.4355, "step": 6540 }, { "epoch": 0.720771349862259, "grad_norm": 9.471633911132812, "learning_rate": 1.8378425762971657e-06, "loss": 0.4078, "step": 6541 }, { "epoch": 0.7208815426997245, "grad_norm": 5.994978904724121, "learning_rate": 1.8364884014024642e-06, "loss": 0.483, "step": 6542 }, { "epoch": 0.7209917355371901, "grad_norm": 6.151260852813721, "learning_rate": 1.8351346133462579e-06, "loss": 0.3639, "step": 6543 }, { "epoch": 0.7211019283746557, "grad_norm": 8.746085166931152, "learning_rate": 1.8337812122940917e-06, "loss": 0.4144, "step": 6544 }, { "epoch": 0.7212121212121212, "grad_norm": 4.571780681610107, "learning_rate": 1.8324281984114577e-06, "loss": 0.4102, "step": 6545 }, { "epoch": 0.7213223140495868, "grad_norm": 7.68917989730835, "learning_rate": 1.8310755718638069e-06, "loss": 0.4475, "step": 6546 }, { "epoch": 0.7214325068870523, "grad_norm": 5.956671714782715, "learning_rate": 1.8297233328165414e-06, "loss": 0.3835, "step": 6547 }, { "epoch": 0.7215426997245179, "grad_norm": 4.739256858825684, "learning_rate": 1.8283714814350113e-06, "loss": 0.4433, "step": 6548 }, { "epoch": 0.7216528925619835, "grad_norm": 6.6831464767456055, "learning_rate": 1.8270200178845242e-06, "loss": 0.4079, "step": 6549 }, { "epoch": 0.721763085399449, "grad_norm": 6.049987316131592, "learning_rate": 1.825668942330342e-06, "loss": 0.4344, "step": 6550 }, { "epoch": 0.7218732782369146, "grad_norm": 8.413738250732422, "learning_rate": 1.8243182549376714e-06, "loss": 0.4136, "step": 6551 }, { "epoch": 0.7219834710743802, "grad_norm": 6.3152008056640625, "learning_rate": 1.8229679558716779e-06, "loss": 0.3579, "step": 6552 }, { "epoch": 0.7220936639118457, "grad_norm": 5.228076457977295, "learning_rate": 1.8216180452974807e-06, "loss": 0.4032, "step": 6553 }, { "epoch": 0.7222038567493113, "grad_norm": 5.5786848068237305, "learning_rate": 1.8202685233801442e-06, "loss": 0.462, "step": 6554 }, { "epoch": 0.7223140495867768, "grad_norm": 8.232794761657715, "learning_rate": 1.818919390284692e-06, "loss": 0.4015, "step": 6555 }, { "epoch": 0.7224242424242424, "grad_norm": 6.3292365074157715, "learning_rate": 1.8175706461760977e-06, "loss": 0.4662, "step": 6556 }, { "epoch": 0.722534435261708, "grad_norm": 7.013743877410889, "learning_rate": 1.8162222912192896e-06, "loss": 0.3822, "step": 6557 }, { "epoch": 0.7226446280991735, "grad_norm": 7.752472400665283, "learning_rate": 1.8148743255791428e-06, "loss": 0.405, "step": 6558 }, { "epoch": 0.7227548209366391, "grad_norm": 6.220157623291016, "learning_rate": 1.81352674942049e-06, "loss": 0.3573, "step": 6559 }, { "epoch": 0.7228650137741047, "grad_norm": 6.291469097137451, "learning_rate": 1.8121795629081163e-06, "loss": 0.4659, "step": 6560 }, { "epoch": 0.7229752066115702, "grad_norm": 8.126771926879883, "learning_rate": 1.8108327662067554e-06, "loss": 0.4463, "step": 6561 }, { "epoch": 0.7230853994490358, "grad_norm": 9.852230072021484, "learning_rate": 1.809486359481093e-06, "loss": 0.4787, "step": 6562 }, { "epoch": 0.7231955922865014, "grad_norm": 8.47900676727295, "learning_rate": 1.8081403428957762e-06, "loss": 0.384, "step": 6563 }, { "epoch": 0.7233057851239669, "grad_norm": 5.633147239685059, "learning_rate": 1.8067947166153937e-06, "loss": 0.3449, "step": 6564 }, { "epoch": 0.7234159779614325, "grad_norm": 6.953232765197754, "learning_rate": 1.805449480804487e-06, "loss": 0.4464, "step": 6565 }, { "epoch": 0.723526170798898, "grad_norm": 5.364905834197998, "learning_rate": 1.804104635627561e-06, "loss": 0.3573, "step": 6566 }, { "epoch": 0.7236363636363636, "grad_norm": 8.890564918518066, "learning_rate": 1.8027601812490614e-06, "loss": 0.4907, "step": 6567 }, { "epoch": 0.7237465564738292, "grad_norm": 4.464534759521484, "learning_rate": 1.8014161178333878e-06, "loss": 0.4317, "step": 6568 }, { "epoch": 0.7238567493112947, "grad_norm": 6.534988880157471, "learning_rate": 1.8000724455448965e-06, "loss": 0.4224, "step": 6569 }, { "epoch": 0.7239669421487603, "grad_norm": 7.258755207061768, "learning_rate": 1.7987291645478926e-06, "loss": 0.4113, "step": 6570 }, { "epoch": 0.724077134986226, "grad_norm": 7.645383358001709, "learning_rate": 1.7973862750066374e-06, "loss": 0.3435, "step": 6571 }, { "epoch": 0.7241873278236914, "grad_norm": 6.104715347290039, "learning_rate": 1.7960437770853368e-06, "loss": 0.4122, "step": 6572 }, { "epoch": 0.724297520661157, "grad_norm": 6.718868732452393, "learning_rate": 1.7947016709481552e-06, "loss": 0.3377, "step": 6573 }, { "epoch": 0.7244077134986225, "grad_norm": 6.723759174346924, "learning_rate": 1.7933599567592092e-06, "loss": 0.3965, "step": 6574 }, { "epoch": 0.7245179063360881, "grad_norm": 8.80811882019043, "learning_rate": 1.7920186346825618e-06, "loss": 0.4579, "step": 6575 }, { "epoch": 0.7246280991735538, "grad_norm": 5.257187843322754, "learning_rate": 1.7906777048822332e-06, "loss": 0.3238, "step": 6576 }, { "epoch": 0.7247382920110192, "grad_norm": 4.086373805999756, "learning_rate": 1.789337167522196e-06, "loss": 0.3268, "step": 6577 }, { "epoch": 0.7248484848484849, "grad_norm": 8.623800277709961, "learning_rate": 1.7879970227663696e-06, "loss": 0.4486, "step": 6578 }, { "epoch": 0.7249586776859505, "grad_norm": 9.160906791687012, "learning_rate": 1.7866572707786301e-06, "loss": 0.5127, "step": 6579 }, { "epoch": 0.725068870523416, "grad_norm": 6.842070579528809, "learning_rate": 1.7853179117228064e-06, "loss": 0.4273, "step": 6580 }, { "epoch": 0.7251790633608816, "grad_norm": 5.782288551330566, "learning_rate": 1.7839789457626733e-06, "loss": 0.3604, "step": 6581 }, { "epoch": 0.725289256198347, "grad_norm": 6.150672435760498, "learning_rate": 1.7826403730619635e-06, "loss": 0.3961, "step": 6582 }, { "epoch": 0.7253994490358127, "grad_norm": 6.725852012634277, "learning_rate": 1.7813021937843606e-06, "loss": 0.3989, "step": 6583 }, { "epoch": 0.7255096418732783, "grad_norm": 6.893726825714111, "learning_rate": 1.7799644080934959e-06, "loss": 0.4057, "step": 6584 }, { "epoch": 0.7256198347107438, "grad_norm": 4.0173115730285645, "learning_rate": 1.7786270161529578e-06, "loss": 0.4538, "step": 6585 }, { "epoch": 0.7257300275482094, "grad_norm": 5.072638988494873, "learning_rate": 1.7772900181262853e-06, "loss": 0.3996, "step": 6586 }, { "epoch": 0.725840220385675, "grad_norm": 3.620347738265991, "learning_rate": 1.775953414176965e-06, "loss": 0.3186, "step": 6587 }, { "epoch": 0.7259504132231405, "grad_norm": 4.775448799133301, "learning_rate": 1.7746172044684413e-06, "loss": 0.4033, "step": 6588 }, { "epoch": 0.7260606060606061, "grad_norm": 7.6348772048950195, "learning_rate": 1.7732813891641088e-06, "loss": 0.4258, "step": 6589 }, { "epoch": 0.7261707988980717, "grad_norm": 6.827914714813232, "learning_rate": 1.7719459684273089e-06, "loss": 0.4086, "step": 6590 }, { "epoch": 0.7262809917355372, "grad_norm": 5.695342063903809, "learning_rate": 1.7706109424213414e-06, "loss": 0.3584, "step": 6591 }, { "epoch": 0.7263911845730028, "grad_norm": 5.97652530670166, "learning_rate": 1.7692763113094557e-06, "loss": 0.4056, "step": 6592 }, { "epoch": 0.7265013774104683, "grad_norm": 6.561317443847656, "learning_rate": 1.7679420752548499e-06, "loss": 0.4263, "step": 6593 }, { "epoch": 0.7266115702479339, "grad_norm": 4.876357555389404, "learning_rate": 1.7666082344206787e-06, "loss": 0.3326, "step": 6594 }, { "epoch": 0.7267217630853995, "grad_norm": 6.124058723449707, "learning_rate": 1.7652747889700434e-06, "loss": 0.3464, "step": 6595 }, { "epoch": 0.726831955922865, "grad_norm": 5.761227607727051, "learning_rate": 1.7639417390660007e-06, "loss": 0.2909, "step": 6596 }, { "epoch": 0.7269421487603306, "grad_norm": 5.419153690338135, "learning_rate": 1.7626090848715598e-06, "loss": 0.4528, "step": 6597 }, { "epoch": 0.7270523415977962, "grad_norm": 6.617321968078613, "learning_rate": 1.7612768265496738e-06, "loss": 0.412, "step": 6598 }, { "epoch": 0.7271625344352617, "grad_norm": 4.691646099090576, "learning_rate": 1.7599449642632605e-06, "loss": 0.4754, "step": 6599 }, { "epoch": 0.7272727272727273, "grad_norm": 6.649336814880371, "learning_rate": 1.7586134981751785e-06, "loss": 0.4625, "step": 6600 }, { "epoch": 0.7273829201101928, "grad_norm": 5.788405418395996, "learning_rate": 1.7572824284482387e-06, "loss": 0.4402, "step": 6601 }, { "epoch": 0.7274931129476584, "grad_norm": 6.544713973999023, "learning_rate": 1.7559517552452082e-06, "loss": 0.3637, "step": 6602 }, { "epoch": 0.727603305785124, "grad_norm": 5.603353023529053, "learning_rate": 1.7546214787288057e-06, "loss": 0.4017, "step": 6603 }, { "epoch": 0.7277134986225895, "grad_norm": 10.790420532226562, "learning_rate": 1.7532915990616955e-06, "loss": 0.4249, "step": 6604 }, { "epoch": 0.7278236914600551, "grad_norm": 5.181728839874268, "learning_rate": 1.7519621164064987e-06, "loss": 0.4056, "step": 6605 }, { "epoch": 0.7279338842975207, "grad_norm": 4.5847930908203125, "learning_rate": 1.750633030925788e-06, "loss": 0.3647, "step": 6606 }, { "epoch": 0.7280440771349862, "grad_norm": 4.745823383331299, "learning_rate": 1.7493043427820827e-06, "loss": 0.3545, "step": 6607 }, { "epoch": 0.7281542699724518, "grad_norm": 5.319917678833008, "learning_rate": 1.7479760521378576e-06, "loss": 0.4041, "step": 6608 }, { "epoch": 0.7282644628099173, "grad_norm": 8.163823127746582, "learning_rate": 1.74664815915554e-06, "loss": 0.3588, "step": 6609 }, { "epoch": 0.7283746556473829, "grad_norm": 8.125265121459961, "learning_rate": 1.7453206639975034e-06, "loss": 0.4947, "step": 6610 }, { "epoch": 0.7284848484848485, "grad_norm": 9.334182739257812, "learning_rate": 1.743993566826077e-06, "loss": 0.4697, "step": 6611 }, { "epoch": 0.728595041322314, "grad_norm": 5.022836685180664, "learning_rate": 1.7426668678035402e-06, "loss": 0.4174, "step": 6612 }, { "epoch": 0.7287052341597796, "grad_norm": 5.568726062774658, "learning_rate": 1.7413405670921246e-06, "loss": 0.3476, "step": 6613 }, { "epoch": 0.7288154269972452, "grad_norm": 6.706714630126953, "learning_rate": 1.7400146648540094e-06, "loss": 0.3886, "step": 6614 }, { "epoch": 0.7289256198347107, "grad_norm": 6.909881114959717, "learning_rate": 1.7386891612513296e-06, "loss": 0.4205, "step": 6615 }, { "epoch": 0.7290358126721763, "grad_norm": 6.599411487579346, "learning_rate": 1.7373640564461707e-06, "loss": 0.4318, "step": 6616 }, { "epoch": 0.7291460055096419, "grad_norm": 5.470994472503662, "learning_rate": 1.7360393506005652e-06, "loss": 0.41, "step": 6617 }, { "epoch": 0.7292561983471074, "grad_norm": 8.121482849121094, "learning_rate": 1.7347150438765016e-06, "loss": 0.4368, "step": 6618 }, { "epoch": 0.729366391184573, "grad_norm": 7.104863166809082, "learning_rate": 1.733391136435919e-06, "loss": 0.4655, "step": 6619 }, { "epoch": 0.7294765840220385, "grad_norm": 8.662788391113281, "learning_rate": 1.7320676284407062e-06, "loss": 0.5028, "step": 6620 }, { "epoch": 0.7295867768595041, "grad_norm": 11.40149211883545, "learning_rate": 1.730744520052699e-06, "loss": 0.3573, "step": 6621 }, { "epoch": 0.7296969696969697, "grad_norm": 5.609067916870117, "learning_rate": 1.7294218114336963e-06, "loss": 0.3833, "step": 6622 }, { "epoch": 0.7298071625344352, "grad_norm": 6.620344161987305, "learning_rate": 1.7280995027454372e-06, "loss": 0.4063, "step": 6623 }, { "epoch": 0.7299173553719008, "grad_norm": 4.553619861602783, "learning_rate": 1.7267775941496122e-06, "loss": 0.4281, "step": 6624 }, { "epoch": 0.7300275482093664, "grad_norm": 4.575986385345459, "learning_rate": 1.7254560858078724e-06, "loss": 0.3539, "step": 6625 }, { "epoch": 0.7301377410468319, "grad_norm": 5.355452060699463, "learning_rate": 1.7241349778818084e-06, "loss": 0.4154, "step": 6626 }, { "epoch": 0.7302479338842975, "grad_norm": 7.630439281463623, "learning_rate": 1.7228142705329715e-06, "loss": 0.4742, "step": 6627 }, { "epoch": 0.730358126721763, "grad_norm": 4.496499061584473, "learning_rate": 1.721493963922855e-06, "loss": 0.3786, "step": 6628 }, { "epoch": 0.7304683195592286, "grad_norm": 3.9322595596313477, "learning_rate": 1.72017405821291e-06, "loss": 0.339, "step": 6629 }, { "epoch": 0.7305785123966942, "grad_norm": 10.261990547180176, "learning_rate": 1.7188545535645385e-06, "loss": 0.4321, "step": 6630 }, { "epoch": 0.7306887052341597, "grad_norm": 6.136670112609863, "learning_rate": 1.7175354501390874e-06, "loss": 0.3624, "step": 6631 }, { "epoch": 0.7307988980716253, "grad_norm": 6.162241458892822, "learning_rate": 1.7162167480978598e-06, "loss": 0.4412, "step": 6632 }, { "epoch": 0.730909090909091, "grad_norm": 11.787130355834961, "learning_rate": 1.7148984476021107e-06, "loss": 0.4973, "step": 6633 }, { "epoch": 0.7310192837465564, "grad_norm": 4.376728534698486, "learning_rate": 1.7135805488130402e-06, "loss": 0.4117, "step": 6634 }, { "epoch": 0.731129476584022, "grad_norm": 6.36915397644043, "learning_rate": 1.7122630518918044e-06, "loss": 0.3593, "step": 6635 }, { "epoch": 0.7312396694214875, "grad_norm": 6.285597801208496, "learning_rate": 1.7109459569995102e-06, "loss": 0.4451, "step": 6636 }, { "epoch": 0.7313498622589532, "grad_norm": 6.95416259765625, "learning_rate": 1.7096292642972107e-06, "loss": 0.4067, "step": 6637 }, { "epoch": 0.7314600550964188, "grad_norm": 4.071789264678955, "learning_rate": 1.7083129739459136e-06, "loss": 0.3959, "step": 6638 }, { "epoch": 0.7315702479338843, "grad_norm": 6.4609456062316895, "learning_rate": 1.70699708610658e-06, "loss": 0.3832, "step": 6639 }, { "epoch": 0.7316804407713499, "grad_norm": 8.448399543762207, "learning_rate": 1.7056816009401134e-06, "loss": 0.4338, "step": 6640 }, { "epoch": 0.7317906336088155, "grad_norm": 8.745491981506348, "learning_rate": 1.7043665186073754e-06, "loss": 0.3957, "step": 6641 }, { "epoch": 0.731900826446281, "grad_norm": 7.603783130645752, "learning_rate": 1.7030518392691785e-06, "loss": 0.4717, "step": 6642 }, { "epoch": 0.7320110192837466, "grad_norm": 5.770527362823486, "learning_rate": 1.7017375630862791e-06, "loss": 0.4157, "step": 6643 }, { "epoch": 0.7321212121212122, "grad_norm": 4.718587875366211, "learning_rate": 1.700423690219391e-06, "loss": 0.4002, "step": 6644 }, { "epoch": 0.7322314049586777, "grad_norm": 5.69619607925415, "learning_rate": 1.6991102208291777e-06, "loss": 0.4224, "step": 6645 }, { "epoch": 0.7323415977961433, "grad_norm": 4.270539283752441, "learning_rate": 1.6977971550762484e-06, "loss": 0.3277, "step": 6646 }, { "epoch": 0.7324517906336088, "grad_norm": 8.055335998535156, "learning_rate": 1.6964844931211689e-06, "loss": 0.3926, "step": 6647 }, { "epoch": 0.7325619834710744, "grad_norm": 4.8165812492370605, "learning_rate": 1.6951722351244542e-06, "loss": 0.3115, "step": 6648 }, { "epoch": 0.73267217630854, "grad_norm": 7.0699310302734375, "learning_rate": 1.6938603812465666e-06, "loss": 0.4445, "step": 6649 }, { "epoch": 0.7327823691460055, "grad_norm": 8.66176986694336, "learning_rate": 1.6925489316479226e-06, "loss": 0.457, "step": 6650 }, { "epoch": 0.7328925619834711, "grad_norm": 4.818056106567383, "learning_rate": 1.691237886488889e-06, "loss": 0.4217, "step": 6651 }, { "epoch": 0.7330027548209367, "grad_norm": 7.170517444610596, "learning_rate": 1.68992724592978e-06, "loss": 0.4272, "step": 6652 }, { "epoch": 0.7331129476584022, "grad_norm": 5.2569169998168945, "learning_rate": 1.6886170101308652e-06, "loss": 0.3411, "step": 6653 }, { "epoch": 0.7332231404958678, "grad_norm": 4.8793110847473145, "learning_rate": 1.6873071792523572e-06, "loss": 0.4019, "step": 6654 }, { "epoch": 0.7333333333333333, "grad_norm": 4.878707408905029, "learning_rate": 1.6859977534544302e-06, "loss": 0.3145, "step": 6655 }, { "epoch": 0.7334435261707989, "grad_norm": 9.079961776733398, "learning_rate": 1.6846887328972e-06, "loss": 0.4538, "step": 6656 }, { "epoch": 0.7335537190082645, "grad_norm": 6.583092212677002, "learning_rate": 1.6833801177407316e-06, "loss": 0.4319, "step": 6657 }, { "epoch": 0.73366391184573, "grad_norm": 9.224050521850586, "learning_rate": 1.6820719081450505e-06, "loss": 0.4834, "step": 6658 }, { "epoch": 0.7337741046831956, "grad_norm": 5.355169773101807, "learning_rate": 1.680764104270124e-06, "loss": 0.3627, "step": 6659 }, { "epoch": 0.7338842975206612, "grad_norm": 10.7400484085083, "learning_rate": 1.6794567062758694e-06, "loss": 0.4701, "step": 6660 }, { "epoch": 0.7339944903581267, "grad_norm": 6.861835956573486, "learning_rate": 1.6781497143221592e-06, "loss": 0.297, "step": 6661 }, { "epoch": 0.7341046831955923, "grad_norm": 7.219264507293701, "learning_rate": 1.6768431285688164e-06, "loss": 0.381, "step": 6662 }, { "epoch": 0.7342148760330579, "grad_norm": 4.299060821533203, "learning_rate": 1.675536949175608e-06, "loss": 0.3768, "step": 6663 }, { "epoch": 0.7343250688705234, "grad_norm": 6.239497184753418, "learning_rate": 1.6742311763022574e-06, "loss": 0.3646, "step": 6664 }, { "epoch": 0.734435261707989, "grad_norm": 14.268431663513184, "learning_rate": 1.6729258101084377e-06, "loss": 0.4536, "step": 6665 }, { "epoch": 0.7345454545454545, "grad_norm": 5.3448991775512695, "learning_rate": 1.6716208507537673e-06, "loss": 0.3741, "step": 6666 }, { "epoch": 0.7346556473829201, "grad_norm": 5.006895065307617, "learning_rate": 1.670316298397821e-06, "loss": 0.4255, "step": 6667 }, { "epoch": 0.7347658402203857, "grad_norm": 12.607826232910156, "learning_rate": 1.6690121532001202e-06, "loss": 0.4501, "step": 6668 }, { "epoch": 0.7348760330578512, "grad_norm": 6.037322998046875, "learning_rate": 1.66770841532014e-06, "loss": 0.3252, "step": 6669 }, { "epoch": 0.7349862258953168, "grad_norm": 7.4917097091674805, "learning_rate": 1.6664050849172997e-06, "loss": 0.4251, "step": 6670 }, { "epoch": 0.7350964187327824, "grad_norm": 6.970833778381348, "learning_rate": 1.6651021621509738e-06, "loss": 0.3062, "step": 6671 }, { "epoch": 0.7352066115702479, "grad_norm": 13.247237205505371, "learning_rate": 1.6637996471804868e-06, "loss": 0.4118, "step": 6672 }, { "epoch": 0.7353168044077135, "grad_norm": 7.240694522857666, "learning_rate": 1.6624975401651095e-06, "loss": 0.4215, "step": 6673 }, { "epoch": 0.735426997245179, "grad_norm": 5.27970027923584, "learning_rate": 1.6611958412640667e-06, "loss": 0.3615, "step": 6674 }, { "epoch": 0.7355371900826446, "grad_norm": 13.812773704528809, "learning_rate": 1.6598945506365327e-06, "loss": 0.4767, "step": 6675 }, { "epoch": 0.7356473829201102, "grad_norm": 12.22442626953125, "learning_rate": 1.6585936684416305e-06, "loss": 0.3072, "step": 6676 }, { "epoch": 0.7357575757575757, "grad_norm": 4.643481731414795, "learning_rate": 1.6572931948384301e-06, "loss": 0.3702, "step": 6677 }, { "epoch": 0.7358677685950413, "grad_norm": 4.2442779541015625, "learning_rate": 1.6559931299859617e-06, "loss": 0.3646, "step": 6678 }, { "epoch": 0.7359779614325069, "grad_norm": 4.7474799156188965, "learning_rate": 1.6546934740431958e-06, "loss": 0.4153, "step": 6679 }, { "epoch": 0.7360881542699724, "grad_norm": 4.34524393081665, "learning_rate": 1.6533942271690528e-06, "loss": 0.3845, "step": 6680 }, { "epoch": 0.736198347107438, "grad_norm": 8.834986686706543, "learning_rate": 1.6520953895224128e-06, "loss": 0.4496, "step": 6681 }, { "epoch": 0.7363085399449035, "grad_norm": 5.882931232452393, "learning_rate": 1.6507969612620949e-06, "loss": 0.3919, "step": 6682 }, { "epoch": 0.7364187327823691, "grad_norm": 5.51677942276001, "learning_rate": 1.6494989425468737e-06, "loss": 0.3354, "step": 6683 }, { "epoch": 0.7365289256198347, "grad_norm": 5.615901470184326, "learning_rate": 1.6482013335354746e-06, "loss": 0.3891, "step": 6684 }, { "epoch": 0.7366391184573002, "grad_norm": 12.293294906616211, "learning_rate": 1.6469041343865683e-06, "loss": 0.5426, "step": 6685 }, { "epoch": 0.7367493112947658, "grad_norm": 11.403398513793945, "learning_rate": 1.64560734525878e-06, "loss": 0.3571, "step": 6686 }, { "epoch": 0.7368595041322314, "grad_norm": 4.734846115112305, "learning_rate": 1.644310966310681e-06, "loss": 0.3627, "step": 6687 }, { "epoch": 0.7369696969696969, "grad_norm": 8.784213066101074, "learning_rate": 1.6430149977007953e-06, "loss": 0.4197, "step": 6688 }, { "epoch": 0.7370798898071625, "grad_norm": 9.416754722595215, "learning_rate": 1.641719439587597e-06, "loss": 0.4473, "step": 6689 }, { "epoch": 0.7371900826446282, "grad_norm": 12.03082275390625, "learning_rate": 1.640424292129506e-06, "loss": 0.4932, "step": 6690 }, { "epoch": 0.7373002754820936, "grad_norm": 7.539887428283691, "learning_rate": 1.6391295554848957e-06, "loss": 0.4544, "step": 6691 }, { "epoch": 0.7374104683195593, "grad_norm": 8.073715209960938, "learning_rate": 1.637835229812091e-06, "loss": 0.3384, "step": 6692 }, { "epoch": 0.7375206611570247, "grad_norm": 5.894067764282227, "learning_rate": 1.6365413152693594e-06, "loss": 0.4213, "step": 6693 }, { "epoch": 0.7376308539944904, "grad_norm": 5.787919521331787, "learning_rate": 1.6352478120149245e-06, "loss": 0.3866, "step": 6694 }, { "epoch": 0.737741046831956, "grad_norm": 5.080339431762695, "learning_rate": 1.6339547202069594e-06, "loss": 0.4116, "step": 6695 }, { "epoch": 0.7378512396694215, "grad_norm": 4.942465782165527, "learning_rate": 1.6326620400035819e-06, "loss": 0.3744, "step": 6696 }, { "epoch": 0.7379614325068871, "grad_norm": 4.5060834884643555, "learning_rate": 1.631369771562864e-06, "loss": 0.3505, "step": 6697 }, { "epoch": 0.7380716253443527, "grad_norm": 8.70274829864502, "learning_rate": 1.630077915042828e-06, "loss": 0.406, "step": 6698 }, { "epoch": 0.7381818181818182, "grad_norm": 6.8086934089660645, "learning_rate": 1.6287864706014406e-06, "loss": 0.4207, "step": 6699 }, { "epoch": 0.7382920110192838, "grad_norm": 5.991214752197266, "learning_rate": 1.627495438396623e-06, "loss": 0.4243, "step": 6700 }, { "epoch": 0.7384022038567493, "grad_norm": 5.16525936126709, "learning_rate": 1.6262048185862456e-06, "loss": 0.4154, "step": 6701 }, { "epoch": 0.7385123966942149, "grad_norm": 4.822534084320068, "learning_rate": 1.6249146113281245e-06, "loss": 0.3613, "step": 6702 }, { "epoch": 0.7386225895316805, "grad_norm": 20.175262451171875, "learning_rate": 1.6236248167800295e-06, "loss": 0.5235, "step": 6703 }, { "epoch": 0.738732782369146, "grad_norm": 15.914359092712402, "learning_rate": 1.6223354350996795e-06, "loss": 0.4973, "step": 6704 }, { "epoch": 0.7388429752066116, "grad_norm": 6.135737895965576, "learning_rate": 1.621046466444739e-06, "loss": 0.4133, "step": 6705 }, { "epoch": 0.7389531680440772, "grad_norm": 12.328907012939453, "learning_rate": 1.6197579109728268e-06, "loss": 0.5264, "step": 6706 }, { "epoch": 0.7390633608815427, "grad_norm": 8.680732727050781, "learning_rate": 1.6184697688415102e-06, "loss": 0.4007, "step": 6707 }, { "epoch": 0.7391735537190083, "grad_norm": 9.753934860229492, "learning_rate": 1.6171820402083022e-06, "loss": 0.3916, "step": 6708 }, { "epoch": 0.7392837465564738, "grad_norm": 12.979923248291016, "learning_rate": 1.6158947252306707e-06, "loss": 0.4962, "step": 6709 }, { "epoch": 0.7393939393939394, "grad_norm": 10.228775978088379, "learning_rate": 1.6146078240660258e-06, "loss": 0.4981, "step": 6710 }, { "epoch": 0.739504132231405, "grad_norm": 8.834919929504395, "learning_rate": 1.6133213368717381e-06, "loss": 0.4441, "step": 6711 }, { "epoch": 0.7396143250688705, "grad_norm": 6.35507345199585, "learning_rate": 1.6120352638051178e-06, "loss": 0.3639, "step": 6712 }, { "epoch": 0.7397245179063361, "grad_norm": 8.341972351074219, "learning_rate": 1.6107496050234244e-06, "loss": 0.4281, "step": 6713 }, { "epoch": 0.7398347107438017, "grad_norm": 5.324977874755859, "learning_rate": 1.609464360683876e-06, "loss": 0.385, "step": 6714 }, { "epoch": 0.7399449035812672, "grad_norm": 6.933832168579102, "learning_rate": 1.6081795309436315e-06, "loss": 0.3917, "step": 6715 }, { "epoch": 0.7400550964187328, "grad_norm": 11.22779655456543, "learning_rate": 1.6068951159597984e-06, "loss": 0.5071, "step": 6716 }, { "epoch": 0.7401652892561984, "grad_norm": 11.70495891571045, "learning_rate": 1.605611115889442e-06, "loss": 0.429, "step": 6717 }, { "epoch": 0.7402754820936639, "grad_norm": 5.684927463531494, "learning_rate": 1.60432753088957e-06, "loss": 0.4069, "step": 6718 }, { "epoch": 0.7403856749311295, "grad_norm": 4.83610725402832, "learning_rate": 1.6030443611171381e-06, "loss": 0.3182, "step": 6719 }, { "epoch": 0.740495867768595, "grad_norm": 6.312492847442627, "learning_rate": 1.601761606729056e-06, "loss": 0.3978, "step": 6720 }, { "epoch": 0.7406060606060606, "grad_norm": 7.136549949645996, "learning_rate": 1.6004792678821823e-06, "loss": 0.4525, "step": 6721 }, { "epoch": 0.7407162534435262, "grad_norm": 7.6813859939575195, "learning_rate": 1.5991973447333198e-06, "loss": 0.3168, "step": 6722 }, { "epoch": 0.7408264462809917, "grad_norm": 8.03650188446045, "learning_rate": 1.5979158374392257e-06, "loss": 0.4422, "step": 6723 }, { "epoch": 0.7409366391184573, "grad_norm": 8.230238914489746, "learning_rate": 1.596634746156604e-06, "loss": 0.4155, "step": 6724 }, { "epoch": 0.7410468319559229, "grad_norm": 4.2329630851745605, "learning_rate": 1.5953540710421106e-06, "loss": 0.3601, "step": 6725 }, { "epoch": 0.7411570247933884, "grad_norm": 5.544435024261475, "learning_rate": 1.5940738122523442e-06, "loss": 0.3858, "step": 6726 }, { "epoch": 0.741267217630854, "grad_norm": 9.27285099029541, "learning_rate": 1.5927939699438588e-06, "loss": 0.3882, "step": 6727 }, { "epoch": 0.7413774104683195, "grad_norm": 7.3773956298828125, "learning_rate": 1.5915145442731566e-06, "loss": 0.432, "step": 6728 }, { "epoch": 0.7414876033057851, "grad_norm": 15.690343856811523, "learning_rate": 1.5902355353966843e-06, "loss": 0.5766, "step": 6729 }, { "epoch": 0.7415977961432507, "grad_norm": 5.725202560424805, "learning_rate": 1.5889569434708418e-06, "loss": 0.3884, "step": 6730 }, { "epoch": 0.7417079889807162, "grad_norm": 8.445252418518066, "learning_rate": 1.58767876865198e-06, "loss": 0.4343, "step": 6731 }, { "epoch": 0.7418181818181818, "grad_norm": 9.029854774475098, "learning_rate": 1.5864010110963919e-06, "loss": 0.4192, "step": 6732 }, { "epoch": 0.7419283746556474, "grad_norm": 5.040360927581787, "learning_rate": 1.5851236709603246e-06, "loss": 0.3588, "step": 6733 }, { "epoch": 0.7420385674931129, "grad_norm": 6.786897659301758, "learning_rate": 1.5838467483999753e-06, "loss": 0.3826, "step": 6734 }, { "epoch": 0.7421487603305785, "grad_norm": 5.710162162780762, "learning_rate": 1.5825702435714862e-06, "loss": 0.3959, "step": 6735 }, { "epoch": 0.742258953168044, "grad_norm": 4.672428607940674, "learning_rate": 1.5812941566309464e-06, "loss": 0.3819, "step": 6736 }, { "epoch": 0.7423691460055096, "grad_norm": 9.552863121032715, "learning_rate": 1.5800184877344044e-06, "loss": 0.4042, "step": 6737 }, { "epoch": 0.7424793388429752, "grad_norm": 7.347883224487305, "learning_rate": 1.578743237037846e-06, "loss": 0.3994, "step": 6738 }, { "epoch": 0.7425895316804407, "grad_norm": 5.481398105621338, "learning_rate": 1.5774684046972111e-06, "loss": 0.375, "step": 6739 }, { "epoch": 0.7426997245179063, "grad_norm": 5.58358907699585, "learning_rate": 1.576193990868391e-06, "loss": 0.4026, "step": 6740 }, { "epoch": 0.7428099173553719, "grad_norm": 4.587759971618652, "learning_rate": 1.5749199957072187e-06, "loss": 0.3508, "step": 6741 }, { "epoch": 0.7429201101928374, "grad_norm": 6.659691333770752, "learning_rate": 1.5736464193694834e-06, "loss": 0.4087, "step": 6742 }, { "epoch": 0.743030303030303, "grad_norm": 6.995134353637695, "learning_rate": 1.5723732620109167e-06, "loss": 0.3273, "step": 6743 }, { "epoch": 0.7431404958677686, "grad_norm": 5.064960479736328, "learning_rate": 1.571100523787203e-06, "loss": 0.4108, "step": 6744 }, { "epoch": 0.7432506887052341, "grad_norm": 6.673768520355225, "learning_rate": 1.569828204853977e-06, "loss": 0.3932, "step": 6745 }, { "epoch": 0.7433608815426997, "grad_norm": 5.711964130401611, "learning_rate": 1.5685563053668158e-06, "loss": 0.3903, "step": 6746 }, { "epoch": 0.7434710743801652, "grad_norm": 5.0760297775268555, "learning_rate": 1.5672848254812506e-06, "loss": 0.3683, "step": 6747 }, { "epoch": 0.7435812672176308, "grad_norm": 5.213315010070801, "learning_rate": 1.5660137653527619e-06, "loss": 0.3581, "step": 6748 }, { "epoch": 0.7436914600550965, "grad_norm": 4.705984115600586, "learning_rate": 1.5647431251367728e-06, "loss": 0.371, "step": 6749 }, { "epoch": 0.743801652892562, "grad_norm": 6.9832587242126465, "learning_rate": 1.5634729049886604e-06, "loss": 0.4204, "step": 6750 }, { "epoch": 0.7439118457300276, "grad_norm": 5.852503299713135, "learning_rate": 1.5622031050637509e-06, "loss": 0.4339, "step": 6751 }, { "epoch": 0.7440220385674932, "grad_norm": 9.679864883422852, "learning_rate": 1.560933725517314e-06, "loss": 0.3648, "step": 6752 }, { "epoch": 0.7441322314049587, "grad_norm": 9.233565330505371, "learning_rate": 1.5596647665045728e-06, "loss": 0.4009, "step": 6753 }, { "epoch": 0.7442424242424243, "grad_norm": 9.235274314880371, "learning_rate": 1.5583962281806987e-06, "loss": 0.4592, "step": 6754 }, { "epoch": 0.7443526170798898, "grad_norm": 7.1648850440979, "learning_rate": 1.5571281107008073e-06, "loss": 0.459, "step": 6755 }, { "epoch": 0.7444628099173554, "grad_norm": 8.624975204467773, "learning_rate": 1.5558604142199668e-06, "loss": 0.4312, "step": 6756 }, { "epoch": 0.744573002754821, "grad_norm": 13.39470386505127, "learning_rate": 1.554593138893195e-06, "loss": 0.3769, "step": 6757 }, { "epoch": 0.7446831955922865, "grad_norm": 5.642722129821777, "learning_rate": 1.5533262848754533e-06, "loss": 0.3878, "step": 6758 }, { "epoch": 0.7447933884297521, "grad_norm": 6.0908203125, "learning_rate": 1.5520598523216546e-06, "loss": 0.4078, "step": 6759 }, { "epoch": 0.7449035812672177, "grad_norm": 9.905556678771973, "learning_rate": 1.5507938413866625e-06, "loss": 0.438, "step": 6760 }, { "epoch": 0.7450137741046832, "grad_norm": 7.522475719451904, "learning_rate": 1.549528252225283e-06, "loss": 0.387, "step": 6761 }, { "epoch": 0.7451239669421488, "grad_norm": 7.384159088134766, "learning_rate": 1.5482630849922764e-06, "loss": 0.4033, "step": 6762 }, { "epoch": 0.7452341597796144, "grad_norm": 7.429068088531494, "learning_rate": 1.5469983398423499e-06, "loss": 0.3876, "step": 6763 }, { "epoch": 0.7453443526170799, "grad_norm": 7.473761081695557, "learning_rate": 1.5457340169301549e-06, "loss": 0.4082, "step": 6764 }, { "epoch": 0.7454545454545455, "grad_norm": 6.6018967628479, "learning_rate": 1.5444701164102966e-06, "loss": 0.3422, "step": 6765 }, { "epoch": 0.745564738292011, "grad_norm": 6.712680816650391, "learning_rate": 1.5432066384373261e-06, "loss": 0.3877, "step": 6766 }, { "epoch": 0.7456749311294766, "grad_norm": 13.06218433380127, "learning_rate": 1.541943583165746e-06, "loss": 0.5377, "step": 6767 }, { "epoch": 0.7457851239669422, "grad_norm": 7.596041679382324, "learning_rate": 1.540680950750001e-06, "loss": 0.4444, "step": 6768 }, { "epoch": 0.7458953168044077, "grad_norm": 6.0458455085754395, "learning_rate": 1.539418741344485e-06, "loss": 0.3408, "step": 6769 }, { "epoch": 0.7460055096418733, "grad_norm": 6.200343608856201, "learning_rate": 1.5381569551035497e-06, "loss": 0.3376, "step": 6770 }, { "epoch": 0.7461157024793389, "grad_norm": 6.153450012207031, "learning_rate": 1.5368955921814844e-06, "loss": 0.2715, "step": 6771 }, { "epoch": 0.7462258953168044, "grad_norm": 4.4604878425598145, "learning_rate": 1.5356346527325273e-06, "loss": 0.4098, "step": 6772 }, { "epoch": 0.74633608815427, "grad_norm": 10.651001930236816, "learning_rate": 1.5343741369108733e-06, "loss": 0.5051, "step": 6773 }, { "epoch": 0.7464462809917355, "grad_norm": 5.188691139221191, "learning_rate": 1.5331140448706576e-06, "loss": 0.3197, "step": 6774 }, { "epoch": 0.7465564738292011, "grad_norm": 5.437318325042725, "learning_rate": 1.5318543767659645e-06, "loss": 0.3382, "step": 6775 }, { "epoch": 0.7466666666666667, "grad_norm": 7.409314155578613, "learning_rate": 1.530595132750829e-06, "loss": 0.3382, "step": 6776 }, { "epoch": 0.7467768595041322, "grad_norm": 10.222969055175781, "learning_rate": 1.5293363129792348e-06, "loss": 0.3694, "step": 6777 }, { "epoch": 0.7468870523415978, "grad_norm": 17.013992309570312, "learning_rate": 1.5280779176051096e-06, "loss": 0.4434, "step": 6778 }, { "epoch": 0.7469972451790634, "grad_norm": 11.93150520324707, "learning_rate": 1.5268199467823324e-06, "loss": 0.4627, "step": 6779 }, { "epoch": 0.7471074380165289, "grad_norm": 6.485506534576416, "learning_rate": 1.52556240066473e-06, "loss": 0.3605, "step": 6780 }, { "epoch": 0.7472176308539945, "grad_norm": 8.435442924499512, "learning_rate": 1.5243052794060785e-06, "loss": 0.481, "step": 6781 }, { "epoch": 0.74732782369146, "grad_norm": 5.366856098175049, "learning_rate": 1.523048583160097e-06, "loss": 0.2738, "step": 6782 }, { "epoch": 0.7474380165289256, "grad_norm": 8.27634334564209, "learning_rate": 1.5217923120804578e-06, "loss": 0.4658, "step": 6783 }, { "epoch": 0.7475482093663912, "grad_norm": 7.089442253112793, "learning_rate": 1.5205364663207811e-06, "loss": 0.414, "step": 6784 }, { "epoch": 0.7476584022038567, "grad_norm": 4.029046535491943, "learning_rate": 1.5192810460346302e-06, "loss": 0.2938, "step": 6785 }, { "epoch": 0.7477685950413223, "grad_norm": 3.518596887588501, "learning_rate": 1.5180260513755207e-06, "loss": 0.3626, "step": 6786 }, { "epoch": 0.7478787878787879, "grad_norm": 5.997016906738281, "learning_rate": 1.516771482496917e-06, "loss": 0.3698, "step": 6787 }, { "epoch": 0.7479889807162534, "grad_norm": 5.003194332122803, "learning_rate": 1.5155173395522266e-06, "loss": 0.4407, "step": 6788 }, { "epoch": 0.748099173553719, "grad_norm": 6.588343143463135, "learning_rate": 1.5142636226948087e-06, "loss": 0.3232, "step": 6789 }, { "epoch": 0.7482093663911846, "grad_norm": 4.953454971313477, "learning_rate": 1.513010332077972e-06, "loss": 0.395, "step": 6790 }, { "epoch": 0.7483195592286501, "grad_norm": 5.947750091552734, "learning_rate": 1.5117574678549667e-06, "loss": 0.3361, "step": 6791 }, { "epoch": 0.7484297520661157, "grad_norm": 6.818279266357422, "learning_rate": 1.5105050301789965e-06, "loss": 0.4242, "step": 6792 }, { "epoch": 0.7485399449035812, "grad_norm": 5.5788493156433105, "learning_rate": 1.509253019203213e-06, "loss": 0.3882, "step": 6793 }, { "epoch": 0.7486501377410468, "grad_norm": 6.290981769561768, "learning_rate": 1.5080014350807104e-06, "loss": 0.378, "step": 6794 }, { "epoch": 0.7487603305785124, "grad_norm": 7.344674110412598, "learning_rate": 1.5067502779645353e-06, "loss": 0.3328, "step": 6795 }, { "epoch": 0.7488705234159779, "grad_norm": 7.661288261413574, "learning_rate": 1.5054995480076833e-06, "loss": 0.4539, "step": 6796 }, { "epoch": 0.7489807162534435, "grad_norm": 8.083108901977539, "learning_rate": 1.5042492453630918e-06, "loss": 0.4032, "step": 6797 }, { "epoch": 0.7490909090909091, "grad_norm": 6.651820659637451, "learning_rate": 1.5029993701836514e-06, "loss": 0.3634, "step": 6798 }, { "epoch": 0.7492011019283746, "grad_norm": 8.187838554382324, "learning_rate": 1.5017499226221993e-06, "loss": 0.4247, "step": 6799 }, { "epoch": 0.7493112947658402, "grad_norm": 3.927823781967163, "learning_rate": 1.500500902831517e-06, "loss": 0.354, "step": 6800 }, { "epoch": 0.7494214876033057, "grad_norm": 4.890981674194336, "learning_rate": 1.4992523109643398e-06, "loss": 0.3147, "step": 6801 }, { "epoch": 0.7495316804407713, "grad_norm": 6.409554958343506, "learning_rate": 1.4980041471733436e-06, "loss": 0.2929, "step": 6802 }, { "epoch": 0.749641873278237, "grad_norm": 9.22906494140625, "learning_rate": 1.4967564116111571e-06, "loss": 0.4626, "step": 6803 }, { "epoch": 0.7497520661157024, "grad_norm": 5.69516134262085, "learning_rate": 1.4955091044303572e-06, "loss": 0.4332, "step": 6804 }, { "epoch": 0.749862258953168, "grad_norm": 8.6701078414917, "learning_rate": 1.4942622257834626e-06, "loss": 0.3755, "step": 6805 }, { "epoch": 0.7499724517906337, "grad_norm": 6.685476303100586, "learning_rate": 1.4930157758229451e-06, "loss": 0.3849, "step": 6806 }, { "epoch": 0.7500826446280991, "grad_norm": 5.954440593719482, "learning_rate": 1.4917697547012239e-06, "loss": 0.4499, "step": 6807 }, { "epoch": 0.7501928374655648, "grad_norm": 6.628360748291016, "learning_rate": 1.4905241625706613e-06, "loss": 0.4078, "step": 6808 }, { "epoch": 0.7503030303030302, "grad_norm": 5.020409107208252, "learning_rate": 1.4892789995835706e-06, "loss": 0.4012, "step": 6809 }, { "epoch": 0.7504132231404959, "grad_norm": 6.6440253257751465, "learning_rate": 1.4880342658922148e-06, "loss": 0.4026, "step": 6810 }, { "epoch": 0.7505234159779615, "grad_norm": 4.822587966918945, "learning_rate": 1.4867899616487974e-06, "loss": 0.4146, "step": 6811 }, { "epoch": 0.750633608815427, "grad_norm": 6.03671407699585, "learning_rate": 1.485546087005476e-06, "loss": 0.3777, "step": 6812 }, { "epoch": 0.7507438016528926, "grad_norm": 13.897258758544922, "learning_rate": 1.4843026421143547e-06, "loss": 0.557, "step": 6813 }, { "epoch": 0.7508539944903582, "grad_norm": 8.15986156463623, "learning_rate": 1.4830596271274806e-06, "loss": 0.3939, "step": 6814 }, { "epoch": 0.7509641873278237, "grad_norm": 5.397086143493652, "learning_rate": 1.4818170421968519e-06, "loss": 0.4204, "step": 6815 }, { "epoch": 0.7510743801652893, "grad_norm": 8.237540245056152, "learning_rate": 1.4805748874744163e-06, "loss": 0.3773, "step": 6816 }, { "epoch": 0.7511845730027549, "grad_norm": 4.589722633361816, "learning_rate": 1.4793331631120628e-06, "loss": 0.3714, "step": 6817 }, { "epoch": 0.7512947658402204, "grad_norm": 4.802692890167236, "learning_rate": 1.4780918692616319e-06, "loss": 0.3222, "step": 6818 }, { "epoch": 0.751404958677686, "grad_norm": 16.29314422607422, "learning_rate": 1.476851006074913e-06, "loss": 0.5529, "step": 6819 }, { "epoch": 0.7515151515151515, "grad_norm": 10.048425674438477, "learning_rate": 1.4756105737036375e-06, "loss": 0.415, "step": 6820 }, { "epoch": 0.7516253443526171, "grad_norm": 5.574573516845703, "learning_rate": 1.4743705722994884e-06, "loss": 0.3419, "step": 6821 }, { "epoch": 0.7517355371900827, "grad_norm": 7.75213623046875, "learning_rate": 1.4731310020140944e-06, "loss": 0.3342, "step": 6822 }, { "epoch": 0.7518457300275482, "grad_norm": 8.48292350769043, "learning_rate": 1.4718918629990342e-06, "loss": 0.4073, "step": 6823 }, { "epoch": 0.7519559228650138, "grad_norm": 9.394196510314941, "learning_rate": 1.4706531554058278e-06, "loss": 0.4546, "step": 6824 }, { "epoch": 0.7520661157024794, "grad_norm": 9.676992416381836, "learning_rate": 1.4694148793859475e-06, "loss": 0.3651, "step": 6825 }, { "epoch": 0.7521763085399449, "grad_norm": 5.157181262969971, "learning_rate": 1.4681770350908136e-06, "loss": 0.3479, "step": 6826 }, { "epoch": 0.7522865013774105, "grad_norm": 5.338240623474121, "learning_rate": 1.466939622671789e-06, "loss": 0.3968, "step": 6827 }, { "epoch": 0.752396694214876, "grad_norm": 6.495171070098877, "learning_rate": 1.4657026422801835e-06, "loss": 0.4372, "step": 6828 }, { "epoch": 0.7525068870523416, "grad_norm": 5.0497636795043945, "learning_rate": 1.4644660940672628e-06, "loss": 0.3542, "step": 6829 }, { "epoch": 0.7526170798898072, "grad_norm": 11.376801490783691, "learning_rate": 1.4632299781842307e-06, "loss": 0.4453, "step": 6830 }, { "epoch": 0.7527272727272727, "grad_norm": 4.18069314956665, "learning_rate": 1.4619942947822379e-06, "loss": 0.3305, "step": 6831 }, { "epoch": 0.7528374655647383, "grad_norm": 9.020021438598633, "learning_rate": 1.460759044012392e-06, "loss": 0.5502, "step": 6832 }, { "epoch": 0.7529476584022039, "grad_norm": 6.928681373596191, "learning_rate": 1.4595242260257381e-06, "loss": 0.4342, "step": 6833 }, { "epoch": 0.7530578512396694, "grad_norm": 5.69641637802124, "learning_rate": 1.4582898409732687e-06, "loss": 0.4027, "step": 6834 }, { "epoch": 0.753168044077135, "grad_norm": 5.6361985206604, "learning_rate": 1.4570558890059288e-06, "loss": 0.3301, "step": 6835 }, { "epoch": 0.7532782369146005, "grad_norm": 5.400755405426025, "learning_rate": 1.4558223702746093e-06, "loss": 0.2976, "step": 6836 }, { "epoch": 0.7533884297520661, "grad_norm": 15.730646133422852, "learning_rate": 1.4545892849301429e-06, "loss": 0.4039, "step": 6837 }, { "epoch": 0.7534986225895317, "grad_norm": 4.789764881134033, "learning_rate": 1.4533566331233145e-06, "loss": 0.3851, "step": 6838 }, { "epoch": 0.7536088154269972, "grad_norm": 5.8264336585998535, "learning_rate": 1.4521244150048552e-06, "loss": 0.4435, "step": 6839 }, { "epoch": 0.7537190082644628, "grad_norm": 6.7339043617248535, "learning_rate": 1.4508926307254427e-06, "loss": 0.3985, "step": 6840 }, { "epoch": 0.7538292011019284, "grad_norm": 7.948024272918701, "learning_rate": 1.4496612804356991e-06, "loss": 0.3228, "step": 6841 }, { "epoch": 0.7539393939393939, "grad_norm": 13.498283386230469, "learning_rate": 1.448430364286197e-06, "loss": 0.4703, "step": 6842 }, { "epoch": 0.7540495867768595, "grad_norm": 5.53570556640625, "learning_rate": 1.4471998824274553e-06, "loss": 0.4111, "step": 6843 }, { "epoch": 0.7541597796143251, "grad_norm": 4.883918762207031, "learning_rate": 1.4459698350099365e-06, "loss": 0.3549, "step": 6844 }, { "epoch": 0.7542699724517906, "grad_norm": 5.303241729736328, "learning_rate": 1.444740222184054e-06, "loss": 0.4668, "step": 6845 }, { "epoch": 0.7543801652892562, "grad_norm": 7.4251532554626465, "learning_rate": 1.4435110441001683e-06, "loss": 0.401, "step": 6846 }, { "epoch": 0.7544903581267217, "grad_norm": 14.293661117553711, "learning_rate": 1.4422823009085812e-06, "loss": 0.5331, "step": 6847 }, { "epoch": 0.7546005509641873, "grad_norm": 6.829413414001465, "learning_rate": 1.4410539927595474e-06, "loss": 0.3936, "step": 6848 }, { "epoch": 0.7547107438016529, "grad_norm": 5.234549522399902, "learning_rate": 1.4398261198032671e-06, "loss": 0.3203, "step": 6849 }, { "epoch": 0.7548209366391184, "grad_norm": 15.444355010986328, "learning_rate": 1.4385986821898834e-06, "loss": 0.4065, "step": 6850 }, { "epoch": 0.754931129476584, "grad_norm": 5.0838303565979, "learning_rate": 1.437371680069491e-06, "loss": 0.4106, "step": 6851 }, { "epoch": 0.7550413223140496, "grad_norm": 5.2294816970825195, "learning_rate": 1.4361451135921296e-06, "loss": 0.305, "step": 6852 }, { "epoch": 0.7551515151515151, "grad_norm": 6.445193290710449, "learning_rate": 1.4349189829077837e-06, "loss": 0.318, "step": 6853 }, { "epoch": 0.7552617079889807, "grad_norm": 8.633602142333984, "learning_rate": 1.4336932881663868e-06, "loss": 0.3874, "step": 6854 }, { "epoch": 0.7553719008264462, "grad_norm": 7.066654682159424, "learning_rate": 1.4324680295178211e-06, "loss": 0.4553, "step": 6855 }, { "epoch": 0.7554820936639118, "grad_norm": 3.980142831802368, "learning_rate": 1.4312432071119086e-06, "loss": 0.3781, "step": 6856 }, { "epoch": 0.7555922865013774, "grad_norm": 6.00819206237793, "learning_rate": 1.430018821098425e-06, "loss": 0.3591, "step": 6857 }, { "epoch": 0.7557024793388429, "grad_norm": 18.058963775634766, "learning_rate": 1.4287948716270906e-06, "loss": 0.4094, "step": 6858 }, { "epoch": 0.7558126721763085, "grad_norm": 5.2950825691223145, "learning_rate": 1.4275713588475687e-06, "loss": 0.3766, "step": 6859 }, { "epoch": 0.7559228650137741, "grad_norm": 5.6798810958862305, "learning_rate": 1.4263482829094754e-06, "loss": 0.4094, "step": 6860 }, { "epoch": 0.7560330578512396, "grad_norm": 3.7791309356689453, "learning_rate": 1.4251256439623667e-06, "loss": 0.3591, "step": 6861 }, { "epoch": 0.7561432506887052, "grad_norm": 10.203629493713379, "learning_rate": 1.4239034421557501e-06, "loss": 0.4784, "step": 6862 }, { "epoch": 0.7562534435261709, "grad_norm": 4.767711639404297, "learning_rate": 1.42268167763908e-06, "loss": 0.351, "step": 6863 }, { "epoch": 0.7563636363636363, "grad_norm": 6.482807159423828, "learning_rate": 1.4214603505617525e-06, "loss": 0.3775, "step": 6864 }, { "epoch": 0.756473829201102, "grad_norm": 5.963954925537109, "learning_rate": 1.420239461073114e-06, "loss": 0.3416, "step": 6865 }, { "epoch": 0.7565840220385674, "grad_norm": 5.087156295776367, "learning_rate": 1.4190190093224582e-06, "loss": 0.4816, "step": 6866 }, { "epoch": 0.7566942148760331, "grad_norm": 15.355376243591309, "learning_rate": 1.417798995459021e-06, "loss": 0.436, "step": 6867 }, { "epoch": 0.7568044077134987, "grad_norm": 4.720404148101807, "learning_rate": 1.4165794196319881e-06, "loss": 0.3455, "step": 6868 }, { "epoch": 0.7569146005509642, "grad_norm": 5.853130340576172, "learning_rate": 1.4153602819904933e-06, "loss": 0.3464, "step": 6869 }, { "epoch": 0.7570247933884298, "grad_norm": 7.015380382537842, "learning_rate": 1.4141415826836109e-06, "loss": 0.3958, "step": 6870 }, { "epoch": 0.7571349862258954, "grad_norm": 5.533911228179932, "learning_rate": 1.4129233218603666e-06, "loss": 0.3602, "step": 6871 }, { "epoch": 0.7572451790633609, "grad_norm": 7.721105575561523, "learning_rate": 1.4117054996697321e-06, "loss": 0.415, "step": 6872 }, { "epoch": 0.7573553719008265, "grad_norm": 5.225645542144775, "learning_rate": 1.4104881162606227e-06, "loss": 0.3553, "step": 6873 }, { "epoch": 0.757465564738292, "grad_norm": 7.83046293258667, "learning_rate": 1.409271171781902e-06, "loss": 0.337, "step": 6874 }, { "epoch": 0.7575757575757576, "grad_norm": 6.9218902587890625, "learning_rate": 1.4080546663823814e-06, "loss": 0.3887, "step": 6875 }, { "epoch": 0.7576859504132232, "grad_norm": 4.234103679656982, "learning_rate": 1.4068386002108137e-06, "loss": 0.3772, "step": 6876 }, { "epoch": 0.7577961432506887, "grad_norm": 5.8084001541137695, "learning_rate": 1.4056229734159026e-06, "loss": 0.406, "step": 6877 }, { "epoch": 0.7579063360881543, "grad_norm": 6.415094375610352, "learning_rate": 1.4044077861462984e-06, "loss": 0.406, "step": 6878 }, { "epoch": 0.7580165289256199, "grad_norm": 11.293000221252441, "learning_rate": 1.4031930385505931e-06, "loss": 0.4479, "step": 6879 }, { "epoch": 0.7581267217630854, "grad_norm": 4.440645217895508, "learning_rate": 1.4019787307773285e-06, "loss": 0.3946, "step": 6880 }, { "epoch": 0.758236914600551, "grad_norm": 7.0031819343566895, "learning_rate": 1.4007648629749925e-06, "loss": 0.4065, "step": 6881 }, { "epoch": 0.7583471074380165, "grad_norm": 5.336536884307861, "learning_rate": 1.3995514352920197e-06, "loss": 0.3794, "step": 6882 }, { "epoch": 0.7584573002754821, "grad_norm": 8.416298866271973, "learning_rate": 1.3983384478767865e-06, "loss": 0.4651, "step": 6883 }, { "epoch": 0.7585674931129477, "grad_norm": 6.324009895324707, "learning_rate": 1.39712590087762e-06, "loss": 0.3646, "step": 6884 }, { "epoch": 0.7586776859504132, "grad_norm": 9.452508926391602, "learning_rate": 1.3959137944427942e-06, "loss": 0.4228, "step": 6885 }, { "epoch": 0.7587878787878788, "grad_norm": 9.684531211853027, "learning_rate": 1.3947021287205248e-06, "loss": 0.4493, "step": 6886 }, { "epoch": 0.7588980716253444, "grad_norm": 6.790139675140381, "learning_rate": 1.3934909038589738e-06, "loss": 0.4968, "step": 6887 }, { "epoch": 0.7590082644628099, "grad_norm": 5.268446445465088, "learning_rate": 1.392280120006257e-06, "loss": 0.3925, "step": 6888 }, { "epoch": 0.7591184573002755, "grad_norm": 4.70731258392334, "learning_rate": 1.3910697773104275e-06, "loss": 0.3949, "step": 6889 }, { "epoch": 0.7592286501377411, "grad_norm": 11.086029052734375, "learning_rate": 1.3898598759194848e-06, "loss": 0.486, "step": 6890 }, { "epoch": 0.7593388429752066, "grad_norm": 6.695804595947266, "learning_rate": 1.388650415981384e-06, "loss": 0.3652, "step": 6891 }, { "epoch": 0.7594490358126722, "grad_norm": 7.074174404144287, "learning_rate": 1.3874413976440154e-06, "loss": 0.4741, "step": 6892 }, { "epoch": 0.7595592286501377, "grad_norm": 7.051998138427734, "learning_rate": 1.3862328210552184e-06, "loss": 0.4229, "step": 6893 }, { "epoch": 0.7596694214876033, "grad_norm": 6.784617900848389, "learning_rate": 1.3850246863627809e-06, "loss": 0.4003, "step": 6894 }, { "epoch": 0.7597796143250689, "grad_norm": 13.098021507263184, "learning_rate": 1.3838169937144351e-06, "loss": 0.3385, "step": 6895 }, { "epoch": 0.7598898071625344, "grad_norm": 6.4965033531188965, "learning_rate": 1.3826097432578612e-06, "loss": 0.4186, "step": 6896 }, { "epoch": 0.76, "grad_norm": 4.233058452606201, "learning_rate": 1.3814029351406799e-06, "loss": 0.3435, "step": 6897 }, { "epoch": 0.7601101928374656, "grad_norm": 4.8349609375, "learning_rate": 1.3801965695104636e-06, "loss": 0.3597, "step": 6898 }, { "epoch": 0.7602203856749311, "grad_norm": 7.645735740661621, "learning_rate": 1.3789906465147284e-06, "loss": 0.4443, "step": 6899 }, { "epoch": 0.7603305785123967, "grad_norm": 6.398595809936523, "learning_rate": 1.3777851663009344e-06, "loss": 0.3722, "step": 6900 }, { "epoch": 0.7604407713498622, "grad_norm": 8.116183280944824, "learning_rate": 1.376580129016491e-06, "loss": 0.4389, "step": 6901 }, { "epoch": 0.7605509641873278, "grad_norm": 4.306082248687744, "learning_rate": 1.3753755348087527e-06, "loss": 0.4354, "step": 6902 }, { "epoch": 0.7606611570247934, "grad_norm": 5.3213300704956055, "learning_rate": 1.374171383825016e-06, "loss": 0.3615, "step": 6903 }, { "epoch": 0.7607713498622589, "grad_norm": 8.165018081665039, "learning_rate": 1.3729676762125276e-06, "loss": 0.3677, "step": 6904 }, { "epoch": 0.7608815426997245, "grad_norm": 9.73409366607666, "learning_rate": 1.3717644121184802e-06, "loss": 0.4883, "step": 6905 }, { "epoch": 0.7609917355371901, "grad_norm": 10.60772705078125, "learning_rate": 1.3705615916900072e-06, "loss": 0.3772, "step": 6906 }, { "epoch": 0.7611019283746556, "grad_norm": 6.223022937774658, "learning_rate": 1.369359215074193e-06, "loss": 0.3599, "step": 6907 }, { "epoch": 0.7612121212121212, "grad_norm": 5.9954986572265625, "learning_rate": 1.3681572824180679e-06, "loss": 0.3362, "step": 6908 }, { "epoch": 0.7613223140495867, "grad_norm": 4.7047882080078125, "learning_rate": 1.3669557938686012e-06, "loss": 0.3118, "step": 6909 }, { "epoch": 0.7614325068870523, "grad_norm": 6.071364879608154, "learning_rate": 1.3657547495727152e-06, "loss": 0.3369, "step": 6910 }, { "epoch": 0.7615426997245179, "grad_norm": 6.846749305725098, "learning_rate": 1.3645541496772768e-06, "loss": 0.3902, "step": 6911 }, { "epoch": 0.7616528925619834, "grad_norm": 13.499283790588379, "learning_rate": 1.363353994329094e-06, "loss": 0.4028, "step": 6912 }, { "epoch": 0.761763085399449, "grad_norm": 6.2844038009643555, "learning_rate": 1.362154283674924e-06, "loss": 0.3668, "step": 6913 }, { "epoch": 0.7618732782369146, "grad_norm": 28.452960968017578, "learning_rate": 1.3609550178614716e-06, "loss": 0.485, "step": 6914 }, { "epoch": 0.7619834710743801, "grad_norm": 5.711501598358154, "learning_rate": 1.3597561970353817e-06, "loss": 0.3805, "step": 6915 }, { "epoch": 0.7620936639118457, "grad_norm": 8.279010772705078, "learning_rate": 1.3585578213432482e-06, "loss": 0.4057, "step": 6916 }, { "epoch": 0.7622038567493113, "grad_norm": 5.910313129425049, "learning_rate": 1.3573598909316127e-06, "loss": 0.4284, "step": 6917 }, { "epoch": 0.7623140495867768, "grad_norm": 9.110838890075684, "learning_rate": 1.3561624059469559e-06, "loss": 0.4227, "step": 6918 }, { "epoch": 0.7624242424242424, "grad_norm": 5.608441352844238, "learning_rate": 1.354965366535711e-06, "loss": 0.4149, "step": 6919 }, { "epoch": 0.7625344352617079, "grad_norm": 5.326509952545166, "learning_rate": 1.3537687728442516e-06, "loss": 0.314, "step": 6920 }, { "epoch": 0.7626446280991735, "grad_norm": 12.109729766845703, "learning_rate": 1.352572625018899e-06, "loss": 0.4072, "step": 6921 }, { "epoch": 0.7627548209366392, "grad_norm": 7.114477634429932, "learning_rate": 1.351376923205922e-06, "loss": 0.4047, "step": 6922 }, { "epoch": 0.7628650137741046, "grad_norm": 11.916533470153809, "learning_rate": 1.3501816675515285e-06, "loss": 0.4535, "step": 6923 }, { "epoch": 0.7629752066115703, "grad_norm": 7.836859703063965, "learning_rate": 1.3489868582018807e-06, "loss": 0.4061, "step": 6924 }, { "epoch": 0.7630853994490359, "grad_norm": 3.2886035442352295, "learning_rate": 1.3477924953030796e-06, "loss": 0.3259, "step": 6925 }, { "epoch": 0.7631955922865014, "grad_norm": 4.548503398895264, "learning_rate": 1.346598579001172e-06, "loss": 0.347, "step": 6926 }, { "epoch": 0.763305785123967, "grad_norm": 5.381287574768066, "learning_rate": 1.3454051094421521e-06, "loss": 0.4272, "step": 6927 }, { "epoch": 0.7634159779614325, "grad_norm": 6.1449995040893555, "learning_rate": 1.344212086771962e-06, "loss": 0.4816, "step": 6928 }, { "epoch": 0.7635261707988981, "grad_norm": 5.375181198120117, "learning_rate": 1.3430195111364818e-06, "loss": 0.4253, "step": 6929 }, { "epoch": 0.7636363636363637, "grad_norm": 4.251392841339111, "learning_rate": 1.3418273826815437e-06, "loss": 0.3694, "step": 6930 }, { "epoch": 0.7637465564738292, "grad_norm": 5.311394691467285, "learning_rate": 1.3406357015529236e-06, "loss": 0.3869, "step": 6931 }, { "epoch": 0.7638567493112948, "grad_norm": 6.5930914878845215, "learning_rate": 1.3394444678963393e-06, "loss": 0.3366, "step": 6932 }, { "epoch": 0.7639669421487604, "grad_norm": 9.378667831420898, "learning_rate": 1.3382536818574576e-06, "loss": 0.4152, "step": 6933 }, { "epoch": 0.7640771349862259, "grad_norm": 7.3711771965026855, "learning_rate": 1.3370633435818913e-06, "loss": 0.445, "step": 6934 }, { "epoch": 0.7641873278236915, "grad_norm": 5.618210315704346, "learning_rate": 1.335873453215194e-06, "loss": 0.3235, "step": 6935 }, { "epoch": 0.764297520661157, "grad_norm": 5.0322184562683105, "learning_rate": 1.3346840109028674e-06, "loss": 0.4211, "step": 6936 }, { "epoch": 0.7644077134986226, "grad_norm": 6.689839839935303, "learning_rate": 1.3334950167903588e-06, "loss": 0.3492, "step": 6937 }, { "epoch": 0.7645179063360882, "grad_norm": 9.82962703704834, "learning_rate": 1.3323064710230622e-06, "loss": 0.4579, "step": 6938 }, { "epoch": 0.7646280991735537, "grad_norm": 6.276013374328613, "learning_rate": 1.3311183737463102e-06, "loss": 0.4161, "step": 6939 }, { "epoch": 0.7647382920110193, "grad_norm": 6.1520304679870605, "learning_rate": 1.3299307251053871e-06, "loss": 0.4232, "step": 6940 }, { "epoch": 0.7648484848484849, "grad_norm": 5.002037048339844, "learning_rate": 1.3287435252455221e-06, "loss": 0.3991, "step": 6941 }, { "epoch": 0.7649586776859504, "grad_norm": 7.803005695343018, "learning_rate": 1.3275567743118855e-06, "loss": 0.4935, "step": 6942 }, { "epoch": 0.765068870523416, "grad_norm": 5.232184410095215, "learning_rate": 1.3263704724495923e-06, "loss": 0.3817, "step": 6943 }, { "epoch": 0.7651790633608816, "grad_norm": 5.051543235778809, "learning_rate": 1.3251846198037104e-06, "loss": 0.3642, "step": 6944 }, { "epoch": 0.7652892561983471, "grad_norm": 9.887868881225586, "learning_rate": 1.3239992165192457e-06, "loss": 0.4108, "step": 6945 }, { "epoch": 0.7653994490358127, "grad_norm": 11.279367446899414, "learning_rate": 1.3228142627411468e-06, "loss": 0.4468, "step": 6946 }, { "epoch": 0.7655096418732782, "grad_norm": 6.569183826446533, "learning_rate": 1.3216297586143173e-06, "loss": 0.3612, "step": 6947 }, { "epoch": 0.7656198347107438, "grad_norm": 9.783895492553711, "learning_rate": 1.3204457042835984e-06, "loss": 0.4023, "step": 6948 }, { "epoch": 0.7657300275482094, "grad_norm": 10.846197128295898, "learning_rate": 1.3192620998937734e-06, "loss": 0.3935, "step": 6949 }, { "epoch": 0.7658402203856749, "grad_norm": 5.138780117034912, "learning_rate": 1.3180789455895814e-06, "loss": 0.3837, "step": 6950 }, { "epoch": 0.7659504132231405, "grad_norm": 10.842103958129883, "learning_rate": 1.3168962415156966e-06, "loss": 0.5067, "step": 6951 }, { "epoch": 0.7660606060606061, "grad_norm": 7.0031819343566895, "learning_rate": 1.3157139878167435e-06, "loss": 0.3931, "step": 6952 }, { "epoch": 0.7661707988980716, "grad_norm": 6.632589817047119, "learning_rate": 1.3145321846372866e-06, "loss": 0.4373, "step": 6953 }, { "epoch": 0.7662809917355372, "grad_norm": 4.488170146942139, "learning_rate": 1.3133508321218408e-06, "loss": 0.3398, "step": 6954 }, { "epoch": 0.7663911845730027, "grad_norm": 5.35597038269043, "learning_rate": 1.312169930414865e-06, "loss": 0.3409, "step": 6955 }, { "epoch": 0.7665013774104683, "grad_norm": 5.265271186828613, "learning_rate": 1.3109894796607576e-06, "loss": 0.377, "step": 6956 }, { "epoch": 0.7666115702479339, "grad_norm": 8.279088973999023, "learning_rate": 1.3098094800038674e-06, "loss": 0.4185, "step": 6957 }, { "epoch": 0.7667217630853994, "grad_norm": 4.7576904296875, "learning_rate": 1.3086299315884887e-06, "loss": 0.3353, "step": 6958 }, { "epoch": 0.766831955922865, "grad_norm": 6.927733421325684, "learning_rate": 1.3074508345588543e-06, "loss": 0.3909, "step": 6959 }, { "epoch": 0.7669421487603306, "grad_norm": 4.825605869293213, "learning_rate": 1.3062721890591478e-06, "loss": 0.3701, "step": 6960 }, { "epoch": 0.7670523415977961, "grad_norm": 7.008309364318848, "learning_rate": 1.3050939952334968e-06, "loss": 0.2961, "step": 6961 }, { "epoch": 0.7671625344352617, "grad_norm": 4.5859761238098145, "learning_rate": 1.3039162532259697e-06, "loss": 0.4035, "step": 6962 }, { "epoch": 0.7672727272727272, "grad_norm": 4.477420330047607, "learning_rate": 1.3027389631805836e-06, "loss": 0.3313, "step": 6963 }, { "epoch": 0.7673829201101928, "grad_norm": 11.143181800842285, "learning_rate": 1.3015621252413014e-06, "loss": 0.3517, "step": 6964 }, { "epoch": 0.7674931129476584, "grad_norm": 8.078117370605469, "learning_rate": 1.3003857395520242e-06, "loss": 0.3019, "step": 6965 }, { "epoch": 0.7676033057851239, "grad_norm": 6.918178081512451, "learning_rate": 1.2992098062566044e-06, "loss": 0.4217, "step": 6966 }, { "epoch": 0.7677134986225895, "grad_norm": 8.825976371765137, "learning_rate": 1.2980343254988387e-06, "loss": 0.4189, "step": 6967 }, { "epoch": 0.7678236914600551, "grad_norm": 6.583045482635498, "learning_rate": 1.296859297422462e-06, "loss": 0.4003, "step": 6968 }, { "epoch": 0.7679338842975206, "grad_norm": 8.6136474609375, "learning_rate": 1.295684722171161e-06, "loss": 0.4759, "step": 6969 }, { "epoch": 0.7680440771349862, "grad_norm": 6.4836344718933105, "learning_rate": 1.2945105998885654e-06, "loss": 0.4326, "step": 6970 }, { "epoch": 0.7681542699724518, "grad_norm": 6.296912670135498, "learning_rate": 1.2933369307182453e-06, "loss": 0.3223, "step": 6971 }, { "epoch": 0.7682644628099173, "grad_norm": 12.304940223693848, "learning_rate": 1.2921637148037203e-06, "loss": 0.4472, "step": 6972 }, { "epoch": 0.7683746556473829, "grad_norm": 6.3861799240112305, "learning_rate": 1.290990952288455e-06, "loss": 0.3862, "step": 6973 }, { "epoch": 0.7684848484848484, "grad_norm": 6.812122821807861, "learning_rate": 1.2898186433158521e-06, "loss": 0.425, "step": 6974 }, { "epoch": 0.768595041322314, "grad_norm": 3.9157824516296387, "learning_rate": 1.2886467880292668e-06, "loss": 0.3687, "step": 6975 }, { "epoch": 0.7687052341597796, "grad_norm": 4.501402378082275, "learning_rate": 1.2874753865719925e-06, "loss": 0.3208, "step": 6976 }, { "epoch": 0.7688154269972451, "grad_norm": 8.66434383392334, "learning_rate": 1.2863044390872708e-06, "loss": 0.4822, "step": 6977 }, { "epoch": 0.7689256198347107, "grad_norm": 7.586112976074219, "learning_rate": 1.2851339457182882e-06, "loss": 0.4505, "step": 6978 }, { "epoch": 0.7690358126721764, "grad_norm": 10.623414039611816, "learning_rate": 1.28396390660817e-06, "loss": 0.4421, "step": 6979 }, { "epoch": 0.7691460055096419, "grad_norm": 8.026878356933594, "learning_rate": 1.282794321899996e-06, "loss": 0.4319, "step": 6980 }, { "epoch": 0.7692561983471075, "grad_norm": 9.683146476745605, "learning_rate": 1.2816251917367816e-06, "loss": 0.506, "step": 6981 }, { "epoch": 0.769366391184573, "grad_norm": 7.421623706817627, "learning_rate": 1.2804565162614868e-06, "loss": 0.4425, "step": 6982 }, { "epoch": 0.7694765840220386, "grad_norm": 4.457887172698975, "learning_rate": 1.279288295617025e-06, "loss": 0.3624, "step": 6983 }, { "epoch": 0.7695867768595042, "grad_norm": 4.602921009063721, "learning_rate": 1.278120529946244e-06, "loss": 0.3022, "step": 6984 }, { "epoch": 0.7696969696969697, "grad_norm": 9.712858200073242, "learning_rate": 1.2769532193919387e-06, "loss": 0.3575, "step": 6985 }, { "epoch": 0.7698071625344353, "grad_norm": 6.121043682098389, "learning_rate": 1.2757863640968515e-06, "loss": 0.3384, "step": 6986 }, { "epoch": 0.7699173553719009, "grad_norm": 5.888635158538818, "learning_rate": 1.2746199642036676e-06, "loss": 0.4432, "step": 6987 }, { "epoch": 0.7700275482093664, "grad_norm": 6.5923309326171875, "learning_rate": 1.2734540198550132e-06, "loss": 0.3614, "step": 6988 }, { "epoch": 0.770137741046832, "grad_norm": 7.518980026245117, "learning_rate": 1.2722885311934641e-06, "loss": 0.297, "step": 6989 }, { "epoch": 0.7702479338842976, "grad_norm": 5.579894065856934, "learning_rate": 1.271123498361538e-06, "loss": 0.3988, "step": 6990 }, { "epoch": 0.7703581267217631, "grad_norm": 5.087518215179443, "learning_rate": 1.2699589215016939e-06, "loss": 0.401, "step": 6991 }, { "epoch": 0.7704683195592287, "grad_norm": 5.216823577880859, "learning_rate": 1.26879480075634e-06, "loss": 0.3954, "step": 6992 }, { "epoch": 0.7705785123966942, "grad_norm": 5.857227802276611, "learning_rate": 1.2676311362678261e-06, "loss": 0.3368, "step": 6993 }, { "epoch": 0.7706887052341598, "grad_norm": 5.294259548187256, "learning_rate": 1.2664679281784487e-06, "loss": 0.3901, "step": 6994 }, { "epoch": 0.7707988980716254, "grad_norm": 7.392274856567383, "learning_rate": 1.2653051766304425e-06, "loss": 0.4224, "step": 6995 }, { "epoch": 0.7709090909090909, "grad_norm": 9.018160820007324, "learning_rate": 1.2641428817659928e-06, "loss": 0.4405, "step": 6996 }, { "epoch": 0.7710192837465565, "grad_norm": 10.305521965026855, "learning_rate": 1.262981043727227e-06, "loss": 0.3653, "step": 6997 }, { "epoch": 0.7711294765840221, "grad_norm": 10.165287017822266, "learning_rate": 1.2618196626562145e-06, "loss": 0.4528, "step": 6998 }, { "epoch": 0.7712396694214876, "grad_norm": 6.014214038848877, "learning_rate": 1.2606587386949714e-06, "loss": 0.4665, "step": 6999 }, { "epoch": 0.7713498622589532, "grad_norm": 6.473193645477295, "learning_rate": 1.2594982719854586e-06, "loss": 0.4118, "step": 7000 }, { "epoch": 0.7714600550964187, "grad_norm": 6.1697821617126465, "learning_rate": 1.2583382626695785e-06, "loss": 0.366, "step": 7001 }, { "epoch": 0.7715702479338843, "grad_norm": 7.235289573669434, "learning_rate": 1.2571787108891748e-06, "loss": 0.3543, "step": 7002 }, { "epoch": 0.7716804407713499, "grad_norm": 6.012856960296631, "learning_rate": 1.256019616786045e-06, "loss": 0.4914, "step": 7003 }, { "epoch": 0.7717906336088154, "grad_norm": 7.119135856628418, "learning_rate": 1.2548609805019229e-06, "loss": 0.4519, "step": 7004 }, { "epoch": 0.771900826446281, "grad_norm": 6.916644096374512, "learning_rate": 1.2537028021784836e-06, "loss": 0.4306, "step": 7005 }, { "epoch": 0.7720110192837466, "grad_norm": 5.378720283508301, "learning_rate": 1.2525450819573582e-06, "loss": 0.4267, "step": 7006 }, { "epoch": 0.7721212121212121, "grad_norm": 6.772036075592041, "learning_rate": 1.2513878199801088e-06, "loss": 0.3534, "step": 7007 }, { "epoch": 0.7722314049586777, "grad_norm": 5.722408771514893, "learning_rate": 1.2502310163882502e-06, "loss": 0.4124, "step": 7008 }, { "epoch": 0.7723415977961432, "grad_norm": 5.215071678161621, "learning_rate": 1.2490746713232356e-06, "loss": 0.4087, "step": 7009 }, { "epoch": 0.7724517906336088, "grad_norm": 12.780637741088867, "learning_rate": 1.2479187849264646e-06, "loss": 0.4359, "step": 7010 }, { "epoch": 0.7725619834710744, "grad_norm": 9.537915229797363, "learning_rate": 1.2467633573392829e-06, "loss": 0.4041, "step": 7011 }, { "epoch": 0.7726721763085399, "grad_norm": 6.791706085205078, "learning_rate": 1.2456083887029746e-06, "loss": 0.3762, "step": 7012 }, { "epoch": 0.7727823691460055, "grad_norm": 9.5051908493042, "learning_rate": 1.2444538791587723e-06, "loss": 0.4262, "step": 7013 }, { "epoch": 0.7728925619834711, "grad_norm": 7.05399751663208, "learning_rate": 1.2432998288478531e-06, "loss": 0.4192, "step": 7014 }, { "epoch": 0.7730027548209366, "grad_norm": 5.819000720977783, "learning_rate": 1.242146237911332e-06, "loss": 0.4417, "step": 7015 }, { "epoch": 0.7731129476584022, "grad_norm": 6.9604997634887695, "learning_rate": 1.240993106490273e-06, "loss": 0.5471, "step": 7016 }, { "epoch": 0.7732231404958678, "grad_norm": 3.621635675430298, "learning_rate": 1.2398404347256854e-06, "loss": 0.3538, "step": 7017 }, { "epoch": 0.7733333333333333, "grad_norm": 16.079496383666992, "learning_rate": 1.238688222758515e-06, "loss": 0.3974, "step": 7018 }, { "epoch": 0.7734435261707989, "grad_norm": 6.103856086730957, "learning_rate": 1.2375364707296583e-06, "loss": 0.433, "step": 7019 }, { "epoch": 0.7735537190082644, "grad_norm": 6.0294647216796875, "learning_rate": 1.236385178779954e-06, "loss": 0.3412, "step": 7020 }, { "epoch": 0.77366391184573, "grad_norm": 5.790236949920654, "learning_rate": 1.235234347050181e-06, "loss": 0.3971, "step": 7021 }, { "epoch": 0.7737741046831956, "grad_norm": 4.867669105529785, "learning_rate": 1.2340839756810657e-06, "loss": 0.4326, "step": 7022 }, { "epoch": 0.7738842975206611, "grad_norm": 6.277116298675537, "learning_rate": 1.2329340648132793e-06, "loss": 0.4039, "step": 7023 }, { "epoch": 0.7739944903581267, "grad_norm": 9.411334037780762, "learning_rate": 1.2317846145874308e-06, "loss": 0.4242, "step": 7024 }, { "epoch": 0.7741046831955923, "grad_norm": 5.751246929168701, "learning_rate": 1.2306356251440787e-06, "loss": 0.3294, "step": 7025 }, { "epoch": 0.7742148760330578, "grad_norm": 5.511191368103027, "learning_rate": 1.2294870966237233e-06, "loss": 0.3936, "step": 7026 }, { "epoch": 0.7743250688705234, "grad_norm": 9.842085838317871, "learning_rate": 1.2283390291668062e-06, "loss": 0.4709, "step": 7027 }, { "epoch": 0.7744352617079889, "grad_norm": 7.187948703765869, "learning_rate": 1.2271914229137161e-06, "loss": 0.3535, "step": 7028 }, { "epoch": 0.7745454545454545, "grad_norm": 7.193229675292969, "learning_rate": 1.2260442780047854e-06, "loss": 0.4727, "step": 7029 }, { "epoch": 0.7746556473829201, "grad_norm": 5.977843284606934, "learning_rate": 1.224897594580285e-06, "loss": 0.3024, "step": 7030 }, { "epoch": 0.7747658402203856, "grad_norm": 7.264575958251953, "learning_rate": 1.2237513727804346e-06, "loss": 0.4059, "step": 7031 }, { "epoch": 0.7748760330578512, "grad_norm": 6.463776111602783, "learning_rate": 1.2226056127453972e-06, "loss": 0.4138, "step": 7032 }, { "epoch": 0.7749862258953168, "grad_norm": 6.193376541137695, "learning_rate": 1.2214603146152753e-06, "loss": 0.386, "step": 7033 }, { "epoch": 0.7750964187327823, "grad_norm": 6.836053371429443, "learning_rate": 1.2203154785301202e-06, "loss": 0.3675, "step": 7034 }, { "epoch": 0.775206611570248, "grad_norm": 7.610666751861572, "learning_rate": 1.2191711046299199e-06, "loss": 0.3901, "step": 7035 }, { "epoch": 0.7753168044077134, "grad_norm": 4.446081638336182, "learning_rate": 1.2180271930546155e-06, "loss": 0.4249, "step": 7036 }, { "epoch": 0.775426997245179, "grad_norm": 7.6869425773620605, "learning_rate": 1.2168837439440834e-06, "loss": 0.3503, "step": 7037 }, { "epoch": 0.7755371900826447, "grad_norm": 5.408324241638184, "learning_rate": 1.2157407574381424e-06, "loss": 0.371, "step": 7038 }, { "epoch": 0.7756473829201102, "grad_norm": 6.367844581604004, "learning_rate": 1.2145982336765655e-06, "loss": 0.3458, "step": 7039 }, { "epoch": 0.7757575757575758, "grad_norm": 10.609964370727539, "learning_rate": 1.2134561727990584e-06, "loss": 0.3174, "step": 7040 }, { "epoch": 0.7758677685950414, "grad_norm": 6.0939507484436035, "learning_rate": 1.2123145749452724e-06, "loss": 0.3861, "step": 7041 }, { "epoch": 0.7759779614325069, "grad_norm": 4.9223198890686035, "learning_rate": 1.2111734402548052e-06, "loss": 0.4554, "step": 7042 }, { "epoch": 0.7760881542699725, "grad_norm": 7.670893669128418, "learning_rate": 1.2100327688671982e-06, "loss": 0.4497, "step": 7043 }, { "epoch": 0.7761983471074381, "grad_norm": 6.00565767288208, "learning_rate": 1.2088925609219304e-06, "loss": 0.3969, "step": 7044 }, { "epoch": 0.7763085399449036, "grad_norm": 8.241581916809082, "learning_rate": 1.20775281655843e-06, "loss": 0.4465, "step": 7045 }, { "epoch": 0.7764187327823692, "grad_norm": 6.670886993408203, "learning_rate": 1.2066135359160686e-06, "loss": 0.3624, "step": 7046 }, { "epoch": 0.7765289256198347, "grad_norm": 7.617499828338623, "learning_rate": 1.2054747191341548e-06, "loss": 0.3823, "step": 7047 }, { "epoch": 0.7766391184573003, "grad_norm": 6.233975887298584, "learning_rate": 1.204336366351947e-06, "loss": 0.4049, "step": 7048 }, { "epoch": 0.7767493112947659, "grad_norm": 9.559657096862793, "learning_rate": 1.2031984777086437e-06, "loss": 0.4493, "step": 7049 }, { "epoch": 0.7768595041322314, "grad_norm": 5.948649883270264, "learning_rate": 1.20206105334339e-06, "loss": 0.4542, "step": 7050 }, { "epoch": 0.776969696969697, "grad_norm": 7.301677227020264, "learning_rate": 1.2009240933952682e-06, "loss": 0.4378, "step": 7051 }, { "epoch": 0.7770798898071626, "grad_norm": 3.9794509410858154, "learning_rate": 1.1997875980033086e-06, "loss": 0.3574, "step": 7052 }, { "epoch": 0.7771900826446281, "grad_norm": 6.005545139312744, "learning_rate": 1.1986515673064847e-06, "loss": 0.3983, "step": 7053 }, { "epoch": 0.7773002754820937, "grad_norm": 4.148007869720459, "learning_rate": 1.1975160014437098e-06, "loss": 0.3678, "step": 7054 }, { "epoch": 0.7774104683195592, "grad_norm": 7.430401802062988, "learning_rate": 1.1963809005538436e-06, "loss": 0.4654, "step": 7055 }, { "epoch": 0.7775206611570248, "grad_norm": 7.025285720825195, "learning_rate": 1.1952462647756885e-06, "loss": 0.3726, "step": 7056 }, { "epoch": 0.7776308539944904, "grad_norm": 4.826577663421631, "learning_rate": 1.1941120942479873e-06, "loss": 0.4262, "step": 7057 }, { "epoch": 0.7777410468319559, "grad_norm": 5.657707691192627, "learning_rate": 1.1929783891094287e-06, "loss": 0.4076, "step": 7058 }, { "epoch": 0.7778512396694215, "grad_norm": 6.577704906463623, "learning_rate": 1.1918451494986461e-06, "loss": 0.3889, "step": 7059 }, { "epoch": 0.7779614325068871, "grad_norm": 8.449810028076172, "learning_rate": 1.190712375554211e-06, "loss": 0.3033, "step": 7060 }, { "epoch": 0.7780716253443526, "grad_norm": 4.806299209594727, "learning_rate": 1.189580067414638e-06, "loss": 0.4073, "step": 7061 }, { "epoch": 0.7781818181818182, "grad_norm": 4.187784671783447, "learning_rate": 1.1884482252183933e-06, "loss": 0.3368, "step": 7062 }, { "epoch": 0.7782920110192837, "grad_norm": 8.73049259185791, "learning_rate": 1.1873168491038762e-06, "loss": 0.4421, "step": 7063 }, { "epoch": 0.7784022038567493, "grad_norm": 5.850810527801514, "learning_rate": 1.1861859392094332e-06, "loss": 0.3799, "step": 7064 }, { "epoch": 0.7785123966942149, "grad_norm": 13.17557430267334, "learning_rate": 1.1850554956733557e-06, "loss": 0.451, "step": 7065 }, { "epoch": 0.7786225895316804, "grad_norm": 7.426393508911133, "learning_rate": 1.1839255186338727e-06, "loss": 0.4401, "step": 7066 }, { "epoch": 0.778732782369146, "grad_norm": 5.190055847167969, "learning_rate": 1.1827960082291623e-06, "loss": 0.4143, "step": 7067 }, { "epoch": 0.7788429752066116, "grad_norm": 8.469324111938477, "learning_rate": 1.18166696459734e-06, "loss": 0.4056, "step": 7068 }, { "epoch": 0.7789531680440771, "grad_norm": 9.043612480163574, "learning_rate": 1.1805383878764682e-06, "loss": 0.4203, "step": 7069 }, { "epoch": 0.7790633608815427, "grad_norm": 8.03549575805664, "learning_rate": 1.1794102782045514e-06, "loss": 0.3586, "step": 7070 }, { "epoch": 0.7791735537190083, "grad_norm": 7.550805568695068, "learning_rate": 1.1782826357195348e-06, "loss": 0.4397, "step": 7071 }, { "epoch": 0.7792837465564738, "grad_norm": 11.737250328063965, "learning_rate": 1.1771554605593083e-06, "loss": 0.4463, "step": 7072 }, { "epoch": 0.7793939393939394, "grad_norm": 6.674030303955078, "learning_rate": 1.1760287528617065e-06, "loss": 0.4025, "step": 7073 }, { "epoch": 0.7795041322314049, "grad_norm": 8.523613929748535, "learning_rate": 1.1749025127645014e-06, "loss": 0.34, "step": 7074 }, { "epoch": 0.7796143250688705, "grad_norm": 8.995020866394043, "learning_rate": 1.1737767404054135e-06, "loss": 0.4579, "step": 7075 }, { "epoch": 0.7797245179063361, "grad_norm": 5.88915491104126, "learning_rate": 1.1726514359221041e-06, "loss": 0.4213, "step": 7076 }, { "epoch": 0.7798347107438016, "grad_norm": 4.900761127471924, "learning_rate": 1.1715265994521745e-06, "loss": 0.4118, "step": 7077 }, { "epoch": 0.7799449035812672, "grad_norm": 7.442956924438477, "learning_rate": 1.1704022311331737e-06, "loss": 0.3528, "step": 7078 }, { "epoch": 0.7800550964187328, "grad_norm": 7.432154178619385, "learning_rate": 1.1692783311025908e-06, "loss": 0.4494, "step": 7079 }, { "epoch": 0.7801652892561983, "grad_norm": 7.1108717918396, "learning_rate": 1.168154899497856e-06, "loss": 0.3928, "step": 7080 }, { "epoch": 0.7802754820936639, "grad_norm": 6.365954875946045, "learning_rate": 1.1670319364563447e-06, "loss": 0.3425, "step": 7081 }, { "epoch": 0.7803856749311294, "grad_norm": 13.459142684936523, "learning_rate": 1.1659094421153766e-06, "loss": 0.471, "step": 7082 }, { "epoch": 0.780495867768595, "grad_norm": 8.00849437713623, "learning_rate": 1.1647874166122087e-06, "loss": 0.3829, "step": 7083 }, { "epoch": 0.7806060606060606, "grad_norm": 10.074445724487305, "learning_rate": 1.1636658600840445e-06, "loss": 0.3996, "step": 7084 }, { "epoch": 0.7807162534435261, "grad_norm": 7.83542537689209, "learning_rate": 1.1625447726680317e-06, "loss": 0.3902, "step": 7085 }, { "epoch": 0.7808264462809917, "grad_norm": 11.84658432006836, "learning_rate": 1.1614241545012556e-06, "loss": 0.4302, "step": 7086 }, { "epoch": 0.7809366391184573, "grad_norm": 5.272179126739502, "learning_rate": 1.1603040057207481e-06, "loss": 0.4253, "step": 7087 }, { "epoch": 0.7810468319559228, "grad_norm": 4.778642654418945, "learning_rate": 1.1591843264634839e-06, "loss": 0.4388, "step": 7088 }, { "epoch": 0.7811570247933884, "grad_norm": 8.891243934631348, "learning_rate": 1.1580651168663759e-06, "loss": 0.3542, "step": 7089 }, { "epoch": 0.781267217630854, "grad_norm": 4.417768478393555, "learning_rate": 1.1569463770662842e-06, "loss": 0.3298, "step": 7090 }, { "epoch": 0.7813774104683195, "grad_norm": 7.110738277435303, "learning_rate": 1.15582810720001e-06, "loss": 0.3986, "step": 7091 }, { "epoch": 0.7814876033057852, "grad_norm": 6.492128372192383, "learning_rate": 1.154710307404298e-06, "loss": 0.3744, "step": 7092 }, { "epoch": 0.7815977961432506, "grad_norm": 7.248207092285156, "learning_rate": 1.1535929778158328e-06, "loss": 0.3587, "step": 7093 }, { "epoch": 0.7817079889807163, "grad_norm": 5.013510704040527, "learning_rate": 1.1524761185712402e-06, "loss": 0.3996, "step": 7094 }, { "epoch": 0.7818181818181819, "grad_norm": 5.109880447387695, "learning_rate": 1.1513597298070973e-06, "loss": 0.3952, "step": 7095 }, { "epoch": 0.7819283746556474, "grad_norm": 6.290883541107178, "learning_rate": 1.150243811659914e-06, "loss": 0.291, "step": 7096 }, { "epoch": 0.782038567493113, "grad_norm": 6.792827129364014, "learning_rate": 1.1491283642661444e-06, "loss": 0.3487, "step": 7097 }, { "epoch": 0.7821487603305786, "grad_norm": 8.456485748291016, "learning_rate": 1.1480133877621925e-06, "loss": 0.4565, "step": 7098 }, { "epoch": 0.7822589531680441, "grad_norm": 7.435070037841797, "learning_rate": 1.146898882284395e-06, "loss": 0.3485, "step": 7099 }, { "epoch": 0.7823691460055097, "grad_norm": 5.633684158325195, "learning_rate": 1.1457848479690354e-06, "loss": 0.3553, "step": 7100 }, { "epoch": 0.7824793388429752, "grad_norm": 9.572943687438965, "learning_rate": 1.1446712849523395e-06, "loss": 0.4794, "step": 7101 }, { "epoch": 0.7825895316804408, "grad_norm": 7.53718900680542, "learning_rate": 1.1435581933704776e-06, "loss": 0.3371, "step": 7102 }, { "epoch": 0.7826997245179064, "grad_norm": 7.918816566467285, "learning_rate": 1.142445573359557e-06, "loss": 0.3127, "step": 7103 }, { "epoch": 0.7828099173553719, "grad_norm": 6.695704460144043, "learning_rate": 1.1413334250556312e-06, "loss": 0.3769, "step": 7104 }, { "epoch": 0.7829201101928375, "grad_norm": 4.887105464935303, "learning_rate": 1.140221748594696e-06, "loss": 0.3407, "step": 7105 }, { "epoch": 0.7830303030303031, "grad_norm": 6.283191204071045, "learning_rate": 1.1391105441126898e-06, "loss": 0.3842, "step": 7106 }, { "epoch": 0.7831404958677686, "grad_norm": 7.616218090057373, "learning_rate": 1.1379998117454894e-06, "loss": 0.472, "step": 7107 }, { "epoch": 0.7832506887052342, "grad_norm": 10.6012601852417, "learning_rate": 1.136889551628918e-06, "loss": 0.4453, "step": 7108 }, { "epoch": 0.7833608815426997, "grad_norm": 6.9253010749816895, "learning_rate": 1.1357797638987407e-06, "loss": 0.3683, "step": 7109 }, { "epoch": 0.7834710743801653, "grad_norm": 10.118677139282227, "learning_rate": 1.1346704486906618e-06, "loss": 0.3754, "step": 7110 }, { "epoch": 0.7835812672176309, "grad_norm": 7.475785732269287, "learning_rate": 1.133561606140331e-06, "loss": 0.4331, "step": 7111 }, { "epoch": 0.7836914600550964, "grad_norm": 7.108438014984131, "learning_rate": 1.1324532363833408e-06, "loss": 0.3855, "step": 7112 }, { "epoch": 0.783801652892562, "grad_norm": 6.726047039031982, "learning_rate": 1.1313453395552205e-06, "loss": 0.4521, "step": 7113 }, { "epoch": 0.7839118457300276, "grad_norm": 5.010704517364502, "learning_rate": 1.1302379157914473e-06, "loss": 0.386, "step": 7114 }, { "epoch": 0.7840220385674931, "grad_norm": 7.332189083099365, "learning_rate": 1.1291309652274397e-06, "loss": 0.5025, "step": 7115 }, { "epoch": 0.7841322314049587, "grad_norm": 7.428362846374512, "learning_rate": 1.128024487998554e-06, "loss": 0.3753, "step": 7116 }, { "epoch": 0.7842424242424243, "grad_norm": 6.251130104064941, "learning_rate": 1.1269184842400943e-06, "loss": 0.3963, "step": 7117 }, { "epoch": 0.7843526170798898, "grad_norm": 5.5299577713012695, "learning_rate": 1.1258129540873042e-06, "loss": 0.4352, "step": 7118 }, { "epoch": 0.7844628099173554, "grad_norm": 8.869359970092773, "learning_rate": 1.1247078976753673e-06, "loss": 0.3957, "step": 7119 }, { "epoch": 0.7845730027548209, "grad_norm": 6.53255558013916, "learning_rate": 1.1236033151394127e-06, "loss": 0.3271, "step": 7120 }, { "epoch": 0.7846831955922865, "grad_norm": 7.609908103942871, "learning_rate": 1.1224992066145117e-06, "loss": 0.4235, "step": 7121 }, { "epoch": 0.7847933884297521, "grad_norm": 9.996800422668457, "learning_rate": 1.121395572235674e-06, "loss": 0.3917, "step": 7122 }, { "epoch": 0.7849035812672176, "grad_norm": 7.006301403045654, "learning_rate": 1.1202924121378532e-06, "loss": 0.4124, "step": 7123 }, { "epoch": 0.7850137741046832, "grad_norm": 6.177187919616699, "learning_rate": 1.1191897264559487e-06, "loss": 0.382, "step": 7124 }, { "epoch": 0.7851239669421488, "grad_norm": 8.534271240234375, "learning_rate": 1.1180875153247938e-06, "loss": 0.3238, "step": 7125 }, { "epoch": 0.7852341597796143, "grad_norm": 9.306317329406738, "learning_rate": 1.1169857788791727e-06, "loss": 0.3449, "step": 7126 }, { "epoch": 0.7853443526170799, "grad_norm": 6.282261848449707, "learning_rate": 1.1158845172538035e-06, "loss": 0.3807, "step": 7127 }, { "epoch": 0.7854545454545454, "grad_norm": 4.840741157531738, "learning_rate": 1.1147837305833513e-06, "loss": 0.376, "step": 7128 }, { "epoch": 0.785564738292011, "grad_norm": 9.918661117553711, "learning_rate": 1.1136834190024237e-06, "loss": 0.3863, "step": 7129 }, { "epoch": 0.7856749311294766, "grad_norm": 6.0856828689575195, "learning_rate": 1.112583582645565e-06, "loss": 0.4311, "step": 7130 }, { "epoch": 0.7857851239669421, "grad_norm": 5.408751487731934, "learning_rate": 1.1114842216472665e-06, "loss": 0.3258, "step": 7131 }, { "epoch": 0.7858953168044077, "grad_norm": 6.2737274169921875, "learning_rate": 1.1103853361419608e-06, "loss": 0.385, "step": 7132 }, { "epoch": 0.7860055096418733, "grad_norm": 9.425874710083008, "learning_rate": 1.1092869262640188e-06, "loss": 0.3745, "step": 7133 }, { "epoch": 0.7861157024793388, "grad_norm": 5.71481466293335, "learning_rate": 1.1081889921477561e-06, "loss": 0.4265, "step": 7134 }, { "epoch": 0.7862258953168044, "grad_norm": 8.225992202758789, "learning_rate": 1.1070915339274312e-06, "loss": 0.4703, "step": 7135 }, { "epoch": 0.7863360881542699, "grad_norm": 8.603610038757324, "learning_rate": 1.10599455173724e-06, "loss": 0.4142, "step": 7136 }, { "epoch": 0.7864462809917355, "grad_norm": 12.46169376373291, "learning_rate": 1.1048980457113251e-06, "loss": 0.3765, "step": 7137 }, { "epoch": 0.7865564738292011, "grad_norm": 7.737566947937012, "learning_rate": 1.1038020159837692e-06, "loss": 0.3685, "step": 7138 }, { "epoch": 0.7866666666666666, "grad_norm": 5.6223931312561035, "learning_rate": 1.1027064626885935e-06, "loss": 0.3964, "step": 7139 }, { "epoch": 0.7867768595041322, "grad_norm": 5.277441501617432, "learning_rate": 1.1016113859597661e-06, "loss": 0.2842, "step": 7140 }, { "epoch": 0.7868870523415978, "grad_norm": 6.701341152191162, "learning_rate": 1.1005167859311949e-06, "loss": 0.4404, "step": 7141 }, { "epoch": 0.7869972451790633, "grad_norm": 6.160130500793457, "learning_rate": 1.0994226627367267e-06, "loss": 0.4521, "step": 7142 }, { "epoch": 0.7871074380165289, "grad_norm": 6.142246246337891, "learning_rate": 1.098329016510154e-06, "loss": 0.3734, "step": 7143 }, { "epoch": 0.7872176308539945, "grad_norm": 6.93201208114624, "learning_rate": 1.0972358473852102e-06, "loss": 0.32, "step": 7144 }, { "epoch": 0.78732782369146, "grad_norm": 10.67005729675293, "learning_rate": 1.0961431554955671e-06, "loss": 0.4239, "step": 7145 }, { "epoch": 0.7874380165289256, "grad_norm": 6.718689441680908, "learning_rate": 1.0950509409748416e-06, "loss": 0.4462, "step": 7146 }, { "epoch": 0.7875482093663911, "grad_norm": 5.778476715087891, "learning_rate": 1.0939592039565915e-06, "loss": 0.4016, "step": 7147 }, { "epoch": 0.7876584022038567, "grad_norm": 7.173189163208008, "learning_rate": 1.0928679445743168e-06, "loss": 0.2673, "step": 7148 }, { "epoch": 0.7877685950413224, "grad_norm": 5.864417552947998, "learning_rate": 1.0917771629614565e-06, "loss": 0.364, "step": 7149 }, { "epoch": 0.7878787878787878, "grad_norm": 5.162528038024902, "learning_rate": 1.090686859251393e-06, "loss": 0.3032, "step": 7150 }, { "epoch": 0.7879889807162535, "grad_norm": 6.525598049163818, "learning_rate": 1.0895970335774518e-06, "loss": 0.4942, "step": 7151 }, { "epoch": 0.7880991735537191, "grad_norm": 7.823827743530273, "learning_rate": 1.0885076860728977e-06, "loss": 0.4232, "step": 7152 }, { "epoch": 0.7882093663911846, "grad_norm": 6.679292678833008, "learning_rate": 1.0874188168709343e-06, "loss": 0.4352, "step": 7153 }, { "epoch": 0.7883195592286502, "grad_norm": 6.845866680145264, "learning_rate": 1.0863304261047148e-06, "loss": 0.3552, "step": 7154 }, { "epoch": 0.7884297520661157, "grad_norm": 8.466389656066895, "learning_rate": 1.0852425139073275e-06, "loss": 0.3649, "step": 7155 }, { "epoch": 0.7885399449035813, "grad_norm": 6.212552070617676, "learning_rate": 1.0841550804118001e-06, "loss": 0.3685, "step": 7156 }, { "epoch": 0.7886501377410469, "grad_norm": 6.205574035644531, "learning_rate": 1.0830681257511117e-06, "loss": 0.3829, "step": 7157 }, { "epoch": 0.7887603305785124, "grad_norm": 7.429014205932617, "learning_rate": 1.0819816500581737e-06, "loss": 0.3488, "step": 7158 }, { "epoch": 0.788870523415978, "grad_norm": 4.995081424713135, "learning_rate": 1.0808956534658399e-06, "loss": 0.4144, "step": 7159 }, { "epoch": 0.7889807162534436, "grad_norm": 6.158295631408691, "learning_rate": 1.079810136106909e-06, "loss": 0.3498, "step": 7160 }, { "epoch": 0.7890909090909091, "grad_norm": 5.911240100860596, "learning_rate": 1.07872509811412e-06, "loss": 0.3704, "step": 7161 }, { "epoch": 0.7892011019283747, "grad_norm": 7.6627326011657715, "learning_rate": 1.077640539620154e-06, "loss": 0.4462, "step": 7162 }, { "epoch": 0.7893112947658402, "grad_norm": 7.624194622039795, "learning_rate": 1.0765564607576295e-06, "loss": 0.3291, "step": 7163 }, { "epoch": 0.7894214876033058, "grad_norm": 9.946186065673828, "learning_rate": 1.0754728616591103e-06, "loss": 0.3983, "step": 7164 }, { "epoch": 0.7895316804407714, "grad_norm": 9.451842308044434, "learning_rate": 1.074389742457102e-06, "loss": 0.3839, "step": 7165 }, { "epoch": 0.7896418732782369, "grad_norm": 8.298590660095215, "learning_rate": 1.0733071032840464e-06, "loss": 0.3419, "step": 7166 }, { "epoch": 0.7897520661157025, "grad_norm": 5.167027473449707, "learning_rate": 1.072224944272333e-06, "loss": 0.4013, "step": 7167 }, { "epoch": 0.7898622589531681, "grad_norm": 7.933115005493164, "learning_rate": 1.0711432655542898e-06, "loss": 0.4792, "step": 7168 }, { "epoch": 0.7899724517906336, "grad_norm": 6.259268760681152, "learning_rate": 1.070062067262183e-06, "loss": 0.3645, "step": 7169 }, { "epoch": 0.7900826446280992, "grad_norm": 4.871016025543213, "learning_rate": 1.068981349528226e-06, "loss": 0.3459, "step": 7170 }, { "epoch": 0.7901928374655648, "grad_norm": 9.617240905761719, "learning_rate": 1.0679011124845702e-06, "loss": 0.4071, "step": 7171 }, { "epoch": 0.7903030303030303, "grad_norm": 5.08292293548584, "learning_rate": 1.0668213562633056e-06, "loss": 0.3782, "step": 7172 }, { "epoch": 0.7904132231404959, "grad_norm": 8.37671184539795, "learning_rate": 1.0657420809964692e-06, "loss": 0.4441, "step": 7173 }, { "epoch": 0.7905234159779614, "grad_norm": 4.549337387084961, "learning_rate": 1.0646632868160362e-06, "loss": 0.2986, "step": 7174 }, { "epoch": 0.790633608815427, "grad_norm": 7.388365745544434, "learning_rate": 1.0635849738539205e-06, "loss": 0.3397, "step": 7175 }, { "epoch": 0.7907438016528926, "grad_norm": 6.887531757354736, "learning_rate": 1.062507142241982e-06, "loss": 0.4043, "step": 7176 }, { "epoch": 0.7908539944903581, "grad_norm": 7.719393730163574, "learning_rate": 1.0614297921120199e-06, "loss": 0.3595, "step": 7177 }, { "epoch": 0.7909641873278237, "grad_norm": 10.925946235656738, "learning_rate": 1.060352923595771e-06, "loss": 0.4671, "step": 7178 }, { "epoch": 0.7910743801652893, "grad_norm": 6.271305561065674, "learning_rate": 1.0592765368249186e-06, "loss": 0.3992, "step": 7179 }, { "epoch": 0.7911845730027548, "grad_norm": 9.810705184936523, "learning_rate": 1.058200631931085e-06, "loss": 0.4578, "step": 7180 }, { "epoch": 0.7912947658402204, "grad_norm": 6.939256191253662, "learning_rate": 1.0571252090458318e-06, "loss": 0.4364, "step": 7181 }, { "epoch": 0.7914049586776859, "grad_norm": 4.0625319480896, "learning_rate": 1.0560502683006634e-06, "loss": 0.4313, "step": 7182 }, { "epoch": 0.7915151515151515, "grad_norm": 5.395636081695557, "learning_rate": 1.0549758098270274e-06, "loss": 0.3478, "step": 7183 }, { "epoch": 0.7916253443526171, "grad_norm": 12.50815486907959, "learning_rate": 1.0539018337563061e-06, "loss": 0.5145, "step": 7184 }, { "epoch": 0.7917355371900826, "grad_norm": 6.146428108215332, "learning_rate": 1.0528283402198309e-06, "loss": 0.3582, "step": 7185 }, { "epoch": 0.7918457300275482, "grad_norm": 9.114592552185059, "learning_rate": 1.0517553293488663e-06, "loss": 0.4028, "step": 7186 }, { "epoch": 0.7919559228650138, "grad_norm": 7.628662586212158, "learning_rate": 1.0506828012746228e-06, "loss": 0.426, "step": 7187 }, { "epoch": 0.7920661157024793, "grad_norm": 5.638787746429443, "learning_rate": 1.0496107561282532e-06, "loss": 0.4132, "step": 7188 }, { "epoch": 0.7921763085399449, "grad_norm": 5.9043779373168945, "learning_rate": 1.048539194040843e-06, "loss": 0.4129, "step": 7189 }, { "epoch": 0.7922865013774105, "grad_norm": 4.690010070800781, "learning_rate": 1.0474681151434306e-06, "loss": 0.4328, "step": 7190 }, { "epoch": 0.792396694214876, "grad_norm": 10.724125862121582, "learning_rate": 1.0463975195669861e-06, "loss": 0.3054, "step": 7191 }, { "epoch": 0.7925068870523416, "grad_norm": 4.3964338302612305, "learning_rate": 1.0453274074424218e-06, "loss": 0.3724, "step": 7192 }, { "epoch": 0.7926170798898071, "grad_norm": 6.200281620025635, "learning_rate": 1.0442577789005943e-06, "loss": 0.3352, "step": 7193 }, { "epoch": 0.7927272727272727, "grad_norm": 7.095250606536865, "learning_rate": 1.0431886340723003e-06, "loss": 0.4533, "step": 7194 }, { "epoch": 0.7928374655647383, "grad_norm": 6.526980400085449, "learning_rate": 1.0421199730882736e-06, "loss": 0.4564, "step": 7195 }, { "epoch": 0.7929476584022038, "grad_norm": 10.025382041931152, "learning_rate": 1.0410517960791926e-06, "loss": 0.4522, "step": 7196 }, { "epoch": 0.7930578512396694, "grad_norm": 5.875022888183594, "learning_rate": 1.0399841031756774e-06, "loss": 0.3377, "step": 7197 }, { "epoch": 0.793168044077135, "grad_norm": 4.792463302612305, "learning_rate": 1.0389168945082833e-06, "loss": 0.3394, "step": 7198 }, { "epoch": 0.7932782369146005, "grad_norm": 6.474595069885254, "learning_rate": 1.0378501702075122e-06, "loss": 0.3626, "step": 7199 }, { "epoch": 0.7933884297520661, "grad_norm": 4.841497898101807, "learning_rate": 1.0367839304038057e-06, "loss": 0.3657, "step": 7200 }, { "epoch": 0.7934986225895316, "grad_norm": 5.467446804046631, "learning_rate": 1.0357181752275425e-06, "loss": 0.3866, "step": 7201 }, { "epoch": 0.7936088154269972, "grad_norm": 7.529897689819336, "learning_rate": 1.034652904809046e-06, "loss": 0.3415, "step": 7202 }, { "epoch": 0.7937190082644628, "grad_norm": 8.403196334838867, "learning_rate": 1.0335881192785778e-06, "loss": 0.3787, "step": 7203 }, { "epoch": 0.7938292011019283, "grad_norm": 11.709247589111328, "learning_rate": 1.0325238187663444e-06, "loss": 0.2872, "step": 7204 }, { "epoch": 0.793939393939394, "grad_norm": 4.930643081665039, "learning_rate": 1.0314600034024864e-06, "loss": 0.322, "step": 7205 }, { "epoch": 0.7940495867768596, "grad_norm": 7.090351581573486, "learning_rate": 1.0303966733170896e-06, "loss": 0.42, "step": 7206 }, { "epoch": 0.794159779614325, "grad_norm": 6.290981769561768, "learning_rate": 1.029333828640181e-06, "loss": 0.4351, "step": 7207 }, { "epoch": 0.7942699724517907, "grad_norm": 6.4861369132995605, "learning_rate": 1.0282714695017255e-06, "loss": 0.3989, "step": 7208 }, { "epoch": 0.7943801652892561, "grad_norm": 5.802674770355225, "learning_rate": 1.027209596031627e-06, "loss": 0.4497, "step": 7209 }, { "epoch": 0.7944903581267218, "grad_norm": 5.004255771636963, "learning_rate": 1.0261482083597385e-06, "loss": 0.3229, "step": 7210 }, { "epoch": 0.7946005509641874, "grad_norm": 10.453167915344238, "learning_rate": 1.025087306615845e-06, "loss": 0.3638, "step": 7211 }, { "epoch": 0.7947107438016529, "grad_norm": 8.389795303344727, "learning_rate": 1.0240268909296724e-06, "loss": 0.4663, "step": 7212 }, { "epoch": 0.7948209366391185, "grad_norm": 7.390760898590088, "learning_rate": 1.022966961430895e-06, "loss": 0.3642, "step": 7213 }, { "epoch": 0.7949311294765841, "grad_norm": 4.972836971282959, "learning_rate": 1.02190751824912e-06, "loss": 0.4278, "step": 7214 }, { "epoch": 0.7950413223140496, "grad_norm": 20.41846466064453, "learning_rate": 1.0208485615138946e-06, "loss": 0.463, "step": 7215 }, { "epoch": 0.7951515151515152, "grad_norm": 7.295198440551758, "learning_rate": 1.0197900913547149e-06, "loss": 0.3614, "step": 7216 }, { "epoch": 0.7952617079889808, "grad_norm": 9.109318733215332, "learning_rate": 1.0187321079010082e-06, "loss": 0.359, "step": 7217 }, { "epoch": 0.7953719008264463, "grad_norm": 5.5986857414245605, "learning_rate": 1.0176746112821483e-06, "loss": 0.3293, "step": 7218 }, { "epoch": 0.7954820936639119, "grad_norm": 7.371835708618164, "learning_rate": 1.0166176016274453e-06, "loss": 0.333, "step": 7219 }, { "epoch": 0.7955922865013774, "grad_norm": 7.072951793670654, "learning_rate": 1.015561079066153e-06, "loss": 0.4177, "step": 7220 }, { "epoch": 0.795702479338843, "grad_norm": 7.765138626098633, "learning_rate": 1.0145050437274655e-06, "loss": 0.4301, "step": 7221 }, { "epoch": 0.7958126721763086, "grad_norm": 8.221468925476074, "learning_rate": 1.0134494957405139e-06, "loss": 0.3963, "step": 7222 }, { "epoch": 0.7959228650137741, "grad_norm": 6.8744072914123535, "learning_rate": 1.0123944352343728e-06, "loss": 0.3925, "step": 7223 }, { "epoch": 0.7960330578512397, "grad_norm": 5.482988357543945, "learning_rate": 1.0113398623380582e-06, "loss": 0.3516, "step": 7224 }, { "epoch": 0.7961432506887053, "grad_norm": 8.589863777160645, "learning_rate": 1.0102857771805218e-06, "loss": 0.415, "step": 7225 }, { "epoch": 0.7962534435261708, "grad_norm": 7.051527500152588, "learning_rate": 1.0092321798906606e-06, "loss": 0.3998, "step": 7226 }, { "epoch": 0.7963636363636364, "grad_norm": 4.680861473083496, "learning_rate": 1.0081790705973105e-06, "loss": 0.4115, "step": 7227 }, { "epoch": 0.7964738292011019, "grad_norm": 4.199980735778809, "learning_rate": 1.0071264494292443e-06, "loss": 0.3424, "step": 7228 }, { "epoch": 0.7965840220385675, "grad_norm": 4.739099502563477, "learning_rate": 1.0060743165151798e-06, "loss": 0.3158, "step": 7229 }, { "epoch": 0.7966942148760331, "grad_norm": 5.168888092041016, "learning_rate": 1.0050226719837746e-06, "loss": 0.4125, "step": 7230 }, { "epoch": 0.7968044077134986, "grad_norm": 6.180441379547119, "learning_rate": 1.0039715159636225e-06, "loss": 0.409, "step": 7231 }, { "epoch": 0.7969146005509642, "grad_norm": 8.566987037658691, "learning_rate": 1.0029208485832614e-06, "loss": 0.4531, "step": 7232 }, { "epoch": 0.7970247933884298, "grad_norm": 7.466010570526123, "learning_rate": 1.0018706699711695e-06, "loss": 0.3514, "step": 7233 }, { "epoch": 0.7971349862258953, "grad_norm": 6.98629903793335, "learning_rate": 1.0008209802557617e-06, "loss": 0.3916, "step": 7234 }, { "epoch": 0.7972451790633609, "grad_norm": 6.633272171020508, "learning_rate": 9.997717795653972e-07, "loss": 0.4642, "step": 7235 }, { "epoch": 0.7973553719008264, "grad_norm": 19.752498626708984, "learning_rate": 9.987230680283744e-07, "loss": 0.5274, "step": 7236 }, { "epoch": 0.797465564738292, "grad_norm": 8.323124885559082, "learning_rate": 9.976748457729285e-07, "loss": 0.4353, "step": 7237 }, { "epoch": 0.7975757575757576, "grad_norm": 8.583427429199219, "learning_rate": 9.966271129272391e-07, "loss": 0.513, "step": 7238 }, { "epoch": 0.7976859504132231, "grad_norm": 6.132208347320557, "learning_rate": 9.955798696194259e-07, "loss": 0.3029, "step": 7239 }, { "epoch": 0.7977961432506887, "grad_norm": 5.390061855316162, "learning_rate": 9.945331159775445e-07, "loss": 0.3564, "step": 7240 }, { "epoch": 0.7979063360881543, "grad_norm": 7.2979607582092285, "learning_rate": 9.934868521295955e-07, "loss": 0.4015, "step": 7241 }, { "epoch": 0.7980165289256198, "grad_norm": 9.862425804138184, "learning_rate": 9.924410782035155e-07, "loss": 0.368, "step": 7242 }, { "epoch": 0.7981267217630854, "grad_norm": 9.85254955291748, "learning_rate": 9.91395794327184e-07, "loss": 0.3668, "step": 7243 }, { "epoch": 0.798236914600551, "grad_norm": 7.688636779785156, "learning_rate": 9.903510006284218e-07, "loss": 0.3777, "step": 7244 }, { "epoch": 0.7983471074380165, "grad_norm": 7.272444725036621, "learning_rate": 9.893066972349824e-07, "loss": 0.3122, "step": 7245 }, { "epoch": 0.7984573002754821, "grad_norm": 11.556692123413086, "learning_rate": 9.882628842745712e-07, "loss": 0.3954, "step": 7246 }, { "epoch": 0.7985674931129476, "grad_norm": 7.959048748016357, "learning_rate": 9.872195618748236e-07, "loss": 0.5244, "step": 7247 }, { "epoch": 0.7986776859504132, "grad_norm": 6.023469924926758, "learning_rate": 9.861767301633163e-07, "loss": 0.4132, "step": 7248 }, { "epoch": 0.7987878787878788, "grad_norm": 6.7346510887146, "learning_rate": 9.851343892675735e-07, "loss": 0.3649, "step": 7249 }, { "epoch": 0.7988980716253443, "grad_norm": 7.217971324920654, "learning_rate": 9.840925393150507e-07, "loss": 0.4012, "step": 7250 }, { "epoch": 0.7990082644628099, "grad_norm": 6.179969787597656, "learning_rate": 9.830511804331467e-07, "loss": 0.3586, "step": 7251 }, { "epoch": 0.7991184573002755, "grad_norm": 5.323660373687744, "learning_rate": 9.820103127492002e-07, "loss": 0.3714, "step": 7252 }, { "epoch": 0.799228650137741, "grad_norm": 6.620593070983887, "learning_rate": 9.809699363904924e-07, "loss": 0.3783, "step": 7253 }, { "epoch": 0.7993388429752066, "grad_norm": 7.338545799255371, "learning_rate": 9.799300514842386e-07, "loss": 0.3991, "step": 7254 }, { "epoch": 0.7994490358126721, "grad_norm": 5.968478202819824, "learning_rate": 9.788906581575986e-07, "loss": 0.4115, "step": 7255 }, { "epoch": 0.7995592286501377, "grad_norm": 16.298006057739258, "learning_rate": 9.778517565376727e-07, "loss": 0.4783, "step": 7256 }, { "epoch": 0.7996694214876033, "grad_norm": 12.101306915283203, "learning_rate": 9.768133467514961e-07, "loss": 0.4822, "step": 7257 }, { "epoch": 0.7997796143250688, "grad_norm": 11.499910354614258, "learning_rate": 9.757754289260485e-07, "loss": 0.4332, "step": 7258 }, { "epoch": 0.7998898071625344, "grad_norm": 7.933718681335449, "learning_rate": 9.747380031882474e-07, "loss": 0.4092, "step": 7259 }, { "epoch": 0.8, "grad_norm": 7.823580741882324, "learning_rate": 9.73701069664953e-07, "loss": 0.3825, "step": 7260 }, { "epoch": 0.8001101928374655, "grad_norm": 9.790545463562012, "learning_rate": 9.726646284829594e-07, "loss": 0.4438, "step": 7261 }, { "epoch": 0.8002203856749311, "grad_norm": 8.749502182006836, "learning_rate": 9.716286797690056e-07, "loss": 0.3701, "step": 7262 }, { "epoch": 0.8003305785123966, "grad_norm": 6.369805812835693, "learning_rate": 9.705932236497701e-07, "loss": 0.4131, "step": 7263 }, { "epoch": 0.8004407713498622, "grad_norm": 5.037927150726318, "learning_rate": 9.695582602518671e-07, "loss": 0.4315, "step": 7264 }, { "epoch": 0.8004407713498622, "eval_loss": 0.39739754796028137, "eval_runtime": 41.9639, "eval_samples_per_second": 17.491, "eval_steps_per_second": 2.192, "step": 7264 }, { "epoch": 0.8005509641873279, "grad_norm": 6.549569129943848, "learning_rate": 9.68523789701855e-07, "loss": 0.4189, "step": 7265 }, { "epoch": 0.8006611570247933, "grad_norm": 8.137191772460938, "learning_rate": 9.674898121262322e-07, "loss": 0.4373, "step": 7266 }, { "epoch": 0.800771349862259, "grad_norm": 4.936229228973389, "learning_rate": 9.664563276514321e-07, "loss": 0.3285, "step": 7267 }, { "epoch": 0.8008815426997246, "grad_norm": 5.178045749664307, "learning_rate": 9.654233364038285e-07, "loss": 0.417, "step": 7268 }, { "epoch": 0.80099173553719, "grad_norm": 13.220976829528809, "learning_rate": 9.643908385097428e-07, "loss": 0.5043, "step": 7269 }, { "epoch": 0.8011019283746557, "grad_norm": 4.701597213745117, "learning_rate": 9.633588340954269e-07, "loss": 0.3783, "step": 7270 }, { "epoch": 0.8012121212121213, "grad_norm": 7.403231143951416, "learning_rate": 9.623273232870734e-07, "loss": 0.3556, "step": 7271 }, { "epoch": 0.8013223140495868, "grad_norm": 7.546553134918213, "learning_rate": 9.612963062108222e-07, "loss": 0.4229, "step": 7272 }, { "epoch": 0.8014325068870524, "grad_norm": 4.353821754455566, "learning_rate": 9.60265782992743e-07, "loss": 0.3123, "step": 7273 }, { "epoch": 0.8015426997245179, "grad_norm": 5.343846321105957, "learning_rate": 9.59235753758853e-07, "loss": 0.4493, "step": 7274 }, { "epoch": 0.8016528925619835, "grad_norm": 10.174478530883789, "learning_rate": 9.582062186351027e-07, "loss": 0.4762, "step": 7275 }, { "epoch": 0.8017630853994491, "grad_norm": 5.632132053375244, "learning_rate": 9.57177177747386e-07, "loss": 0.2683, "step": 7276 }, { "epoch": 0.8018732782369146, "grad_norm": 5.7510504722595215, "learning_rate": 9.561486312215374e-07, "loss": 0.4303, "step": 7277 }, { "epoch": 0.8019834710743802, "grad_norm": 4.070655345916748, "learning_rate": 9.551205791833252e-07, "loss": 0.4219, "step": 7278 }, { "epoch": 0.8020936639118458, "grad_norm": 6.352983474731445, "learning_rate": 9.540930217584633e-07, "loss": 0.4086, "step": 7279 }, { "epoch": 0.8022038567493113, "grad_norm": 12.105496406555176, "learning_rate": 9.530659590726037e-07, "loss": 0.4251, "step": 7280 }, { "epoch": 0.8023140495867769, "grad_norm": 5.1941680908203125, "learning_rate": 9.520393912513348e-07, "loss": 0.3371, "step": 7281 }, { "epoch": 0.8024242424242424, "grad_norm": 6.544575214385986, "learning_rate": 9.510133184201881e-07, "loss": 0.3963, "step": 7282 }, { "epoch": 0.802534435261708, "grad_norm": 5.792811870574951, "learning_rate": 9.499877407046332e-07, "loss": 0.4255, "step": 7283 }, { "epoch": 0.8026446280991736, "grad_norm": 7.323215484619141, "learning_rate": 9.489626582300782e-07, "loss": 0.3708, "step": 7284 }, { "epoch": 0.8027548209366391, "grad_norm": 9.369105339050293, "learning_rate": 9.479380711218716e-07, "loss": 0.4317, "step": 7285 }, { "epoch": 0.8028650137741047, "grad_norm": 8.165177345275879, "learning_rate": 9.469139795053034e-07, "loss": 0.2629, "step": 7286 }, { "epoch": 0.8029752066115703, "grad_norm": 8.255684852600098, "learning_rate": 9.458903835055983e-07, "loss": 0.3928, "step": 7287 }, { "epoch": 0.8030853994490358, "grad_norm": 5.622321605682373, "learning_rate": 9.448672832479239e-07, "loss": 0.371, "step": 7288 }, { "epoch": 0.8031955922865014, "grad_norm": 4.412442684173584, "learning_rate": 9.43844678857388e-07, "loss": 0.3221, "step": 7289 }, { "epoch": 0.8033057851239669, "grad_norm": 4.750947952270508, "learning_rate": 9.428225704590327e-07, "loss": 0.3824, "step": 7290 }, { "epoch": 0.8034159779614325, "grad_norm": 6.953831195831299, "learning_rate": 9.418009581778447e-07, "loss": 0.3953, "step": 7291 }, { "epoch": 0.8035261707988981, "grad_norm": 6.322239398956299, "learning_rate": 9.407798421387498e-07, "loss": 0.3995, "step": 7292 }, { "epoch": 0.8036363636363636, "grad_norm": 5.172215938568115, "learning_rate": 9.39759222466608e-07, "loss": 0.4628, "step": 7293 }, { "epoch": 0.8037465564738292, "grad_norm": 7.184489727020264, "learning_rate": 9.387390992862238e-07, "loss": 0.373, "step": 7294 }, { "epoch": 0.8038567493112948, "grad_norm": 5.020264148712158, "learning_rate": 9.37719472722341e-07, "loss": 0.3731, "step": 7295 }, { "epoch": 0.8039669421487603, "grad_norm": 7.934032440185547, "learning_rate": 9.367003428996374e-07, "loss": 0.4368, "step": 7296 }, { "epoch": 0.8040771349862259, "grad_norm": 7.1293134689331055, "learning_rate": 9.356817099427351e-07, "loss": 0.473, "step": 7297 }, { "epoch": 0.8041873278236915, "grad_norm": 7.446048259735107, "learning_rate": 9.346635739761955e-07, "loss": 0.4487, "step": 7298 }, { "epoch": 0.804297520661157, "grad_norm": 5.76701545715332, "learning_rate": 9.336459351245152e-07, "loss": 0.3696, "step": 7299 }, { "epoch": 0.8044077134986226, "grad_norm": 5.016604900360107, "learning_rate": 9.326287935121353e-07, "loss": 0.3991, "step": 7300 }, { "epoch": 0.8045179063360881, "grad_norm": 4.685810089111328, "learning_rate": 9.316121492634283e-07, "loss": 0.3693, "step": 7301 }, { "epoch": 0.8046280991735537, "grad_norm": 7.337115287780762, "learning_rate": 9.305960025027172e-07, "loss": 0.3524, "step": 7302 }, { "epoch": 0.8047382920110193, "grad_norm": 7.880723476409912, "learning_rate": 9.295803533542541e-07, "loss": 0.4536, "step": 7303 }, { "epoch": 0.8048484848484848, "grad_norm": 10.412185668945312, "learning_rate": 9.28565201942232e-07, "loss": 0.4296, "step": 7304 }, { "epoch": 0.8049586776859504, "grad_norm": 8.625537872314453, "learning_rate": 9.275505483907904e-07, "loss": 0.3921, "step": 7305 }, { "epoch": 0.805068870523416, "grad_norm": 10.102866172790527, "learning_rate": 9.265363928239995e-07, "loss": 0.3705, "step": 7306 }, { "epoch": 0.8051790633608815, "grad_norm": 7.82152795791626, "learning_rate": 9.255227353658691e-07, "loss": 0.4334, "step": 7307 }, { "epoch": 0.8052892561983471, "grad_norm": 4.74699592590332, "learning_rate": 9.245095761403555e-07, "loss": 0.4471, "step": 7308 }, { "epoch": 0.8053994490358126, "grad_norm": 9.94813346862793, "learning_rate": 9.234969152713475e-07, "loss": 0.3688, "step": 7309 }, { "epoch": 0.8055096418732782, "grad_norm": 12.886780738830566, "learning_rate": 9.224847528826725e-07, "loss": 0.4914, "step": 7310 }, { "epoch": 0.8056198347107438, "grad_norm": 7.919636249542236, "learning_rate": 9.214730890981005e-07, "loss": 0.3961, "step": 7311 }, { "epoch": 0.8057300275482093, "grad_norm": 11.858296394348145, "learning_rate": 9.204619240413409e-07, "loss": 0.435, "step": 7312 }, { "epoch": 0.8058402203856749, "grad_norm": 6.879891395568848, "learning_rate": 9.194512578360377e-07, "loss": 0.443, "step": 7313 }, { "epoch": 0.8059504132231405, "grad_norm": 10.284538269042969, "learning_rate": 9.184410906057773e-07, "loss": 0.421, "step": 7314 }, { "epoch": 0.806060606060606, "grad_norm": 10.9752836227417, "learning_rate": 9.174314224740844e-07, "loss": 0.5039, "step": 7315 }, { "epoch": 0.8061707988980716, "grad_norm": 11.572310447692871, "learning_rate": 9.164222535644241e-07, "loss": 0.417, "step": 7316 }, { "epoch": 0.8062809917355372, "grad_norm": 6.825654029846191, "learning_rate": 9.154135840001965e-07, "loss": 0.3913, "step": 7317 }, { "epoch": 0.8063911845730027, "grad_norm": 9.148991584777832, "learning_rate": 9.144054139047442e-07, "loss": 0.4296, "step": 7318 }, { "epoch": 0.8065013774104683, "grad_norm": 4.8619890213012695, "learning_rate": 9.133977434013485e-07, "loss": 0.3764, "step": 7319 }, { "epoch": 0.8066115702479338, "grad_norm": 5.3864030838012695, "learning_rate": 9.12390572613227e-07, "loss": 0.3388, "step": 7320 }, { "epoch": 0.8067217630853994, "grad_norm": 4.868517875671387, "learning_rate": 9.113839016635389e-07, "loss": 0.345, "step": 7321 }, { "epoch": 0.806831955922865, "grad_norm": 5.957769393920898, "learning_rate": 9.103777306753825e-07, "loss": 0.4002, "step": 7322 }, { "epoch": 0.8069421487603305, "grad_norm": 6.74638032913208, "learning_rate": 9.093720597717909e-07, "loss": 0.4176, "step": 7323 }, { "epoch": 0.8070523415977962, "grad_norm": 4.691101551055908, "learning_rate": 9.083668890757402e-07, "loss": 0.3389, "step": 7324 }, { "epoch": 0.8071625344352618, "grad_norm": 15.070783615112305, "learning_rate": 9.073622187101455e-07, "loss": 0.4583, "step": 7325 }, { "epoch": 0.8072727272727273, "grad_norm": 12.757047653198242, "learning_rate": 9.063580487978579e-07, "loss": 0.3861, "step": 7326 }, { "epoch": 0.8073829201101929, "grad_norm": 6.557346343994141, "learning_rate": 9.053543794616665e-07, "loss": 0.4441, "step": 7327 }, { "epoch": 0.8074931129476584, "grad_norm": 6.969200134277344, "learning_rate": 9.043512108243063e-07, "loss": 0.4289, "step": 7328 }, { "epoch": 0.807603305785124, "grad_norm": 7.515108108520508, "learning_rate": 9.033485430084421e-07, "loss": 0.3323, "step": 7329 }, { "epoch": 0.8077134986225896, "grad_norm": 4.961164951324463, "learning_rate": 9.023463761366824e-07, "loss": 0.3961, "step": 7330 }, { "epoch": 0.8078236914600551, "grad_norm": 7.038287162780762, "learning_rate": 9.013447103315758e-07, "loss": 0.4241, "step": 7331 }, { "epoch": 0.8079338842975207, "grad_norm": 5.465232849121094, "learning_rate": 9.00343545715604e-07, "loss": 0.3273, "step": 7332 }, { "epoch": 0.8080440771349863, "grad_norm": 8.099470138549805, "learning_rate": 8.993428824111932e-07, "loss": 0.3854, "step": 7333 }, { "epoch": 0.8081542699724518, "grad_norm": 3.8248982429504395, "learning_rate": 8.983427205407041e-07, "loss": 0.4089, "step": 7334 }, { "epoch": 0.8082644628099174, "grad_norm": 7.873836517333984, "learning_rate": 8.973430602264388e-07, "loss": 0.3808, "step": 7335 }, { "epoch": 0.8083746556473829, "grad_norm": 6.351470470428467, "learning_rate": 8.963439015906378e-07, "loss": 0.4302, "step": 7336 }, { "epoch": 0.8084848484848485, "grad_norm": 9.827178001403809, "learning_rate": 8.953452447554778e-07, "loss": 0.335, "step": 7337 }, { "epoch": 0.8085950413223141, "grad_norm": 5.265390396118164, "learning_rate": 8.943470898430768e-07, "loss": 0.3765, "step": 7338 }, { "epoch": 0.8087052341597796, "grad_norm": 5.931137561798096, "learning_rate": 8.933494369754919e-07, "loss": 0.3746, "step": 7339 }, { "epoch": 0.8088154269972452, "grad_norm": 5.3239359855651855, "learning_rate": 8.923522862747148e-07, "loss": 0.39, "step": 7340 }, { "epoch": 0.8089256198347108, "grad_norm": 7.077722072601318, "learning_rate": 8.913556378626804e-07, "loss": 0.4648, "step": 7341 }, { "epoch": 0.8090358126721763, "grad_norm": 4.245509147644043, "learning_rate": 8.903594918612601e-07, "loss": 0.3343, "step": 7342 }, { "epoch": 0.8091460055096419, "grad_norm": 6.912594795227051, "learning_rate": 8.893638483922628e-07, "loss": 0.4431, "step": 7343 }, { "epoch": 0.8092561983471075, "grad_norm": 6.49102258682251, "learning_rate": 8.883687075774377e-07, "loss": 0.3481, "step": 7344 }, { "epoch": 0.809366391184573, "grad_norm": 4.468546390533447, "learning_rate": 8.873740695384736e-07, "loss": 0.3958, "step": 7345 }, { "epoch": 0.8094765840220386, "grad_norm": 6.101900577545166, "learning_rate": 8.863799343969931e-07, "loss": 0.4225, "step": 7346 }, { "epoch": 0.8095867768595041, "grad_norm": 8.90814208984375, "learning_rate": 8.853863022745623e-07, "loss": 0.3228, "step": 7347 }, { "epoch": 0.8096969696969697, "grad_norm": 5.18488883972168, "learning_rate": 8.843931732926847e-07, "loss": 0.3955, "step": 7348 }, { "epoch": 0.8098071625344353, "grad_norm": 8.424071311950684, "learning_rate": 8.834005475727991e-07, "loss": 0.3924, "step": 7349 }, { "epoch": 0.8099173553719008, "grad_norm": 14.261128425598145, "learning_rate": 8.824084252362864e-07, "loss": 0.3871, "step": 7350 }, { "epoch": 0.8100275482093664, "grad_norm": 5.683298110961914, "learning_rate": 8.814168064044659e-07, "loss": 0.4796, "step": 7351 }, { "epoch": 0.810137741046832, "grad_norm": 8.727130889892578, "learning_rate": 8.80425691198592e-07, "loss": 0.3456, "step": 7352 }, { "epoch": 0.8102479338842975, "grad_norm": 7.0580339431762695, "learning_rate": 8.794350797398604e-07, "loss": 0.3149, "step": 7353 }, { "epoch": 0.8103581267217631, "grad_norm": 5.936559677124023, "learning_rate": 8.784449721494054e-07, "loss": 0.4401, "step": 7354 }, { "epoch": 0.8104683195592286, "grad_norm": 6.004831790924072, "learning_rate": 8.774553685482968e-07, "loss": 0.423, "step": 7355 }, { "epoch": 0.8105785123966942, "grad_norm": 7.186921119689941, "learning_rate": 8.764662690575454e-07, "loss": 0.3428, "step": 7356 }, { "epoch": 0.8106887052341598, "grad_norm": 8.847982406616211, "learning_rate": 8.754776737981002e-07, "loss": 0.4349, "step": 7357 }, { "epoch": 0.8107988980716253, "grad_norm": 7.745877265930176, "learning_rate": 8.744895828908484e-07, "loss": 0.4443, "step": 7358 }, { "epoch": 0.8109090909090909, "grad_norm": 9.938824653625488, "learning_rate": 8.735019964566149e-07, "loss": 0.4282, "step": 7359 }, { "epoch": 0.8110192837465565, "grad_norm": 7.234878063201904, "learning_rate": 8.725149146161599e-07, "loss": 0.4152, "step": 7360 }, { "epoch": 0.811129476584022, "grad_norm": 11.086160659790039, "learning_rate": 8.715283374901901e-07, "loss": 0.3772, "step": 7361 }, { "epoch": 0.8112396694214876, "grad_norm": 11.494916915893555, "learning_rate": 8.705422651993434e-07, "loss": 0.4294, "step": 7362 }, { "epoch": 0.8113498622589531, "grad_norm": 13.305935859680176, "learning_rate": 8.69556697864195e-07, "loss": 0.4527, "step": 7363 }, { "epoch": 0.8114600550964187, "grad_norm": 5.638788223266602, "learning_rate": 8.68571635605267e-07, "loss": 0.3419, "step": 7364 }, { "epoch": 0.8115702479338843, "grad_norm": 7.193055629730225, "learning_rate": 8.675870785430113e-07, "loss": 0.3688, "step": 7365 }, { "epoch": 0.8116804407713498, "grad_norm": 4.8958048820495605, "learning_rate": 8.666030267978199e-07, "loss": 0.4236, "step": 7366 }, { "epoch": 0.8117906336088154, "grad_norm": 7.572325229644775, "learning_rate": 8.656194804900254e-07, "loss": 0.3654, "step": 7367 }, { "epoch": 0.811900826446281, "grad_norm": 8.064297676086426, "learning_rate": 8.64636439739897e-07, "loss": 0.4437, "step": 7368 }, { "epoch": 0.8120110192837465, "grad_norm": 9.213482856750488, "learning_rate": 8.636539046676418e-07, "loss": 0.3727, "step": 7369 }, { "epoch": 0.8121212121212121, "grad_norm": 7.8315653800964355, "learning_rate": 8.626718753934055e-07, "loss": 0.3821, "step": 7370 }, { "epoch": 0.8122314049586777, "grad_norm": 7.829998970031738, "learning_rate": 8.616903520372721e-07, "loss": 0.3698, "step": 7371 }, { "epoch": 0.8123415977961432, "grad_norm": 8.769179344177246, "learning_rate": 8.607093347192652e-07, "loss": 0.3621, "step": 7372 }, { "epoch": 0.8124517906336088, "grad_norm": 8.211814880371094, "learning_rate": 8.597288235593426e-07, "loss": 0.5154, "step": 7373 }, { "epoch": 0.8125619834710743, "grad_norm": 7.744908332824707, "learning_rate": 8.587488186774029e-07, "loss": 0.4354, "step": 7374 }, { "epoch": 0.8126721763085399, "grad_norm": 5.910334587097168, "learning_rate": 8.577693201932846e-07, "loss": 0.4104, "step": 7375 }, { "epoch": 0.8127823691460055, "grad_norm": 6.860529899597168, "learning_rate": 8.567903282267593e-07, "loss": 0.3656, "step": 7376 }, { "epoch": 0.812892561983471, "grad_norm": 5.221512317657471, "learning_rate": 8.558118428975404e-07, "loss": 0.3476, "step": 7377 }, { "epoch": 0.8130027548209366, "grad_norm": 6.213238716125488, "learning_rate": 8.548338643252796e-07, "loss": 0.3832, "step": 7378 }, { "epoch": 0.8131129476584023, "grad_norm": 4.635070323944092, "learning_rate": 8.53856392629564e-07, "loss": 0.3872, "step": 7379 }, { "epoch": 0.8132231404958677, "grad_norm": 6.566573143005371, "learning_rate": 8.528794279299201e-07, "loss": 0.3391, "step": 7380 }, { "epoch": 0.8133333333333334, "grad_norm": 5.124295234680176, "learning_rate": 8.519029703458148e-07, "loss": 0.3496, "step": 7381 }, { "epoch": 0.8134435261707988, "grad_norm": 6.284134864807129, "learning_rate": 8.509270199966474e-07, "loss": 0.3601, "step": 7382 }, { "epoch": 0.8135537190082645, "grad_norm": 4.887261390686035, "learning_rate": 8.499515770017603e-07, "loss": 0.3419, "step": 7383 }, { "epoch": 0.8136639118457301, "grad_norm": 9.922983169555664, "learning_rate": 8.489766414804323e-07, "loss": 0.4235, "step": 7384 }, { "epoch": 0.8137741046831956, "grad_norm": 5.538818359375, "learning_rate": 8.480022135518784e-07, "loss": 0.3737, "step": 7385 }, { "epoch": 0.8138842975206612, "grad_norm": 4.689999580383301, "learning_rate": 8.470282933352536e-07, "loss": 0.3563, "step": 7386 }, { "epoch": 0.8139944903581268, "grad_norm": 6.984789848327637, "learning_rate": 8.460548809496516e-07, "loss": 0.3945, "step": 7387 }, { "epoch": 0.8141046831955923, "grad_norm": 6.579164505004883, "learning_rate": 8.450819765141e-07, "loss": 0.3862, "step": 7388 }, { "epoch": 0.8142148760330579, "grad_norm": 7.291015625, "learning_rate": 8.441095801475685e-07, "loss": 0.4248, "step": 7389 }, { "epoch": 0.8143250688705234, "grad_norm": 7.309769630432129, "learning_rate": 8.431376919689638e-07, "loss": 0.3647, "step": 7390 }, { "epoch": 0.814435261707989, "grad_norm": 7.58947229385376, "learning_rate": 8.421663120971274e-07, "loss": 0.435, "step": 7391 }, { "epoch": 0.8145454545454546, "grad_norm": 5.637911796569824, "learning_rate": 8.411954406508438e-07, "loss": 0.2922, "step": 7392 }, { "epoch": 0.8146556473829201, "grad_norm": 7.525227069854736, "learning_rate": 8.402250777488291e-07, "loss": 0.3679, "step": 7393 }, { "epoch": 0.8147658402203857, "grad_norm": 17.968149185180664, "learning_rate": 8.392552235097429e-07, "loss": 0.3763, "step": 7394 }, { "epoch": 0.8148760330578513, "grad_norm": 3.8819832801818848, "learning_rate": 8.382858780521807e-07, "loss": 0.4402, "step": 7395 }, { "epoch": 0.8149862258953168, "grad_norm": 5.026881694793701, "learning_rate": 8.373170414946735e-07, "loss": 0.4128, "step": 7396 }, { "epoch": 0.8150964187327824, "grad_norm": 5.453049659729004, "learning_rate": 8.363487139556925e-07, "loss": 0.4167, "step": 7397 }, { "epoch": 0.815206611570248, "grad_norm": 11.708805084228516, "learning_rate": 8.35380895553648e-07, "loss": 0.4392, "step": 7398 }, { "epoch": 0.8153168044077135, "grad_norm": 7.500401496887207, "learning_rate": 8.344135864068837e-07, "loss": 0.3301, "step": 7399 }, { "epoch": 0.8154269972451791, "grad_norm": 6.323413372039795, "learning_rate": 8.334467866336843e-07, "loss": 0.4216, "step": 7400 }, { "epoch": 0.8155371900826446, "grad_norm": 8.722508430480957, "learning_rate": 8.324804963522726e-07, "loss": 0.4156, "step": 7401 }, { "epoch": 0.8156473829201102, "grad_norm": 10.26689624786377, "learning_rate": 8.315147156808057e-07, "loss": 0.3739, "step": 7402 }, { "epoch": 0.8157575757575758, "grad_norm": 4.217180252075195, "learning_rate": 8.305494447373813e-07, "loss": 0.3765, "step": 7403 }, { "epoch": 0.8158677685950413, "grad_norm": 4.736955165863037, "learning_rate": 8.295846836400362e-07, "loss": 0.4242, "step": 7404 }, { "epoch": 0.8159779614325069, "grad_norm": 7.2362799644470215, "learning_rate": 8.286204325067393e-07, "loss": 0.5049, "step": 7405 }, { "epoch": 0.8160881542699725, "grad_norm": 8.120038032531738, "learning_rate": 8.276566914554023e-07, "loss": 0.3996, "step": 7406 }, { "epoch": 0.816198347107438, "grad_norm": 7.081818103790283, "learning_rate": 8.26693460603874e-07, "loss": 0.3563, "step": 7407 }, { "epoch": 0.8163085399449036, "grad_norm": 5.156095504760742, "learning_rate": 8.257307400699372e-07, "loss": 0.3981, "step": 7408 }, { "epoch": 0.8164187327823691, "grad_norm": 13.11861515045166, "learning_rate": 8.247685299713154e-07, "loss": 0.3828, "step": 7409 }, { "epoch": 0.8165289256198347, "grad_norm": 13.410127639770508, "learning_rate": 8.238068304256707e-07, "loss": 0.4772, "step": 7410 }, { "epoch": 0.8166391184573003, "grad_norm": 5.385002613067627, "learning_rate": 8.22845641550598e-07, "loss": 0.4295, "step": 7411 }, { "epoch": 0.8167493112947658, "grad_norm": 11.29184627532959, "learning_rate": 8.218849634636345e-07, "loss": 0.4429, "step": 7412 }, { "epoch": 0.8168595041322314, "grad_norm": 11.415976524353027, "learning_rate": 8.209247962822531e-07, "loss": 0.4586, "step": 7413 }, { "epoch": 0.816969696969697, "grad_norm": 5.033871650695801, "learning_rate": 8.199651401238656e-07, "loss": 0.4098, "step": 7414 }, { "epoch": 0.8170798898071625, "grad_norm": 7.05573034286499, "learning_rate": 8.190059951058177e-07, "loss": 0.4453, "step": 7415 }, { "epoch": 0.8171900826446281, "grad_norm": 7.5459160804748535, "learning_rate": 8.18047361345396e-07, "loss": 0.4227, "step": 7416 }, { "epoch": 0.8173002754820937, "grad_norm": 11.135947227478027, "learning_rate": 8.170892389598245e-07, "loss": 0.4486, "step": 7417 }, { "epoch": 0.8174104683195592, "grad_norm": 9.077187538146973, "learning_rate": 8.161316280662629e-07, "loss": 0.4363, "step": 7418 }, { "epoch": 0.8175206611570248, "grad_norm": 8.63121223449707, "learning_rate": 8.151745287818069e-07, "loss": 0.4338, "step": 7419 }, { "epoch": 0.8176308539944903, "grad_norm": 5.781494140625, "learning_rate": 8.142179412234963e-07, "loss": 0.3854, "step": 7420 }, { "epoch": 0.8177410468319559, "grad_norm": 6.21343994140625, "learning_rate": 8.132618655083014e-07, "loss": 0.4058, "step": 7421 }, { "epoch": 0.8178512396694215, "grad_norm": 6.685483455657959, "learning_rate": 8.123063017531308e-07, "loss": 0.3797, "step": 7422 }, { "epoch": 0.817961432506887, "grad_norm": 6.716452598571777, "learning_rate": 8.113512500748361e-07, "loss": 0.4155, "step": 7423 }, { "epoch": 0.8180716253443526, "grad_norm": 10.566768646240234, "learning_rate": 8.103967105902e-07, "loss": 0.4461, "step": 7424 }, { "epoch": 0.8181818181818182, "grad_norm": 6.36700963973999, "learning_rate": 8.094426834159447e-07, "loss": 0.4079, "step": 7425 }, { "epoch": 0.8182920110192837, "grad_norm": 6.516602516174316, "learning_rate": 8.084891686687296e-07, "loss": 0.4678, "step": 7426 }, { "epoch": 0.8184022038567493, "grad_norm": 7.448019981384277, "learning_rate": 8.075361664651532e-07, "loss": 0.3733, "step": 7427 }, { "epoch": 0.8185123966942148, "grad_norm": 6.607315540313721, "learning_rate": 8.065836769217499e-07, "loss": 0.4473, "step": 7428 }, { "epoch": 0.8186225895316804, "grad_norm": 8.278419494628906, "learning_rate": 8.056317001549902e-07, "loss": 0.3997, "step": 7429 }, { "epoch": 0.818732782369146, "grad_norm": 4.643522262573242, "learning_rate": 8.046802362812833e-07, "loss": 0.4545, "step": 7430 }, { "epoch": 0.8188429752066115, "grad_norm": 4.523182392120361, "learning_rate": 8.03729285416977e-07, "loss": 0.3383, "step": 7431 }, { "epoch": 0.8189531680440771, "grad_norm": 9.416790962219238, "learning_rate": 8.027788476783527e-07, "loss": 0.4083, "step": 7432 }, { "epoch": 0.8190633608815427, "grad_norm": 4.700539588928223, "learning_rate": 8.018289231816323e-07, "loss": 0.3711, "step": 7433 }, { "epoch": 0.8191735537190082, "grad_norm": 5.3888750076293945, "learning_rate": 8.008795120429752e-07, "loss": 0.3789, "step": 7434 }, { "epoch": 0.8192837465564738, "grad_norm": 4.854864120483398, "learning_rate": 7.999306143784741e-07, "loss": 0.4277, "step": 7435 }, { "epoch": 0.8193939393939393, "grad_norm": 5.642134666442871, "learning_rate": 7.989822303041622e-07, "loss": 0.4036, "step": 7436 }, { "epoch": 0.819504132231405, "grad_norm": 6.250295639038086, "learning_rate": 7.980343599360113e-07, "loss": 0.3542, "step": 7437 }, { "epoch": 0.8196143250688706, "grad_norm": 9.971394538879395, "learning_rate": 7.970870033899253e-07, "loss": 0.4408, "step": 7438 }, { "epoch": 0.819724517906336, "grad_norm": 5.384455680847168, "learning_rate": 7.961401607817499e-07, "loss": 0.4375, "step": 7439 }, { "epoch": 0.8198347107438017, "grad_norm": 5.423167705535889, "learning_rate": 7.951938322272673e-07, "loss": 0.3207, "step": 7440 }, { "epoch": 0.8199449035812673, "grad_norm": 8.072731018066406, "learning_rate": 7.94248017842193e-07, "loss": 0.4668, "step": 7441 }, { "epoch": 0.8200550964187328, "grad_norm": 4.293138027191162, "learning_rate": 7.933027177421842e-07, "loss": 0.4011, "step": 7442 }, { "epoch": 0.8201652892561984, "grad_norm": 10.118760108947754, "learning_rate": 7.923579320428342e-07, "loss": 0.4398, "step": 7443 }, { "epoch": 0.820275482093664, "grad_norm": 6.136846542358398, "learning_rate": 7.914136608596712e-07, "loss": 0.4212, "step": 7444 }, { "epoch": 0.8203856749311295, "grad_norm": 5.816466808319092, "learning_rate": 7.904699043081621e-07, "loss": 0.3057, "step": 7445 }, { "epoch": 0.8204958677685951, "grad_norm": 4.639345169067383, "learning_rate": 7.895266625037124e-07, "loss": 0.3567, "step": 7446 }, { "epoch": 0.8206060606060606, "grad_norm": 7.182546138763428, "learning_rate": 7.885839355616609e-07, "loss": 0.3723, "step": 7447 }, { "epoch": 0.8207162534435262, "grad_norm": 4.012908935546875, "learning_rate": 7.876417235972861e-07, "loss": 0.3462, "step": 7448 }, { "epoch": 0.8208264462809918, "grad_norm": 5.623358249664307, "learning_rate": 7.867000267258045e-07, "loss": 0.4108, "step": 7449 }, { "epoch": 0.8209366391184573, "grad_norm": 4.929952144622803, "learning_rate": 7.857588450623654e-07, "loss": 0.3243, "step": 7450 }, { "epoch": 0.8210468319559229, "grad_norm": 8.155301094055176, "learning_rate": 7.84818178722061e-07, "loss": 0.3344, "step": 7451 }, { "epoch": 0.8211570247933885, "grad_norm": 6.135559558868408, "learning_rate": 7.838780278199137e-07, "loss": 0.3263, "step": 7452 }, { "epoch": 0.821267217630854, "grad_norm": 7.614686965942383, "learning_rate": 7.829383924708889e-07, "loss": 0.3218, "step": 7453 }, { "epoch": 0.8213774104683196, "grad_norm": 7.434074401855469, "learning_rate": 7.819992727898862e-07, "loss": 0.403, "step": 7454 }, { "epoch": 0.8214876033057851, "grad_norm": 9.409198760986328, "learning_rate": 7.8106066889174e-07, "loss": 0.3881, "step": 7455 }, { "epoch": 0.8215977961432507, "grad_norm": 4.896434307098389, "learning_rate": 7.801225808912288e-07, "loss": 0.402, "step": 7456 }, { "epoch": 0.8217079889807163, "grad_norm": 6.69950008392334, "learning_rate": 7.791850089030601e-07, "loss": 0.3975, "step": 7457 }, { "epoch": 0.8218181818181818, "grad_norm": 6.383553981781006, "learning_rate": 7.782479530418807e-07, "loss": 0.3964, "step": 7458 }, { "epoch": 0.8219283746556474, "grad_norm": 6.325447082519531, "learning_rate": 7.773114134222765e-07, "loss": 0.3689, "step": 7459 }, { "epoch": 0.822038567493113, "grad_norm": 7.699170112609863, "learning_rate": 7.763753901587695e-07, "loss": 0.3063, "step": 7460 }, { "epoch": 0.8221487603305785, "grad_norm": 6.865559101104736, "learning_rate": 7.754398833658161e-07, "loss": 0.3655, "step": 7461 }, { "epoch": 0.8222589531680441, "grad_norm": 5.2361369132995605, "learning_rate": 7.745048931578125e-07, "loss": 0.3859, "step": 7462 }, { "epoch": 0.8223691460055096, "grad_norm": 9.958680152893066, "learning_rate": 7.735704196490911e-07, "loss": 0.4336, "step": 7463 }, { "epoch": 0.8224793388429752, "grad_norm": 5.9230570793151855, "learning_rate": 7.72636462953919e-07, "loss": 0.3634, "step": 7464 }, { "epoch": 0.8225895316804408, "grad_norm": 9.550772666931152, "learning_rate": 7.71703023186502e-07, "loss": 0.4396, "step": 7465 }, { "epoch": 0.8226997245179063, "grad_norm": 5.828458309173584, "learning_rate": 7.707701004609846e-07, "loss": 0.3809, "step": 7466 }, { "epoch": 0.8228099173553719, "grad_norm": 6.5030059814453125, "learning_rate": 7.698376948914426e-07, "loss": 0.37, "step": 7467 }, { "epoch": 0.8229201101928375, "grad_norm": 5.650920867919922, "learning_rate": 7.689058065918937e-07, "loss": 0.3444, "step": 7468 }, { "epoch": 0.823030303030303, "grad_norm": 4.907332897186279, "learning_rate": 7.679744356762897e-07, "loss": 0.3816, "step": 7469 }, { "epoch": 0.8231404958677686, "grad_norm": 11.659539222717285, "learning_rate": 7.67043582258522e-07, "loss": 0.4082, "step": 7470 }, { "epoch": 0.8232506887052342, "grad_norm": 5.233342170715332, "learning_rate": 7.661132464524135e-07, "loss": 0.4045, "step": 7471 }, { "epoch": 0.8233608815426997, "grad_norm": 5.3335161209106445, "learning_rate": 7.651834283717286e-07, "loss": 0.3777, "step": 7472 }, { "epoch": 0.8234710743801653, "grad_norm": 6.997351169586182, "learning_rate": 7.642541281301674e-07, "loss": 0.3797, "step": 7473 }, { "epoch": 0.8235812672176308, "grad_norm": 6.346808910369873, "learning_rate": 7.633253458413653e-07, "loss": 0.39, "step": 7474 }, { "epoch": 0.8236914600550964, "grad_norm": 6.893392562866211, "learning_rate": 7.623970816188925e-07, "loss": 0.3167, "step": 7475 }, { "epoch": 0.823801652892562, "grad_norm": 6.692793369293213, "learning_rate": 7.614693355762632e-07, "loss": 0.404, "step": 7476 }, { "epoch": 0.8239118457300275, "grad_norm": 8.373234748840332, "learning_rate": 7.605421078269209e-07, "loss": 0.4487, "step": 7477 }, { "epoch": 0.8240220385674931, "grad_norm": 16.04227066040039, "learning_rate": 7.596153984842464e-07, "loss": 0.5009, "step": 7478 }, { "epoch": 0.8241322314049587, "grad_norm": 8.250227928161621, "learning_rate": 7.586892076615632e-07, "loss": 0.4597, "step": 7479 }, { "epoch": 0.8242424242424242, "grad_norm": 6.846471309661865, "learning_rate": 7.577635354721247e-07, "loss": 0.386, "step": 7480 }, { "epoch": 0.8243526170798898, "grad_norm": 5.904123783111572, "learning_rate": 7.568383820291214e-07, "loss": 0.4399, "step": 7481 }, { "epoch": 0.8244628099173553, "grad_norm": 7.321972846984863, "learning_rate": 7.559137474456868e-07, "loss": 0.4062, "step": 7482 }, { "epoch": 0.8245730027548209, "grad_norm": 4.483993053436279, "learning_rate": 7.549896318348826e-07, "loss": 0.3509, "step": 7483 }, { "epoch": 0.8246831955922865, "grad_norm": 8.56092643737793, "learning_rate": 7.540660353097146e-07, "loss": 0.3823, "step": 7484 }, { "epoch": 0.824793388429752, "grad_norm": 5.048705577850342, "learning_rate": 7.531429579831173e-07, "loss": 0.4145, "step": 7485 }, { "epoch": 0.8249035812672176, "grad_norm": 7.269402503967285, "learning_rate": 7.522203999679684e-07, "loss": 0.4043, "step": 7486 }, { "epoch": 0.8250137741046832, "grad_norm": 6.417334079742432, "learning_rate": 7.512983613770797e-07, "loss": 0.4195, "step": 7487 }, { "epoch": 0.8251239669421487, "grad_norm": 5.412443161010742, "learning_rate": 7.503768423231983e-07, "loss": 0.3367, "step": 7488 }, { "epoch": 0.8252341597796143, "grad_norm": 6.628684997558594, "learning_rate": 7.494558429190085e-07, "loss": 0.3736, "step": 7489 }, { "epoch": 0.8253443526170798, "grad_norm": 7.438686370849609, "learning_rate": 7.485353632771336e-07, "loss": 0.3753, "step": 7490 }, { "epoch": 0.8254545454545454, "grad_norm": 5.179122447967529, "learning_rate": 7.476154035101279e-07, "loss": 0.3946, "step": 7491 }, { "epoch": 0.825564738292011, "grad_norm": 6.319139003753662, "learning_rate": 7.466959637304871e-07, "loss": 0.4079, "step": 7492 }, { "epoch": 0.8256749311294765, "grad_norm": 6.583497047424316, "learning_rate": 7.457770440506429e-07, "loss": 0.3306, "step": 7493 }, { "epoch": 0.8257851239669421, "grad_norm": 4.10272741317749, "learning_rate": 7.448586445829592e-07, "loss": 0.3673, "step": 7494 }, { "epoch": 0.8258953168044078, "grad_norm": 8.826009750366211, "learning_rate": 7.439407654397402e-07, "loss": 0.3622, "step": 7495 }, { "epoch": 0.8260055096418732, "grad_norm": 6.662688255310059, "learning_rate": 7.43023406733227e-07, "loss": 0.4372, "step": 7496 }, { "epoch": 0.8261157024793389, "grad_norm": 16.362524032592773, "learning_rate": 7.421065685755935e-07, "loss": 0.3762, "step": 7497 }, { "epoch": 0.8262258953168045, "grad_norm": 7.434336185455322, "learning_rate": 7.41190251078952e-07, "loss": 0.4018, "step": 7498 }, { "epoch": 0.82633608815427, "grad_norm": 6.320937156677246, "learning_rate": 7.402744543553531e-07, "loss": 0.3997, "step": 7499 }, { "epoch": 0.8264462809917356, "grad_norm": 5.576345443725586, "learning_rate": 7.393591785167786e-07, "loss": 0.3733, "step": 7500 }, { "epoch": 0.826556473829201, "grad_norm": 15.954998970031738, "learning_rate": 7.384444236751514e-07, "loss": 0.5084, "step": 7501 }, { "epoch": 0.8266666666666667, "grad_norm": 7.5200886726379395, "learning_rate": 7.375301899423304e-07, "loss": 0.4141, "step": 7502 }, { "epoch": 0.8267768595041323, "grad_norm": 7.538036346435547, "learning_rate": 7.36616477430106e-07, "loss": 0.3081, "step": 7503 }, { "epoch": 0.8268870523415978, "grad_norm": 9.011994361877441, "learning_rate": 7.357032862502106e-07, "loss": 0.4073, "step": 7504 }, { "epoch": 0.8269972451790634, "grad_norm": 6.264695167541504, "learning_rate": 7.34790616514311e-07, "loss": 0.4056, "step": 7505 }, { "epoch": 0.827107438016529, "grad_norm": 6.546352863311768, "learning_rate": 7.338784683340067e-07, "loss": 0.3476, "step": 7506 }, { "epoch": 0.8272176308539945, "grad_norm": 7.9584197998046875, "learning_rate": 7.329668418208386e-07, "loss": 0.383, "step": 7507 }, { "epoch": 0.8273278236914601, "grad_norm": 6.149013996124268, "learning_rate": 7.320557370862824e-07, "loss": 0.4527, "step": 7508 }, { "epoch": 0.8274380165289256, "grad_norm": 7.031678199768066, "learning_rate": 7.311451542417469e-07, "loss": 0.3523, "step": 7509 }, { "epoch": 0.8275482093663912, "grad_norm": 5.1706624031066895, "learning_rate": 7.302350933985819e-07, "loss": 0.4124, "step": 7510 }, { "epoch": 0.8276584022038568, "grad_norm": 5.587656497955322, "learning_rate": 7.293255546680678e-07, "loss": 0.4128, "step": 7511 }, { "epoch": 0.8277685950413223, "grad_norm": 6.654507637023926, "learning_rate": 7.284165381614278e-07, "loss": 0.4302, "step": 7512 }, { "epoch": 0.8278787878787879, "grad_norm": 11.286688804626465, "learning_rate": 7.275080439898158e-07, "loss": 0.402, "step": 7513 }, { "epoch": 0.8279889807162535, "grad_norm": 7.159271717071533, "learning_rate": 7.266000722643213e-07, "loss": 0.4005, "step": 7514 }, { "epoch": 0.828099173553719, "grad_norm": 6.855350017547607, "learning_rate": 7.256926230959776e-07, "loss": 0.3725, "step": 7515 }, { "epoch": 0.8282093663911846, "grad_norm": 7.471316814422607, "learning_rate": 7.247856965957456e-07, "loss": 0.4115, "step": 7516 }, { "epoch": 0.8283195592286502, "grad_norm": 9.73602294921875, "learning_rate": 7.238792928745247e-07, "loss": 0.4839, "step": 7517 }, { "epoch": 0.8284297520661157, "grad_norm": 5.6931471824646, "learning_rate": 7.229734120431531e-07, "loss": 0.3588, "step": 7518 }, { "epoch": 0.8285399449035813, "grad_norm": 5.699717998504639, "learning_rate": 7.220680542124031e-07, "loss": 0.3021, "step": 7519 }, { "epoch": 0.8286501377410468, "grad_norm": 8.98326587677002, "learning_rate": 7.211632194929813e-07, "loss": 0.3568, "step": 7520 }, { "epoch": 0.8287603305785124, "grad_norm": 5.933770179748535, "learning_rate": 7.20258907995533e-07, "loss": 0.4108, "step": 7521 }, { "epoch": 0.828870523415978, "grad_norm": 5.496316432952881, "learning_rate": 7.193551198306408e-07, "loss": 0.3757, "step": 7522 }, { "epoch": 0.8289807162534435, "grad_norm": 9.473560333251953, "learning_rate": 7.184518551088176e-07, "loss": 0.4775, "step": 7523 }, { "epoch": 0.8290909090909091, "grad_norm": 6.112762451171875, "learning_rate": 7.175491139405172e-07, "loss": 0.3568, "step": 7524 }, { "epoch": 0.8292011019283747, "grad_norm": 9.144979476928711, "learning_rate": 7.166468964361289e-07, "loss": 0.3841, "step": 7525 }, { "epoch": 0.8293112947658402, "grad_norm": 6.507180690765381, "learning_rate": 7.157452027059769e-07, "loss": 0.3955, "step": 7526 }, { "epoch": 0.8294214876033058, "grad_norm": 9.648252487182617, "learning_rate": 7.148440328603206e-07, "loss": 0.3757, "step": 7527 }, { "epoch": 0.8295316804407713, "grad_norm": 5.539919376373291, "learning_rate": 7.139433870093565e-07, "loss": 0.414, "step": 7528 }, { "epoch": 0.8296418732782369, "grad_norm": 6.198495864868164, "learning_rate": 7.130432652632179e-07, "loss": 0.3635, "step": 7529 }, { "epoch": 0.8297520661157025, "grad_norm": 10.395768165588379, "learning_rate": 7.121436677319715e-07, "loss": 0.4508, "step": 7530 }, { "epoch": 0.829862258953168, "grad_norm": 6.952027797698975, "learning_rate": 7.112445945256219e-07, "loss": 0.4318, "step": 7531 }, { "epoch": 0.8299724517906336, "grad_norm": 6.353927135467529, "learning_rate": 7.1034604575411e-07, "loss": 0.4063, "step": 7532 }, { "epoch": 0.8300826446280992, "grad_norm": 7.425856590270996, "learning_rate": 7.094480215273103e-07, "loss": 0.4616, "step": 7533 }, { "epoch": 0.8301928374655647, "grad_norm": 6.667521953582764, "learning_rate": 7.085505219550326e-07, "loss": 0.323, "step": 7534 }, { "epoch": 0.8303030303030303, "grad_norm": 9.246989250183105, "learning_rate": 7.076535471470286e-07, "loss": 0.3977, "step": 7535 }, { "epoch": 0.8304132231404958, "grad_norm": 7.972851753234863, "learning_rate": 7.067570972129795e-07, "loss": 0.4488, "step": 7536 }, { "epoch": 0.8305234159779614, "grad_norm": 6.29607629776001, "learning_rate": 7.058611722625019e-07, "loss": 0.3144, "step": 7537 }, { "epoch": 0.830633608815427, "grad_norm": 8.219892501831055, "learning_rate": 7.049657724051556e-07, "loss": 0.4169, "step": 7538 }, { "epoch": 0.8307438016528925, "grad_norm": 5.537420749664307, "learning_rate": 7.040708977504279e-07, "loss": 0.3664, "step": 7539 }, { "epoch": 0.8308539944903581, "grad_norm": 6.648195743560791, "learning_rate": 7.031765484077463e-07, "loss": 0.4035, "step": 7540 }, { "epoch": 0.8309641873278237, "grad_norm": 9.343284606933594, "learning_rate": 7.022827244864738e-07, "loss": 0.443, "step": 7541 }, { "epoch": 0.8310743801652892, "grad_norm": 4.213461875915527, "learning_rate": 7.013894260959064e-07, "loss": 0.3329, "step": 7542 }, { "epoch": 0.8311845730027548, "grad_norm": 8.926543235778809, "learning_rate": 7.004966533452806e-07, "loss": 0.4323, "step": 7543 }, { "epoch": 0.8312947658402204, "grad_norm": 7.958025932312012, "learning_rate": 6.99604406343763e-07, "loss": 0.4226, "step": 7544 }, { "epoch": 0.8314049586776859, "grad_norm": 8.788220405578613, "learning_rate": 6.987126852004606e-07, "loss": 0.3323, "step": 7545 }, { "epoch": 0.8315151515151515, "grad_norm": 5.215331554412842, "learning_rate": 6.97821490024414e-07, "loss": 0.3899, "step": 7546 }, { "epoch": 0.831625344352617, "grad_norm": 10.267518043518066, "learning_rate": 6.96930820924599e-07, "loss": 0.4865, "step": 7547 }, { "epoch": 0.8317355371900826, "grad_norm": 5.965144157409668, "learning_rate": 6.960406780099282e-07, "loss": 0.3882, "step": 7548 }, { "epoch": 0.8318457300275482, "grad_norm": 12.689521789550781, "learning_rate": 6.951510613892509e-07, "loss": 0.3665, "step": 7549 }, { "epoch": 0.8319559228650137, "grad_norm": 7.58388090133667, "learning_rate": 6.942619711713483e-07, "loss": 0.4159, "step": 7550 }, { "epoch": 0.8320661157024793, "grad_norm": 9.022119522094727, "learning_rate": 6.933734074649406e-07, "loss": 0.481, "step": 7551 }, { "epoch": 0.832176308539945, "grad_norm": 7.578518390655518, "learning_rate": 6.924853703786838e-07, "loss": 0.3811, "step": 7552 }, { "epoch": 0.8322865013774104, "grad_norm": 4.457272052764893, "learning_rate": 6.915978600211654e-07, "loss": 0.3322, "step": 7553 }, { "epoch": 0.832396694214876, "grad_norm": 7.8011064529418945, "learning_rate": 6.907108765009136e-07, "loss": 0.4161, "step": 7554 }, { "epoch": 0.8325068870523415, "grad_norm": 6.120491981506348, "learning_rate": 6.898244199263904e-07, "loss": 0.3595, "step": 7555 }, { "epoch": 0.8326170798898072, "grad_norm": 9.96679401397705, "learning_rate": 6.889384904059909e-07, "loss": 0.4497, "step": 7556 }, { "epoch": 0.8327272727272728, "grad_norm": 6.320555686950684, "learning_rate": 6.88053088048049e-07, "loss": 0.3224, "step": 7557 }, { "epoch": 0.8328374655647383, "grad_norm": 6.006103515625, "learning_rate": 6.871682129608332e-07, "loss": 0.3396, "step": 7558 }, { "epoch": 0.8329476584022039, "grad_norm": 6.8017683029174805, "learning_rate": 6.862838652525461e-07, "loss": 0.357, "step": 7559 }, { "epoch": 0.8330578512396695, "grad_norm": 19.439645767211914, "learning_rate": 6.854000450313275e-07, "loss": 0.4138, "step": 7560 }, { "epoch": 0.833168044077135, "grad_norm": 10.255168914794922, "learning_rate": 6.845167524052531e-07, "loss": 0.4535, "step": 7561 }, { "epoch": 0.8332782369146006, "grad_norm": 5.926719665527344, "learning_rate": 6.836339874823311e-07, "loss": 0.342, "step": 7562 }, { "epoch": 0.8333884297520661, "grad_norm": 9.165303230285645, "learning_rate": 6.827517503705089e-07, "loss": 0.404, "step": 7563 }, { "epoch": 0.8334986225895317, "grad_norm": 4.787303924560547, "learning_rate": 6.818700411776674e-07, "loss": 0.3747, "step": 7564 }, { "epoch": 0.8336088154269973, "grad_norm": 9.279829025268555, "learning_rate": 6.809888600116221e-07, "loss": 0.3478, "step": 7565 }, { "epoch": 0.8337190082644628, "grad_norm": 9.262317657470703, "learning_rate": 6.801082069801268e-07, "loss": 0.4606, "step": 7566 }, { "epoch": 0.8338292011019284, "grad_norm": 6.749006271362305, "learning_rate": 6.792280821908659e-07, "loss": 0.3707, "step": 7567 }, { "epoch": 0.833939393939394, "grad_norm": 7.549883842468262, "learning_rate": 6.783484857514666e-07, "loss": 0.4643, "step": 7568 }, { "epoch": 0.8340495867768595, "grad_norm": 5.3465399742126465, "learning_rate": 6.774694177694846e-07, "loss": 0.3999, "step": 7569 }, { "epoch": 0.8341597796143251, "grad_norm": 3.93556547164917, "learning_rate": 6.765908783524116e-07, "loss": 0.4035, "step": 7570 }, { "epoch": 0.8342699724517907, "grad_norm": 5.638111114501953, "learning_rate": 6.757128676076813e-07, "loss": 0.3956, "step": 7571 }, { "epoch": 0.8343801652892562, "grad_norm": 12.791842460632324, "learning_rate": 6.748353856426553e-07, "loss": 0.4195, "step": 7572 }, { "epoch": 0.8344903581267218, "grad_norm": 7.686941623687744, "learning_rate": 6.739584325646314e-07, "loss": 0.4666, "step": 7573 }, { "epoch": 0.8346005509641873, "grad_norm": 4.8445611000061035, "learning_rate": 6.730820084808487e-07, "loss": 0.408, "step": 7574 }, { "epoch": 0.8347107438016529, "grad_norm": 12.405566215515137, "learning_rate": 6.722061134984759e-07, "loss": 0.4039, "step": 7575 }, { "epoch": 0.8348209366391185, "grad_norm": 9.464533805847168, "learning_rate": 6.713307477246168e-07, "loss": 0.2975, "step": 7576 }, { "epoch": 0.834931129476584, "grad_norm": 6.935895919799805, "learning_rate": 6.70455911266314e-07, "loss": 0.3776, "step": 7577 }, { "epoch": 0.8350413223140496, "grad_norm": 6.494793891906738, "learning_rate": 6.695816042305441e-07, "loss": 0.3186, "step": 7578 }, { "epoch": 0.8351515151515152, "grad_norm": 7.261117935180664, "learning_rate": 6.687078267242176e-07, "loss": 0.3739, "step": 7579 }, { "epoch": 0.8352617079889807, "grad_norm": 5.465384006500244, "learning_rate": 6.678345788541807e-07, "loss": 0.3734, "step": 7580 }, { "epoch": 0.8353719008264463, "grad_norm": 8.419529914855957, "learning_rate": 6.669618607272166e-07, "loss": 0.4178, "step": 7581 }, { "epoch": 0.8354820936639118, "grad_norm": 6.401289463043213, "learning_rate": 6.660896724500432e-07, "loss": 0.3982, "step": 7582 }, { "epoch": 0.8355922865013774, "grad_norm": 6.065435886383057, "learning_rate": 6.652180141293107e-07, "loss": 0.4255, "step": 7583 }, { "epoch": 0.835702479338843, "grad_norm": 5.099431037902832, "learning_rate": 6.643468858716074e-07, "loss": 0.4164, "step": 7584 }, { "epoch": 0.8358126721763085, "grad_norm": 7.421755790710449, "learning_rate": 6.634762877834578e-07, "loss": 0.4291, "step": 7585 }, { "epoch": 0.8359228650137741, "grad_norm": 4.8282318115234375, "learning_rate": 6.626062199713168e-07, "loss": 0.3984, "step": 7586 }, { "epoch": 0.8360330578512397, "grad_norm": 7.492334365844727, "learning_rate": 6.617366825415788e-07, "loss": 0.4239, "step": 7587 }, { "epoch": 0.8361432506887052, "grad_norm": 6.620091438293457, "learning_rate": 6.608676756005738e-07, "loss": 0.2626, "step": 7588 }, { "epoch": 0.8362534435261708, "grad_norm": 13.550325393676758, "learning_rate": 6.599991992545624e-07, "loss": 0.5648, "step": 7589 }, { "epoch": 0.8363636363636363, "grad_norm": 6.486863136291504, "learning_rate": 6.591312536097438e-07, "loss": 0.403, "step": 7590 }, { "epoch": 0.8364738292011019, "grad_norm": 7.239338397979736, "learning_rate": 6.582638387722534e-07, "loss": 0.4628, "step": 7591 }, { "epoch": 0.8365840220385675, "grad_norm": 6.446137428283691, "learning_rate": 6.573969548481585e-07, "loss": 0.3246, "step": 7592 }, { "epoch": 0.836694214876033, "grad_norm": 8.6100435256958, "learning_rate": 6.565306019434603e-07, "loss": 0.419, "step": 7593 }, { "epoch": 0.8368044077134986, "grad_norm": 18.886690139770508, "learning_rate": 6.556647801641031e-07, "loss": 0.4786, "step": 7594 }, { "epoch": 0.8369146005509642, "grad_norm": 3.9173879623413086, "learning_rate": 6.547994896159559e-07, "loss": 0.3588, "step": 7595 }, { "epoch": 0.8370247933884297, "grad_norm": 6.038689613342285, "learning_rate": 6.5393473040483e-07, "loss": 0.3917, "step": 7596 }, { "epoch": 0.8371349862258953, "grad_norm": 8.612103462219238, "learning_rate": 6.530705026364692e-07, "loss": 0.3718, "step": 7597 }, { "epoch": 0.8372451790633609, "grad_norm": 5.947027683258057, "learning_rate": 6.522068064165515e-07, "loss": 0.4106, "step": 7598 }, { "epoch": 0.8373553719008264, "grad_norm": 8.922974586486816, "learning_rate": 6.513436418506925e-07, "loss": 0.3402, "step": 7599 }, { "epoch": 0.837465564738292, "grad_norm": 13.584858894348145, "learning_rate": 6.504810090444392e-07, "loss": 0.4067, "step": 7600 }, { "epoch": 0.8375757575757575, "grad_norm": 8.478438377380371, "learning_rate": 6.496189081032755e-07, "loss": 0.4709, "step": 7601 }, { "epoch": 0.8376859504132231, "grad_norm": 6.562189102172852, "learning_rate": 6.48757339132623e-07, "loss": 0.3826, "step": 7602 }, { "epoch": 0.8377961432506887, "grad_norm": 5.741882801055908, "learning_rate": 6.478963022378327e-07, "loss": 0.4329, "step": 7603 }, { "epoch": 0.8379063360881542, "grad_norm": 6.273601055145264, "learning_rate": 6.470357975241937e-07, "loss": 0.3931, "step": 7604 }, { "epoch": 0.8380165289256198, "grad_norm": 5.691573143005371, "learning_rate": 6.461758250969313e-07, "loss": 0.3401, "step": 7605 }, { "epoch": 0.8381267217630854, "grad_norm": 5.817183971405029, "learning_rate": 6.453163850612026e-07, "loss": 0.3951, "step": 7606 }, { "epoch": 0.8382369146005509, "grad_norm": 6.3230156898498535, "learning_rate": 6.444574775221013e-07, "loss": 0.3436, "step": 7607 }, { "epoch": 0.8383471074380165, "grad_norm": 4.935028076171875, "learning_rate": 6.435991025846572e-07, "loss": 0.4212, "step": 7608 }, { "epoch": 0.838457300275482, "grad_norm": 10.081740379333496, "learning_rate": 6.427412603538314e-07, "loss": 0.4168, "step": 7609 }, { "epoch": 0.8385674931129476, "grad_norm": 5.62360954284668, "learning_rate": 6.418839509345231e-07, "loss": 0.4195, "step": 7610 }, { "epoch": 0.8386776859504133, "grad_norm": 6.882262229919434, "learning_rate": 6.41027174431566e-07, "loss": 0.386, "step": 7611 }, { "epoch": 0.8387878787878787, "grad_norm": 6.527801036834717, "learning_rate": 6.401709309497262e-07, "loss": 0.4452, "step": 7612 }, { "epoch": 0.8388980716253444, "grad_norm": 15.232413291931152, "learning_rate": 6.39315220593707e-07, "loss": 0.5078, "step": 7613 }, { "epoch": 0.83900826446281, "grad_norm": 7.460157871246338, "learning_rate": 6.384600434681476e-07, "loss": 0.4059, "step": 7614 }, { "epoch": 0.8391184573002755, "grad_norm": 11.996729850769043, "learning_rate": 6.376053996776172e-07, "loss": 0.4352, "step": 7615 }, { "epoch": 0.8392286501377411, "grad_norm": 5.507315635681152, "learning_rate": 6.367512893266243e-07, "loss": 0.391, "step": 7616 }, { "epoch": 0.8393388429752067, "grad_norm": 7.622223377227783, "learning_rate": 6.358977125196114e-07, "loss": 0.4086, "step": 7617 }, { "epoch": 0.8394490358126722, "grad_norm": 5.924062252044678, "learning_rate": 6.350446693609536e-07, "loss": 0.4142, "step": 7618 }, { "epoch": 0.8395592286501378, "grad_norm": 4.628000259399414, "learning_rate": 6.341921599549628e-07, "loss": 0.3287, "step": 7619 }, { "epoch": 0.8396694214876033, "grad_norm": 6.6283369064331055, "learning_rate": 6.333401844058862e-07, "loss": 0.3357, "step": 7620 }, { "epoch": 0.8397796143250689, "grad_norm": 6.074338912963867, "learning_rate": 6.324887428179022e-07, "loss": 0.3197, "step": 7621 }, { "epoch": 0.8398898071625345, "grad_norm": 5.312293529510498, "learning_rate": 6.316378352951275e-07, "loss": 0.3066, "step": 7622 }, { "epoch": 0.84, "grad_norm": 4.839400291442871, "learning_rate": 6.307874619416116e-07, "loss": 0.4367, "step": 7623 }, { "epoch": 0.8401101928374656, "grad_norm": 6.882898330688477, "learning_rate": 6.299376228613413e-07, "loss": 0.3652, "step": 7624 }, { "epoch": 0.8402203856749312, "grad_norm": 5.810752868652344, "learning_rate": 6.290883181582347e-07, "loss": 0.4103, "step": 7625 }, { "epoch": 0.8403305785123967, "grad_norm": 8.484546661376953, "learning_rate": 6.282395479361442e-07, "loss": 0.427, "step": 7626 }, { "epoch": 0.8404407713498623, "grad_norm": 7.992834091186523, "learning_rate": 6.273913122988618e-07, "loss": 0.3979, "step": 7627 }, { "epoch": 0.8405509641873278, "grad_norm": 4.217305660247803, "learning_rate": 6.265436113501094e-07, "loss": 0.3736, "step": 7628 }, { "epoch": 0.8406611570247934, "grad_norm": 5.164590358734131, "learning_rate": 6.256964451935427e-07, "loss": 0.3923, "step": 7629 }, { "epoch": 0.840771349862259, "grad_norm": 9.06873607635498, "learning_rate": 6.248498139327586e-07, "loss": 0.4647, "step": 7630 }, { "epoch": 0.8408815426997245, "grad_norm": 6.276998996734619, "learning_rate": 6.240037176712826e-07, "loss": 0.4112, "step": 7631 }, { "epoch": 0.8409917355371901, "grad_norm": 6.069005012512207, "learning_rate": 6.23158156512575e-07, "loss": 0.3844, "step": 7632 }, { "epoch": 0.8411019283746557, "grad_norm": 9.613306999206543, "learning_rate": 6.223131305600339e-07, "loss": 0.369, "step": 7633 }, { "epoch": 0.8412121212121212, "grad_norm": 5.842223167419434, "learning_rate": 6.2146863991699e-07, "loss": 0.4157, "step": 7634 }, { "epoch": 0.8413223140495868, "grad_norm": 4.464301586151123, "learning_rate": 6.206246846867081e-07, "loss": 0.4021, "step": 7635 }, { "epoch": 0.8414325068870523, "grad_norm": 11.78282356262207, "learning_rate": 6.197812649723878e-07, "loss": 0.3999, "step": 7636 }, { "epoch": 0.8415426997245179, "grad_norm": 6.033527374267578, "learning_rate": 6.189383808771649e-07, "loss": 0.3789, "step": 7637 }, { "epoch": 0.8416528925619835, "grad_norm": 5.16497278213501, "learning_rate": 6.180960325041085e-07, "loss": 0.4042, "step": 7638 }, { "epoch": 0.841763085399449, "grad_norm": 11.83187484741211, "learning_rate": 6.17254219956221e-07, "loss": 0.4211, "step": 7639 }, { "epoch": 0.8418732782369146, "grad_norm": 6.822471618652344, "learning_rate": 6.164129433364407e-07, "loss": 0.3986, "step": 7640 }, { "epoch": 0.8419834710743802, "grad_norm": 8.842621803283691, "learning_rate": 6.155722027476408e-07, "loss": 0.4551, "step": 7641 }, { "epoch": 0.8420936639118457, "grad_norm": 8.396306991577148, "learning_rate": 6.14731998292627e-07, "loss": 0.5187, "step": 7642 }, { "epoch": 0.8422038567493113, "grad_norm": 6.149595260620117, "learning_rate": 6.138923300741412e-07, "loss": 0.3448, "step": 7643 }, { "epoch": 0.8423140495867769, "grad_norm": 9.125263214111328, "learning_rate": 6.130531981948601e-07, "loss": 0.4397, "step": 7644 }, { "epoch": 0.8424242424242424, "grad_norm": 5.493200778961182, "learning_rate": 6.122146027573922e-07, "loss": 0.364, "step": 7645 }, { "epoch": 0.842534435261708, "grad_norm": 5.831855773925781, "learning_rate": 6.113765438642827e-07, "loss": 0.3772, "step": 7646 }, { "epoch": 0.8426446280991735, "grad_norm": 6.851219177246094, "learning_rate": 6.105390216180119e-07, "loss": 0.2942, "step": 7647 }, { "epoch": 0.8427548209366391, "grad_norm": 4.491634845733643, "learning_rate": 6.09702036120991e-07, "loss": 0.3646, "step": 7648 }, { "epoch": 0.8428650137741047, "grad_norm": 6.463755130767822, "learning_rate": 6.088655874755689e-07, "loss": 0.4057, "step": 7649 }, { "epoch": 0.8429752066115702, "grad_norm": 7.555401802062988, "learning_rate": 6.080296757840282e-07, "loss": 0.393, "step": 7650 }, { "epoch": 0.8430853994490358, "grad_norm": 5.10014533996582, "learning_rate": 6.071943011485837e-07, "loss": 0.3135, "step": 7651 }, { "epoch": 0.8431955922865014, "grad_norm": 6.354140758514404, "learning_rate": 6.063594636713877e-07, "loss": 0.4231, "step": 7652 }, { "epoch": 0.8433057851239669, "grad_norm": 5.660528659820557, "learning_rate": 6.05525163454525e-07, "loss": 0.3039, "step": 7653 }, { "epoch": 0.8434159779614325, "grad_norm": 6.830688953399658, "learning_rate": 6.046914006000137e-07, "loss": 0.4382, "step": 7654 }, { "epoch": 0.843526170798898, "grad_norm": 6.061576843261719, "learning_rate": 6.038581752098083e-07, "loss": 0.3374, "step": 7655 }, { "epoch": 0.8436363636363636, "grad_norm": 5.584253311157227, "learning_rate": 6.030254873857982e-07, "loss": 0.3632, "step": 7656 }, { "epoch": 0.8437465564738292, "grad_norm": 9.06962776184082, "learning_rate": 6.021933372298028e-07, "loss": 0.492, "step": 7657 }, { "epoch": 0.8438567493112947, "grad_norm": 8.123315811157227, "learning_rate": 6.013617248435815e-07, "loss": 0.4305, "step": 7658 }, { "epoch": 0.8439669421487603, "grad_norm": 7.666255950927734, "learning_rate": 6.005306503288222e-07, "loss": 0.4583, "step": 7659 }, { "epoch": 0.8440771349862259, "grad_norm": 11.121870994567871, "learning_rate": 5.997001137871505e-07, "loss": 0.5033, "step": 7660 }, { "epoch": 0.8441873278236914, "grad_norm": 5.500019550323486, "learning_rate": 5.988701153201276e-07, "loss": 0.3998, "step": 7661 }, { "epoch": 0.844297520661157, "grad_norm": 6.658538341522217, "learning_rate": 5.980406550292445e-07, "loss": 0.3212, "step": 7662 }, { "epoch": 0.8444077134986225, "grad_norm": 4.301170349121094, "learning_rate": 5.972117330159294e-07, "loss": 0.3336, "step": 7663 }, { "epoch": 0.8445179063360881, "grad_norm": 6.08860445022583, "learning_rate": 5.963833493815452e-07, "loss": 0.441, "step": 7664 }, { "epoch": 0.8446280991735537, "grad_norm": 5.657111644744873, "learning_rate": 5.955555042273858e-07, "loss": 0.3926, "step": 7665 }, { "epoch": 0.8447382920110192, "grad_norm": 10.227325439453125, "learning_rate": 5.94728197654682e-07, "loss": 0.4637, "step": 7666 }, { "epoch": 0.8448484848484848, "grad_norm": 6.199921131134033, "learning_rate": 5.939014297645995e-07, "loss": 0.3628, "step": 7667 }, { "epoch": 0.8449586776859505, "grad_norm": 5.123288631439209, "learning_rate": 5.930752006582341e-07, "loss": 0.408, "step": 7668 }, { "epoch": 0.845068870523416, "grad_norm": 8.856583595275879, "learning_rate": 5.922495104366194e-07, "loss": 0.4075, "step": 7669 }, { "epoch": 0.8451790633608816, "grad_norm": 5.787606716156006, "learning_rate": 5.914243592007229e-07, "loss": 0.4321, "step": 7670 }, { "epoch": 0.8452892561983472, "grad_norm": 10.660080909729004, "learning_rate": 5.90599747051443e-07, "loss": 0.4141, "step": 7671 }, { "epoch": 0.8453994490358127, "grad_norm": 7.40358304977417, "learning_rate": 5.89775674089616e-07, "loss": 0.3404, "step": 7672 }, { "epoch": 0.8455096418732783, "grad_norm": 6.4439005851745605, "learning_rate": 5.889521404160109e-07, "loss": 0.3808, "step": 7673 }, { "epoch": 0.8456198347107438, "grad_norm": 7.696054458618164, "learning_rate": 5.881291461313293e-07, "loss": 0.3735, "step": 7674 }, { "epoch": 0.8457300275482094, "grad_norm": 4.064366340637207, "learning_rate": 5.87306691336208e-07, "loss": 0.3548, "step": 7675 }, { "epoch": 0.845840220385675, "grad_norm": 5.888460159301758, "learning_rate": 5.864847761312204e-07, "loss": 0.3777, "step": 7676 }, { "epoch": 0.8459504132231405, "grad_norm": 7.175257682800293, "learning_rate": 5.856634006168677e-07, "loss": 0.4329, "step": 7677 }, { "epoch": 0.8460606060606061, "grad_norm": 4.705997943878174, "learning_rate": 5.848425648935913e-07, "loss": 0.3432, "step": 7678 }, { "epoch": 0.8461707988980717, "grad_norm": 5.8356242179870605, "learning_rate": 5.84022269061763e-07, "loss": 0.3829, "step": 7679 }, { "epoch": 0.8462809917355372, "grad_norm": 7.494500160217285, "learning_rate": 5.832025132216917e-07, "loss": 0.389, "step": 7680 }, { "epoch": 0.8463911845730028, "grad_norm": 7.917182922363281, "learning_rate": 5.823832974736154e-07, "loss": 0.3599, "step": 7681 }, { "epoch": 0.8465013774104683, "grad_norm": 4.986794471740723, "learning_rate": 5.815646219177102e-07, "loss": 0.3944, "step": 7682 }, { "epoch": 0.8466115702479339, "grad_norm": 6.412942409515381, "learning_rate": 5.807464866540857e-07, "loss": 0.3049, "step": 7683 }, { "epoch": 0.8467217630853995, "grad_norm": 7.922013282775879, "learning_rate": 5.799288917827838e-07, "loss": 0.4079, "step": 7684 }, { "epoch": 0.846831955922865, "grad_norm": 8.984867095947266, "learning_rate": 5.791118374037796e-07, "loss": 0.4134, "step": 7685 }, { "epoch": 0.8469421487603306, "grad_norm": 4.974648952484131, "learning_rate": 5.78295323616987e-07, "loss": 0.4181, "step": 7686 }, { "epoch": 0.8470523415977962, "grad_norm": 9.432533264160156, "learning_rate": 5.774793505222481e-07, "loss": 0.4887, "step": 7687 }, { "epoch": 0.8471625344352617, "grad_norm": 8.944128036499023, "learning_rate": 5.766639182193395e-07, "loss": 0.4455, "step": 7688 }, { "epoch": 0.8472727272727273, "grad_norm": 6.501738548278809, "learning_rate": 5.758490268079781e-07, "loss": 0.3665, "step": 7689 }, { "epoch": 0.8473829201101928, "grad_norm": 4.6713385581970215, "learning_rate": 5.750346763878073e-07, "loss": 0.3188, "step": 7690 }, { "epoch": 0.8474931129476584, "grad_norm": 3.850214958190918, "learning_rate": 5.742208670584054e-07, "loss": 0.3162, "step": 7691 }, { "epoch": 0.847603305785124, "grad_norm": 10.417648315429688, "learning_rate": 5.734075989192884e-07, "loss": 0.4206, "step": 7692 }, { "epoch": 0.8477134986225895, "grad_norm": 6.745625972747803, "learning_rate": 5.725948720699026e-07, "loss": 0.4001, "step": 7693 }, { "epoch": 0.8478236914600551, "grad_norm": 6.785290241241455, "learning_rate": 5.71782686609631e-07, "loss": 0.3319, "step": 7694 }, { "epoch": 0.8479338842975207, "grad_norm": 7.850511074066162, "learning_rate": 5.709710426377868e-07, "loss": 0.4087, "step": 7695 }, { "epoch": 0.8480440771349862, "grad_norm": 8.589800834655762, "learning_rate": 5.701599402536196e-07, "loss": 0.3473, "step": 7696 }, { "epoch": 0.8481542699724518, "grad_norm": 8.957038879394531, "learning_rate": 5.693493795563132e-07, "loss": 0.4608, "step": 7697 }, { "epoch": 0.8482644628099174, "grad_norm": 12.42385482788086, "learning_rate": 5.685393606449824e-07, "loss": 0.5595, "step": 7698 }, { "epoch": 0.8483746556473829, "grad_norm": 6.1786298751831055, "learning_rate": 5.677298836186779e-07, "loss": 0.3342, "step": 7699 }, { "epoch": 0.8484848484848485, "grad_norm": 7.499631881713867, "learning_rate": 5.66920948576385e-07, "loss": 0.3894, "step": 7700 }, { "epoch": 0.848595041322314, "grad_norm": 5.315148830413818, "learning_rate": 5.661125556170188e-07, "loss": 0.37, "step": 7701 }, { "epoch": 0.8487052341597796, "grad_norm": 7.056180477142334, "learning_rate": 5.653047048394328e-07, "loss": 0.3748, "step": 7702 }, { "epoch": 0.8488154269972452, "grad_norm": 6.130964279174805, "learning_rate": 5.644973963424122e-07, "loss": 0.3858, "step": 7703 }, { "epoch": 0.8489256198347107, "grad_norm": 9.253560066223145, "learning_rate": 5.636906302246736e-07, "loss": 0.3755, "step": 7704 }, { "epoch": 0.8490358126721763, "grad_norm": 6.5167927742004395, "learning_rate": 5.628844065848715e-07, "loss": 0.4288, "step": 7705 }, { "epoch": 0.8491460055096419, "grad_norm": 5.824051856994629, "learning_rate": 5.620787255215921e-07, "loss": 0.3397, "step": 7706 }, { "epoch": 0.8492561983471074, "grad_norm": 4.285006523132324, "learning_rate": 5.612735871333535e-07, "loss": 0.3016, "step": 7707 }, { "epoch": 0.849366391184573, "grad_norm": 9.04549789428711, "learning_rate": 5.604689915186101e-07, "loss": 0.4501, "step": 7708 }, { "epoch": 0.8494765840220385, "grad_norm": 6.156450271606445, "learning_rate": 5.596649387757502e-07, "loss": 0.4291, "step": 7709 }, { "epoch": 0.8495867768595041, "grad_norm": 5.915395259857178, "learning_rate": 5.588614290030919e-07, "loss": 0.3578, "step": 7710 }, { "epoch": 0.8496969696969697, "grad_norm": 8.087713241577148, "learning_rate": 5.580584622988905e-07, "loss": 0.4435, "step": 7711 }, { "epoch": 0.8498071625344352, "grad_norm": 8.039299011230469, "learning_rate": 5.572560387613352e-07, "loss": 0.3567, "step": 7712 }, { "epoch": 0.8499173553719008, "grad_norm": 7.934005260467529, "learning_rate": 5.564541584885458e-07, "loss": 0.3357, "step": 7713 }, { "epoch": 0.8500275482093664, "grad_norm": 4.658009052276611, "learning_rate": 5.556528215785778e-07, "loss": 0.3597, "step": 7714 }, { "epoch": 0.8501377410468319, "grad_norm": 4.118621826171875, "learning_rate": 5.548520281294206e-07, "loss": 0.3772, "step": 7715 }, { "epoch": 0.8502479338842975, "grad_norm": 7.155452251434326, "learning_rate": 5.540517782389943e-07, "loss": 0.4349, "step": 7716 }, { "epoch": 0.850358126721763, "grad_norm": 9.435413360595703, "learning_rate": 5.532520720051571e-07, "loss": 0.4621, "step": 7717 }, { "epoch": 0.8504683195592286, "grad_norm": 5.139011859893799, "learning_rate": 5.524529095256958e-07, "loss": 0.4654, "step": 7718 }, { "epoch": 0.8505785123966942, "grad_norm": 3.8253042697906494, "learning_rate": 5.516542908983341e-07, "loss": 0.3474, "step": 7719 }, { "epoch": 0.8506887052341597, "grad_norm": 4.634027004241943, "learning_rate": 5.508562162207293e-07, "loss": 0.416, "step": 7720 }, { "epoch": 0.8507988980716253, "grad_norm": 9.43326473236084, "learning_rate": 5.500586855904677e-07, "loss": 0.3912, "step": 7721 }, { "epoch": 0.850909090909091, "grad_norm": 9.88778018951416, "learning_rate": 5.49261699105077e-07, "loss": 0.4034, "step": 7722 }, { "epoch": 0.8510192837465564, "grad_norm": 7.362020969390869, "learning_rate": 5.484652568620113e-07, "loss": 0.3774, "step": 7723 }, { "epoch": 0.851129476584022, "grad_norm": 6.79647159576416, "learning_rate": 5.476693589586596e-07, "loss": 0.392, "step": 7724 }, { "epoch": 0.8512396694214877, "grad_norm": 10.27442455291748, "learning_rate": 5.468740054923472e-07, "loss": 0.4702, "step": 7725 }, { "epoch": 0.8513498622589531, "grad_norm": 4.888890266418457, "learning_rate": 5.460791965603307e-07, "loss": 0.3662, "step": 7726 }, { "epoch": 0.8514600550964188, "grad_norm": 5.989506721496582, "learning_rate": 5.452849322597997e-07, "loss": 0.3734, "step": 7727 }, { "epoch": 0.8515702479338843, "grad_norm": 10.52830696105957, "learning_rate": 5.444912126878776e-07, "loss": 0.429, "step": 7728 }, { "epoch": 0.8516804407713499, "grad_norm": 5.60654354095459, "learning_rate": 5.43698037941624e-07, "loss": 0.4441, "step": 7729 }, { "epoch": 0.8517906336088155, "grad_norm": 5.992086410522461, "learning_rate": 5.429054081180263e-07, "loss": 0.4182, "step": 7730 }, { "epoch": 0.851900826446281, "grad_norm": 12.801902770996094, "learning_rate": 5.421133233140096e-07, "loss": 0.5226, "step": 7731 }, { "epoch": 0.8520110192837466, "grad_norm": 9.182729721069336, "learning_rate": 5.413217836264317e-07, "loss": 0.4883, "step": 7732 }, { "epoch": 0.8521212121212122, "grad_norm": 14.285962104797363, "learning_rate": 5.405307891520823e-07, "loss": 0.3842, "step": 7733 }, { "epoch": 0.8522314049586777, "grad_norm": 12.276472091674805, "learning_rate": 5.397403399876855e-07, "loss": 0.3837, "step": 7734 }, { "epoch": 0.8523415977961433, "grad_norm": 5.377422332763672, "learning_rate": 5.389504362298987e-07, "loss": 0.3916, "step": 7735 }, { "epoch": 0.8524517906336088, "grad_norm": 6.361353397369385, "learning_rate": 5.381610779753127e-07, "loss": 0.4305, "step": 7736 }, { "epoch": 0.8525619834710744, "grad_norm": 9.844764709472656, "learning_rate": 5.373722653204505e-07, "loss": 0.3797, "step": 7737 }, { "epoch": 0.85267217630854, "grad_norm": 6.1600117683410645, "learning_rate": 5.365839983617693e-07, "loss": 0.3293, "step": 7738 }, { "epoch": 0.8527823691460055, "grad_norm": 6.187741279602051, "learning_rate": 5.35796277195661e-07, "loss": 0.4407, "step": 7739 }, { "epoch": 0.8528925619834711, "grad_norm": 16.504772186279297, "learning_rate": 5.350091019184467e-07, "loss": 0.6067, "step": 7740 }, { "epoch": 0.8530027548209367, "grad_norm": 9.01555061340332, "learning_rate": 5.342224726263845e-07, "loss": 0.4012, "step": 7741 }, { "epoch": 0.8531129476584022, "grad_norm": 6.932723522186279, "learning_rate": 5.334363894156658e-07, "loss": 0.3988, "step": 7742 }, { "epoch": 0.8532231404958678, "grad_norm": 8.647543907165527, "learning_rate": 5.32650852382412e-07, "loss": 0.4217, "step": 7743 }, { "epoch": 0.8533333333333334, "grad_norm": 7.07531213760376, "learning_rate": 5.318658616226791e-07, "loss": 0.4471, "step": 7744 }, { "epoch": 0.8534435261707989, "grad_norm": 6.896085739135742, "learning_rate": 5.310814172324596e-07, "loss": 0.3559, "step": 7745 }, { "epoch": 0.8535537190082645, "grad_norm": 11.441871643066406, "learning_rate": 5.302975193076748e-07, "loss": 0.5203, "step": 7746 }, { "epoch": 0.85366391184573, "grad_norm": 4.805006980895996, "learning_rate": 5.295141679441784e-07, "loss": 0.3604, "step": 7747 }, { "epoch": 0.8537741046831956, "grad_norm": 4.587762832641602, "learning_rate": 5.28731363237765e-07, "loss": 0.3518, "step": 7748 }, { "epoch": 0.8538842975206612, "grad_norm": 8.714046478271484, "learning_rate": 5.279491052841523e-07, "loss": 0.3746, "step": 7749 }, { "epoch": 0.8539944903581267, "grad_norm": 8.611539840698242, "learning_rate": 5.271673941789996e-07, "loss": 0.3367, "step": 7750 }, { "epoch": 0.8541046831955923, "grad_norm": 5.320642948150635, "learning_rate": 5.263862300178917e-07, "loss": 0.4176, "step": 7751 }, { "epoch": 0.8542148760330579, "grad_norm": 7.6433539390563965, "learning_rate": 5.256056128963533e-07, "loss": 0.3874, "step": 7752 }, { "epoch": 0.8543250688705234, "grad_norm": 5.494566440582275, "learning_rate": 5.248255429098387e-07, "loss": 0.3558, "step": 7753 }, { "epoch": 0.854435261707989, "grad_norm": 8.333560943603516, "learning_rate": 5.24046020153735e-07, "loss": 0.4335, "step": 7754 }, { "epoch": 0.8545454545454545, "grad_norm": 5.336777210235596, "learning_rate": 5.232670447233639e-07, "loss": 0.3522, "step": 7755 }, { "epoch": 0.8546556473829201, "grad_norm": 4.345096111297607, "learning_rate": 5.224886167139803e-07, "loss": 0.3992, "step": 7756 }, { "epoch": 0.8547658402203857, "grad_norm": 6.853146553039551, "learning_rate": 5.217107362207701e-07, "loss": 0.4191, "step": 7757 }, { "epoch": 0.8548760330578512, "grad_norm": 5.39180850982666, "learning_rate": 5.209334033388542e-07, "loss": 0.4003, "step": 7758 }, { "epoch": 0.8549862258953168, "grad_norm": 7.58671760559082, "learning_rate": 5.201566181632872e-07, "loss": 0.3832, "step": 7759 }, { "epoch": 0.8550964187327824, "grad_norm": 5.2194976806640625, "learning_rate": 5.193803807890529e-07, "loss": 0.3714, "step": 7760 }, { "epoch": 0.8552066115702479, "grad_norm": 9.42974853515625, "learning_rate": 5.186046913110721e-07, "loss": 0.3761, "step": 7761 }, { "epoch": 0.8553168044077135, "grad_norm": 5.1489081382751465, "learning_rate": 5.178295498241976e-07, "loss": 0.3207, "step": 7762 }, { "epoch": 0.855426997245179, "grad_norm": 6.776419639587402, "learning_rate": 5.170549564232135e-07, "loss": 0.3667, "step": 7763 }, { "epoch": 0.8555371900826446, "grad_norm": 13.21877384185791, "learning_rate": 5.162809112028388e-07, "loss": 0.4848, "step": 7764 }, { "epoch": 0.8556473829201102, "grad_norm": 4.820184230804443, "learning_rate": 5.15507414257726e-07, "loss": 0.358, "step": 7765 }, { "epoch": 0.8557575757575757, "grad_norm": 5.9227705001831055, "learning_rate": 5.147344656824566e-07, "loss": 0.4157, "step": 7766 }, { "epoch": 0.8558677685950413, "grad_norm": 7.291881084442139, "learning_rate": 5.139620655715499e-07, "loss": 0.364, "step": 7767 }, { "epoch": 0.8559779614325069, "grad_norm": 5.570192337036133, "learning_rate": 5.131902140194561e-07, "loss": 0.3772, "step": 7768 }, { "epoch": 0.8560881542699724, "grad_norm": 6.814751148223877, "learning_rate": 5.124189111205574e-07, "loss": 0.4392, "step": 7769 }, { "epoch": 0.856198347107438, "grad_norm": 17.650371551513672, "learning_rate": 5.116481569691695e-07, "loss": 0.54, "step": 7770 }, { "epoch": 0.8563085399449036, "grad_norm": 4.790437698364258, "learning_rate": 5.108779516595424e-07, "loss": 0.375, "step": 7771 }, { "epoch": 0.8564187327823691, "grad_norm": 7.293126106262207, "learning_rate": 5.101082952858571e-07, "loss": 0.4796, "step": 7772 }, { "epoch": 0.8565289256198347, "grad_norm": 7.757967472076416, "learning_rate": 5.093391879422277e-07, "loss": 0.403, "step": 7773 }, { "epoch": 0.8566391184573002, "grad_norm": 4.683879852294922, "learning_rate": 5.085706297227039e-07, "loss": 0.363, "step": 7774 }, { "epoch": 0.8567493112947658, "grad_norm": 5.255395889282227, "learning_rate": 5.078026207212633e-07, "loss": 0.3157, "step": 7775 }, { "epoch": 0.8568595041322314, "grad_norm": 6.1586384773254395, "learning_rate": 5.070351610318208e-07, "loss": 0.4242, "step": 7776 }, { "epoch": 0.8569696969696969, "grad_norm": 8.594871520996094, "learning_rate": 5.062682507482203e-07, "loss": 0.4221, "step": 7777 }, { "epoch": 0.8570798898071625, "grad_norm": 5.6082587242126465, "learning_rate": 5.055018899642445e-07, "loss": 0.3667, "step": 7778 }, { "epoch": 0.8571900826446281, "grad_norm": 7.998793125152588, "learning_rate": 5.047360787736027e-07, "loss": 0.3374, "step": 7779 }, { "epoch": 0.8573002754820936, "grad_norm": 6.855576038360596, "learning_rate": 5.03970817269937e-07, "loss": 0.4028, "step": 7780 }, { "epoch": 0.8574104683195592, "grad_norm": 7.873417854309082, "learning_rate": 5.03206105546829e-07, "loss": 0.4002, "step": 7781 }, { "epoch": 0.8575206611570247, "grad_norm": 6.745034217834473, "learning_rate": 5.024419436977873e-07, "loss": 0.3676, "step": 7782 }, { "epoch": 0.8576308539944903, "grad_norm": 7.215935707092285, "learning_rate": 5.016783318162527e-07, "loss": 0.4505, "step": 7783 }, { "epoch": 0.857741046831956, "grad_norm": 13.45441722869873, "learning_rate": 5.009152699956022e-07, "loss": 0.4353, "step": 7784 }, { "epoch": 0.8578512396694215, "grad_norm": 11.69281005859375, "learning_rate": 5.001527583291443e-07, "loss": 0.4109, "step": 7785 }, { "epoch": 0.8579614325068871, "grad_norm": 6.876806735992432, "learning_rate": 4.993907969101191e-07, "loss": 0.334, "step": 7786 }, { "epoch": 0.8580716253443527, "grad_norm": 6.825343132019043, "learning_rate": 4.98629385831701e-07, "loss": 0.3764, "step": 7787 }, { "epoch": 0.8581818181818182, "grad_norm": 6.715329647064209, "learning_rate": 4.978685251869964e-07, "loss": 0.362, "step": 7788 }, { "epoch": 0.8582920110192838, "grad_norm": 5.902563095092773, "learning_rate": 4.971082150690437e-07, "loss": 0.4258, "step": 7789 }, { "epoch": 0.8584022038567493, "grad_norm": 5.445431232452393, "learning_rate": 4.963484555708148e-07, "loss": 0.3971, "step": 7790 }, { "epoch": 0.8585123966942149, "grad_norm": 9.014820098876953, "learning_rate": 4.955892467852142e-07, "loss": 0.4358, "step": 7791 }, { "epoch": 0.8586225895316805, "grad_norm": 10.790130615234375, "learning_rate": 4.948305888050803e-07, "loss": 0.3901, "step": 7792 }, { "epoch": 0.858732782369146, "grad_norm": 6.64311408996582, "learning_rate": 4.940724817231807e-07, "loss": 0.3852, "step": 7793 }, { "epoch": 0.8588429752066116, "grad_norm": 6.426881313323975, "learning_rate": 4.93314925632219e-07, "loss": 0.4101, "step": 7794 }, { "epoch": 0.8589531680440772, "grad_norm": 6.748328685760498, "learning_rate": 4.925579206248305e-07, "loss": 0.3288, "step": 7795 }, { "epoch": 0.8590633608815427, "grad_norm": 5.245304584503174, "learning_rate": 4.918014667935811e-07, "loss": 0.3531, "step": 7796 }, { "epoch": 0.8591735537190083, "grad_norm": 5.381857872009277, "learning_rate": 4.910455642309725e-07, "loss": 0.4286, "step": 7797 }, { "epoch": 0.8592837465564739, "grad_norm": 9.686392784118652, "learning_rate": 4.90290213029438e-07, "loss": 0.4856, "step": 7798 }, { "epoch": 0.8593939393939394, "grad_norm": 4.616172790527344, "learning_rate": 4.895354132813418e-07, "loss": 0.4519, "step": 7799 }, { "epoch": 0.859504132231405, "grad_norm": 6.30689001083374, "learning_rate": 4.887811650789809e-07, "loss": 0.3234, "step": 7800 }, { "epoch": 0.8596143250688705, "grad_norm": 7.747769832611084, "learning_rate": 4.880274685145886e-07, "loss": 0.39, "step": 7801 }, { "epoch": 0.8597245179063361, "grad_norm": 5.953192710876465, "learning_rate": 4.872743236803263e-07, "loss": 0.3754, "step": 7802 }, { "epoch": 0.8598347107438017, "grad_norm": 6.571876525878906, "learning_rate": 4.865217306682874e-07, "loss": 0.3665, "step": 7803 }, { "epoch": 0.8599449035812672, "grad_norm": 6.302215099334717, "learning_rate": 4.857696895705045e-07, "loss": 0.3479, "step": 7804 }, { "epoch": 0.8600550964187328, "grad_norm": 5.455636501312256, "learning_rate": 4.85018200478935e-07, "loss": 0.4147, "step": 7805 }, { "epoch": 0.8601652892561984, "grad_norm": 7.550479888916016, "learning_rate": 4.842672634854728e-07, "loss": 0.4079, "step": 7806 }, { "epoch": 0.8602754820936639, "grad_norm": 4.89521598815918, "learning_rate": 4.835168786819445e-07, "loss": 0.3678, "step": 7807 }, { "epoch": 0.8603856749311295, "grad_norm": 8.000993728637695, "learning_rate": 4.827670461601064e-07, "loss": 0.4827, "step": 7808 }, { "epoch": 0.860495867768595, "grad_norm": 4.615668296813965, "learning_rate": 4.820177660116515e-07, "loss": 0.386, "step": 7809 }, { "epoch": 0.8606060606060606, "grad_norm": 8.023161888122559, "learning_rate": 4.812690383281998e-07, "loss": 0.4274, "step": 7810 }, { "epoch": 0.8607162534435262, "grad_norm": 6.460952281951904, "learning_rate": 4.80520863201308e-07, "loss": 0.362, "step": 7811 }, { "epoch": 0.8608264462809917, "grad_norm": 4.9097676277160645, "learning_rate": 4.797732407224654e-07, "loss": 0.3567, "step": 7812 }, { "epoch": 0.8609366391184573, "grad_norm": 9.771832466125488, "learning_rate": 4.790261709830901e-07, "loss": 0.3936, "step": 7813 }, { "epoch": 0.8610468319559229, "grad_norm": 10.23644733428955, "learning_rate": 4.782796540745354e-07, "loss": 0.4066, "step": 7814 }, { "epoch": 0.8611570247933884, "grad_norm": 7.400659561157227, "learning_rate": 4.775336900880884e-07, "loss": 0.4473, "step": 7815 }, { "epoch": 0.861267217630854, "grad_norm": 5.771268367767334, "learning_rate": 4.7678827911496304e-07, "loss": 0.4521, "step": 7816 }, { "epoch": 0.8613774104683195, "grad_norm": 4.279479026794434, "learning_rate": 4.7604342124631166e-07, "loss": 0.342, "step": 7817 }, { "epoch": 0.8614876033057851, "grad_norm": 7.7825541496276855, "learning_rate": 4.752991165732168e-07, "loss": 0.4103, "step": 7818 }, { "epoch": 0.8615977961432507, "grad_norm": 8.169404983520508, "learning_rate": 4.745553651866913e-07, "loss": 0.323, "step": 7819 }, { "epoch": 0.8617079889807162, "grad_norm": 5.612525939941406, "learning_rate": 4.7381216717768295e-07, "loss": 0.331, "step": 7820 }, { "epoch": 0.8618181818181818, "grad_norm": 4.310781955718994, "learning_rate": 4.730695226370724e-07, "loss": 0.2813, "step": 7821 }, { "epoch": 0.8619283746556474, "grad_norm": 10.757055282592773, "learning_rate": 4.723274316556681e-07, "loss": 0.4415, "step": 7822 }, { "epoch": 0.8620385674931129, "grad_norm": 5.158672332763672, "learning_rate": 4.715858943242163e-07, "loss": 0.3846, "step": 7823 }, { "epoch": 0.8621487603305785, "grad_norm": 5.668391227722168, "learning_rate": 4.708449107333929e-07, "loss": 0.322, "step": 7824 }, { "epoch": 0.8622589531680441, "grad_norm": 8.147780418395996, "learning_rate": 4.701044809738059e-07, "loss": 0.3339, "step": 7825 }, { "epoch": 0.8623691460055096, "grad_norm": 5.757749080657959, "learning_rate": 4.693646051359957e-07, "loss": 0.3206, "step": 7826 }, { "epoch": 0.8624793388429752, "grad_norm": 7.330891132354736, "learning_rate": 4.6862528331043654e-07, "loss": 0.4199, "step": 7827 }, { "epoch": 0.8625895316804407, "grad_norm": 5.04267692565918, "learning_rate": 4.6788651558753286e-07, "loss": 0.3396, "step": 7828 }, { "epoch": 0.8626997245179063, "grad_norm": 6.792194366455078, "learning_rate": 4.671483020576217e-07, "loss": 0.3301, "step": 7829 }, { "epoch": 0.8628099173553719, "grad_norm": 8.746522903442383, "learning_rate": 4.664106428109744e-07, "loss": 0.3467, "step": 7830 }, { "epoch": 0.8629201101928374, "grad_norm": 6.639345645904541, "learning_rate": 4.6567353793779134e-07, "loss": 0.3339, "step": 7831 }, { "epoch": 0.863030303030303, "grad_norm": 5.252315521240234, "learning_rate": 4.649369875282084e-07, "loss": 0.4206, "step": 7832 }, { "epoch": 0.8631404958677686, "grad_norm": 4.119190692901611, "learning_rate": 4.642009916722884e-07, "loss": 0.3193, "step": 7833 }, { "epoch": 0.8632506887052341, "grad_norm": 6.538342475891113, "learning_rate": 4.6346555046003493e-07, "loss": 0.411, "step": 7834 }, { "epoch": 0.8633608815426997, "grad_norm": 7.538941860198975, "learning_rate": 4.627306639813761e-07, "loss": 0.3699, "step": 7835 }, { "epoch": 0.8634710743801652, "grad_norm": 5.973537445068359, "learning_rate": 4.619963323261728e-07, "loss": 0.4105, "step": 7836 }, { "epoch": 0.8635812672176308, "grad_norm": 6.2615437507629395, "learning_rate": 4.612625555842243e-07, "loss": 0.3151, "step": 7837 }, { "epoch": 0.8636914600550964, "grad_norm": 8.059074401855469, "learning_rate": 4.605293338452554e-07, "loss": 0.458, "step": 7838 }, { "epoch": 0.8638016528925619, "grad_norm": 5.59768533706665, "learning_rate": 4.597966671989246e-07, "loss": 0.3664, "step": 7839 }, { "epoch": 0.8639118457300275, "grad_norm": 7.597044944763184, "learning_rate": 4.590645557348261e-07, "loss": 0.3609, "step": 7840 }, { "epoch": 0.8640220385674932, "grad_norm": 6.077411651611328, "learning_rate": 4.5833299954248233e-07, "loss": 0.3579, "step": 7841 }, { "epoch": 0.8641322314049587, "grad_norm": 5.302631855010986, "learning_rate": 4.5760199871134723e-07, "loss": 0.3454, "step": 7842 }, { "epoch": 0.8642424242424243, "grad_norm": 10.610418319702148, "learning_rate": 4.568715533308099e-07, "loss": 0.4745, "step": 7843 }, { "epoch": 0.8643526170798899, "grad_norm": 6.243701457977295, "learning_rate": 4.5614166349019163e-07, "loss": 0.422, "step": 7844 }, { "epoch": 0.8644628099173554, "grad_norm": 13.345518112182617, "learning_rate": 4.5541232927874155e-07, "loss": 0.3868, "step": 7845 }, { "epoch": 0.864573002754821, "grad_norm": 4.608929634094238, "learning_rate": 4.546835507856456e-07, "loss": 0.3389, "step": 7846 }, { "epoch": 0.8646831955922865, "grad_norm": 6.194828033447266, "learning_rate": 4.539553281000192e-07, "loss": 0.4143, "step": 7847 }, { "epoch": 0.8647933884297521, "grad_norm": 8.774277687072754, "learning_rate": 4.53227661310911e-07, "loss": 0.4575, "step": 7848 }, { "epoch": 0.8649035812672177, "grad_norm": 6.736591815948486, "learning_rate": 4.525005505072999e-07, "loss": 0.351, "step": 7849 }, { "epoch": 0.8650137741046832, "grad_norm": 7.460677623748779, "learning_rate": 4.5177399577809867e-07, "loss": 0.3125, "step": 7850 }, { "epoch": 0.8651239669421488, "grad_norm": 5.065041542053223, "learning_rate": 4.510479972121523e-07, "loss": 0.39, "step": 7851 }, { "epoch": 0.8652341597796144, "grad_norm": 9.561484336853027, "learning_rate": 4.5032255489823484e-07, "loss": 0.4045, "step": 7852 }, { "epoch": 0.8653443526170799, "grad_norm": 6.36940336227417, "learning_rate": 4.4959766892505587e-07, "loss": 0.3412, "step": 7853 }, { "epoch": 0.8654545454545455, "grad_norm": 4.763881206512451, "learning_rate": 4.488733393812555e-07, "loss": 0.4107, "step": 7854 }, { "epoch": 0.865564738292011, "grad_norm": 7.719789028167725, "learning_rate": 4.4814956635540477e-07, "loss": 0.3691, "step": 7855 }, { "epoch": 0.8656749311294766, "grad_norm": 7.469186305999756, "learning_rate": 4.474263499360082e-07, "loss": 0.4382, "step": 7856 }, { "epoch": 0.8657851239669422, "grad_norm": 4.7013840675354, "learning_rate": 4.4670369021150237e-07, "loss": 0.4005, "step": 7857 }, { "epoch": 0.8658953168044077, "grad_norm": 7.680978775024414, "learning_rate": 4.459815872702544e-07, "loss": 0.3761, "step": 7858 }, { "epoch": 0.8660055096418733, "grad_norm": 7.623222351074219, "learning_rate": 4.45260041200562e-07, "loss": 0.4246, "step": 7859 }, { "epoch": 0.8661157024793389, "grad_norm": 7.7285895347595215, "learning_rate": 4.445390520906606e-07, "loss": 0.4048, "step": 7860 }, { "epoch": 0.8662258953168044, "grad_norm": 4.648258686065674, "learning_rate": 4.4381862002871144e-07, "loss": 0.3657, "step": 7861 }, { "epoch": 0.86633608815427, "grad_norm": 6.883289813995361, "learning_rate": 4.4309874510280957e-07, "loss": 0.4015, "step": 7862 }, { "epoch": 0.8664462809917355, "grad_norm": 9.443902969360352, "learning_rate": 4.423794274009846e-07, "loss": 0.3514, "step": 7863 }, { "epoch": 0.8665564738292011, "grad_norm": 7.527348518371582, "learning_rate": 4.4166066701119336e-07, "loss": 0.3888, "step": 7864 }, { "epoch": 0.8666666666666667, "grad_norm": 7.373073101043701, "learning_rate": 4.4094246402132836e-07, "loss": 0.4161, "step": 7865 }, { "epoch": 0.8667768595041322, "grad_norm": 4.945640563964844, "learning_rate": 4.402248185192104e-07, "loss": 0.399, "step": 7866 }, { "epoch": 0.8668870523415978, "grad_norm": 6.406610012054443, "learning_rate": 4.3950773059259597e-07, "loss": 0.4355, "step": 7867 }, { "epoch": 0.8669972451790634, "grad_norm": 5.226853370666504, "learning_rate": 4.3879120032917224e-07, "loss": 0.3961, "step": 7868 }, { "epoch": 0.8671074380165289, "grad_norm": 8.117792129516602, "learning_rate": 4.3807522781655454e-07, "loss": 0.4156, "step": 7869 }, { "epoch": 0.8672176308539945, "grad_norm": 16.762847900390625, "learning_rate": 4.373598131422957e-07, "loss": 0.511, "step": 7870 }, { "epoch": 0.8673278236914601, "grad_norm": 16.404571533203125, "learning_rate": 4.3664495639387683e-07, "loss": 0.5058, "step": 7871 }, { "epoch": 0.8674380165289256, "grad_norm": 10.145781517028809, "learning_rate": 4.359306576587108e-07, "loss": 0.5076, "step": 7872 }, { "epoch": 0.8675482093663912, "grad_norm": 9.53577709197998, "learning_rate": 4.352169170241438e-07, "loss": 0.3837, "step": 7873 }, { "epoch": 0.8676584022038567, "grad_norm": 5.142624378204346, "learning_rate": 4.345037345774533e-07, "loss": 0.3095, "step": 7874 }, { "epoch": 0.8677685950413223, "grad_norm": 5.64030122756958, "learning_rate": 4.3379111040584734e-07, "loss": 0.433, "step": 7875 }, { "epoch": 0.8678787878787879, "grad_norm": 3.860023260116577, "learning_rate": 4.3307904459646666e-07, "loss": 0.3126, "step": 7876 }, { "epoch": 0.8679889807162534, "grad_norm": 5.460316181182861, "learning_rate": 4.3236753723638446e-07, "loss": 0.3417, "step": 7877 }, { "epoch": 0.868099173553719, "grad_norm": 5.342752456665039, "learning_rate": 4.3165658841260394e-07, "loss": 0.3704, "step": 7878 }, { "epoch": 0.8682093663911846, "grad_norm": 12.695240020751953, "learning_rate": 4.3094619821206164e-07, "loss": 0.4667, "step": 7879 }, { "epoch": 0.8683195592286501, "grad_norm": 6.243031978607178, "learning_rate": 4.302363667216253e-07, "loss": 0.4304, "step": 7880 }, { "epoch": 0.8684297520661157, "grad_norm": 8.409862518310547, "learning_rate": 4.295270940280921e-07, "loss": 0.4571, "step": 7881 }, { "epoch": 0.8685399449035812, "grad_norm": 8.675850868225098, "learning_rate": 4.2881838021819447e-07, "loss": 0.4461, "step": 7882 }, { "epoch": 0.8686501377410468, "grad_norm": 7.134649276733398, "learning_rate": 4.281102253785957e-07, "loss": 0.3419, "step": 7883 }, { "epoch": 0.8687603305785124, "grad_norm": 8.466950416564941, "learning_rate": 4.2740262959588777e-07, "loss": 0.4195, "step": 7884 }, { "epoch": 0.8688705234159779, "grad_norm": 6.0013227462768555, "learning_rate": 4.26695592956598e-07, "loss": 0.3899, "step": 7885 }, { "epoch": 0.8689807162534435, "grad_norm": 3.945721387863159, "learning_rate": 4.259891155471835e-07, "loss": 0.392, "step": 7886 }, { "epoch": 0.8690909090909091, "grad_norm": 6.7099175453186035, "learning_rate": 4.252831974540328e-07, "loss": 0.3565, "step": 7887 }, { "epoch": 0.8692011019283746, "grad_norm": 9.24994945526123, "learning_rate": 4.245778387634669e-07, "loss": 0.4214, "step": 7888 }, { "epoch": 0.8693112947658402, "grad_norm": 8.128684997558594, "learning_rate": 4.2387303956173744e-07, "loss": 0.3915, "step": 7889 }, { "epoch": 0.8694214876033057, "grad_norm": 6.961113452911377, "learning_rate": 4.2316879993503033e-07, "loss": 0.4557, "step": 7890 }, { "epoch": 0.8695316804407713, "grad_norm": 7.837036609649658, "learning_rate": 4.2246511996945904e-07, "loss": 0.4634, "step": 7891 }, { "epoch": 0.8696418732782369, "grad_norm": 5.132203102111816, "learning_rate": 4.2176199975106913e-07, "loss": 0.3386, "step": 7892 }, { "epoch": 0.8697520661157024, "grad_norm": 7.128950595855713, "learning_rate": 4.210594393658424e-07, "loss": 0.4544, "step": 7893 }, { "epoch": 0.869862258953168, "grad_norm": 8.016288757324219, "learning_rate": 4.203574388996873e-07, "loss": 0.3735, "step": 7894 }, { "epoch": 0.8699724517906336, "grad_norm": 4.295894622802734, "learning_rate": 4.196559984384441e-07, "loss": 0.4244, "step": 7895 }, { "epoch": 0.8700826446280991, "grad_norm": 6.6320719718933105, "learning_rate": 4.189551180678886e-07, "loss": 0.3294, "step": 7896 }, { "epoch": 0.8701928374655648, "grad_norm": 7.069095134735107, "learning_rate": 4.182547978737239e-07, "loss": 0.4646, "step": 7897 }, { "epoch": 0.8703030303030304, "grad_norm": 6.976655960083008, "learning_rate": 4.1755503794158547e-07, "loss": 0.4672, "step": 7898 }, { "epoch": 0.8704132231404959, "grad_norm": 6.343747138977051, "learning_rate": 4.16855838357042e-07, "loss": 0.3327, "step": 7899 }, { "epoch": 0.8705234159779615, "grad_norm": 8.259794235229492, "learning_rate": 4.161571992055924e-07, "loss": 0.3866, "step": 7900 }, { "epoch": 0.870633608815427, "grad_norm": 7.544325351715088, "learning_rate": 4.1545912057266656e-07, "loss": 0.4435, "step": 7901 }, { "epoch": 0.8707438016528926, "grad_norm": 5.710932731628418, "learning_rate": 4.1476160254362683e-07, "loss": 0.3952, "step": 7902 }, { "epoch": 0.8708539944903582, "grad_norm": 6.750667572021484, "learning_rate": 4.1406464520376664e-07, "loss": 0.3688, "step": 7903 }, { "epoch": 0.8709641873278237, "grad_norm": 5.663279056549072, "learning_rate": 4.133682486383123e-07, "loss": 0.3955, "step": 7904 }, { "epoch": 0.8710743801652893, "grad_norm": 4.945156574249268, "learning_rate": 4.126724129324178e-07, "loss": 0.4232, "step": 7905 }, { "epoch": 0.8711845730027549, "grad_norm": 11.367006301879883, "learning_rate": 4.119771381711718e-07, "loss": 0.3749, "step": 7906 }, { "epoch": 0.8712947658402204, "grad_norm": 8.525832176208496, "learning_rate": 4.1128242443959466e-07, "loss": 0.382, "step": 7907 }, { "epoch": 0.871404958677686, "grad_norm": 5.626183032989502, "learning_rate": 4.105882718226345e-07, "loss": 0.392, "step": 7908 }, { "epoch": 0.8715151515151515, "grad_norm": 7.996572971343994, "learning_rate": 4.098946804051751e-07, "loss": 0.429, "step": 7909 }, { "epoch": 0.8716253443526171, "grad_norm": 6.027618885040283, "learning_rate": 4.0920165027202975e-07, "loss": 0.3688, "step": 7910 }, { "epoch": 0.8717355371900827, "grad_norm": 4.738654136657715, "learning_rate": 4.085091815079417e-07, "loss": 0.3237, "step": 7911 }, { "epoch": 0.8718457300275482, "grad_norm": 5.284733772277832, "learning_rate": 4.0781727419758777e-07, "loss": 0.379, "step": 7912 }, { "epoch": 0.8719559228650138, "grad_norm": 7.230706214904785, "learning_rate": 4.0712592842557685e-07, "loss": 0.3235, "step": 7913 }, { "epoch": 0.8720661157024794, "grad_norm": 8.302559852600098, "learning_rate": 4.064351442764447e-07, "loss": 0.393, "step": 7914 }, { "epoch": 0.8721763085399449, "grad_norm": 10.283626556396484, "learning_rate": 4.057449218346632e-07, "loss": 0.4482, "step": 7915 }, { "epoch": 0.8722865013774105, "grad_norm": 4.496822357177734, "learning_rate": 4.0505526118463425e-07, "loss": 0.375, "step": 7916 }, { "epoch": 0.872396694214876, "grad_norm": 6.420034408569336, "learning_rate": 4.043661624106887e-07, "loss": 0.3895, "step": 7917 }, { "epoch": 0.8725068870523416, "grad_norm": 11.06975269317627, "learning_rate": 4.036776255970909e-07, "loss": 0.4565, "step": 7918 }, { "epoch": 0.8726170798898072, "grad_norm": 6.550425052642822, "learning_rate": 4.0298965082803785e-07, "loss": 0.4343, "step": 7919 }, { "epoch": 0.8727272727272727, "grad_norm": 6.285310745239258, "learning_rate": 4.02302238187654e-07, "loss": 0.3874, "step": 7920 }, { "epoch": 0.8728374655647383, "grad_norm": 4.321930408477783, "learning_rate": 4.016153877599976e-07, "loss": 0.394, "step": 7921 }, { "epoch": 0.8729476584022039, "grad_norm": 5.822112083435059, "learning_rate": 4.009290996290588e-07, "loss": 0.3841, "step": 7922 }, { "epoch": 0.8730578512396694, "grad_norm": 11.433794975280762, "learning_rate": 4.002433738787559e-07, "loss": 0.4247, "step": 7923 }, { "epoch": 0.873168044077135, "grad_norm": 9.85711669921875, "learning_rate": 3.995582105929424e-07, "loss": 0.3715, "step": 7924 }, { "epoch": 0.8732782369146006, "grad_norm": 4.688582897186279, "learning_rate": 3.9887360985539913e-07, "loss": 0.3918, "step": 7925 }, { "epoch": 0.8733884297520661, "grad_norm": 5.9062957763671875, "learning_rate": 3.9818957174984076e-07, "loss": 0.439, "step": 7926 }, { "epoch": 0.8734986225895317, "grad_norm": 7.6765570640563965, "learning_rate": 3.9750609635991313e-07, "loss": 0.3475, "step": 7927 }, { "epoch": 0.8736088154269972, "grad_norm": 6.803735733032227, "learning_rate": 3.968231837691916e-07, "loss": 0.3725, "step": 7928 }, { "epoch": 0.8737190082644628, "grad_norm": 5.69181489944458, "learning_rate": 3.9614083406118384e-07, "loss": 0.3447, "step": 7929 }, { "epoch": 0.8738292011019284, "grad_norm": 7.410181522369385, "learning_rate": 3.9545904731932926e-07, "loss": 0.4619, "step": 7930 }, { "epoch": 0.8739393939393939, "grad_norm": 6.334578990936279, "learning_rate": 3.947778236269961e-07, "loss": 0.4537, "step": 7931 }, { "epoch": 0.8740495867768595, "grad_norm": 11.030654907226562, "learning_rate": 3.940971630674867e-07, "loss": 0.3301, "step": 7932 }, { "epoch": 0.8741597796143251, "grad_norm": 8.924234390258789, "learning_rate": 3.9341706572403326e-07, "loss": 0.4461, "step": 7933 }, { "epoch": 0.8742699724517906, "grad_norm": 5.870466709136963, "learning_rate": 3.927375316797971e-07, "loss": 0.2781, "step": 7934 }, { "epoch": 0.8743801652892562, "grad_norm": 8.122753143310547, "learning_rate": 3.9205856101787457e-07, "loss": 0.4622, "step": 7935 }, { "epoch": 0.8744903581267217, "grad_norm": 9.385433197021484, "learning_rate": 3.913801538212914e-07, "loss": 0.4746, "step": 7936 }, { "epoch": 0.8746005509641873, "grad_norm": 8.047636032104492, "learning_rate": 3.907023101730023e-07, "loss": 0.3346, "step": 7937 }, { "epoch": 0.8747107438016529, "grad_norm": 5.601955890655518, "learning_rate": 3.9002503015589554e-07, "loss": 0.4383, "step": 7938 }, { "epoch": 0.8748209366391184, "grad_norm": 6.140141010284424, "learning_rate": 3.893483138527909e-07, "loss": 0.4151, "step": 7939 }, { "epoch": 0.874931129476584, "grad_norm": 5.186720848083496, "learning_rate": 3.886721613464367e-07, "loss": 0.3644, "step": 7940 }, { "epoch": 0.8750413223140496, "grad_norm": 8.273353576660156, "learning_rate": 3.879965727195145e-07, "loss": 0.4391, "step": 7941 }, { "epoch": 0.8751515151515151, "grad_norm": 6.724390029907227, "learning_rate": 3.873215480546372e-07, "loss": 0.3852, "step": 7942 }, { "epoch": 0.8752617079889807, "grad_norm": 6.30220365524292, "learning_rate": 3.8664708743434585e-07, "loss": 0.3941, "step": 7943 }, { "epoch": 0.8753719008264463, "grad_norm": 8.863051414489746, "learning_rate": 3.8597319094111516e-07, "loss": 0.384, "step": 7944 }, { "epoch": 0.8754820936639118, "grad_norm": 7.784864902496338, "learning_rate": 3.852998586573503e-07, "loss": 0.3233, "step": 7945 }, { "epoch": 0.8755922865013774, "grad_norm": 7.160646438598633, "learning_rate": 3.8462709066538763e-07, "loss": 0.3986, "step": 7946 }, { "epoch": 0.8757024793388429, "grad_norm": 9.844958305358887, "learning_rate": 3.839548870474935e-07, "loss": 0.4987, "step": 7947 }, { "epoch": 0.8758126721763085, "grad_norm": 4.915763854980469, "learning_rate": 3.832832478858656e-07, "loss": 0.4543, "step": 7948 }, { "epoch": 0.8759228650137741, "grad_norm": 9.974266052246094, "learning_rate": 3.826121732626342e-07, "loss": 0.4292, "step": 7949 }, { "epoch": 0.8760330578512396, "grad_norm": 5.884228229522705, "learning_rate": 3.8194166325985826e-07, "loss": 0.4142, "step": 7950 }, { "epoch": 0.8761432506887052, "grad_norm": 9.149958610534668, "learning_rate": 3.8127171795952766e-07, "loss": 0.4498, "step": 7951 }, { "epoch": 0.8762534435261708, "grad_norm": 7.31388521194458, "learning_rate": 3.8060233744356634e-07, "loss": 0.4345, "step": 7952 }, { "epoch": 0.8763636363636363, "grad_norm": 6.551784038543701, "learning_rate": 3.799335217938266e-07, "loss": 0.4088, "step": 7953 }, { "epoch": 0.876473829201102, "grad_norm": 4.9210662841796875, "learning_rate": 3.7926527109208967e-07, "loss": 0.3613, "step": 7954 }, { "epoch": 0.8765840220385674, "grad_norm": 9.949579238891602, "learning_rate": 3.7859758542007354e-07, "loss": 0.3757, "step": 7955 }, { "epoch": 0.876694214876033, "grad_norm": 6.613851070404053, "learning_rate": 3.779304648594223e-07, "loss": 0.4321, "step": 7956 }, { "epoch": 0.8768044077134987, "grad_norm": 7.689001560211182, "learning_rate": 3.7726390949171133e-07, "loss": 0.3359, "step": 7957 }, { "epoch": 0.8769146005509642, "grad_norm": 7.808323383331299, "learning_rate": 3.765979193984487e-07, "loss": 0.3436, "step": 7958 }, { "epoch": 0.8770247933884298, "grad_norm": 8.902508735656738, "learning_rate": 3.759324946610743e-07, "loss": 0.4354, "step": 7959 }, { "epoch": 0.8771349862258954, "grad_norm": 5.330629825592041, "learning_rate": 3.7526763536095414e-07, "loss": 0.4416, "step": 7960 }, { "epoch": 0.8772451790633609, "grad_norm": 9.913277626037598, "learning_rate": 3.7460334157938983e-07, "loss": 0.4213, "step": 7961 }, { "epoch": 0.8773553719008265, "grad_norm": 5.145281791687012, "learning_rate": 3.73939613397612e-07, "loss": 0.3298, "step": 7962 }, { "epoch": 0.877465564738292, "grad_norm": 7.522690296173096, "learning_rate": 3.732764508967829e-07, "loss": 0.4063, "step": 7963 }, { "epoch": 0.8775757575757576, "grad_norm": 5.090415954589844, "learning_rate": 3.7261385415799325e-07, "loss": 0.3964, "step": 7964 }, { "epoch": 0.8776859504132232, "grad_norm": 14.363067626953125, "learning_rate": 3.7195182326226765e-07, "loss": 0.431, "step": 7965 }, { "epoch": 0.8777961432506887, "grad_norm": 6.970553398132324, "learning_rate": 3.7129035829056027e-07, "loss": 0.3938, "step": 7966 }, { "epoch": 0.8779063360881543, "grad_norm": 8.049395561218262, "learning_rate": 3.706294593237542e-07, "loss": 0.4911, "step": 7967 }, { "epoch": 0.8780165289256199, "grad_norm": 6.71792459487915, "learning_rate": 3.699691264426664e-07, "loss": 0.4038, "step": 7968 }, { "epoch": 0.8781267217630854, "grad_norm": 4.56690788269043, "learning_rate": 3.6930935972804395e-07, "loss": 0.3587, "step": 7969 }, { "epoch": 0.878236914600551, "grad_norm": 3.843503475189209, "learning_rate": 3.6865015926056237e-07, "loss": 0.4201, "step": 7970 }, { "epoch": 0.8783471074380166, "grad_norm": 6.373324394226074, "learning_rate": 3.679915251208305e-07, "loss": 0.3347, "step": 7971 }, { "epoch": 0.8784573002754821, "grad_norm": 5.485124111175537, "learning_rate": 3.6733345738938776e-07, "loss": 0.3936, "step": 7972 }, { "epoch": 0.8785674931129477, "grad_norm": 5.12364387512207, "learning_rate": 3.666759561467015e-07, "loss": 0.3765, "step": 7973 }, { "epoch": 0.8786776859504132, "grad_norm": 7.823952674865723, "learning_rate": 3.6601902147317345e-07, "loss": 0.4422, "step": 7974 }, { "epoch": 0.8787878787878788, "grad_norm": 5.81721305847168, "learning_rate": 3.653626534491345e-07, "loss": 0.4155, "step": 7975 }, { "epoch": 0.8788980716253444, "grad_norm": 8.598370552062988, "learning_rate": 3.6470685215484525e-07, "loss": 0.4148, "step": 7976 }, { "epoch": 0.8790082644628099, "grad_norm": 5.4404826164245605, "learning_rate": 3.640516176704989e-07, "loss": 0.4311, "step": 7977 }, { "epoch": 0.8791184573002755, "grad_norm": 5.446481227874756, "learning_rate": 3.6339695007621855e-07, "loss": 0.4092, "step": 7978 }, { "epoch": 0.8792286501377411, "grad_norm": 6.2118401527404785, "learning_rate": 3.627428494520563e-07, "loss": 0.3892, "step": 7979 }, { "epoch": 0.8793388429752066, "grad_norm": 5.770524024963379, "learning_rate": 3.6208931587799813e-07, "loss": 0.2954, "step": 7980 }, { "epoch": 0.8794490358126722, "grad_norm": 4.975718021392822, "learning_rate": 3.6143634943395846e-07, "loss": 0.3841, "step": 7981 }, { "epoch": 0.8795592286501377, "grad_norm": 12.857072830200195, "learning_rate": 3.607839501997823e-07, "loss": 0.5214, "step": 7982 }, { "epoch": 0.8796694214876033, "grad_norm": 10.470033645629883, "learning_rate": 3.6013211825524754e-07, "loss": 0.4818, "step": 7983 }, { "epoch": 0.8797796143250689, "grad_norm": 7.853455543518066, "learning_rate": 3.5948085368005926e-07, "loss": 0.4785, "step": 7984 }, { "epoch": 0.8798898071625344, "grad_norm": 16.928417205810547, "learning_rate": 3.5883015655385544e-07, "loss": 0.4825, "step": 7985 }, { "epoch": 0.88, "grad_norm": 5.691904544830322, "learning_rate": 3.5818002695620526e-07, "loss": 0.3282, "step": 7986 }, { "epoch": 0.8801101928374656, "grad_norm": 6.92352294921875, "learning_rate": 3.5753046496660614e-07, "loss": 0.3857, "step": 7987 }, { "epoch": 0.8802203856749311, "grad_norm": 9.776923179626465, "learning_rate": 3.5688147066448744e-07, "loss": 0.4823, "step": 7988 }, { "epoch": 0.8803305785123967, "grad_norm": 6.693477153778076, "learning_rate": 3.562330441292111e-07, "loss": 0.3555, "step": 7989 }, { "epoch": 0.8804407713498622, "grad_norm": 12.216975212097168, "learning_rate": 3.5558518544006493e-07, "loss": 0.5197, "step": 7990 }, { "epoch": 0.8805509641873278, "grad_norm": 8.715909004211426, "learning_rate": 3.549378946762705e-07, "loss": 0.3948, "step": 7991 }, { "epoch": 0.8806611570247934, "grad_norm": 8.513862609863281, "learning_rate": 3.542911719169817e-07, "loss": 0.4894, "step": 7992 }, { "epoch": 0.8807713498622589, "grad_norm": 8.399560928344727, "learning_rate": 3.536450172412775e-07, "loss": 0.5353, "step": 7993 }, { "epoch": 0.8808815426997245, "grad_norm": 6.549276351928711, "learning_rate": 3.5299943072817257e-07, "loss": 0.4302, "step": 7994 }, { "epoch": 0.8809917355371901, "grad_norm": 6.469832420349121, "learning_rate": 3.523544124566103e-07, "loss": 0.3995, "step": 7995 }, { "epoch": 0.8811019283746556, "grad_norm": 5.7896246910095215, "learning_rate": 3.517099625054626e-07, "loss": 0.3088, "step": 7996 }, { "epoch": 0.8812121212121212, "grad_norm": 6.694138526916504, "learning_rate": 3.510660809535349e-07, "loss": 0.364, "step": 7997 }, { "epoch": 0.8813223140495868, "grad_norm": 6.127065181732178, "learning_rate": 3.504227678795624e-07, "loss": 0.3851, "step": 7998 }, { "epoch": 0.8814325068870523, "grad_norm": 6.639011859893799, "learning_rate": 3.4978002336220953e-07, "loss": 0.4902, "step": 7999 }, { "epoch": 0.8815426997245179, "grad_norm": 7.459198951721191, "learning_rate": 3.4913784748007163e-07, "loss": 0.3475, "step": 8000 }, { "epoch": 0.8816528925619834, "grad_norm": 5.273727893829346, "learning_rate": 3.4849624031167593e-07, "loss": 0.4321, "step": 8001 }, { "epoch": 0.881763085399449, "grad_norm": 8.887303352355957, "learning_rate": 3.4785520193547806e-07, "loss": 0.3721, "step": 8002 }, { "epoch": 0.8818732782369146, "grad_norm": 8.102290153503418, "learning_rate": 3.472147324298647e-07, "loss": 0.3801, "step": 8003 }, { "epoch": 0.8819834710743801, "grad_norm": 6.330785274505615, "learning_rate": 3.465748318731549e-07, "loss": 0.3685, "step": 8004 }, { "epoch": 0.8820936639118457, "grad_norm": 5.924086570739746, "learning_rate": 3.45935500343596e-07, "loss": 0.4131, "step": 8005 }, { "epoch": 0.8822038567493113, "grad_norm": 6.860537052154541, "learning_rate": 3.4529673791936556e-07, "loss": 0.4818, "step": 8006 }, { "epoch": 0.8823140495867768, "grad_norm": 7.429711818695068, "learning_rate": 3.446585446785722e-07, "loss": 0.315, "step": 8007 }, { "epoch": 0.8824242424242424, "grad_norm": 4.881139755249023, "learning_rate": 3.440209206992573e-07, "loss": 0.3868, "step": 8008 }, { "epoch": 0.8825344352617079, "grad_norm": 8.522668838500977, "learning_rate": 3.43383866059388e-07, "loss": 0.4128, "step": 8009 }, { "epoch": 0.8826446280991735, "grad_norm": 9.63332462310791, "learning_rate": 3.427473808368637e-07, "loss": 0.4244, "step": 8010 }, { "epoch": 0.8827548209366392, "grad_norm": 9.569814682006836, "learning_rate": 3.4211146510951755e-07, "loss": 0.3674, "step": 8011 }, { "epoch": 0.8828650137741046, "grad_norm": 5.571114540100098, "learning_rate": 3.414761189551086e-07, "loss": 0.3177, "step": 8012 }, { "epoch": 0.8829752066115703, "grad_norm": 11.988824844360352, "learning_rate": 3.408413424513263e-07, "loss": 0.5143, "step": 8013 }, { "epoch": 0.8830853994490359, "grad_norm": 7.931176662445068, "learning_rate": 3.402071356757947e-07, "loss": 0.4564, "step": 8014 }, { "epoch": 0.8831955922865014, "grad_norm": 8.800257682800293, "learning_rate": 3.395734987060645e-07, "loss": 0.3842, "step": 8015 }, { "epoch": 0.883305785123967, "grad_norm": 4.346943378448486, "learning_rate": 3.3894043161961653e-07, "loss": 0.2961, "step": 8016 }, { "epoch": 0.8834159779614325, "grad_norm": 4.912277698516846, "learning_rate": 3.383079344938639e-07, "loss": 0.3357, "step": 8017 }, { "epoch": 0.8835261707988981, "grad_norm": 5.641799449920654, "learning_rate": 3.376760074061497e-07, "loss": 0.4385, "step": 8018 }, { "epoch": 0.8836363636363637, "grad_norm": 7.622751712799072, "learning_rate": 3.3704465043374714e-07, "loss": 0.4138, "step": 8019 }, { "epoch": 0.8837465564738292, "grad_norm": 6.354327201843262, "learning_rate": 3.3641386365385773e-07, "loss": 0.4326, "step": 8020 }, { "epoch": 0.8838567493112948, "grad_norm": 6.589369297027588, "learning_rate": 3.3578364714361597e-07, "loss": 0.481, "step": 8021 }, { "epoch": 0.8839669421487604, "grad_norm": 5.158421039581299, "learning_rate": 3.351540009800869e-07, "loss": 0.434, "step": 8022 }, { "epoch": 0.8840771349862259, "grad_norm": 7.01084041595459, "learning_rate": 3.3452492524026156e-07, "loss": 0.3777, "step": 8023 }, { "epoch": 0.8841873278236915, "grad_norm": 5.994622230529785, "learning_rate": 3.338964200010664e-07, "loss": 0.4062, "step": 8024 }, { "epoch": 0.8842975206611571, "grad_norm": 7.221744537353516, "learning_rate": 3.3326848533935584e-07, "loss": 0.3739, "step": 8025 }, { "epoch": 0.8844077134986226, "grad_norm": 5.481081962585449, "learning_rate": 3.3264112133191307e-07, "loss": 0.3611, "step": 8026 }, { "epoch": 0.8845179063360882, "grad_norm": 8.957030296325684, "learning_rate": 3.3201432805545387e-07, "loss": 0.3442, "step": 8027 }, { "epoch": 0.8846280991735537, "grad_norm": 10.126974105834961, "learning_rate": 3.313881055866247e-07, "loss": 0.4113, "step": 8028 }, { "epoch": 0.8847382920110193, "grad_norm": 6.311191082000732, "learning_rate": 3.307624540019988e-07, "loss": 0.4409, "step": 8029 }, { "epoch": 0.8848484848484849, "grad_norm": 5.306412220001221, "learning_rate": 3.3013737337808217e-07, "loss": 0.4359, "step": 8030 }, { "epoch": 0.8849586776859504, "grad_norm": 6.894516468048096, "learning_rate": 3.29512863791312e-07, "loss": 0.4272, "step": 8031 }, { "epoch": 0.885068870523416, "grad_norm": 5.035217761993408, "learning_rate": 3.288889253180522e-07, "loss": 0.4076, "step": 8032 }, { "epoch": 0.8851790633608816, "grad_norm": 8.280344009399414, "learning_rate": 3.282655580346e-07, "loss": 0.4444, "step": 8033 }, { "epoch": 0.8852892561983471, "grad_norm": 8.162149429321289, "learning_rate": 3.276427620171818e-07, "loss": 0.3521, "step": 8034 }, { "epoch": 0.8853994490358127, "grad_norm": 12.015260696411133, "learning_rate": 3.270205373419527e-07, "loss": 0.5101, "step": 8035 }, { "epoch": 0.8855096418732782, "grad_norm": 10.396943092346191, "learning_rate": 3.2639888408499964e-07, "loss": 0.4122, "step": 8036 }, { "epoch": 0.8856198347107438, "grad_norm": 6.952391624450684, "learning_rate": 3.257778023223407e-07, "loss": 0.3836, "step": 8037 }, { "epoch": 0.8857300275482094, "grad_norm": 6.282987117767334, "learning_rate": 3.251572921299206e-07, "loss": 0.4001, "step": 8038 }, { "epoch": 0.8858402203856749, "grad_norm": 8.051639556884766, "learning_rate": 3.245373535836166e-07, "loss": 0.4535, "step": 8039 }, { "epoch": 0.8859504132231405, "grad_norm": 7.128776550292969, "learning_rate": 3.2391798675923735e-07, "loss": 0.3403, "step": 8040 }, { "epoch": 0.8860606060606061, "grad_norm": 6.599671363830566, "learning_rate": 3.2329919173251734e-07, "loss": 0.3496, "step": 8041 }, { "epoch": 0.8861707988980716, "grad_norm": 5.977092742919922, "learning_rate": 3.2268096857912547e-07, "loss": 0.356, "step": 8042 }, { "epoch": 0.8862809917355372, "grad_norm": 7.180810451507568, "learning_rate": 3.220633173746579e-07, "loss": 0.4067, "step": 8043 }, { "epoch": 0.8863911845730027, "grad_norm": 8.043085098266602, "learning_rate": 3.2144623819464205e-07, "loss": 0.3187, "step": 8044 }, { "epoch": 0.8865013774104683, "grad_norm": 7.067012310028076, "learning_rate": 3.2082973111453587e-07, "loss": 0.4156, "step": 8045 }, { "epoch": 0.8866115702479339, "grad_norm": 11.196881294250488, "learning_rate": 3.2021379620972513e-07, "loss": 0.4194, "step": 8046 }, { "epoch": 0.8867217630853994, "grad_norm": 8.067953109741211, "learning_rate": 3.1959843355552964e-07, "loss": 0.4077, "step": 8047 }, { "epoch": 0.886831955922865, "grad_norm": 4.846611499786377, "learning_rate": 3.189836432271953e-07, "loss": 0.3907, "step": 8048 }, { "epoch": 0.8869421487603306, "grad_norm": 13.707871437072754, "learning_rate": 3.183694252998987e-07, "loss": 0.4561, "step": 8049 }, { "epoch": 0.8870523415977961, "grad_norm": 4.391310691833496, "learning_rate": 3.177557798487485e-07, "loss": 0.4253, "step": 8050 }, { "epoch": 0.8871625344352617, "grad_norm": 7.767878532409668, "learning_rate": 3.17142706948782e-07, "loss": 0.3253, "step": 8051 }, { "epoch": 0.8872727272727273, "grad_norm": 5.446445941925049, "learning_rate": 3.1653020667496593e-07, "loss": 0.42, "step": 8052 }, { "epoch": 0.8873829201101928, "grad_norm": 6.431450366973877, "learning_rate": 3.1591827910219806e-07, "loss": 0.4241, "step": 8053 }, { "epoch": 0.8874931129476584, "grad_norm": 9.028473854064941, "learning_rate": 3.153069243053064e-07, "loss": 0.4122, "step": 8054 }, { "epoch": 0.8876033057851239, "grad_norm": 8.524662017822266, "learning_rate": 3.146961423590472e-07, "loss": 0.4029, "step": 8055 }, { "epoch": 0.8877134986225895, "grad_norm": 5.424405097961426, "learning_rate": 3.1408593333810743e-07, "loss": 0.3575, "step": 8056 }, { "epoch": 0.8878236914600551, "grad_norm": 12.417449951171875, "learning_rate": 3.1347629731710625e-07, "loss": 0.4773, "step": 8057 }, { "epoch": 0.8879338842975206, "grad_norm": 12.965645790100098, "learning_rate": 3.128672343705885e-07, "loss": 0.5651, "step": 8058 }, { "epoch": 0.8880440771349862, "grad_norm": 4.561319351196289, "learning_rate": 3.1225874457303183e-07, "loss": 0.3918, "step": 8059 }, { "epoch": 0.8881542699724518, "grad_norm": 6.537079811096191, "learning_rate": 3.1165082799884396e-07, "loss": 0.4639, "step": 8060 }, { "epoch": 0.8882644628099173, "grad_norm": 4.994674205780029, "learning_rate": 3.1104348472236203e-07, "loss": 0.3408, "step": 8061 }, { "epoch": 0.8883746556473829, "grad_norm": 6.84326696395874, "learning_rate": 3.104367148178511e-07, "loss": 0.3811, "step": 8062 }, { "epoch": 0.8884848484848484, "grad_norm": 14.176671981811523, "learning_rate": 3.0983051835950904e-07, "loss": 0.5579, "step": 8063 }, { "epoch": 0.888595041322314, "grad_norm": 7.461105823516846, "learning_rate": 3.092248954214627e-07, "loss": 0.4269, "step": 8064 }, { "epoch": 0.8887052341597796, "grad_norm": 8.84874153137207, "learning_rate": 3.086198460777684e-07, "loss": 0.3993, "step": 8065 }, { "epoch": 0.8888154269972451, "grad_norm": 6.398798942565918, "learning_rate": 3.080153704024097e-07, "loss": 0.4355, "step": 8066 }, { "epoch": 0.8889256198347107, "grad_norm": 19.387798309326172, "learning_rate": 3.074114684693069e-07, "loss": 0.4238, "step": 8067 }, { "epoch": 0.8890358126721764, "grad_norm": 11.25837230682373, "learning_rate": 3.068081403523043e-07, "loss": 0.4366, "step": 8068 }, { "epoch": 0.8891460055096418, "grad_norm": 6.566999912261963, "learning_rate": 3.0620538612517567e-07, "loss": 0.3861, "step": 8069 }, { "epoch": 0.8892561983471075, "grad_norm": 5.205772876739502, "learning_rate": 3.056032058616293e-07, "loss": 0.3086, "step": 8070 }, { "epoch": 0.8893663911845731, "grad_norm": 16.4765567779541, "learning_rate": 3.050015996353001e-07, "loss": 0.4317, "step": 8071 }, { "epoch": 0.8894765840220386, "grad_norm": 7.251389503479004, "learning_rate": 3.044005675197514e-07, "loss": 0.41, "step": 8072 }, { "epoch": 0.8895867768595042, "grad_norm": 5.62153959274292, "learning_rate": 3.0380010958848125e-07, "loss": 0.3659, "step": 8073 }, { "epoch": 0.8896969696969697, "grad_norm": 4.930037498474121, "learning_rate": 3.0320022591491193e-07, "loss": 0.4092, "step": 8074 }, { "epoch": 0.8898071625344353, "grad_norm": 7.107131004333496, "learning_rate": 3.026009165723992e-07, "loss": 0.4444, "step": 8075 }, { "epoch": 0.8899173553719009, "grad_norm": 5.48328161239624, "learning_rate": 3.0200218163422725e-07, "loss": 0.4075, "step": 8076 }, { "epoch": 0.8900275482093664, "grad_norm": 10.584638595581055, "learning_rate": 3.014040211736097e-07, "loss": 0.3669, "step": 8077 }, { "epoch": 0.890137741046832, "grad_norm": 8.460695266723633, "learning_rate": 3.0080643526369135e-07, "loss": 0.4176, "step": 8078 }, { "epoch": 0.8902479338842976, "grad_norm": 7.722398281097412, "learning_rate": 3.0020942397754493e-07, "loss": 0.3903, "step": 8079 }, { "epoch": 0.8903581267217631, "grad_norm": 8.00770092010498, "learning_rate": 2.9961298738817424e-07, "loss": 0.3857, "step": 8080 }, { "epoch": 0.8904683195592287, "grad_norm": 5.820324420928955, "learning_rate": 2.9901712556851315e-07, "loss": 0.4006, "step": 8081 }, { "epoch": 0.8905785123966942, "grad_norm": 5.2179975509643555, "learning_rate": 2.984218385914228e-07, "loss": 0.3543, "step": 8082 }, { "epoch": 0.8906887052341598, "grad_norm": 5.526549339294434, "learning_rate": 2.9782712652969615e-07, "loss": 0.4097, "step": 8083 }, { "epoch": 0.8907988980716254, "grad_norm": 5.95803165435791, "learning_rate": 2.9723298945605663e-07, "loss": 0.3751, "step": 8084 }, { "epoch": 0.8909090909090909, "grad_norm": 8.43634033203125, "learning_rate": 2.9663942744315443e-07, "loss": 0.3998, "step": 8085 }, { "epoch": 0.8910192837465565, "grad_norm": 9.699858665466309, "learning_rate": 2.960464405635721e-07, "loss": 0.3533, "step": 8086 }, { "epoch": 0.8911294765840221, "grad_norm": 7.270328998565674, "learning_rate": 2.954540288898217e-07, "loss": 0.3432, "step": 8087 }, { "epoch": 0.8912396694214876, "grad_norm": 8.234623908996582, "learning_rate": 2.9486219249434234e-07, "loss": 0.3458, "step": 8088 }, { "epoch": 0.8913498622589532, "grad_norm": 8.299948692321777, "learning_rate": 2.9427093144950513e-07, "loss": 0.3965, "step": 8089 }, { "epoch": 0.8914600550964187, "grad_norm": 4.60298490524292, "learning_rate": 2.936802458276111e-07, "loss": 0.3638, "step": 8090 }, { "epoch": 0.8915702479338843, "grad_norm": 7.8577165603637695, "learning_rate": 2.9309013570088916e-07, "loss": 0.4339, "step": 8091 }, { "epoch": 0.8916804407713499, "grad_norm": 6.670377254486084, "learning_rate": 2.9250060114149883e-07, "loss": 0.4261, "step": 8092 }, { "epoch": 0.8917906336088154, "grad_norm": 4.505085468292236, "learning_rate": 2.9191164222153014e-07, "loss": 0.3706, "step": 8093 }, { "epoch": 0.891900826446281, "grad_norm": 4.422532558441162, "learning_rate": 2.9132325901300106e-07, "loss": 0.3651, "step": 8094 }, { "epoch": 0.8920110192837466, "grad_norm": 8.972837448120117, "learning_rate": 2.907354515878591e-07, "loss": 0.41, "step": 8095 }, { "epoch": 0.8921212121212121, "grad_norm": 7.192322254180908, "learning_rate": 2.9014822001798446e-07, "loss": 0.3484, "step": 8096 }, { "epoch": 0.8922314049586777, "grad_norm": 8.343283653259277, "learning_rate": 2.8956156437518204e-07, "loss": 0.4628, "step": 8097 }, { "epoch": 0.8923415977961433, "grad_norm": 8.42356014251709, "learning_rate": 2.88975484731191e-07, "loss": 0.495, "step": 8098 }, { "epoch": 0.8924517906336088, "grad_norm": 4.533227443695068, "learning_rate": 2.8838998115767623e-07, "loss": 0.2961, "step": 8099 }, { "epoch": 0.8925619834710744, "grad_norm": 9.942359924316406, "learning_rate": 2.8780505372623444e-07, "loss": 0.4207, "step": 8100 }, { "epoch": 0.8926721763085399, "grad_norm": 7.865906715393066, "learning_rate": 2.8722070250839283e-07, "loss": 0.393, "step": 8101 }, { "epoch": 0.8927823691460055, "grad_norm": 8.467253684997559, "learning_rate": 2.866369275756037e-07, "loss": 0.4613, "step": 8102 }, { "epoch": 0.8928925619834711, "grad_norm": 14.772082328796387, "learning_rate": 2.860537289992549e-07, "loss": 0.5226, "step": 8103 }, { "epoch": 0.8930027548209366, "grad_norm": 4.3217997550964355, "learning_rate": 2.854711068506594e-07, "loss": 0.3545, "step": 8104 }, { "epoch": 0.8931129476584022, "grad_norm": 4.720042705535889, "learning_rate": 2.848890612010591e-07, "loss": 0.4085, "step": 8105 }, { "epoch": 0.8932231404958678, "grad_norm": 8.005223274230957, "learning_rate": 2.843075921216315e-07, "loss": 0.3778, "step": 8106 }, { "epoch": 0.8933333333333333, "grad_norm": 6.306997299194336, "learning_rate": 2.8372669968347645e-07, "loss": 0.3506, "step": 8107 }, { "epoch": 0.8934435261707989, "grad_norm": 6.133072376251221, "learning_rate": 2.831463839576271e-07, "loss": 0.3684, "step": 8108 }, { "epoch": 0.8935537190082644, "grad_norm": 4.7656989097595215, "learning_rate": 2.825666450150444e-07, "loss": 0.3681, "step": 8109 }, { "epoch": 0.89366391184573, "grad_norm": 5.7682013511657715, "learning_rate": 2.8198748292662114e-07, "loss": 0.3956, "step": 8110 }, { "epoch": 0.8937741046831956, "grad_norm": 10.335726737976074, "learning_rate": 2.814088977631768e-07, "loss": 0.4726, "step": 8111 }, { "epoch": 0.8938842975206611, "grad_norm": 5.817956924438477, "learning_rate": 2.80830889595462e-07, "loss": 0.4033, "step": 8112 }, { "epoch": 0.8939944903581267, "grad_norm": 5.217440128326416, "learning_rate": 2.802534584941569e-07, "loss": 0.3323, "step": 8113 }, { "epoch": 0.8941046831955923, "grad_norm": 6.813904285430908, "learning_rate": 2.7967660452986987e-07, "loss": 0.4036, "step": 8114 }, { "epoch": 0.8942148760330578, "grad_norm": 8.03448486328125, "learning_rate": 2.791003277731391e-07, "loss": 0.3333, "step": 8115 }, { "epoch": 0.8943250688705234, "grad_norm": 6.301514148712158, "learning_rate": 2.7852462829443314e-07, "loss": 0.3822, "step": 8116 }, { "epoch": 0.8944352617079889, "grad_norm": 6.421783447265625, "learning_rate": 2.779495061641496e-07, "loss": 0.454, "step": 8117 }, { "epoch": 0.8945454545454545, "grad_norm": 5.428205966949463, "learning_rate": 2.773749614526139e-07, "loss": 0.3974, "step": 8118 }, { "epoch": 0.8946556473829201, "grad_norm": 8.008651733398438, "learning_rate": 2.768009942300831e-07, "loss": 0.4237, "step": 8119 }, { "epoch": 0.8947658402203856, "grad_norm": 5.937148094177246, "learning_rate": 2.762276045667439e-07, "loss": 0.3997, "step": 8120 }, { "epoch": 0.8948760330578512, "grad_norm": 6.53151798248291, "learning_rate": 2.756547925327085e-07, "loss": 0.3953, "step": 8121 }, { "epoch": 0.8949862258953168, "grad_norm": 6.310358047485352, "learning_rate": 2.7508255819802266e-07, "loss": 0.3669, "step": 8122 }, { "epoch": 0.8950964187327823, "grad_norm": 6.95485782623291, "learning_rate": 2.7451090163266026e-07, "loss": 0.3781, "step": 8123 }, { "epoch": 0.895206611570248, "grad_norm": 5.507017612457275, "learning_rate": 2.7393982290652374e-07, "loss": 0.417, "step": 8124 }, { "epoch": 0.8953168044077136, "grad_norm": 4.009406566619873, "learning_rate": 2.7336932208944403e-07, "loss": 0.3, "step": 8125 }, { "epoch": 0.895426997245179, "grad_norm": 8.58454418182373, "learning_rate": 2.727993992511857e-07, "loss": 0.4231, "step": 8126 }, { "epoch": 0.8955371900826447, "grad_norm": 7.0145769119262695, "learning_rate": 2.722300544614381e-07, "loss": 0.4203, "step": 8127 }, { "epoch": 0.8956473829201101, "grad_norm": 9.251009941101074, "learning_rate": 2.7166128778982005e-07, "loss": 0.4215, "step": 8128 }, { "epoch": 0.8957575757575758, "grad_norm": 5.557025909423828, "learning_rate": 2.7109309930588413e-07, "loss": 0.3281, "step": 8129 }, { "epoch": 0.8958677685950414, "grad_norm": 8.478955268859863, "learning_rate": 2.705254890791065e-07, "loss": 0.5145, "step": 8130 }, { "epoch": 0.8959779614325069, "grad_norm": 7.705807209014893, "learning_rate": 2.6995845717889715e-07, "loss": 0.4013, "step": 8131 }, { "epoch": 0.8960881542699725, "grad_norm": 8.202071189880371, "learning_rate": 2.693920036745923e-07, "loss": 0.4038, "step": 8132 }, { "epoch": 0.8961983471074381, "grad_norm": 3.9696667194366455, "learning_rate": 2.688261286354593e-07, "loss": 0.3579, "step": 8133 }, { "epoch": 0.8963085399449036, "grad_norm": 10.224541664123535, "learning_rate": 2.6826083213069453e-07, "loss": 0.4634, "step": 8134 }, { "epoch": 0.8964187327823692, "grad_norm": 10.307765007019043, "learning_rate": 2.6769611422942155e-07, "loss": 0.4637, "step": 8135 }, { "epoch": 0.8965289256198347, "grad_norm": 3.828857898712158, "learning_rate": 2.671319750006962e-07, "loss": 0.3909, "step": 8136 }, { "epoch": 0.8966391184573003, "grad_norm": 7.539666175842285, "learning_rate": 2.6656841451350277e-07, "loss": 0.4047, "step": 8137 }, { "epoch": 0.8967493112947659, "grad_norm": 7.9426398277282715, "learning_rate": 2.660054328367523e-07, "loss": 0.4304, "step": 8138 }, { "epoch": 0.8968595041322314, "grad_norm": 5.386150360107422, "learning_rate": 2.654430300392885e-07, "loss": 0.3594, "step": 8139 }, { "epoch": 0.896969696969697, "grad_norm": 4.974326133728027, "learning_rate": 2.6488120618988256e-07, "loss": 0.322, "step": 8140 }, { "epoch": 0.8970798898071626, "grad_norm": 7.355337619781494, "learning_rate": 2.643199613572345e-07, "loss": 0.4132, "step": 8141 }, { "epoch": 0.8971900826446281, "grad_norm": 7.0208916664123535, "learning_rate": 2.637592956099738e-07, "loss": 0.3688, "step": 8142 }, { "epoch": 0.8973002754820937, "grad_norm": 5.85609245300293, "learning_rate": 2.6319920901666073e-07, "loss": 0.3767, "step": 8143 }, { "epoch": 0.8974104683195592, "grad_norm": 9.025439262390137, "learning_rate": 2.626397016457827e-07, "loss": 0.4982, "step": 8144 }, { "epoch": 0.8975206611570248, "grad_norm": 5.851327896118164, "learning_rate": 2.62080773565756e-07, "loss": 0.4023, "step": 8145 }, { "epoch": 0.8976308539944904, "grad_norm": 6.238762855529785, "learning_rate": 2.6152242484492943e-07, "loss": 0.416, "step": 8146 }, { "epoch": 0.8977410468319559, "grad_norm": 5.96334981918335, "learning_rate": 2.6096465555157655e-07, "loss": 0.4034, "step": 8147 }, { "epoch": 0.8978512396694215, "grad_norm": 8.227760314941406, "learning_rate": 2.6040746575390295e-07, "loss": 0.3429, "step": 8148 }, { "epoch": 0.8979614325068871, "grad_norm": 9.91084098815918, "learning_rate": 2.598508555200435e-07, "loss": 0.4176, "step": 8149 }, { "epoch": 0.8980716253443526, "grad_norm": 5.673585891723633, "learning_rate": 2.592948249180594e-07, "loss": 0.3466, "step": 8150 }, { "epoch": 0.8981818181818182, "grad_norm": 5.9524641036987305, "learning_rate": 2.587393740159433e-07, "loss": 0.3574, "step": 8151 }, { "epoch": 0.8982920110192838, "grad_norm": 8.94787883758545, "learning_rate": 2.5818450288161823e-07, "loss": 0.5022, "step": 8152 }, { "epoch": 0.8984022038567493, "grad_norm": 7.699998378753662, "learning_rate": 2.5763021158293213e-07, "loss": 0.4089, "step": 8153 }, { "epoch": 0.8985123966942149, "grad_norm": 3.6122827529907227, "learning_rate": 2.5707650018766627e-07, "loss": 0.3893, "step": 8154 }, { "epoch": 0.8986225895316804, "grad_norm": 4.201280117034912, "learning_rate": 2.565233687635288e-07, "loss": 0.3328, "step": 8155 }, { "epoch": 0.898732782369146, "grad_norm": 10.086000442504883, "learning_rate": 2.559708173781561e-07, "loss": 0.4042, "step": 8156 }, { "epoch": 0.8988429752066116, "grad_norm": 10.268120765686035, "learning_rate": 2.554188460991175e-07, "loss": 0.4516, "step": 8157 }, { "epoch": 0.8989531680440771, "grad_norm": 5.0997700691223145, "learning_rate": 2.5486745499390564e-07, "loss": 0.3556, "step": 8158 }, { "epoch": 0.8990633608815427, "grad_norm": 8.162371635437012, "learning_rate": 2.5431664412994774e-07, "loss": 0.3871, "step": 8159 }, { "epoch": 0.8991735537190083, "grad_norm": 3.4119391441345215, "learning_rate": 2.5376641357459765e-07, "loss": 0.3638, "step": 8160 }, { "epoch": 0.8992837465564738, "grad_norm": 6.416200160980225, "learning_rate": 2.53216763395136e-07, "loss": 0.3328, "step": 8161 }, { "epoch": 0.8993939393939394, "grad_norm": 5.656872272491455, "learning_rate": 2.5266769365877796e-07, "loss": 0.3552, "step": 8162 }, { "epoch": 0.8995041322314049, "grad_norm": 5.099634647369385, "learning_rate": 2.5211920443266314e-07, "loss": 0.3377, "step": 8163 }, { "epoch": 0.8996143250688705, "grad_norm": 5.004369735717773, "learning_rate": 2.5157129578386007e-07, "loss": 0.301, "step": 8164 }, { "epoch": 0.8997245179063361, "grad_norm": 7.1246724128723145, "learning_rate": 2.5102396777936965e-07, "loss": 0.3258, "step": 8165 }, { "epoch": 0.8998347107438016, "grad_norm": 5.042446136474609, "learning_rate": 2.5047722048611944e-07, "loss": 0.424, "step": 8166 }, { "epoch": 0.8999449035812672, "grad_norm": 7.238094806671143, "learning_rate": 2.4993105397096596e-07, "loss": 0.4319, "step": 8167 }, { "epoch": 0.9000550964187328, "grad_norm": 6.154951095581055, "learning_rate": 2.4938546830069575e-07, "loss": 0.4727, "step": 8168 }, { "epoch": 0.9001652892561983, "grad_norm": 5.778834342956543, "learning_rate": 2.4884046354202383e-07, "loss": 0.3655, "step": 8169 }, { "epoch": 0.9002754820936639, "grad_norm": 5.150241851806641, "learning_rate": 2.482960397615936e-07, "loss": 0.4015, "step": 8170 }, { "epoch": 0.9003856749311295, "grad_norm": 8.055282592773438, "learning_rate": 2.4775219702597777e-07, "loss": 0.4048, "step": 8171 }, { "epoch": 0.900495867768595, "grad_norm": 8.553960800170898, "learning_rate": 2.472089354016788e-07, "loss": 0.4267, "step": 8172 }, { "epoch": 0.900495867768595, "eval_loss": 0.39516323804855347, "eval_runtime": 41.972, "eval_samples_per_second": 17.488, "eval_steps_per_second": 2.192, "step": 8172 }, { "epoch": 0.9006060606060606, "grad_norm": 5.766805648803711, "learning_rate": 2.466662549551274e-07, "loss": 0.3505, "step": 8173 }, { "epoch": 0.9007162534435261, "grad_norm": 7.682420253753662, "learning_rate": 2.4612415575268276e-07, "loss": 0.396, "step": 8174 }, { "epoch": 0.9008264462809917, "grad_norm": 4.986112594604492, "learning_rate": 2.4558263786063406e-07, "loss": 0.3932, "step": 8175 }, { "epoch": 0.9009366391184573, "grad_norm": 7.434338092803955, "learning_rate": 2.4504170134519944e-07, "loss": 0.4025, "step": 8176 }, { "epoch": 0.9010468319559228, "grad_norm": 8.345190048217773, "learning_rate": 2.4450134627252376e-07, "loss": 0.395, "step": 8177 }, { "epoch": 0.9011570247933884, "grad_norm": 6.5656633377075195, "learning_rate": 2.4396157270868304e-07, "loss": 0.421, "step": 8178 }, { "epoch": 0.901267217630854, "grad_norm": 10.848407745361328, "learning_rate": 2.434223807196823e-07, "loss": 0.4032, "step": 8179 }, { "epoch": 0.9013774104683195, "grad_norm": 5.726838111877441, "learning_rate": 2.4288377037145315e-07, "loss": 0.4159, "step": 8180 }, { "epoch": 0.9014876033057851, "grad_norm": 9.377209663391113, "learning_rate": 2.423457417298591e-07, "loss": 0.4314, "step": 8181 }, { "epoch": 0.9015977961432506, "grad_norm": 6.047981262207031, "learning_rate": 2.4180829486069037e-07, "loss": 0.4007, "step": 8182 }, { "epoch": 0.9017079889807162, "grad_norm": 5.022491455078125, "learning_rate": 2.41271429829667e-07, "loss": 0.3669, "step": 8183 }, { "epoch": 0.9018181818181819, "grad_norm": 7.403291702270508, "learning_rate": 2.4073514670243605e-07, "loss": 0.4, "step": 8184 }, { "epoch": 0.9019283746556473, "grad_norm": 4.81556510925293, "learning_rate": 2.4019944554457775e-07, "loss": 0.3806, "step": 8185 }, { "epoch": 0.902038567493113, "grad_norm": 6.117400646209717, "learning_rate": 2.3966432642159587e-07, "loss": 0.4074, "step": 8186 }, { "epoch": 0.9021487603305786, "grad_norm": 12.024847984313965, "learning_rate": 2.391297893989264e-07, "loss": 0.4172, "step": 8187 }, { "epoch": 0.902258953168044, "grad_norm": 6.242091655731201, "learning_rate": 2.385958345419337e-07, "loss": 0.3194, "step": 8188 }, { "epoch": 0.9023691460055097, "grad_norm": 6.490644454956055, "learning_rate": 2.3806246191590941e-07, "loss": 0.3924, "step": 8189 }, { "epoch": 0.9024793388429752, "grad_norm": 12.245840072631836, "learning_rate": 2.3752967158607698e-07, "loss": 0.5208, "step": 8190 }, { "epoch": 0.9025895316804408, "grad_norm": 5.594180583953857, "learning_rate": 2.3699746361758424e-07, "loss": 0.4115, "step": 8191 }, { "epoch": 0.9026997245179064, "grad_norm": 4.443387508392334, "learning_rate": 2.3646583807551194e-07, "loss": 0.3965, "step": 8192 }, { "epoch": 0.9028099173553719, "grad_norm": 6.190323829650879, "learning_rate": 2.3593479502486804e-07, "loss": 0.3864, "step": 8193 }, { "epoch": 0.9029201101928375, "grad_norm": 7.716813564300537, "learning_rate": 2.3540433453058843e-07, "loss": 0.4035, "step": 8194 }, { "epoch": 0.9030303030303031, "grad_norm": 6.573265075683594, "learning_rate": 2.348744566575384e-07, "loss": 0.3777, "step": 8195 }, { "epoch": 0.9031404958677686, "grad_norm": 7.373490810394287, "learning_rate": 2.3434516147051389e-07, "loss": 0.3437, "step": 8196 }, { "epoch": 0.9032506887052342, "grad_norm": 5.659514904022217, "learning_rate": 2.338164490342354e-07, "loss": 0.3805, "step": 8197 }, { "epoch": 0.9033608815426998, "grad_norm": 4.987745761871338, "learning_rate": 2.332883194133556e-07, "loss": 0.3703, "step": 8198 }, { "epoch": 0.9034710743801653, "grad_norm": 6.0765838623046875, "learning_rate": 2.3276077267245567e-07, "loss": 0.3111, "step": 8199 }, { "epoch": 0.9035812672176309, "grad_norm": 5.776496410369873, "learning_rate": 2.3223380887604396e-07, "loss": 0.4141, "step": 8200 }, { "epoch": 0.9036914600550964, "grad_norm": 4.796311378479004, "learning_rate": 2.317074280885584e-07, "loss": 0.3587, "step": 8201 }, { "epoch": 0.903801652892562, "grad_norm": 10.538880348205566, "learning_rate": 2.3118163037436582e-07, "loss": 0.4016, "step": 8202 }, { "epoch": 0.9039118457300276, "grad_norm": 7.131097316741943, "learning_rate": 2.3065641579776088e-07, "loss": 0.4957, "step": 8203 }, { "epoch": 0.9040220385674931, "grad_norm": 6.796650409698486, "learning_rate": 2.301317844229678e-07, "loss": 0.3976, "step": 8204 }, { "epoch": 0.9041322314049587, "grad_norm": 4.019796848297119, "learning_rate": 2.2960773631414024e-07, "loss": 0.3696, "step": 8205 }, { "epoch": 0.9042424242424243, "grad_norm": 16.750940322875977, "learning_rate": 2.2908427153535806e-07, "loss": 0.4196, "step": 8206 }, { "epoch": 0.9043526170798898, "grad_norm": 6.668462753295898, "learning_rate": 2.2856139015063172e-07, "loss": 0.4145, "step": 8207 }, { "epoch": 0.9044628099173554, "grad_norm": 6.95205545425415, "learning_rate": 2.2803909222390065e-07, "loss": 0.3372, "step": 8208 }, { "epoch": 0.9045730027548209, "grad_norm": 5.908565998077393, "learning_rate": 2.2751737781903038e-07, "loss": 0.4251, "step": 8209 }, { "epoch": 0.9046831955922865, "grad_norm": 7.492595672607422, "learning_rate": 2.2699624699981826e-07, "loss": 0.3483, "step": 8210 }, { "epoch": 0.9047933884297521, "grad_norm": 9.232844352722168, "learning_rate": 2.2647569982998942e-07, "loss": 0.4344, "step": 8211 }, { "epoch": 0.9049035812672176, "grad_norm": 13.05223560333252, "learning_rate": 2.2595573637319513e-07, "loss": 0.5699, "step": 8212 }, { "epoch": 0.9050137741046832, "grad_norm": 9.177809715270996, "learning_rate": 2.2543635669301843e-07, "loss": 0.4088, "step": 8213 }, { "epoch": 0.9051239669421488, "grad_norm": 7.9022955894470215, "learning_rate": 2.2491756085296966e-07, "loss": 0.4845, "step": 8214 }, { "epoch": 0.9052341597796143, "grad_norm": 5.8695831298828125, "learning_rate": 2.2439934891648863e-07, "loss": 0.2941, "step": 8215 }, { "epoch": 0.9053443526170799, "grad_norm": 8.41268253326416, "learning_rate": 2.2388172094694237e-07, "loss": 0.3745, "step": 8216 }, { "epoch": 0.9054545454545454, "grad_norm": 8.964435577392578, "learning_rate": 2.2336467700762532e-07, "loss": 0.4527, "step": 8217 }, { "epoch": 0.905564738292011, "grad_norm": 8.264138221740723, "learning_rate": 2.228482171617652e-07, "loss": 0.3942, "step": 8218 }, { "epoch": 0.9056749311294766, "grad_norm": 5.133756637573242, "learning_rate": 2.2233234147251482e-07, "loss": 0.3861, "step": 8219 }, { "epoch": 0.9057851239669421, "grad_norm": 6.964199542999268, "learning_rate": 2.2181705000295374e-07, "loss": 0.4232, "step": 8220 }, { "epoch": 0.9058953168044077, "grad_norm": 6.645883560180664, "learning_rate": 2.2130234281609541e-07, "loss": 0.3692, "step": 8221 }, { "epoch": 0.9060055096418733, "grad_norm": 8.911177635192871, "learning_rate": 2.2078821997487841e-07, "loss": 0.4156, "step": 8222 }, { "epoch": 0.9061157024793388, "grad_norm": 7.476542949676514, "learning_rate": 2.2027468154216857e-07, "loss": 0.4474, "step": 8223 }, { "epoch": 0.9062258953168044, "grad_norm": 7.22313117980957, "learning_rate": 2.1976172758076398e-07, "loss": 0.4365, "step": 8224 }, { "epoch": 0.90633608815427, "grad_norm": 11.649321556091309, "learning_rate": 2.192493581533889e-07, "loss": 0.4786, "step": 8225 }, { "epoch": 0.9064462809917355, "grad_norm": 6.162008285522461, "learning_rate": 2.187375733226954e-07, "loss": 0.3448, "step": 8226 }, { "epoch": 0.9065564738292011, "grad_norm": 10.098167419433594, "learning_rate": 2.182263731512668e-07, "loss": 0.4494, "step": 8227 }, { "epoch": 0.9066666666666666, "grad_norm": 5.976385116577148, "learning_rate": 2.17715757701612e-07, "loss": 0.4257, "step": 8228 }, { "epoch": 0.9067768595041322, "grad_norm": 10.715227127075195, "learning_rate": 2.172057270361716e-07, "loss": 0.4498, "step": 8229 }, { "epoch": 0.9068870523415978, "grad_norm": 10.081153869628906, "learning_rate": 2.1669628121731068e-07, "loss": 0.3486, "step": 8230 }, { "epoch": 0.9069972451790633, "grad_norm": 7.819251537322998, "learning_rate": 2.161874203073261e-07, "loss": 0.4109, "step": 8231 }, { "epoch": 0.9071074380165289, "grad_norm": 6.477564334869385, "learning_rate": 2.1567914436844305e-07, "loss": 0.3086, "step": 8232 }, { "epoch": 0.9072176308539945, "grad_norm": 5.197267055511475, "learning_rate": 2.1517145346281188e-07, "loss": 0.4296, "step": 8233 }, { "epoch": 0.90732782369146, "grad_norm": 10.985544204711914, "learning_rate": 2.146643476525151e-07, "loss": 0.4504, "step": 8234 }, { "epoch": 0.9074380165289256, "grad_norm": 5.30659294128418, "learning_rate": 2.1415782699956255e-07, "loss": 0.3781, "step": 8235 }, { "epoch": 0.9075482093663911, "grad_norm": 5.9461894035339355, "learning_rate": 2.136518915658914e-07, "loss": 0.3664, "step": 8236 }, { "epoch": 0.9076584022038567, "grad_norm": 6.129619121551514, "learning_rate": 2.1314654141336878e-07, "loss": 0.3769, "step": 8237 }, { "epoch": 0.9077685950413223, "grad_norm": 6.220506191253662, "learning_rate": 2.1264177660378972e-07, "loss": 0.4206, "step": 8238 }, { "epoch": 0.9078787878787878, "grad_norm": 10.714601516723633, "learning_rate": 2.121375971988765e-07, "loss": 0.3468, "step": 8239 }, { "epoch": 0.9079889807162534, "grad_norm": 6.325436115264893, "learning_rate": 2.1163400326028204e-07, "loss": 0.3695, "step": 8240 }, { "epoch": 0.908099173553719, "grad_norm": 8.421700477600098, "learning_rate": 2.11130994849586e-07, "loss": 0.3944, "step": 8241 }, { "epoch": 0.9082093663911845, "grad_norm": 5.89466667175293, "learning_rate": 2.106285720282969e-07, "loss": 0.3764, "step": 8242 }, { "epoch": 0.9083195592286502, "grad_norm": 5.76004695892334, "learning_rate": 2.1012673485785173e-07, "loss": 0.4143, "step": 8243 }, { "epoch": 0.9084297520661156, "grad_norm": 6.659377098083496, "learning_rate": 2.0962548339961586e-07, "loss": 0.3692, "step": 8244 }, { "epoch": 0.9085399449035813, "grad_norm": 6.028273582458496, "learning_rate": 2.09124817714883e-07, "loss": 0.3318, "step": 8245 }, { "epoch": 0.9086501377410469, "grad_norm": 5.327060222625732, "learning_rate": 2.086247378648748e-07, "loss": 0.3372, "step": 8246 }, { "epoch": 0.9087603305785124, "grad_norm": 11.709897994995117, "learning_rate": 2.0812524391074285e-07, "loss": 0.4626, "step": 8247 }, { "epoch": 0.908870523415978, "grad_norm": 7.009006023406982, "learning_rate": 2.076263359135644e-07, "loss": 0.3802, "step": 8248 }, { "epoch": 0.9089807162534436, "grad_norm": 6.697656154632568, "learning_rate": 2.071280139343479e-07, "loss": 0.3865, "step": 8249 }, { "epoch": 0.9090909090909091, "grad_norm": 7.680671691894531, "learning_rate": 2.066302780340279e-07, "loss": 0.3975, "step": 8250 }, { "epoch": 0.9092011019283747, "grad_norm": 15.613241195678711, "learning_rate": 2.0613312827346908e-07, "loss": 0.3827, "step": 8251 }, { "epoch": 0.9093112947658403, "grad_norm": 10.426063537597656, "learning_rate": 2.0563656471346338e-07, "loss": 0.4593, "step": 8252 }, { "epoch": 0.9094214876033058, "grad_norm": 8.83703899383545, "learning_rate": 2.0514058741473053e-07, "loss": 0.4043, "step": 8253 }, { "epoch": 0.9095316804407714, "grad_norm": 5.748766899108887, "learning_rate": 2.046451964379198e-07, "loss": 0.3937, "step": 8254 }, { "epoch": 0.9096418732782369, "grad_norm": 5.267103672027588, "learning_rate": 2.0415039184360884e-07, "loss": 0.356, "step": 8255 }, { "epoch": 0.9097520661157025, "grad_norm": 9.736839294433594, "learning_rate": 2.0365617369230205e-07, "loss": 0.4508, "step": 8256 }, { "epoch": 0.9098622589531681, "grad_norm": 6.641547679901123, "learning_rate": 2.0316254204443332e-07, "loss": 0.4355, "step": 8257 }, { "epoch": 0.9099724517906336, "grad_norm": 6.056273460388184, "learning_rate": 2.0266949696036543e-07, "loss": 0.3821, "step": 8258 }, { "epoch": 0.9100826446280992, "grad_norm": 6.898715496063232, "learning_rate": 2.021770385003874e-07, "loss": 0.4678, "step": 8259 }, { "epoch": 0.9101928374655648, "grad_norm": 4.9509172439575195, "learning_rate": 2.0168516672471828e-07, "loss": 0.4205, "step": 8260 }, { "epoch": 0.9103030303030303, "grad_norm": 15.809843063354492, "learning_rate": 2.011938816935055e-07, "loss": 0.5226, "step": 8261 }, { "epoch": 0.9104132231404959, "grad_norm": 4.871439456939697, "learning_rate": 2.0070318346682272e-07, "loss": 0.3979, "step": 8262 }, { "epoch": 0.9105234159779614, "grad_norm": 8.149961471557617, "learning_rate": 2.002130721046741e-07, "loss": 0.38, "step": 8263 }, { "epoch": 0.910633608815427, "grad_norm": 7.782983303070068, "learning_rate": 1.997235476669912e-07, "loss": 0.4124, "step": 8264 }, { "epoch": 0.9107438016528926, "grad_norm": 7.894191741943359, "learning_rate": 1.9923461021363334e-07, "loss": 0.4377, "step": 8265 }, { "epoch": 0.9108539944903581, "grad_norm": 6.815659523010254, "learning_rate": 1.987462598043882e-07, "loss": 0.4063, "step": 8266 }, { "epoch": 0.9109641873278237, "grad_norm": 7.375637531280518, "learning_rate": 1.9825849649897255e-07, "loss": 0.366, "step": 8267 }, { "epoch": 0.9110743801652893, "grad_norm": 9.88588809967041, "learning_rate": 1.977713203570303e-07, "loss": 0.4498, "step": 8268 }, { "epoch": 0.9111845730027548, "grad_norm": 7.845414638519287, "learning_rate": 1.9728473143813432e-07, "loss": 0.3645, "step": 8269 }, { "epoch": 0.9112947658402204, "grad_norm": 8.823968887329102, "learning_rate": 1.9679872980178483e-07, "loss": 0.4212, "step": 8270 }, { "epoch": 0.911404958677686, "grad_norm": 6.635942459106445, "learning_rate": 1.9631331550741207e-07, "loss": 0.3881, "step": 8271 }, { "epoch": 0.9115151515151515, "grad_norm": 5.339030742645264, "learning_rate": 1.958284886143713e-07, "loss": 0.3459, "step": 8272 }, { "epoch": 0.9116253443526171, "grad_norm": 7.035585880279541, "learning_rate": 1.9534424918194906e-07, "loss": 0.4295, "step": 8273 }, { "epoch": 0.9117355371900826, "grad_norm": 4.605419635772705, "learning_rate": 1.9486059726935903e-07, "loss": 0.3746, "step": 8274 }, { "epoch": 0.9118457300275482, "grad_norm": 8.786975860595703, "learning_rate": 1.9437753293574225e-07, "loss": 0.4245, "step": 8275 }, { "epoch": 0.9119559228650138, "grad_norm": 8.458885192871094, "learning_rate": 1.9389505624016758e-07, "loss": 0.4328, "step": 8276 }, { "epoch": 0.9120661157024793, "grad_norm": 5.858531951904297, "learning_rate": 1.9341316724163506e-07, "loss": 0.4012, "step": 8277 }, { "epoch": 0.9121763085399449, "grad_norm": 8.922369003295898, "learning_rate": 1.929318659990692e-07, "loss": 0.4947, "step": 8278 }, { "epoch": 0.9122865013774105, "grad_norm": 5.19242000579834, "learning_rate": 1.9245115257132351e-07, "loss": 0.3481, "step": 8279 }, { "epoch": 0.912396694214876, "grad_norm": 8.85990047454834, "learning_rate": 1.9197102701718263e-07, "loss": 0.4135, "step": 8280 }, { "epoch": 0.9125068870523416, "grad_norm": 5.6612420082092285, "learning_rate": 1.9149148939535568e-07, "loss": 0.3619, "step": 8281 }, { "epoch": 0.9126170798898071, "grad_norm": 4.2848381996154785, "learning_rate": 1.910125397644802e-07, "loss": 0.3566, "step": 8282 }, { "epoch": 0.9127272727272727, "grad_norm": 11.075757026672363, "learning_rate": 1.905341781831238e-07, "loss": 0.4707, "step": 8283 }, { "epoch": 0.9128374655647383, "grad_norm": 10.635125160217285, "learning_rate": 1.9005640470978137e-07, "loss": 0.4806, "step": 8284 }, { "epoch": 0.9129476584022038, "grad_norm": 7.729781627655029, "learning_rate": 1.895792194028756e-07, "loss": 0.3553, "step": 8285 }, { "epoch": 0.9130578512396694, "grad_norm": 5.757815361022949, "learning_rate": 1.8910262232075706e-07, "loss": 0.4058, "step": 8286 }, { "epoch": 0.913168044077135, "grad_norm": 9.050127983093262, "learning_rate": 1.8862661352170465e-07, "loss": 0.3369, "step": 8287 }, { "epoch": 0.9132782369146005, "grad_norm": 16.06497573852539, "learning_rate": 1.8815119306392625e-07, "loss": 0.4096, "step": 8288 }, { "epoch": 0.9133884297520661, "grad_norm": 5.088180065155029, "learning_rate": 1.8767636100555543e-07, "loss": 0.3421, "step": 8289 }, { "epoch": 0.9134986225895316, "grad_norm": 7.645425796508789, "learning_rate": 1.872021174046562e-07, "loss": 0.4557, "step": 8290 }, { "epoch": 0.9136088154269972, "grad_norm": 11.184514045715332, "learning_rate": 1.8672846231922005e-07, "loss": 0.4466, "step": 8291 }, { "epoch": 0.9137190082644628, "grad_norm": 6.974379539489746, "learning_rate": 1.86255395807165e-07, "loss": 0.4206, "step": 8292 }, { "epoch": 0.9138292011019283, "grad_norm": 8.65231990814209, "learning_rate": 1.857829179263393e-07, "loss": 0.4017, "step": 8293 }, { "epoch": 0.9139393939393939, "grad_norm": 6.22916841506958, "learning_rate": 1.8531102873451834e-07, "loss": 0.4092, "step": 8294 }, { "epoch": 0.9140495867768595, "grad_norm": 7.040229797363281, "learning_rate": 1.8483972828940434e-07, "loss": 0.3962, "step": 8295 }, { "epoch": 0.914159779614325, "grad_norm": 14.772039413452148, "learning_rate": 1.843690166486295e-07, "loss": 0.5156, "step": 8296 }, { "epoch": 0.9142699724517906, "grad_norm": 6.841568470001221, "learning_rate": 1.8389889386975279e-07, "loss": 0.4296, "step": 8297 }, { "epoch": 0.9143801652892563, "grad_norm": 4.574252128601074, "learning_rate": 1.8342936001026101e-07, "loss": 0.3238, "step": 8298 }, { "epoch": 0.9144903581267217, "grad_norm": 12.451244354248047, "learning_rate": 1.8296041512756934e-07, "loss": 0.4644, "step": 8299 }, { "epoch": 0.9146005509641874, "grad_norm": 5.818710803985596, "learning_rate": 1.8249205927902247e-07, "loss": 0.3551, "step": 8300 }, { "epoch": 0.9147107438016528, "grad_norm": 5.565225124359131, "learning_rate": 1.8202429252188957e-07, "loss": 0.3945, "step": 8301 }, { "epoch": 0.9148209366391185, "grad_norm": 8.565671920776367, "learning_rate": 1.81557114913371e-07, "loss": 0.4937, "step": 8302 }, { "epoch": 0.9149311294765841, "grad_norm": 6.482603549957275, "learning_rate": 1.8109052651059444e-07, "loss": 0.4699, "step": 8303 }, { "epoch": 0.9150413223140496, "grad_norm": 5.407090663909912, "learning_rate": 1.806245273706131e-07, "loss": 0.4133, "step": 8304 }, { "epoch": 0.9151515151515152, "grad_norm": 10.727090835571289, "learning_rate": 1.8015911755041137e-07, "loss": 0.4985, "step": 8305 }, { "epoch": 0.9152617079889808, "grad_norm": 7.957470417022705, "learning_rate": 1.7969429710689989e-07, "loss": 0.3101, "step": 8306 }, { "epoch": 0.9153719008264463, "grad_norm": 7.423542022705078, "learning_rate": 1.7923006609691761e-07, "loss": 0.3959, "step": 8307 }, { "epoch": 0.9154820936639119, "grad_norm": 5.17576265335083, "learning_rate": 1.7876642457723136e-07, "loss": 0.3963, "step": 8308 }, { "epoch": 0.9155922865013774, "grad_norm": 8.73119831085205, "learning_rate": 1.7830337260453523e-07, "loss": 0.3828, "step": 8309 }, { "epoch": 0.915702479338843, "grad_norm": 5.12293815612793, "learning_rate": 1.778409102354528e-07, "loss": 0.3701, "step": 8310 }, { "epoch": 0.9158126721763086, "grad_norm": 8.923873901367188, "learning_rate": 1.7737903752653386e-07, "loss": 0.4444, "step": 8311 }, { "epoch": 0.9159228650137741, "grad_norm": 24.163288116455078, "learning_rate": 1.7691775453425653e-07, "loss": 0.423, "step": 8312 }, { "epoch": 0.9160330578512397, "grad_norm": 4.271527290344238, "learning_rate": 1.7645706131502904e-07, "loss": 0.3025, "step": 8313 }, { "epoch": 0.9161432506887053, "grad_norm": 8.020672798156738, "learning_rate": 1.7599695792518356e-07, "loss": 0.4385, "step": 8314 }, { "epoch": 0.9162534435261708, "grad_norm": 4.3559393882751465, "learning_rate": 1.7553744442098285e-07, "loss": 0.3572, "step": 8315 }, { "epoch": 0.9163636363636364, "grad_norm": 6.599991321563721, "learning_rate": 1.7507852085861642e-07, "loss": 0.4171, "step": 8316 }, { "epoch": 0.9164738292011019, "grad_norm": 5.609654903411865, "learning_rate": 1.7462018729420326e-07, "loss": 0.3031, "step": 8317 }, { "epoch": 0.9165840220385675, "grad_norm": 8.524760246276855, "learning_rate": 1.7416244378378745e-07, "loss": 0.4674, "step": 8318 }, { "epoch": 0.9166942148760331, "grad_norm": 7.917247772216797, "learning_rate": 1.737052903833436e-07, "loss": 0.3783, "step": 8319 }, { "epoch": 0.9168044077134986, "grad_norm": 5.46023416519165, "learning_rate": 1.7324872714877317e-07, "loss": 0.2716, "step": 8320 }, { "epoch": 0.9169146005509642, "grad_norm": 7.115887641906738, "learning_rate": 1.7279275413590425e-07, "loss": 0.4426, "step": 8321 }, { "epoch": 0.9170247933884298, "grad_norm": 5.3829874992370605, "learning_rate": 1.7233737140049445e-07, "loss": 0.3992, "step": 8322 }, { "epoch": 0.9171349862258953, "grad_norm": 7.402966022491455, "learning_rate": 1.7188257899822868e-07, "loss": 0.4134, "step": 8323 }, { "epoch": 0.9172451790633609, "grad_norm": 5.992671489715576, "learning_rate": 1.714283769847197e-07, "loss": 0.3665, "step": 8324 }, { "epoch": 0.9173553719008265, "grad_norm": 8.363330841064453, "learning_rate": 1.7097476541550751e-07, "loss": 0.4608, "step": 8325 }, { "epoch": 0.917465564738292, "grad_norm": 13.325737953186035, "learning_rate": 1.705217443460605e-07, "loss": 0.4234, "step": 8326 }, { "epoch": 0.9175757575757576, "grad_norm": 5.575732707977295, "learning_rate": 1.7006931383177548e-07, "loss": 0.3845, "step": 8327 }, { "epoch": 0.9176859504132231, "grad_norm": 7.288289546966553, "learning_rate": 1.6961747392797488e-07, "loss": 0.4315, "step": 8328 }, { "epoch": 0.9177961432506887, "grad_norm": 6.849255084991455, "learning_rate": 1.6916622468991118e-07, "loss": 0.3797, "step": 8329 }, { "epoch": 0.9179063360881543, "grad_norm": 8.203755378723145, "learning_rate": 1.6871556617276407e-07, "loss": 0.3878, "step": 8330 }, { "epoch": 0.9180165289256198, "grad_norm": 7.336549282073975, "learning_rate": 1.682654984316401e-07, "loss": 0.3361, "step": 8331 }, { "epoch": 0.9181267217630854, "grad_norm": 4.305370330810547, "learning_rate": 1.678160215215735e-07, "loss": 0.3609, "step": 8332 }, { "epoch": 0.918236914600551, "grad_norm": 6.776370048522949, "learning_rate": 1.6736713549752815e-07, "loss": 0.4287, "step": 8333 }, { "epoch": 0.9183471074380165, "grad_norm": 4.999423980712891, "learning_rate": 1.6691884041439455e-07, "loss": 0.4315, "step": 8334 }, { "epoch": 0.9184573002754821, "grad_norm": 20.000083923339844, "learning_rate": 1.6647113632698886e-07, "loss": 0.458, "step": 8335 }, { "epoch": 0.9185674931129476, "grad_norm": 6.919802188873291, "learning_rate": 1.660240232900595e-07, "loss": 0.4631, "step": 8336 }, { "epoch": 0.9186776859504132, "grad_norm": 11.445610046386719, "learning_rate": 1.6557750135827833e-07, "loss": 0.4419, "step": 8337 }, { "epoch": 0.9187878787878788, "grad_norm": 12.312527656555176, "learning_rate": 1.6513157058624662e-07, "loss": 0.439, "step": 8338 }, { "epoch": 0.9188980716253443, "grad_norm": 6.393393516540527, "learning_rate": 1.6468623102849523e-07, "loss": 0.41, "step": 8339 }, { "epoch": 0.9190082644628099, "grad_norm": 6.301146984100342, "learning_rate": 1.6424148273947892e-07, "loss": 0.4067, "step": 8340 }, { "epoch": 0.9191184573002755, "grad_norm": 16.659996032714844, "learning_rate": 1.6379732577358366e-07, "loss": 0.439, "step": 8341 }, { "epoch": 0.919228650137741, "grad_norm": 4.91457462310791, "learning_rate": 1.6335376018511984e-07, "loss": 0.3293, "step": 8342 }, { "epoch": 0.9193388429752066, "grad_norm": 7.265914440155029, "learning_rate": 1.62910786028328e-07, "loss": 0.4245, "step": 8343 }, { "epoch": 0.9194490358126721, "grad_norm": 8.63418197631836, "learning_rate": 1.6246840335737646e-07, "loss": 0.433, "step": 8344 }, { "epoch": 0.9195592286501377, "grad_norm": 6.614242076873779, "learning_rate": 1.6202661222635917e-07, "loss": 0.3203, "step": 8345 }, { "epoch": 0.9196694214876033, "grad_norm": 8.162196159362793, "learning_rate": 1.6158541268929962e-07, "loss": 0.4583, "step": 8346 }, { "epoch": 0.9197796143250688, "grad_norm": 7.244357109069824, "learning_rate": 1.6114480480014905e-07, "loss": 0.3071, "step": 8347 }, { "epoch": 0.9198898071625344, "grad_norm": 7.062908172607422, "learning_rate": 1.6070478861278327e-07, "loss": 0.4074, "step": 8348 }, { "epoch": 0.92, "grad_norm": 9.556879997253418, "learning_rate": 1.6026536418101034e-07, "loss": 0.5522, "step": 8349 }, { "epoch": 0.9201101928374655, "grad_norm": 6.477383613586426, "learning_rate": 1.5982653155856287e-07, "loss": 0.3977, "step": 8350 }, { "epoch": 0.9202203856749311, "grad_norm": 4.523920059204102, "learning_rate": 1.5938829079910122e-07, "loss": 0.3486, "step": 8351 }, { "epoch": 0.9203305785123967, "grad_norm": 4.436920166015625, "learning_rate": 1.5895064195621478e-07, "loss": 0.405, "step": 8352 }, { "epoch": 0.9204407713498622, "grad_norm": 7.761414051055908, "learning_rate": 1.5851358508342074e-07, "loss": 0.4385, "step": 8353 }, { "epoch": 0.9205509641873278, "grad_norm": 8.559017181396484, "learning_rate": 1.5807712023416078e-07, "loss": 0.4268, "step": 8354 }, { "epoch": 0.9206611570247933, "grad_norm": 6.970315933227539, "learning_rate": 1.5764124746180832e-07, "loss": 0.3736, "step": 8355 }, { "epoch": 0.920771349862259, "grad_norm": 6.862329006195068, "learning_rate": 1.572059668196618e-07, "loss": 0.3573, "step": 8356 }, { "epoch": 0.9208815426997246, "grad_norm": 6.494803428649902, "learning_rate": 1.5677127836094763e-07, "loss": 0.4031, "step": 8357 }, { "epoch": 0.92099173553719, "grad_norm": 8.824837684631348, "learning_rate": 1.5633718213882097e-07, "loss": 0.4843, "step": 8358 }, { "epoch": 0.9211019283746557, "grad_norm": 5.581061840057373, "learning_rate": 1.5590367820636276e-07, "loss": 0.4371, "step": 8359 }, { "epoch": 0.9212121212121213, "grad_norm": 8.021149635314941, "learning_rate": 1.5547076661658279e-07, "loss": 0.4531, "step": 8360 }, { "epoch": 0.9213223140495868, "grad_norm": 4.6611127853393555, "learning_rate": 1.5503844742241813e-07, "loss": 0.3755, "step": 8361 }, { "epoch": 0.9214325068870524, "grad_norm": 6.004344463348389, "learning_rate": 1.5460672067673376e-07, "loss": 0.3695, "step": 8362 }, { "epoch": 0.9215426997245179, "grad_norm": 6.207559585571289, "learning_rate": 1.5417558643232077e-07, "loss": 0.3423, "step": 8363 }, { "epoch": 0.9216528925619835, "grad_norm": 7.601280212402344, "learning_rate": 1.5374504474190033e-07, "loss": 0.4347, "step": 8364 }, { "epoch": 0.9217630853994491, "grad_norm": 4.724337577819824, "learning_rate": 1.533150956581181e-07, "loss": 0.3406, "step": 8365 }, { "epoch": 0.9218732782369146, "grad_norm": 8.186506271362305, "learning_rate": 1.5288573923354976e-07, "loss": 0.432, "step": 8366 }, { "epoch": 0.9219834710743802, "grad_norm": 7.820041656494141, "learning_rate": 1.5245697552069782e-07, "loss": 0.3995, "step": 8367 }, { "epoch": 0.9220936639118458, "grad_norm": 4.091879844665527, "learning_rate": 1.520288045719903e-07, "loss": 0.3506, "step": 8368 }, { "epoch": 0.9222038567493113, "grad_norm": 4.473921775817871, "learning_rate": 1.5160122643978703e-07, "loss": 0.3674, "step": 8369 }, { "epoch": 0.9223140495867769, "grad_norm": 6.545633792877197, "learning_rate": 1.511742411763717e-07, "loss": 0.2819, "step": 8370 }, { "epoch": 0.9224242424242424, "grad_norm": 11.863525390625, "learning_rate": 1.5074784883395587e-07, "loss": 0.4053, "step": 8371 }, { "epoch": 0.922534435261708, "grad_norm": 4.6853814125061035, "learning_rate": 1.503220494646812e-07, "loss": 0.3621, "step": 8372 }, { "epoch": 0.9226446280991736, "grad_norm": 8.450872421264648, "learning_rate": 1.498968431206138e-07, "loss": 0.3981, "step": 8373 }, { "epoch": 0.9227548209366391, "grad_norm": 5.5612359046936035, "learning_rate": 1.494722298537482e-07, "loss": 0.3576, "step": 8374 }, { "epoch": 0.9228650137741047, "grad_norm": 10.44948959350586, "learning_rate": 1.4904820971600676e-07, "loss": 0.4276, "step": 8375 }, { "epoch": 0.9229752066115703, "grad_norm": 4.357065200805664, "learning_rate": 1.486247827592402e-07, "loss": 0.3567, "step": 8376 }, { "epoch": 0.9230853994490358, "grad_norm": 6.663888931274414, "learning_rate": 1.482019490352249e-07, "loss": 0.3808, "step": 8377 }, { "epoch": 0.9231955922865014, "grad_norm": 6.435294151306152, "learning_rate": 1.477797085956656e-07, "loss": 0.3919, "step": 8378 }, { "epoch": 0.923305785123967, "grad_norm": 5.487656593322754, "learning_rate": 1.4735806149219544e-07, "loss": 0.386, "step": 8379 }, { "epoch": 0.9234159779614325, "grad_norm": 8.581289291381836, "learning_rate": 1.4693700777637265e-07, "loss": 0.4092, "step": 8380 }, { "epoch": 0.9235261707988981, "grad_norm": 8.062307357788086, "learning_rate": 1.4651654749968436e-07, "loss": 0.4001, "step": 8381 }, { "epoch": 0.9236363636363636, "grad_norm": 4.797517776489258, "learning_rate": 1.460966807135461e-07, "loss": 0.3722, "step": 8382 }, { "epoch": 0.9237465564738292, "grad_norm": 6.919247150421143, "learning_rate": 1.4567740746929904e-07, "loss": 0.3781, "step": 8383 }, { "epoch": 0.9238567493112948, "grad_norm": 7.119813919067383, "learning_rate": 1.4525872781821215e-07, "loss": 0.3846, "step": 8384 }, { "epoch": 0.9239669421487603, "grad_norm": 6.6954498291015625, "learning_rate": 1.4484064181148283e-07, "loss": 0.3339, "step": 8385 }, { "epoch": 0.9240771349862259, "grad_norm": 6.583959579467773, "learning_rate": 1.4442314950023517e-07, "loss": 0.3822, "step": 8386 }, { "epoch": 0.9241873278236915, "grad_norm": 6.830016136169434, "learning_rate": 1.4400625093552e-07, "loss": 0.4408, "step": 8387 }, { "epoch": 0.924297520661157, "grad_norm": 6.698701858520508, "learning_rate": 1.4358994616831656e-07, "loss": 0.3444, "step": 8388 }, { "epoch": 0.9244077134986226, "grad_norm": 4.193092346191406, "learning_rate": 1.431742352495319e-07, "loss": 0.4458, "step": 8389 }, { "epoch": 0.9245179063360881, "grad_norm": 7.711377143859863, "learning_rate": 1.4275911822999922e-07, "loss": 0.4317, "step": 8390 }, { "epoch": 0.9246280991735537, "grad_norm": 7.1049370765686035, "learning_rate": 1.423445951604785e-07, "loss": 0.3267, "step": 8391 }, { "epoch": 0.9247382920110193, "grad_norm": 6.219496250152588, "learning_rate": 1.4193066609165972e-07, "loss": 0.3452, "step": 8392 }, { "epoch": 0.9248484848484848, "grad_norm": 4.745777130126953, "learning_rate": 1.4151733107415855e-07, "loss": 0.3863, "step": 8393 }, { "epoch": 0.9249586776859504, "grad_norm": 6.9581990242004395, "learning_rate": 1.4110459015851675e-07, "loss": 0.4266, "step": 8394 }, { "epoch": 0.925068870523416, "grad_norm": 6.962114334106445, "learning_rate": 1.4069244339520672e-07, "loss": 0.4283, "step": 8395 }, { "epoch": 0.9251790633608815, "grad_norm": 7.494615077972412, "learning_rate": 1.4028089083462482e-07, "loss": 0.3962, "step": 8396 }, { "epoch": 0.9252892561983471, "grad_norm": 7.1831560134887695, "learning_rate": 1.3986993252709747e-07, "loss": 0.4074, "step": 8397 }, { "epoch": 0.9253994490358127, "grad_norm": 5.726095676422119, "learning_rate": 1.394595685228761e-07, "loss": 0.3096, "step": 8398 }, { "epoch": 0.9255096418732782, "grad_norm": 6.874561309814453, "learning_rate": 1.3904979887214064e-07, "loss": 0.3626, "step": 8399 }, { "epoch": 0.9256198347107438, "grad_norm": 6.559597015380859, "learning_rate": 1.3864062362499987e-07, "loss": 0.3821, "step": 8400 }, { "epoch": 0.9257300275482093, "grad_norm": 6.934940814971924, "learning_rate": 1.3823204283148651e-07, "loss": 0.3554, "step": 8401 }, { "epoch": 0.9258402203856749, "grad_norm": 9.219505310058594, "learning_rate": 1.3782405654156284e-07, "loss": 0.4705, "step": 8402 }, { "epoch": 0.9259504132231405, "grad_norm": 7.8724870681762695, "learning_rate": 1.3741666480511894e-07, "loss": 0.392, "step": 8403 }, { "epoch": 0.926060606060606, "grad_norm": 4.987101078033447, "learning_rate": 1.3700986767197e-07, "loss": 0.4414, "step": 8404 }, { "epoch": 0.9261707988980716, "grad_norm": 7.214572429656982, "learning_rate": 1.3660366519185953e-07, "loss": 0.3922, "step": 8405 }, { "epoch": 0.9262809917355372, "grad_norm": 6.101369380950928, "learning_rate": 1.3619805741446e-07, "loss": 0.3524, "step": 8406 }, { "epoch": 0.9263911845730027, "grad_norm": 8.496451377868652, "learning_rate": 1.3579304438936848e-07, "loss": 0.4005, "step": 8407 }, { "epoch": 0.9265013774104683, "grad_norm": 6.085446834564209, "learning_rate": 1.3538862616611083e-07, "loss": 0.3692, "step": 8408 }, { "epoch": 0.9266115702479338, "grad_norm": 7.8222150802612305, "learning_rate": 1.3498480279414028e-07, "loss": 0.4491, "step": 8409 }, { "epoch": 0.9267217630853994, "grad_norm": 12.749969482421875, "learning_rate": 1.3458157432283626e-07, "loss": 0.4216, "step": 8410 }, { "epoch": 0.926831955922865, "grad_norm": 8.5120849609375, "learning_rate": 1.3417894080150595e-07, "loss": 0.4171, "step": 8411 }, { "epoch": 0.9269421487603305, "grad_norm": 7.881687164306641, "learning_rate": 1.33776902279385e-07, "loss": 0.3602, "step": 8412 }, { "epoch": 0.9270523415977961, "grad_norm": 14.683112144470215, "learning_rate": 1.3337545880563462e-07, "loss": 0.4831, "step": 8413 }, { "epoch": 0.9271625344352618, "grad_norm": 6.600551128387451, "learning_rate": 1.329746104293428e-07, "loss": 0.394, "step": 8414 }, { "epoch": 0.9272727272727272, "grad_norm": 11.075981140136719, "learning_rate": 1.3257435719952804e-07, "loss": 0.3995, "step": 8415 }, { "epoch": 0.9273829201101929, "grad_norm": 6.9011664390563965, "learning_rate": 1.3217469916513182e-07, "loss": 0.4076, "step": 8416 }, { "epoch": 0.9274931129476583, "grad_norm": 7.484837055206299, "learning_rate": 1.3177563637502612e-07, "loss": 0.43, "step": 8417 }, { "epoch": 0.927603305785124, "grad_norm": 10.550387382507324, "learning_rate": 1.3137716887800854e-07, "loss": 0.4981, "step": 8418 }, { "epoch": 0.9277134986225896, "grad_norm": 7.676487445831299, "learning_rate": 1.309792967228035e-07, "loss": 0.3883, "step": 8419 }, { "epoch": 0.9278236914600551, "grad_norm": 6.712424278259277, "learning_rate": 1.305820199580643e-07, "loss": 0.4221, "step": 8420 }, { "epoch": 0.9279338842975207, "grad_norm": 4.638515472412109, "learning_rate": 1.3018533863237037e-07, "loss": 0.4026, "step": 8421 }, { "epoch": 0.9280440771349863, "grad_norm": 7.031803131103516, "learning_rate": 1.2978925279422795e-07, "loss": 0.3845, "step": 8422 }, { "epoch": 0.9281542699724518, "grad_norm": 3.9012720584869385, "learning_rate": 1.2939376249207157e-07, "loss": 0.2964, "step": 8423 }, { "epoch": 0.9282644628099174, "grad_norm": 6.953731536865234, "learning_rate": 1.2899886777426096e-07, "loss": 0.3968, "step": 8424 }, { "epoch": 0.928374655647383, "grad_norm": 6.745138645172119, "learning_rate": 1.2860456868908632e-07, "loss": 0.4183, "step": 8425 }, { "epoch": 0.9284848484848485, "grad_norm": 8.751028060913086, "learning_rate": 1.2821086528476244e-07, "loss": 0.4627, "step": 8426 }, { "epoch": 0.9285950413223141, "grad_norm": 9.254036903381348, "learning_rate": 1.2781775760943026e-07, "loss": 0.3513, "step": 8427 }, { "epoch": 0.9287052341597796, "grad_norm": 6.881036758422852, "learning_rate": 1.2742524571116244e-07, "loss": 0.4159, "step": 8428 }, { "epoch": 0.9288154269972452, "grad_norm": 8.426811218261719, "learning_rate": 1.270333296379539e-07, "loss": 0.4034, "step": 8429 }, { "epoch": 0.9289256198347108, "grad_norm": 14.932004928588867, "learning_rate": 1.2664200943772853e-07, "loss": 0.442, "step": 8430 }, { "epoch": 0.9290358126721763, "grad_norm": 5.104497909545898, "learning_rate": 1.2625128515833863e-07, "loss": 0.326, "step": 8431 }, { "epoch": 0.9291460055096419, "grad_norm": 4.100868225097656, "learning_rate": 1.2586115684756205e-07, "loss": 0.3718, "step": 8432 }, { "epoch": 0.9292561983471075, "grad_norm": 5.32192850112915, "learning_rate": 1.2547162455310347e-07, "loss": 0.3859, "step": 8433 }, { "epoch": 0.929366391184573, "grad_norm": 5.072726249694824, "learning_rate": 1.2508268832259585e-07, "loss": 0.449, "step": 8434 }, { "epoch": 0.9294765840220386, "grad_norm": 8.323136329650879, "learning_rate": 1.2469434820360005e-07, "loss": 0.4656, "step": 8435 }, { "epoch": 0.9295867768595041, "grad_norm": 5.533694744110107, "learning_rate": 1.2430660424360085e-07, "loss": 0.3922, "step": 8436 }, { "epoch": 0.9296969696969697, "grad_norm": 6.415494918823242, "learning_rate": 1.2391945649001314e-07, "loss": 0.3965, "step": 8437 }, { "epoch": 0.9298071625344353, "grad_norm": 4.350618362426758, "learning_rate": 1.2353290499017788e-07, "loss": 0.3766, "step": 8438 }, { "epoch": 0.9299173553719008, "grad_norm": 6.249704360961914, "learning_rate": 1.23146949791364e-07, "loss": 0.3584, "step": 8439 }, { "epoch": 0.9300275482093664, "grad_norm": 9.35655403137207, "learning_rate": 1.2276159094076479e-07, "loss": 0.3968, "step": 8440 }, { "epoch": 0.930137741046832, "grad_norm": 8.002750396728516, "learning_rate": 1.2237682848550313e-07, "loss": 0.3365, "step": 8441 }, { "epoch": 0.9302479338842975, "grad_norm": 5.87313985824585, "learning_rate": 1.219926624726292e-07, "loss": 0.3905, "step": 8442 }, { "epoch": 0.9303581267217631, "grad_norm": 7.536591053009033, "learning_rate": 1.216090929491176e-07, "loss": 0.3819, "step": 8443 }, { "epoch": 0.9304683195592286, "grad_norm": 4.820857524871826, "learning_rate": 1.212261199618736e-07, "loss": 0.3978, "step": 8444 }, { "epoch": 0.9305785123966942, "grad_norm": 7.339576721191406, "learning_rate": 1.2084374355772642e-07, "loss": 0.3865, "step": 8445 }, { "epoch": 0.9306887052341598, "grad_norm": 9.985773086547852, "learning_rate": 1.2046196378343423e-07, "loss": 0.4125, "step": 8446 }, { "epoch": 0.9307988980716253, "grad_norm": 8.914613723754883, "learning_rate": 1.2008078068568074e-07, "loss": 0.4161, "step": 8447 }, { "epoch": 0.9309090909090909, "grad_norm": 9.997947692871094, "learning_rate": 1.1970019431107926e-07, "loss": 0.4691, "step": 8448 }, { "epoch": 0.9310192837465565, "grad_norm": 7.667982578277588, "learning_rate": 1.1932020470616646e-07, "loss": 0.4159, "step": 8449 }, { "epoch": 0.931129476584022, "grad_norm": 5.538841247558594, "learning_rate": 1.1894081191740848e-07, "loss": 0.3165, "step": 8450 }, { "epoch": 0.9312396694214876, "grad_norm": 7.691782474517822, "learning_rate": 1.1856201599119876e-07, "loss": 0.3869, "step": 8451 }, { "epoch": 0.9313498622589532, "grad_norm": 5.27807092666626, "learning_rate": 1.1818381697385639e-07, "loss": 0.4573, "step": 8452 }, { "epoch": 0.9314600550964187, "grad_norm": 8.917466163635254, "learning_rate": 1.1780621491162825e-07, "loss": 0.3827, "step": 8453 }, { "epoch": 0.9315702479338843, "grad_norm": 8.276079177856445, "learning_rate": 1.1742920985068795e-07, "loss": 0.2961, "step": 8454 }, { "epoch": 0.9316804407713498, "grad_norm": 7.216142654418945, "learning_rate": 1.1705280183713641e-07, "loss": 0.4257, "step": 8455 }, { "epoch": 0.9317906336088154, "grad_norm": 5.993567943572998, "learning_rate": 1.1667699091700068e-07, "loss": 0.3158, "step": 8456 }, { "epoch": 0.931900826446281, "grad_norm": 13.740479469299316, "learning_rate": 1.1630177713623625e-07, "loss": 0.5898, "step": 8457 }, { "epoch": 0.9320110192837465, "grad_norm": 8.478474617004395, "learning_rate": 1.1592716054072361e-07, "loss": 0.427, "step": 8458 }, { "epoch": 0.9321212121212121, "grad_norm": 10.227319717407227, "learning_rate": 1.1555314117627336e-07, "loss": 0.355, "step": 8459 }, { "epoch": 0.9322314049586777, "grad_norm": 5.869755268096924, "learning_rate": 1.1517971908861892e-07, "loss": 0.3748, "step": 8460 }, { "epoch": 0.9323415977961432, "grad_norm": 5.438831329345703, "learning_rate": 1.1480689432342373e-07, "loss": 0.4089, "step": 8461 }, { "epoch": 0.9324517906336088, "grad_norm": 6.387732028961182, "learning_rate": 1.1443466692627803e-07, "loss": 0.3713, "step": 8462 }, { "epoch": 0.9325619834710743, "grad_norm": 5.7738728523254395, "learning_rate": 1.1406303694269705e-07, "loss": 0.3228, "step": 8463 }, { "epoch": 0.9326721763085399, "grad_norm": 5.7311296463012695, "learning_rate": 1.13692004418125e-07, "loss": 0.3703, "step": 8464 }, { "epoch": 0.9327823691460055, "grad_norm": 21.89881134033203, "learning_rate": 1.1332156939793282e-07, "loss": 0.4693, "step": 8465 }, { "epoch": 0.932892561983471, "grad_norm": 12.989684104919434, "learning_rate": 1.1295173192741593e-07, "loss": 0.4476, "step": 8466 }, { "epoch": 0.9330027548209366, "grad_norm": 7.674446105957031, "learning_rate": 1.1258249205179983e-07, "loss": 0.3345, "step": 8467 }, { "epoch": 0.9331129476584022, "grad_norm": 5.347735404968262, "learning_rate": 1.1221384981623618e-07, "loss": 0.3376, "step": 8468 }, { "epoch": 0.9332231404958677, "grad_norm": 4.392724514007568, "learning_rate": 1.118458052658017e-07, "loss": 0.349, "step": 8469 }, { "epoch": 0.9333333333333333, "grad_norm": 6.3683857917785645, "learning_rate": 1.1147835844550204e-07, "loss": 0.3807, "step": 8470 }, { "epoch": 0.9334435261707988, "grad_norm": 12.318037033081055, "learning_rate": 1.1111150940027016e-07, "loss": 0.4266, "step": 8471 }, { "epoch": 0.9335537190082644, "grad_norm": 9.726642608642578, "learning_rate": 1.107452581749624e-07, "loss": 0.3922, "step": 8472 }, { "epoch": 0.93366391184573, "grad_norm": 7.582922458648682, "learning_rate": 1.1037960481436682e-07, "loss": 0.389, "step": 8473 }, { "epoch": 0.9337741046831955, "grad_norm": 7.151773929595947, "learning_rate": 1.1001454936319489e-07, "loss": 0.4179, "step": 8474 }, { "epoch": 0.9338842975206612, "grad_norm": 6.1926774978637695, "learning_rate": 1.0965009186608589e-07, "loss": 0.404, "step": 8475 }, { "epoch": 0.9339944903581268, "grad_norm": 6.94793701171875, "learning_rate": 1.0928623236760694e-07, "loss": 0.3677, "step": 8476 }, { "epoch": 0.9341046831955923, "grad_norm": 8.575023651123047, "learning_rate": 1.0892297091225079e-07, "loss": 0.4431, "step": 8477 }, { "epoch": 0.9342148760330579, "grad_norm": 5.1859130859375, "learning_rate": 1.0856030754443747e-07, "loss": 0.3697, "step": 8478 }, { "epoch": 0.9343250688705235, "grad_norm": 11.38362979888916, "learning_rate": 1.0819824230851373e-07, "loss": 0.4817, "step": 8479 }, { "epoch": 0.934435261707989, "grad_norm": 6.110439300537109, "learning_rate": 1.0783677524875413e-07, "loss": 0.3407, "step": 8480 }, { "epoch": 0.9345454545454546, "grad_norm": 4.999442100524902, "learning_rate": 1.0747590640935945e-07, "loss": 0.387, "step": 8481 }, { "epoch": 0.9346556473829201, "grad_norm": 7.006037712097168, "learning_rate": 1.0711563583445717e-07, "loss": 0.3847, "step": 8482 }, { "epoch": 0.9347658402203857, "grad_norm": 9.513243675231934, "learning_rate": 1.067559635680998e-07, "loss": 0.4652, "step": 8483 }, { "epoch": 0.9348760330578513, "grad_norm": 4.773813247680664, "learning_rate": 1.0639688965427108e-07, "loss": 0.3606, "step": 8484 }, { "epoch": 0.9349862258953168, "grad_norm": 7.91803503036499, "learning_rate": 1.060384141368781e-07, "loss": 0.387, "step": 8485 }, { "epoch": 0.9350964187327824, "grad_norm": 6.70215368270874, "learning_rate": 1.0568053705975467e-07, "loss": 0.4408, "step": 8486 }, { "epoch": 0.935206611570248, "grad_norm": 4.688905239105225, "learning_rate": 1.0532325846666414e-07, "loss": 0.3805, "step": 8487 }, { "epoch": 0.9353168044077135, "grad_norm": 9.817471504211426, "learning_rate": 1.0496657840129432e-07, "loss": 0.4929, "step": 8488 }, { "epoch": 0.9354269972451791, "grad_norm": 6.609936237335205, "learning_rate": 1.0461049690726033e-07, "loss": 0.3759, "step": 8489 }, { "epoch": 0.9355371900826446, "grad_norm": 7.818711280822754, "learning_rate": 1.0425501402810457e-07, "loss": 0.4189, "step": 8490 }, { "epoch": 0.9356473829201102, "grad_norm": 8.710646629333496, "learning_rate": 1.0390012980729613e-07, "loss": 0.372, "step": 8491 }, { "epoch": 0.9357575757575758, "grad_norm": 6.422230243682861, "learning_rate": 1.0354584428823034e-07, "loss": 0.3829, "step": 8492 }, { "epoch": 0.9358677685950413, "grad_norm": 12.990571022033691, "learning_rate": 1.0319215751422973e-07, "loss": 0.4292, "step": 8493 }, { "epoch": 0.9359779614325069, "grad_norm": 13.805790901184082, "learning_rate": 1.0283906952854361e-07, "loss": 0.4625, "step": 8494 }, { "epoch": 0.9360881542699725, "grad_norm": 10.73592758178711, "learning_rate": 1.024865803743491e-07, "loss": 0.5072, "step": 8495 }, { "epoch": 0.936198347107438, "grad_norm": 6.469753265380859, "learning_rate": 1.021346900947473e-07, "loss": 0.3863, "step": 8496 }, { "epoch": 0.9363085399449036, "grad_norm": 5.712851047515869, "learning_rate": 1.0178339873276877e-07, "loss": 0.4021, "step": 8497 }, { "epoch": 0.9364187327823692, "grad_norm": 5.719577312469482, "learning_rate": 1.0143270633137026e-07, "loss": 0.3985, "step": 8498 }, { "epoch": 0.9365289256198347, "grad_norm": 11.244417190551758, "learning_rate": 1.0108261293343413e-07, "loss": 0.4558, "step": 8499 }, { "epoch": 0.9366391184573003, "grad_norm": 9.082850456237793, "learning_rate": 1.007331185817706e-07, "loss": 0.3955, "step": 8500 }, { "epoch": 0.9367493112947658, "grad_norm": 6.717166900634766, "learning_rate": 1.0038422331911657e-07, "loss": 0.4204, "step": 8501 }, { "epoch": 0.9368595041322314, "grad_norm": 10.005170822143555, "learning_rate": 1.0003592718813515e-07, "loss": 0.4111, "step": 8502 }, { "epoch": 0.936969696969697, "grad_norm": 5.7031145095825195, "learning_rate": 9.968823023141616e-08, "loss": 0.4185, "step": 8503 }, { "epoch": 0.9370798898071625, "grad_norm": 5.498226165771484, "learning_rate": 9.934113249147725e-08, "loss": 0.4084, "step": 8504 }, { "epoch": 0.9371900826446281, "grad_norm": 5.361534118652344, "learning_rate": 9.899463401076115e-08, "loss": 0.3802, "step": 8505 }, { "epoch": 0.9373002754820937, "grad_norm": 7.721748352050781, "learning_rate": 9.864873483163839e-08, "loss": 0.3692, "step": 8506 }, { "epoch": 0.9374104683195592, "grad_norm": 8.12588882446289, "learning_rate": 9.830343499640683e-08, "loss": 0.383, "step": 8507 }, { "epoch": 0.9375206611570248, "grad_norm": 6.769551753997803, "learning_rate": 9.795873454728932e-08, "loss": 0.3858, "step": 8508 }, { "epoch": 0.9376308539944903, "grad_norm": 9.868959426879883, "learning_rate": 9.761463352643608e-08, "loss": 0.4937, "step": 8509 }, { "epoch": 0.9377410468319559, "grad_norm": 6.203073501586914, "learning_rate": 9.727113197592564e-08, "loss": 0.4053, "step": 8510 }, { "epoch": 0.9378512396694215, "grad_norm": 10.969325065612793, "learning_rate": 9.692822993775996e-08, "loss": 0.3893, "step": 8511 }, { "epoch": 0.937961432506887, "grad_norm": 4.758317470550537, "learning_rate": 9.658592745387108e-08, "loss": 0.4115, "step": 8512 }, { "epoch": 0.9380716253443526, "grad_norm": 4.448451519012451, "learning_rate": 9.624422456611548e-08, "loss": 0.3227, "step": 8513 }, { "epoch": 0.9381818181818182, "grad_norm": 4.2206645011901855, "learning_rate": 9.590312131627699e-08, "loss": 0.3886, "step": 8514 }, { "epoch": 0.9382920110192837, "grad_norm": 6.712493419647217, "learning_rate": 9.556261774606668e-08, "loss": 0.3623, "step": 8515 }, { "epoch": 0.9384022038567493, "grad_norm": 7.380617141723633, "learning_rate": 9.522271389712123e-08, "loss": 0.3771, "step": 8516 }, { "epoch": 0.9385123966942148, "grad_norm": 11.71654224395752, "learning_rate": 9.488340981100463e-08, "loss": 0.5992, "step": 8517 }, { "epoch": 0.9386225895316804, "grad_norm": 6.216311454772949, "learning_rate": 9.454470552920814e-08, "loss": 0.3657, "step": 8518 }, { "epoch": 0.938732782369146, "grad_norm": 7.123051643371582, "learning_rate": 9.420660109314805e-08, "loss": 0.3396, "step": 8519 }, { "epoch": 0.9388429752066115, "grad_norm": 9.15761661529541, "learning_rate": 9.386909654416853e-08, "loss": 0.3531, "step": 8520 }, { "epoch": 0.9389531680440771, "grad_norm": 5.481376647949219, "learning_rate": 9.353219192354101e-08, "loss": 0.3669, "step": 8521 }, { "epoch": 0.9390633608815427, "grad_norm": 4.637485027313232, "learning_rate": 9.319588727246143e-08, "loss": 0.338, "step": 8522 }, { "epoch": 0.9391735537190082, "grad_norm": 12.680817604064941, "learning_rate": 9.286018263205355e-08, "loss": 0.4836, "step": 8523 }, { "epoch": 0.9392837465564738, "grad_norm": 4.465548992156982, "learning_rate": 9.252507804336897e-08, "loss": 0.4173, "step": 8524 }, { "epoch": 0.9393939393939394, "grad_norm": 4.898728370666504, "learning_rate": 9.219057354738326e-08, "loss": 0.3305, "step": 8525 }, { "epoch": 0.9395041322314049, "grad_norm": 4.7653727531433105, "learning_rate": 9.18566691850009e-08, "loss": 0.4154, "step": 8526 }, { "epoch": 0.9396143250688705, "grad_norm": 5.312608242034912, "learning_rate": 9.152336499705261e-08, "loss": 0.3742, "step": 8527 }, { "epoch": 0.939724517906336, "grad_norm": 8.836138725280762, "learning_rate": 9.119066102429464e-08, "loss": 0.3333, "step": 8528 }, { "epoch": 0.9398347107438016, "grad_norm": 7.88871431350708, "learning_rate": 9.085855730741e-08, "loss": 0.4661, "step": 8529 }, { "epoch": 0.9399449035812673, "grad_norm": 8.838181495666504, "learning_rate": 9.05270538870101e-08, "loss": 0.3756, "step": 8530 }, { "epoch": 0.9400550964187327, "grad_norm": 4.735797882080078, "learning_rate": 9.019615080363087e-08, "loss": 0.3795, "step": 8531 }, { "epoch": 0.9401652892561984, "grad_norm": 3.989769458770752, "learning_rate": 8.98658480977349e-08, "loss": 0.3852, "step": 8532 }, { "epoch": 0.940275482093664, "grad_norm": 7.275123119354248, "learning_rate": 8.953614580971381e-08, "loss": 0.4019, "step": 8533 }, { "epoch": 0.9403856749311295, "grad_norm": 5.2555718421936035, "learning_rate": 8.920704397988256e-08, "loss": 0.3471, "step": 8534 }, { "epoch": 0.9404958677685951, "grad_norm": 5.258383274078369, "learning_rate": 8.88785426484845e-08, "loss": 0.3768, "step": 8535 }, { "epoch": 0.9406060606060606, "grad_norm": 18.62480926513672, "learning_rate": 8.855064185568918e-08, "loss": 0.5789, "step": 8536 }, { "epoch": 0.9407162534435262, "grad_norm": 4.384313106536865, "learning_rate": 8.82233416415934e-08, "loss": 0.3795, "step": 8537 }, { "epoch": 0.9408264462809918, "grad_norm": 6.902232646942139, "learning_rate": 8.789664204621906e-08, "loss": 0.4671, "step": 8538 }, { "epoch": 0.9409366391184573, "grad_norm": 5.78926420211792, "learning_rate": 8.757054310951585e-08, "loss": 0.35, "step": 8539 }, { "epoch": 0.9410468319559229, "grad_norm": 6.472806930541992, "learning_rate": 8.724504487135965e-08, "loss": 0.4257, "step": 8540 }, { "epoch": 0.9411570247933885, "grad_norm": 7.819636344909668, "learning_rate": 8.692014737155307e-08, "loss": 0.4085, "step": 8541 }, { "epoch": 0.941267217630854, "grad_norm": 9.186422348022461, "learning_rate": 8.659585064982323e-08, "loss": 0.3484, "step": 8542 }, { "epoch": 0.9413774104683196, "grad_norm": 4.838017463684082, "learning_rate": 8.62721547458284e-08, "loss": 0.3866, "step": 8543 }, { "epoch": 0.9414876033057851, "grad_norm": 5.862768173217773, "learning_rate": 8.594905969914858e-08, "loss": 0.3511, "step": 8544 }, { "epoch": 0.9415977961432507, "grad_norm": 9.558072090148926, "learning_rate": 8.562656554929271e-08, "loss": 0.3256, "step": 8545 }, { "epoch": 0.9417079889807163, "grad_norm": 7.147973537445068, "learning_rate": 8.530467233569595e-08, "loss": 0.4119, "step": 8546 }, { "epoch": 0.9418181818181818, "grad_norm": 9.320735931396484, "learning_rate": 8.498338009772067e-08, "loss": 0.4266, "step": 8547 }, { "epoch": 0.9419283746556474, "grad_norm": 4.809484958648682, "learning_rate": 8.466268887465268e-08, "loss": 0.333, "step": 8548 }, { "epoch": 0.942038567493113, "grad_norm": 5.545483112335205, "learning_rate": 8.434259870570893e-08, "loss": 0.3773, "step": 8549 }, { "epoch": 0.9421487603305785, "grad_norm": 4.454220294952393, "learning_rate": 8.402310963002869e-08, "loss": 0.3499, "step": 8550 }, { "epoch": 0.9422589531680441, "grad_norm": 7.443580627441406, "learning_rate": 8.370422168668125e-08, "loss": 0.3727, "step": 8551 }, { "epoch": 0.9423691460055097, "grad_norm": 5.809983253479004, "learning_rate": 8.338593491465874e-08, "loss": 0.3427, "step": 8552 }, { "epoch": 0.9424793388429752, "grad_norm": 6.981957912445068, "learning_rate": 8.306824935288338e-08, "loss": 0.3758, "step": 8553 }, { "epoch": 0.9425895316804408, "grad_norm": 5.555224895477295, "learning_rate": 8.275116504020131e-08, "loss": 0.4115, "step": 8554 }, { "epoch": 0.9426997245179063, "grad_norm": 5.965600967407227, "learning_rate": 8.243468201538596e-08, "loss": 0.3928, "step": 8555 }, { "epoch": 0.9428099173553719, "grad_norm": 4.40320348739624, "learning_rate": 8.211880031713748e-08, "loss": 0.3729, "step": 8556 }, { "epoch": 0.9429201101928375, "grad_norm": 4.800597190856934, "learning_rate": 8.180351998408331e-08, "loss": 0.3673, "step": 8557 }, { "epoch": 0.943030303030303, "grad_norm": 5.584104061126709, "learning_rate": 8.148884105477429e-08, "loss": 0.3483, "step": 8558 }, { "epoch": 0.9431404958677686, "grad_norm": 10.89968490600586, "learning_rate": 8.117476356769127e-08, "loss": 0.4443, "step": 8559 }, { "epoch": 0.9432506887052342, "grad_norm": 7.249094009399414, "learning_rate": 8.086128756124023e-08, "loss": 0.3651, "step": 8560 }, { "epoch": 0.9433608815426997, "grad_norm": 7.102390766143799, "learning_rate": 8.054841307375217e-08, "loss": 0.4313, "step": 8561 }, { "epoch": 0.9434710743801653, "grad_norm": 4.627991199493408, "learning_rate": 8.023614014348702e-08, "loss": 0.3556, "step": 8562 }, { "epoch": 0.9435812672176308, "grad_norm": 7.0684309005737305, "learning_rate": 7.992446880862981e-08, "loss": 0.4176, "step": 8563 }, { "epoch": 0.9436914600550964, "grad_norm": 4.0046186447143555, "learning_rate": 7.961339910729115e-08, "loss": 0.3443, "step": 8564 }, { "epoch": 0.943801652892562, "grad_norm": 7.933088779449463, "learning_rate": 7.930293107751009e-08, "loss": 0.4305, "step": 8565 }, { "epoch": 0.9439118457300275, "grad_norm": 4.772091865539551, "learning_rate": 7.899306475725066e-08, "loss": 0.4325, "step": 8566 }, { "epoch": 0.9440220385674931, "grad_norm": 4.680997848510742, "learning_rate": 7.868380018440369e-08, "loss": 0.3798, "step": 8567 }, { "epoch": 0.9441322314049587, "grad_norm": 8.505746841430664, "learning_rate": 7.83751373967867e-08, "loss": 0.3907, "step": 8568 }, { "epoch": 0.9442424242424242, "grad_norm": 5.709012985229492, "learning_rate": 7.806707643214395e-08, "loss": 0.4498, "step": 8569 }, { "epoch": 0.9443526170798898, "grad_norm": 8.944786071777344, "learning_rate": 7.775961732814364e-08, "loss": 0.3283, "step": 8570 }, { "epoch": 0.9444628099173553, "grad_norm": 6.441588401794434, "learning_rate": 7.745276012238401e-08, "loss": 0.4687, "step": 8571 }, { "epoch": 0.9445730027548209, "grad_norm": 7.572641849517822, "learning_rate": 7.714650485238783e-08, "loss": 0.4429, "step": 8572 }, { "epoch": 0.9446831955922865, "grad_norm": 5.6551833152771, "learning_rate": 7.684085155560406e-08, "loss": 0.368, "step": 8573 }, { "epoch": 0.944793388429752, "grad_norm": 7.429037570953369, "learning_rate": 7.653580026940833e-08, "loss": 0.4332, "step": 8574 }, { "epoch": 0.9449035812672176, "grad_norm": 7.719753742218018, "learning_rate": 7.623135103110246e-08, "loss": 0.4786, "step": 8575 }, { "epoch": 0.9450137741046832, "grad_norm": 7.220445156097412, "learning_rate": 7.592750387791558e-08, "loss": 0.4455, "step": 8576 }, { "epoch": 0.9451239669421487, "grad_norm": 6.1490654945373535, "learning_rate": 7.562425884700241e-08, "loss": 0.3882, "step": 8577 }, { "epoch": 0.9452341597796143, "grad_norm": 7.9718804359436035, "learning_rate": 7.53216159754433e-08, "loss": 0.4391, "step": 8578 }, { "epoch": 0.9453443526170799, "grad_norm": 5.425424098968506, "learning_rate": 7.501957530024695e-08, "loss": 0.3879, "step": 8579 }, { "epoch": 0.9454545454545454, "grad_norm": 4.844546318054199, "learning_rate": 7.471813685834716e-08, "loss": 0.3323, "step": 8580 }, { "epoch": 0.945564738292011, "grad_norm": 6.59128999710083, "learning_rate": 7.441730068660336e-08, "loss": 0.3366, "step": 8581 }, { "epoch": 0.9456749311294765, "grad_norm": 5.869863986968994, "learning_rate": 7.41170668218033e-08, "loss": 0.455, "step": 8582 }, { "epoch": 0.9457851239669421, "grad_norm": 9.603310585021973, "learning_rate": 7.381743530065933e-08, "loss": 0.4076, "step": 8583 }, { "epoch": 0.9458953168044077, "grad_norm": 6.4055495262146, "learning_rate": 7.351840615981043e-08, "loss": 0.2913, "step": 8584 }, { "epoch": 0.9460055096418732, "grad_norm": 9.786589622497559, "learning_rate": 7.321997943582293e-08, "loss": 0.4402, "step": 8585 }, { "epoch": 0.9461157024793388, "grad_norm": 11.6978178024292, "learning_rate": 7.292215516518931e-08, "loss": 0.5475, "step": 8586 }, { "epoch": 0.9462258953168045, "grad_norm": 7.274374008178711, "learning_rate": 7.262493338432708e-08, "loss": 0.4702, "step": 8587 }, { "epoch": 0.94633608815427, "grad_norm": 4.72409725189209, "learning_rate": 7.232831412958053e-08, "loss": 0.4482, "step": 8588 }, { "epoch": 0.9464462809917356, "grad_norm": 6.25150203704834, "learning_rate": 7.203229743722229e-08, "loss": 0.3769, "step": 8589 }, { "epoch": 0.946556473829201, "grad_norm": 5.5450639724731445, "learning_rate": 7.173688334344841e-08, "loss": 0.3712, "step": 8590 }, { "epoch": 0.9466666666666667, "grad_norm": 5.280126571655273, "learning_rate": 7.144207188438223e-08, "loss": 0.3382, "step": 8591 }, { "epoch": 0.9467768595041323, "grad_norm": 7.815299034118652, "learning_rate": 7.114786309607491e-08, "loss": 0.3563, "step": 8592 }, { "epoch": 0.9468870523415978, "grad_norm": 7.206793785095215, "learning_rate": 7.08542570145021e-08, "loss": 0.3989, "step": 8593 }, { "epoch": 0.9469972451790634, "grad_norm": 7.927333354949951, "learning_rate": 7.056125367556566e-08, "loss": 0.3615, "step": 8594 }, { "epoch": 0.947107438016529, "grad_norm": 6.645603656768799, "learning_rate": 7.026885311509579e-08, "loss": 0.4112, "step": 8595 }, { "epoch": 0.9472176308539945, "grad_norm": 6.320545196533203, "learning_rate": 6.99770553688467e-08, "loss": 0.4128, "step": 8596 }, { "epoch": 0.9473278236914601, "grad_norm": 5.7397074699401855, "learning_rate": 6.968586047250036e-08, "loss": 0.3748, "step": 8597 }, { "epoch": 0.9474380165289257, "grad_norm": 6.543806076049805, "learning_rate": 6.939526846166333e-08, "loss": 0.3366, "step": 8598 }, { "epoch": 0.9475482093663912, "grad_norm": 7.952162742614746, "learning_rate": 6.910527937187162e-08, "loss": 0.5369, "step": 8599 }, { "epoch": 0.9476584022038568, "grad_norm": 7.1026716232299805, "learning_rate": 6.881589323858407e-08, "loss": 0.3817, "step": 8600 }, { "epoch": 0.9477685950413223, "grad_norm": 7.128777027130127, "learning_rate": 6.852711009718626e-08, "loss": 0.2803, "step": 8601 }, { "epoch": 0.9478787878787879, "grad_norm": 4.653260231018066, "learning_rate": 6.82389299829933e-08, "loss": 0.3445, "step": 8602 }, { "epoch": 0.9479889807162535, "grad_norm": 4.707342624664307, "learning_rate": 6.795135293124311e-08, "loss": 0.3503, "step": 8603 }, { "epoch": 0.948099173553719, "grad_norm": 5.059284687042236, "learning_rate": 6.766437897710032e-08, "loss": 0.3551, "step": 8604 }, { "epoch": 0.9482093663911846, "grad_norm": 7.666423320770264, "learning_rate": 6.737800815565799e-08, "loss": 0.4054, "step": 8605 }, { "epoch": 0.9483195592286502, "grad_norm": 4.72752046585083, "learning_rate": 6.709224050193252e-08, "loss": 0.3762, "step": 8606 }, { "epoch": 0.9484297520661157, "grad_norm": 6.7642316818237305, "learning_rate": 6.680707605086878e-08, "loss": 0.4859, "step": 8607 }, { "epoch": 0.9485399449035813, "grad_norm": 6.059367656707764, "learning_rate": 6.652251483733718e-08, "loss": 0.3812, "step": 8608 }, { "epoch": 0.9486501377410468, "grad_norm": 5.486311435699463, "learning_rate": 6.623855689613323e-08, "loss": 0.3266, "step": 8609 }, { "epoch": 0.9487603305785124, "grad_norm": 7.317872524261475, "learning_rate": 6.595520226198138e-08, "loss": 0.3442, "step": 8610 }, { "epoch": 0.948870523415978, "grad_norm": 4.8540778160095215, "learning_rate": 6.56724509695289e-08, "loss": 0.402, "step": 8611 }, { "epoch": 0.9489807162534435, "grad_norm": 4.781094074249268, "learning_rate": 6.539030305335147e-08, "loss": 0.403, "step": 8612 }, { "epoch": 0.9490909090909091, "grad_norm": 7.0046067237854, "learning_rate": 6.510875854795152e-08, "loss": 0.2694, "step": 8613 }, { "epoch": 0.9492011019283747, "grad_norm": 6.1152825355529785, "learning_rate": 6.48278174877559e-08, "loss": 0.4012, "step": 8614 }, { "epoch": 0.9493112947658402, "grad_norm": 6.957250595092773, "learning_rate": 6.454747990711774e-08, "loss": 0.4294, "step": 8615 }, { "epoch": 0.9494214876033058, "grad_norm": 8.402409553527832, "learning_rate": 6.426774584031902e-08, "loss": 0.4305, "step": 8616 }, { "epoch": 0.9495316804407713, "grad_norm": 8.449647903442383, "learning_rate": 6.398861532156408e-08, "loss": 0.5149, "step": 8617 }, { "epoch": 0.9496418732782369, "grad_norm": 8.074153900146484, "learning_rate": 6.371008838498616e-08, "loss": 0.3902, "step": 8618 }, { "epoch": 0.9497520661157025, "grad_norm": 6.5540876388549805, "learning_rate": 6.343216506464467e-08, "loss": 0.4076, "step": 8619 }, { "epoch": 0.949862258953168, "grad_norm": 6.830234527587891, "learning_rate": 6.315484539452299e-08, "loss": 0.3625, "step": 8620 }, { "epoch": 0.9499724517906336, "grad_norm": 7.6607584953308105, "learning_rate": 6.287812940853288e-08, "loss": 0.422, "step": 8621 }, { "epoch": 0.9500826446280992, "grad_norm": 6.200892925262451, "learning_rate": 6.260201714051229e-08, "loss": 0.3613, "step": 8622 }, { "epoch": 0.9501928374655647, "grad_norm": 12.884931564331055, "learning_rate": 6.232650862422308e-08, "loss": 0.4733, "step": 8623 }, { "epoch": 0.9503030303030303, "grad_norm": 5.730234622955322, "learning_rate": 6.205160389335552e-08, "loss": 0.4097, "step": 8624 }, { "epoch": 0.9504132231404959, "grad_norm": 4.848020076751709, "learning_rate": 6.177730298152606e-08, "loss": 0.3682, "step": 8625 }, { "epoch": 0.9505234159779614, "grad_norm": 5.113597869873047, "learning_rate": 6.150360592227511e-08, "loss": 0.41, "step": 8626 }, { "epoch": 0.950633608815427, "grad_norm": 8.638497352600098, "learning_rate": 6.123051274907199e-08, "loss": 0.4268, "step": 8627 }, { "epoch": 0.9507438016528925, "grad_norm": 7.427926063537598, "learning_rate": 6.095802349531055e-08, "loss": 0.3286, "step": 8628 }, { "epoch": 0.9508539944903581, "grad_norm": 5.209235191345215, "learning_rate": 6.068613819431079e-08, "loss": 0.308, "step": 8629 }, { "epoch": 0.9509641873278237, "grad_norm": 5.912085056304932, "learning_rate": 6.041485687931891e-08, "loss": 0.3675, "step": 8630 }, { "epoch": 0.9510743801652892, "grad_norm": 5.423869609832764, "learning_rate": 6.014417958350893e-08, "loss": 0.4198, "step": 8631 }, { "epoch": 0.9511845730027548, "grad_norm": 9.339936256408691, "learning_rate": 5.987410633997881e-08, "loss": 0.4165, "step": 8632 }, { "epoch": 0.9512947658402204, "grad_norm": 5.6121439933776855, "learning_rate": 5.960463718175324e-08, "loss": 0.395, "step": 8633 }, { "epoch": 0.9514049586776859, "grad_norm": 8.349647521972656, "learning_rate": 5.93357721417831e-08, "loss": 0.349, "step": 8634 }, { "epoch": 0.9515151515151515, "grad_norm": 9.9574556350708, "learning_rate": 5.906751125294652e-08, "loss": 0.4834, "step": 8635 }, { "epoch": 0.951625344352617, "grad_norm": 4.9349799156188965, "learning_rate": 5.8799854548046156e-08, "loss": 0.3591, "step": 8636 }, { "epoch": 0.9517355371900826, "grad_norm": 6.009603023529053, "learning_rate": 5.8532802059810825e-08, "loss": 0.3637, "step": 8637 }, { "epoch": 0.9518457300275482, "grad_norm": 7.522815227508545, "learning_rate": 5.8266353820897736e-08, "loss": 0.4404, "step": 8638 }, { "epoch": 0.9519559228650137, "grad_norm": 10.83249282836914, "learning_rate": 5.80005098638875e-08, "loss": 0.3346, "step": 8639 }, { "epoch": 0.9520661157024793, "grad_norm": 7.943075656890869, "learning_rate": 5.7735270221287444e-08, "loss": 0.4483, "step": 8640 }, { "epoch": 0.952176308539945, "grad_norm": 6.577495098114014, "learning_rate": 5.747063492553218e-08, "loss": 0.3587, "step": 8641 }, { "epoch": 0.9522865013774104, "grad_norm": 6.013138771057129, "learning_rate": 5.720660400898193e-08, "loss": 0.3738, "step": 8642 }, { "epoch": 0.952396694214876, "grad_norm": 5.224336624145508, "learning_rate": 5.694317750392142e-08, "loss": 0.408, "step": 8643 }, { "epoch": 0.9525068870523415, "grad_norm": 7.12232780456543, "learning_rate": 5.668035544256434e-08, "loss": 0.3908, "step": 8644 }, { "epoch": 0.9526170798898072, "grad_norm": 12.983247756958008, "learning_rate": 5.641813785704831e-08, "loss": 0.4187, "step": 8645 }, { "epoch": 0.9527272727272728, "grad_norm": 7.4847893714904785, "learning_rate": 5.6156524779437116e-08, "loss": 0.4703, "step": 8646 }, { "epoch": 0.9528374655647383, "grad_norm": 6.619725704193115, "learning_rate": 5.589551624172129e-08, "loss": 0.4126, "step": 8647 }, { "epoch": 0.9529476584022039, "grad_norm": 10.35739517211914, "learning_rate": 5.563511227581808e-08, "loss": 0.4818, "step": 8648 }, { "epoch": 0.9530578512396695, "grad_norm": 11.51596450805664, "learning_rate": 5.537531291356979e-08, "loss": 0.3502, "step": 8649 }, { "epoch": 0.953168044077135, "grad_norm": 5.162397384643555, "learning_rate": 5.511611818674434e-08, "loss": 0.383, "step": 8650 }, { "epoch": 0.9532782369146006, "grad_norm": 4.714761734008789, "learning_rate": 5.485752812703749e-08, "loss": 0.362, "step": 8651 }, { "epoch": 0.9533884297520662, "grad_norm": 10.624504089355469, "learning_rate": 5.4599542766069494e-08, "loss": 0.4279, "step": 8652 }, { "epoch": 0.9534986225895317, "grad_norm": 9.877367973327637, "learning_rate": 5.4342162135386236e-08, "loss": 0.4233, "step": 8653 }, { "epoch": 0.9536088154269973, "grad_norm": 5.578636646270752, "learning_rate": 5.408538626646198e-08, "loss": 0.3959, "step": 8654 }, { "epoch": 0.9537190082644628, "grad_norm": 8.448080062866211, "learning_rate": 5.38292151906955e-08, "loss": 0.3512, "step": 8655 }, { "epoch": 0.9538292011019284, "grad_norm": 5.63498592376709, "learning_rate": 5.357364893941064e-08, "loss": 0.3658, "step": 8656 }, { "epoch": 0.953939393939394, "grad_norm": 5.705527305603027, "learning_rate": 5.331868754385905e-08, "loss": 0.4013, "step": 8657 }, { "epoch": 0.9540495867768595, "grad_norm": 5.283731937408447, "learning_rate": 5.306433103521802e-08, "loss": 0.3825, "step": 8658 }, { "epoch": 0.9541597796143251, "grad_norm": 6.6954216957092285, "learning_rate": 5.2810579444590445e-08, "loss": 0.4422, "step": 8659 }, { "epoch": 0.9542699724517907, "grad_norm": 6.549747943878174, "learning_rate": 5.255743280300485e-08, "loss": 0.3703, "step": 8660 }, { "epoch": 0.9543801652892562, "grad_norm": 6.241518974304199, "learning_rate": 5.2304891141417014e-08, "loss": 0.3691, "step": 8661 }, { "epoch": 0.9544903581267218, "grad_norm": 7.21035623550415, "learning_rate": 5.205295449070835e-08, "loss": 0.4171, "step": 8662 }, { "epoch": 0.9546005509641873, "grad_norm": 6.325603485107422, "learning_rate": 5.1801622881684775e-08, "loss": 0.4035, "step": 8663 }, { "epoch": 0.9547107438016529, "grad_norm": 7.280940055847168, "learning_rate": 5.155089634508059e-08, "loss": 0.4243, "step": 8664 }, { "epoch": 0.9548209366391185, "grad_norm": 4.8911824226379395, "learning_rate": 5.130077491155461e-08, "loss": 0.3151, "step": 8665 }, { "epoch": 0.954931129476584, "grad_norm": 8.103285789489746, "learning_rate": 5.1051258611692355e-08, "loss": 0.4047, "step": 8666 }, { "epoch": 0.9550413223140496, "grad_norm": 5.535149097442627, "learning_rate": 5.0802347476004434e-08, "loss": 0.3804, "step": 8667 }, { "epoch": 0.9551515151515152, "grad_norm": 14.667068481445312, "learning_rate": 5.05540415349276e-08, "loss": 0.5715, "step": 8668 }, { "epoch": 0.9552617079889807, "grad_norm": 5.581683158874512, "learning_rate": 5.030634081882702e-08, "loss": 0.4177, "step": 8669 }, { "epoch": 0.9553719008264463, "grad_norm": 9.997817039489746, "learning_rate": 5.005924535798956e-08, "loss": 0.4213, "step": 8670 }, { "epoch": 0.9554820936639118, "grad_norm": 7.072441101074219, "learning_rate": 4.981275518263162e-08, "loss": 0.4222, "step": 8671 }, { "epoch": 0.9555922865013774, "grad_norm": 7.401980400085449, "learning_rate": 4.9566870322894645e-08, "loss": 0.3962, "step": 8672 }, { "epoch": 0.955702479338843, "grad_norm": 10.443068504333496, "learning_rate": 4.932159080884458e-08, "loss": 0.3976, "step": 8673 }, { "epoch": 0.9558126721763085, "grad_norm": 6.550655841827393, "learning_rate": 4.9076916670475206e-08, "loss": 0.3847, "step": 8674 }, { "epoch": 0.9559228650137741, "grad_norm": 10.696511268615723, "learning_rate": 4.8832847937706486e-08, "loss": 0.4171, "step": 8675 }, { "epoch": 0.9560330578512397, "grad_norm": 5.649347305297852, "learning_rate": 4.8589384640381766e-08, "loss": 0.3848, "step": 8676 }, { "epoch": 0.9561432506887052, "grad_norm": 5.724981307983398, "learning_rate": 4.834652680827334e-08, "loss": 0.4056, "step": 8677 }, { "epoch": 0.9562534435261708, "grad_norm": 7.857694625854492, "learning_rate": 4.8104274471078015e-08, "loss": 0.3987, "step": 8678 }, { "epoch": 0.9563636363636364, "grad_norm": 4.164583683013916, "learning_rate": 4.786262765841765e-08, "loss": 0.372, "step": 8679 }, { "epoch": 0.9564738292011019, "grad_norm": 14.193947792053223, "learning_rate": 4.7621586399842487e-08, "loss": 0.463, "step": 8680 }, { "epoch": 0.9565840220385675, "grad_norm": 3.8538706302642822, "learning_rate": 4.7381150724827296e-08, "loss": 0.3822, "step": 8681 }, { "epoch": 0.956694214876033, "grad_norm": 4.835615634918213, "learning_rate": 4.714132066277188e-08, "loss": 0.3613, "step": 8682 }, { "epoch": 0.9568044077134986, "grad_norm": 7.823811054229736, "learning_rate": 4.69020962430039e-08, "loss": 0.3714, "step": 8683 }, { "epoch": 0.9569146005509642, "grad_norm": 8.162225723266602, "learning_rate": 4.666347749477551e-08, "loss": 0.4376, "step": 8684 }, { "epoch": 0.9570247933884297, "grad_norm": 8.022966384887695, "learning_rate": 4.6425464447265586e-08, "loss": 0.3877, "step": 8685 }, { "epoch": 0.9571349862258953, "grad_norm": 8.46652603149414, "learning_rate": 4.6188057129578635e-08, "loss": 0.3787, "step": 8686 }, { "epoch": 0.9572451790633609, "grad_norm": 4.681698322296143, "learning_rate": 4.595125557074531e-08, "loss": 0.3885, "step": 8687 }, { "epoch": 0.9573553719008264, "grad_norm": 7.8477935791015625, "learning_rate": 4.571505979972191e-08, "loss": 0.4589, "step": 8688 }, { "epoch": 0.957465564738292, "grad_norm": 5.692086696624756, "learning_rate": 4.547946984539031e-08, "loss": 0.4306, "step": 8689 }, { "epoch": 0.9575757575757575, "grad_norm": 5.172653675079346, "learning_rate": 4.524448573655915e-08, "loss": 0.3984, "step": 8690 }, { "epoch": 0.9576859504132231, "grad_norm": 5.916308403015137, "learning_rate": 4.501010750196322e-08, "loss": 0.3823, "step": 8691 }, { "epoch": 0.9577961432506887, "grad_norm": 7.113374710083008, "learning_rate": 4.477633517026181e-08, "loss": 0.3866, "step": 8692 }, { "epoch": 0.9579063360881542, "grad_norm": 4.070824146270752, "learning_rate": 4.4543168770040946e-08, "loss": 0.3483, "step": 8693 }, { "epoch": 0.9580165289256198, "grad_norm": 5.743619441986084, "learning_rate": 4.431060832981282e-08, "loss": 0.3443, "step": 8694 }, { "epoch": 0.9581267217630854, "grad_norm": 6.238353252410889, "learning_rate": 4.40786538780158e-08, "loss": 0.3864, "step": 8695 }, { "epoch": 0.9582369146005509, "grad_norm": 4.780889987945557, "learning_rate": 4.3847305443011635e-08, "loss": 0.3364, "step": 8696 }, { "epoch": 0.9583471074380165, "grad_norm": 7.004602432250977, "learning_rate": 4.361656305309214e-08, "loss": 0.3869, "step": 8697 }, { "epoch": 0.9584573002754821, "grad_norm": 6.1174116134643555, "learning_rate": 4.338642673647198e-08, "loss": 0.3435, "step": 8698 }, { "epoch": 0.9585674931129476, "grad_norm": 5.435125827789307, "learning_rate": 4.3156896521291956e-08, "loss": 0.3154, "step": 8699 }, { "epoch": 0.9586776859504132, "grad_norm": 8.105121612548828, "learning_rate": 4.2927972435620194e-08, "loss": 0.3504, "step": 8700 }, { "epoch": 0.9587878787878787, "grad_norm": 8.5179443359375, "learning_rate": 4.2699654507449836e-08, "loss": 0.3894, "step": 8701 }, { "epoch": 0.9588980716253444, "grad_norm": 6.650347709655762, "learning_rate": 4.247194276469857e-08, "loss": 0.3452, "step": 8702 }, { "epoch": 0.95900826446281, "grad_norm": 7.823209285736084, "learning_rate": 4.2244837235213e-08, "loss": 0.4156, "step": 8703 }, { "epoch": 0.9591184573002755, "grad_norm": 4.450412273406982, "learning_rate": 4.201833794676258e-08, "loss": 0.3676, "step": 8704 }, { "epoch": 0.9592286501377411, "grad_norm": 7.199307441711426, "learning_rate": 4.179244492704515e-08, "loss": 0.4141, "step": 8705 }, { "epoch": 0.9593388429752067, "grad_norm": 6.122735500335693, "learning_rate": 4.1567158203682514e-08, "loss": 0.4061, "step": 8706 }, { "epoch": 0.9594490358126722, "grad_norm": 6.911500930786133, "learning_rate": 4.134247780422318e-08, "loss": 0.4044, "step": 8707 }, { "epoch": 0.9595592286501378, "grad_norm": 5.078444957733154, "learning_rate": 4.111840375614129e-08, "loss": 0.3493, "step": 8708 }, { "epoch": 0.9596694214876033, "grad_norm": 6.058871269226074, "learning_rate": 4.089493608683659e-08, "loss": 0.4757, "step": 8709 }, { "epoch": 0.9597796143250689, "grad_norm": 6.7376532554626465, "learning_rate": 4.0672074823635554e-08, "loss": 0.4169, "step": 8710 }, { "epoch": 0.9598898071625345, "grad_norm": 7.444267272949219, "learning_rate": 4.044981999379027e-08, "loss": 0.4361, "step": 8711 }, { "epoch": 0.96, "grad_norm": 10.424495697021484, "learning_rate": 4.022817162447734e-08, "loss": 0.3745, "step": 8712 }, { "epoch": 0.9601101928374656, "grad_norm": 17.904897689819336, "learning_rate": 4.000712974280119e-08, "loss": 0.4437, "step": 8713 }, { "epoch": 0.9602203856749312, "grad_norm": 7.5280022621154785, "learning_rate": 3.978669437579019e-08, "loss": 0.4471, "step": 8714 }, { "epoch": 0.9603305785123967, "grad_norm": 6.035186290740967, "learning_rate": 3.9566865550400566e-08, "loss": 0.3758, "step": 8715 }, { "epoch": 0.9604407713498623, "grad_norm": 5.6610918045043945, "learning_rate": 3.9347643293512485e-08, "loss": 0.4303, "step": 8716 }, { "epoch": 0.9605509641873278, "grad_norm": 5.898346900939941, "learning_rate": 3.9129027631932826e-08, "loss": 0.4415, "step": 8717 }, { "epoch": 0.9606611570247934, "grad_norm": 7.7965240478515625, "learning_rate": 3.89110185923941e-08, "loss": 0.362, "step": 8718 }, { "epoch": 0.960771349862259, "grad_norm": 4.916289329528809, "learning_rate": 3.869361620155554e-08, "loss": 0.4284, "step": 8719 }, { "epoch": 0.9608815426997245, "grad_norm": 8.657247543334961, "learning_rate": 3.847682048600088e-08, "loss": 0.4117, "step": 8720 }, { "epoch": 0.9609917355371901, "grad_norm": 4.531099319458008, "learning_rate": 3.826063147224002e-08, "loss": 0.4181, "step": 8721 }, { "epoch": 0.9611019283746557, "grad_norm": 6.3142523765563965, "learning_rate": 3.804504918670904e-08, "loss": 0.4041, "step": 8722 }, { "epoch": 0.9612121212121212, "grad_norm": 5.790463447570801, "learning_rate": 3.783007365576907e-08, "loss": 0.3302, "step": 8723 }, { "epoch": 0.9613223140495868, "grad_norm": 4.8118133544921875, "learning_rate": 3.7615704905708537e-08, "loss": 0.3303, "step": 8724 }, { "epoch": 0.9614325068870524, "grad_norm": 6.230093955993652, "learning_rate": 3.740194296274091e-08, "loss": 0.3689, "step": 8725 }, { "epoch": 0.9615426997245179, "grad_norm": 7.357944011688232, "learning_rate": 3.7188787853003614e-08, "loss": 0.4025, "step": 8726 }, { "epoch": 0.9616528925619835, "grad_norm": 5.621352672576904, "learning_rate": 3.697623960256358e-08, "loss": 0.3632, "step": 8727 }, { "epoch": 0.961763085399449, "grad_norm": 4.002316951751709, "learning_rate": 3.6764298237410014e-08, "loss": 0.377, "step": 8728 }, { "epoch": 0.9618732782369146, "grad_norm": 7.255180835723877, "learning_rate": 3.655296378346052e-08, "loss": 0.4069, "step": 8729 }, { "epoch": 0.9619834710743802, "grad_norm": 6.317800521850586, "learning_rate": 3.6342236266556085e-08, "loss": 0.3946, "step": 8730 }, { "epoch": 0.9620936639118457, "grad_norm": 12.31389331817627, "learning_rate": 3.613211571246611e-08, "loss": 0.388, "step": 8731 }, { "epoch": 0.9622038567493113, "grad_norm": 7.282164573669434, "learning_rate": 3.592260214688337e-08, "loss": 0.3554, "step": 8732 }, { "epoch": 0.9623140495867769, "grad_norm": 5.954392433166504, "learning_rate": 3.571369559542792e-08, "loss": 0.4262, "step": 8733 }, { "epoch": 0.9624242424242424, "grad_norm": 10.04466724395752, "learning_rate": 3.550539608364545e-08, "loss": 0.4081, "step": 8734 }, { "epoch": 0.962534435261708, "grad_norm": 5.788437843322754, "learning_rate": 3.529770363700613e-08, "loss": 0.3278, "step": 8735 }, { "epoch": 0.9626446280991735, "grad_norm": 7.020529270172119, "learning_rate": 3.5090618280907985e-08, "loss": 0.4064, "step": 8736 }, { "epoch": 0.9627548209366391, "grad_norm": 6.224966526031494, "learning_rate": 3.4884140040672975e-08, "loss": 0.3332, "step": 8737 }, { "epoch": 0.9628650137741047, "grad_norm": 5.927913188934326, "learning_rate": 3.4678268941549794e-08, "loss": 0.4083, "step": 8738 }, { "epoch": 0.9629752066115702, "grad_norm": 10.509246826171875, "learning_rate": 3.447300500871276e-08, "loss": 0.4871, "step": 8739 }, { "epoch": 0.9630853994490358, "grad_norm": 5.570017337799072, "learning_rate": 3.4268348267261776e-08, "loss": 0.4285, "step": 8740 }, { "epoch": 0.9631955922865014, "grad_norm": 10.697563171386719, "learning_rate": 3.406429874222239e-08, "loss": 0.3668, "step": 8741 }, { "epoch": 0.9633057851239669, "grad_norm": 4.3002424240112305, "learning_rate": 3.3860856458545754e-08, "loss": 0.3056, "step": 8742 }, { "epoch": 0.9634159779614325, "grad_norm": 5.99077033996582, "learning_rate": 3.3658021441109744e-08, "loss": 0.3781, "step": 8743 }, { "epoch": 0.963526170798898, "grad_norm": 7.563045024871826, "learning_rate": 3.345579371471674e-08, "loss": 0.4507, "step": 8744 }, { "epoch": 0.9636363636363636, "grad_norm": 7.128659725189209, "learning_rate": 3.325417330409586e-08, "loss": 0.3879, "step": 8745 }, { "epoch": 0.9637465564738292, "grad_norm": 5.0663909912109375, "learning_rate": 3.305316023390126e-08, "loss": 0.4553, "step": 8746 }, { "epoch": 0.9638567493112947, "grad_norm": 8.34847354888916, "learning_rate": 3.2852754528713285e-08, "loss": 0.4624, "step": 8747 }, { "epoch": 0.9639669421487603, "grad_norm": 6.316442966461182, "learning_rate": 3.265295621303788e-08, "loss": 0.3036, "step": 8748 }, { "epoch": 0.9640771349862259, "grad_norm": 6.613633155822754, "learning_rate": 3.245376531130551e-08, "loss": 0.3491, "step": 8749 }, { "epoch": 0.9641873278236914, "grad_norm": 6.894759178161621, "learning_rate": 3.2255181847875574e-08, "loss": 0.4662, "step": 8750 }, { "epoch": 0.964297520661157, "grad_norm": 7.324357032775879, "learning_rate": 3.205720584702976e-08, "loss": 0.3483, "step": 8751 }, { "epoch": 0.9644077134986226, "grad_norm": 5.790113925933838, "learning_rate": 3.1859837332976486e-08, "loss": 0.367, "step": 8752 }, { "epoch": 0.9645179063360881, "grad_norm": 6.1464715003967285, "learning_rate": 3.166307632985144e-08, "loss": 0.3834, "step": 8753 }, { "epoch": 0.9646280991735537, "grad_norm": 9.115376472473145, "learning_rate": 3.1466922861714266e-08, "loss": 0.4463, "step": 8754 }, { "epoch": 0.9647382920110192, "grad_norm": 10.096417427062988, "learning_rate": 3.127137695255078e-08, "loss": 0.3869, "step": 8755 }, { "epoch": 0.9648484848484848, "grad_norm": 5.323825836181641, "learning_rate": 3.107643862627241e-08, "loss": 0.4285, "step": 8756 }, { "epoch": 0.9649586776859504, "grad_norm": 7.107351303100586, "learning_rate": 3.0882107906717307e-08, "loss": 0.4017, "step": 8757 }, { "epoch": 0.965068870523416, "grad_norm": 9.90488338470459, "learning_rate": 3.0688384817647574e-08, "loss": 0.3584, "step": 8758 }, { "epoch": 0.9651790633608816, "grad_norm": 8.946452140808105, "learning_rate": 3.0495269382752046e-08, "loss": 0.3719, "step": 8759 }, { "epoch": 0.9652892561983472, "grad_norm": 8.5913667678833, "learning_rate": 3.0302761625645716e-08, "loss": 0.4018, "step": 8760 }, { "epoch": 0.9653994490358127, "grad_norm": 8.09288215637207, "learning_rate": 3.011086156986864e-08, "loss": 0.453, "step": 8761 }, { "epoch": 0.9655096418732783, "grad_norm": 5.370037078857422, "learning_rate": 2.991956923888539e-08, "loss": 0.3169, "step": 8762 }, { "epoch": 0.9656198347107438, "grad_norm": 5.912369251251221, "learning_rate": 2.9728884656088918e-08, "loss": 0.3846, "step": 8763 }, { "epoch": 0.9657300275482094, "grad_norm": 6.833310127258301, "learning_rate": 2.9538807844796124e-08, "loss": 0.3688, "step": 8764 }, { "epoch": 0.965840220385675, "grad_norm": 6.207882404327393, "learning_rate": 2.934933882824953e-08, "loss": 0.373, "step": 8765 }, { "epoch": 0.9659504132231405, "grad_norm": 4.079768657684326, "learning_rate": 2.916047762961782e-08, "loss": 0.438, "step": 8766 }, { "epoch": 0.9660606060606061, "grad_norm": 10.41829776763916, "learning_rate": 2.8972224271994755e-08, "loss": 0.4724, "step": 8767 }, { "epoch": 0.9661707988980717, "grad_norm": 7.421224594116211, "learning_rate": 2.8784578778400796e-08, "loss": 0.4165, "step": 8768 }, { "epoch": 0.9662809917355372, "grad_norm": 6.185629367828369, "learning_rate": 2.8597541171781483e-08, "loss": 0.4181, "step": 8769 }, { "epoch": 0.9663911845730028, "grad_norm": 5.071274757385254, "learning_rate": 2.8411111475007968e-08, "loss": 0.322, "step": 8770 }, { "epoch": 0.9665013774104683, "grad_norm": 5.575862884521484, "learning_rate": 2.8225289710876457e-08, "loss": 0.331, "step": 8771 }, { "epoch": 0.9666115702479339, "grad_norm": 6.873533248901367, "learning_rate": 2.8040075902109887e-08, "loss": 0.423, "step": 8772 }, { "epoch": 0.9667217630853995, "grad_norm": 5.264482021331787, "learning_rate": 2.785547007135736e-08, "loss": 0.3141, "step": 8773 }, { "epoch": 0.966831955922865, "grad_norm": 9.443242073059082, "learning_rate": 2.7671472241191376e-08, "loss": 0.3916, "step": 8774 }, { "epoch": 0.9669421487603306, "grad_norm": 8.357752799987793, "learning_rate": 2.7488082434111718e-08, "loss": 0.4151, "step": 8775 }, { "epoch": 0.9670523415977962, "grad_norm": 7.899710178375244, "learning_rate": 2.7305300672544334e-08, "loss": 0.3292, "step": 8776 }, { "epoch": 0.9671625344352617, "grad_norm": 6.588962554931641, "learning_rate": 2.7123126978839676e-08, "loss": 0.3844, "step": 8777 }, { "epoch": 0.9672727272727273, "grad_norm": 8.38394832611084, "learning_rate": 2.6941561375273818e-08, "loss": 0.4229, "step": 8778 }, { "epoch": 0.9673829201101929, "grad_norm": 13.35795783996582, "learning_rate": 2.6760603884048998e-08, "loss": 0.4688, "step": 8779 }, { "epoch": 0.9674931129476584, "grad_norm": 9.614317893981934, "learning_rate": 2.658025452729307e-08, "loss": 0.3995, "step": 8780 }, { "epoch": 0.967603305785124, "grad_norm": 7.807085990905762, "learning_rate": 2.6400513327059508e-08, "loss": 0.4585, "step": 8781 }, { "epoch": 0.9677134986225895, "grad_norm": 5.789939880371094, "learning_rate": 2.622138030532684e-08, "loss": 0.3661, "step": 8782 }, { "epoch": 0.9678236914600551, "grad_norm": 5.294877529144287, "learning_rate": 2.604285548399976e-08, "loss": 0.3886, "step": 8783 }, { "epoch": 0.9679338842975207, "grad_norm": 4.9963812828063965, "learning_rate": 2.5864938884909707e-08, "loss": 0.4206, "step": 8784 }, { "epoch": 0.9680440771349862, "grad_norm": 4.177475452423096, "learning_rate": 2.5687630529810935e-08, "loss": 0.3957, "step": 8785 }, { "epoch": 0.9681542699724518, "grad_norm": 7.786369323730469, "learning_rate": 2.5510930440385552e-08, "loss": 0.4177, "step": 8786 }, { "epoch": 0.9682644628099174, "grad_norm": 5.515124320983887, "learning_rate": 2.5334838638241268e-08, "loss": 0.3151, "step": 8787 }, { "epoch": 0.9683746556473829, "grad_norm": 8.18659496307373, "learning_rate": 2.5159355144909746e-08, "loss": 0.4789, "step": 8788 }, { "epoch": 0.9684848484848485, "grad_norm": 4.32504940032959, "learning_rate": 2.4984479981850494e-08, "loss": 0.3506, "step": 8789 }, { "epoch": 0.968595041322314, "grad_norm": 6.653140068054199, "learning_rate": 2.481021317044696e-08, "loss": 0.3292, "step": 8790 }, { "epoch": 0.9687052341597796, "grad_norm": 5.733898639678955, "learning_rate": 2.463655473200821e-08, "loss": 0.3428, "step": 8791 }, { "epoch": 0.9688154269972452, "grad_norm": 5.614349365234375, "learning_rate": 2.4463504687770035e-08, "loss": 0.3711, "step": 8792 }, { "epoch": 0.9689256198347107, "grad_norm": 6.361004829406738, "learning_rate": 2.4291063058893283e-08, "loss": 0.4352, "step": 8793 }, { "epoch": 0.9690358126721763, "grad_norm": 5.7507100105285645, "learning_rate": 2.4119229866463866e-08, "loss": 0.333, "step": 8794 }, { "epoch": 0.9691460055096419, "grad_norm": 8.004241943359375, "learning_rate": 2.3948005131494422e-08, "loss": 0.4431, "step": 8795 }, { "epoch": 0.9692561983471074, "grad_norm": 5.039779186248779, "learning_rate": 2.3777388874922092e-08, "loss": 0.3062, "step": 8796 }, { "epoch": 0.969366391184573, "grad_norm": 10.546436309814453, "learning_rate": 2.3607381117610184e-08, "loss": 0.4645, "step": 8797 }, { "epoch": 0.9694765840220385, "grad_norm": 4.041770935058594, "learning_rate": 2.3437981880347628e-08, "loss": 0.3388, "step": 8798 }, { "epoch": 0.9695867768595041, "grad_norm": 4.304748058319092, "learning_rate": 2.3269191183848405e-08, "loss": 0.3828, "step": 8799 }, { "epoch": 0.9696969696969697, "grad_norm": 5.550631046295166, "learning_rate": 2.3101009048752678e-08, "loss": 0.3803, "step": 8800 }, { "epoch": 0.9698071625344352, "grad_norm": 7.604842662811279, "learning_rate": 2.2933435495626212e-08, "loss": 0.3725, "step": 8801 }, { "epoch": 0.9699173553719008, "grad_norm": 5.92302131652832, "learning_rate": 2.276647054495984e-08, "loss": 0.415, "step": 8802 }, { "epoch": 0.9700275482093664, "grad_norm": 5.738622665405273, "learning_rate": 2.2600114217170566e-08, "loss": 0.3844, "step": 8803 }, { "epoch": 0.9701377410468319, "grad_norm": 5.6382646560668945, "learning_rate": 2.243436653260045e-08, "loss": 0.4277, "step": 8804 }, { "epoch": 0.9702479338842975, "grad_norm": 5.22743558883667, "learning_rate": 2.2269227511517167e-08, "loss": 0.3433, "step": 8805 }, { "epoch": 0.9703581267217631, "grad_norm": 4.088083267211914, "learning_rate": 2.210469717411401e-08, "loss": 0.2806, "step": 8806 }, { "epoch": 0.9704683195592286, "grad_norm": 6.981813907623291, "learning_rate": 2.1940775540510996e-08, "loss": 0.4285, "step": 8807 }, { "epoch": 0.9705785123966942, "grad_norm": 11.983031272888184, "learning_rate": 2.1777462630751533e-08, "loss": 0.4332, "step": 8808 }, { "epoch": 0.9706887052341597, "grad_norm": 12.438445091247559, "learning_rate": 2.1614758464806316e-08, "loss": 0.4587, "step": 8809 }, { "epoch": 0.9707988980716253, "grad_norm": 8.856440544128418, "learning_rate": 2.14526630625711e-08, "loss": 0.5133, "step": 8810 }, { "epoch": 0.9709090909090909, "grad_norm": 6.0965399742126465, "learning_rate": 2.1291176443866134e-08, "loss": 0.3325, "step": 8811 }, { "epoch": 0.9710192837465564, "grad_norm": 9.841634750366211, "learning_rate": 2.113029862844007e-08, "loss": 0.4037, "step": 8812 }, { "epoch": 0.971129476584022, "grad_norm": 5.323287487030029, "learning_rate": 2.097002963596384e-08, "loss": 0.3911, "step": 8813 }, { "epoch": 0.9712396694214877, "grad_norm": 6.11491060256958, "learning_rate": 2.0810369486035652e-08, "loss": 0.3484, "step": 8814 }, { "epoch": 0.9713498622589531, "grad_norm": 8.195962905883789, "learning_rate": 2.0651318198178783e-08, "loss": 0.3972, "step": 8815 }, { "epoch": 0.9714600550964188, "grad_norm": 9.117520332336426, "learning_rate": 2.049287579184267e-08, "loss": 0.3708, "step": 8816 }, { "epoch": 0.9715702479338842, "grad_norm": 7.001363277435303, "learning_rate": 2.0335042286401817e-08, "loss": 0.3454, "step": 8817 }, { "epoch": 0.9716804407713499, "grad_norm": 8.596649169921875, "learning_rate": 2.0177817701156342e-08, "loss": 0.387, "step": 8818 }, { "epoch": 0.9717906336088155, "grad_norm": 3.672086477279663, "learning_rate": 2.0021202055331424e-08, "loss": 0.3942, "step": 8819 }, { "epoch": 0.971900826446281, "grad_norm": 8.945923805236816, "learning_rate": 1.986519536807896e-08, "loss": 0.3429, "step": 8820 }, { "epoch": 0.9720110192837466, "grad_norm": 5.04570198059082, "learning_rate": 1.9709797658474805e-08, "loss": 0.4346, "step": 8821 }, { "epoch": 0.9721212121212122, "grad_norm": 10.192368507385254, "learning_rate": 1.9555008945521536e-08, "loss": 0.4353, "step": 8822 }, { "epoch": 0.9722314049586777, "grad_norm": 6.014553546905518, "learning_rate": 1.9400829248147902e-08, "loss": 0.4043, "step": 8823 }, { "epoch": 0.9723415977961433, "grad_norm": 13.046184539794922, "learning_rate": 1.9247258585205488e-08, "loss": 0.504, "step": 8824 }, { "epoch": 0.9724517906336089, "grad_norm": 8.074289321899414, "learning_rate": 1.9094296975474268e-08, "loss": 0.3953, "step": 8825 }, { "epoch": 0.9725619834710744, "grad_norm": 7.343116283416748, "learning_rate": 1.894194443765873e-08, "loss": 0.4039, "step": 8826 }, { "epoch": 0.97267217630854, "grad_norm": 5.112274169921875, "learning_rate": 1.8790200990387863e-08, "loss": 0.4343, "step": 8827 }, { "epoch": 0.9727823691460055, "grad_norm": 4.739718437194824, "learning_rate": 1.8639066652217375e-08, "loss": 0.387, "step": 8828 }, { "epoch": 0.9728925619834711, "grad_norm": 4.407110691070557, "learning_rate": 1.8488541441628593e-08, "loss": 0.2905, "step": 8829 }, { "epoch": 0.9730027548209367, "grad_norm": 10.422987937927246, "learning_rate": 1.8338625377027907e-08, "loss": 0.5067, "step": 8830 }, { "epoch": 0.9731129476584022, "grad_norm": 7.39470911026001, "learning_rate": 1.8189318476746764e-08, "loss": 0.3879, "step": 8831 }, { "epoch": 0.9732231404958678, "grad_norm": 9.564068794250488, "learning_rate": 1.804062075904278e-08, "loss": 0.4198, "step": 8832 }, { "epoch": 0.9733333333333334, "grad_norm": 5.889505863189697, "learning_rate": 1.7892532242099192e-08, "loss": 0.3871, "step": 8833 }, { "epoch": 0.9734435261707989, "grad_norm": 9.963391304016113, "learning_rate": 1.774505294402429e-08, "loss": 0.3672, "step": 8834 }, { "epoch": 0.9735537190082645, "grad_norm": 9.059921264648438, "learning_rate": 1.7598182882851977e-08, "loss": 0.3408, "step": 8835 }, { "epoch": 0.97366391184573, "grad_norm": 6.291049480438232, "learning_rate": 1.7451922076541783e-08, "loss": 0.3148, "step": 8836 }, { "epoch": 0.9737741046831956, "grad_norm": 5.076037883758545, "learning_rate": 1.7306270542978288e-08, "loss": 0.4341, "step": 8837 }, { "epoch": 0.9738842975206612, "grad_norm": 6.406747341156006, "learning_rate": 1.7161228299973354e-08, "loss": 0.3532, "step": 8838 }, { "epoch": 0.9739944903581267, "grad_norm": 6.340034008026123, "learning_rate": 1.701679536526113e-08, "loss": 0.4543, "step": 8839 }, { "epoch": 0.9741046831955923, "grad_norm": 7.013032913208008, "learning_rate": 1.6872971756504153e-08, "loss": 0.3869, "step": 8840 }, { "epoch": 0.9742148760330579, "grad_norm": 5.976755142211914, "learning_rate": 1.672975749128891e-08, "loss": 0.4204, "step": 8841 }, { "epoch": 0.9743250688705234, "grad_norm": 6.865273952484131, "learning_rate": 1.6587152587128064e-08, "loss": 0.4589, "step": 8842 }, { "epoch": 0.974435261707989, "grad_norm": 7.500924587249756, "learning_rate": 1.6445157061459883e-08, "loss": 0.4156, "step": 8843 }, { "epoch": 0.9745454545454545, "grad_norm": 15.058416366577148, "learning_rate": 1.6303770931647146e-08, "loss": 0.3318, "step": 8844 }, { "epoch": 0.9746556473829201, "grad_norm": 5.671866416931152, "learning_rate": 1.61629942149788e-08, "loss": 0.3541, "step": 8845 }, { "epoch": 0.9747658402203857, "grad_norm": 5.84348726272583, "learning_rate": 1.6022826928669964e-08, "loss": 0.4222, "step": 8846 }, { "epoch": 0.9748760330578512, "grad_norm": 5.512253284454346, "learning_rate": 1.5883269089859155e-08, "loss": 0.3553, "step": 8847 }, { "epoch": 0.9749862258953168, "grad_norm": 6.648393154144287, "learning_rate": 1.5744320715612716e-08, "loss": 0.4339, "step": 8848 }, { "epoch": 0.9750964187327824, "grad_norm": 5.175326347351074, "learning_rate": 1.56059818229215e-08, "loss": 0.3948, "step": 8849 }, { "epoch": 0.9752066115702479, "grad_norm": 7.846436977386475, "learning_rate": 1.5468252428701425e-08, "loss": 0.3716, "step": 8850 }, { "epoch": 0.9753168044077135, "grad_norm": 4.911636829376221, "learning_rate": 1.5331132549794014e-08, "loss": 0.3354, "step": 8851 }, { "epoch": 0.9754269972451791, "grad_norm": 8.698405265808105, "learning_rate": 1.5194622202966968e-08, "loss": 0.3774, "step": 8852 }, { "epoch": 0.9755371900826446, "grad_norm": 5.8944501876831055, "learning_rate": 1.505872140491249e-08, "loss": 0.4486, "step": 8853 }, { "epoch": 0.9756473829201102, "grad_norm": 6.094091415405273, "learning_rate": 1.4923430172248953e-08, "loss": 0.4023, "step": 8854 }, { "epoch": 0.9757575757575757, "grad_norm": 10.681119918823242, "learning_rate": 1.4788748521519792e-08, "loss": 0.4874, "step": 8855 }, { "epoch": 0.9758677685950413, "grad_norm": 5.61772346496582, "learning_rate": 1.4654676469194607e-08, "loss": 0.3438, "step": 8856 }, { "epoch": 0.9759779614325069, "grad_norm": 4.298542499542236, "learning_rate": 1.4521214031666952e-08, "loss": 0.374, "step": 8857 }, { "epoch": 0.9760881542699724, "grad_norm": 6.038506031036377, "learning_rate": 1.4388361225257663e-08, "loss": 0.3459, "step": 8858 }, { "epoch": 0.976198347107438, "grad_norm": 5.182193756103516, "learning_rate": 1.4256118066212077e-08, "loss": 0.3743, "step": 8859 }, { "epoch": 0.9763085399449036, "grad_norm": 8.860308647155762, "learning_rate": 1.4124484570700591e-08, "loss": 0.4173, "step": 8860 }, { "epoch": 0.9764187327823691, "grad_norm": 7.584855556488037, "learning_rate": 1.3993460754819777e-08, "loss": 0.3871, "step": 8861 }, { "epoch": 0.9765289256198347, "grad_norm": 10.860026359558105, "learning_rate": 1.3863046634591815e-08, "loss": 0.3235, "step": 8862 }, { "epoch": 0.9766391184573002, "grad_norm": 5.567005634307861, "learning_rate": 1.373324222596284e-08, "loss": 0.3589, "step": 8863 }, { "epoch": 0.9767493112947658, "grad_norm": 8.486008644104004, "learning_rate": 1.360404754480682e-08, "loss": 0.4191, "step": 8864 }, { "epoch": 0.9768595041322314, "grad_norm": 10.701818466186523, "learning_rate": 1.347546260692112e-08, "loss": 0.5152, "step": 8865 }, { "epoch": 0.9769696969696969, "grad_norm": 10.07162857055664, "learning_rate": 1.3347487428029272e-08, "loss": 0.4344, "step": 8866 }, { "epoch": 0.9770798898071625, "grad_norm": 6.04201078414917, "learning_rate": 1.3220122023779869e-08, "loss": 0.3542, "step": 8867 }, { "epoch": 0.9771900826446281, "grad_norm": 8.38823127746582, "learning_rate": 1.3093366409748232e-08, "loss": 0.4219, "step": 8868 }, { "epoch": 0.9773002754820936, "grad_norm": 8.828341484069824, "learning_rate": 1.2967220601434183e-08, "loss": 0.4607, "step": 8869 }, { "epoch": 0.9774104683195592, "grad_norm": 8.483080863952637, "learning_rate": 1.2841684614262052e-08, "loss": 0.4064, "step": 8870 }, { "epoch": 0.9775206611570247, "grad_norm": 5.314237594604492, "learning_rate": 1.2716758463583444e-08, "loss": 0.3547, "step": 8871 }, { "epoch": 0.9776308539944903, "grad_norm": 5.820374011993408, "learning_rate": 1.2592442164673923e-08, "loss": 0.3678, "step": 8872 }, { "epoch": 0.977741046831956, "grad_norm": 5.87931489944458, "learning_rate": 1.2468735732735215e-08, "loss": 0.3456, "step": 8873 }, { "epoch": 0.9778512396694214, "grad_norm": 4.359767913818359, "learning_rate": 1.2345639182894664e-08, "loss": 0.4238, "step": 8874 }, { "epoch": 0.977961432506887, "grad_norm": 6.766221523284912, "learning_rate": 1.2223152530204118e-08, "loss": 0.3743, "step": 8875 }, { "epoch": 0.9780716253443527, "grad_norm": 8.950994491577148, "learning_rate": 1.2101275789642152e-08, "loss": 0.4784, "step": 8876 }, { "epoch": 0.9781818181818182, "grad_norm": 5.21338415145874, "learning_rate": 1.198000897611129e-08, "loss": 0.398, "step": 8877 }, { "epoch": 0.9782920110192838, "grad_norm": 8.412769317626953, "learning_rate": 1.1859352104440225e-08, "loss": 0.4103, "step": 8878 }, { "epoch": 0.9784022038567494, "grad_norm": 5.464070796966553, "learning_rate": 1.173930518938382e-08, "loss": 0.3167, "step": 8879 }, { "epoch": 0.9785123966942149, "grad_norm": 6.9967803955078125, "learning_rate": 1.161986824562089e-08, "loss": 0.3065, "step": 8880 }, { "epoch": 0.9786225895316805, "grad_norm": 11.746569633483887, "learning_rate": 1.150104128775642e-08, "loss": 0.3764, "step": 8881 }, { "epoch": 0.978732782369146, "grad_norm": 5.9719648361206055, "learning_rate": 1.1382824330321007e-08, "loss": 0.3483, "step": 8882 }, { "epoch": 0.9788429752066116, "grad_norm": 5.727097511291504, "learning_rate": 1.126521738777031e-08, "loss": 0.3614, "step": 8883 }, { "epoch": 0.9789531680440772, "grad_norm": 6.904728412628174, "learning_rate": 1.1148220474485049e-08, "loss": 0.3706, "step": 8884 }, { "epoch": 0.9790633608815427, "grad_norm": 5.867455959320068, "learning_rate": 1.1031833604772113e-08, "loss": 0.3982, "step": 8885 }, { "epoch": 0.9791735537190083, "grad_norm": 11.182619094848633, "learning_rate": 1.091605679286345e-08, "loss": 0.5638, "step": 8886 }, { "epoch": 0.9792837465564739, "grad_norm": 5.756790637969971, "learning_rate": 1.0800890052916623e-08, "loss": 0.3807, "step": 8887 }, { "epoch": 0.9793939393939394, "grad_norm": 4.721381187438965, "learning_rate": 1.0686333399013704e-08, "loss": 0.394, "step": 8888 }, { "epoch": 0.979504132231405, "grad_norm": 5.011298179626465, "learning_rate": 1.0572386845163485e-08, "loss": 0.4153, "step": 8889 }, { "epoch": 0.9796143250688705, "grad_norm": 5.2296142578125, "learning_rate": 1.0459050405299265e-08, "loss": 0.3156, "step": 8890 }, { "epoch": 0.9797245179063361, "grad_norm": 5.576880931854248, "learning_rate": 1.0346324093279958e-08, "loss": 0.3834, "step": 8891 }, { "epoch": 0.9798347107438017, "grad_norm": 8.7191162109375, "learning_rate": 1.0234207922890094e-08, "loss": 0.4275, "step": 8892 }, { "epoch": 0.9799449035812672, "grad_norm": 6.978935241699219, "learning_rate": 1.012270190783926e-08, "loss": 0.4068, "step": 8893 }, { "epoch": 0.9800550964187328, "grad_norm": 6.131322383880615, "learning_rate": 1.0011806061762109e-08, "loss": 0.3177, "step": 8894 }, { "epoch": 0.9801652892561984, "grad_norm": 5.406473636627197, "learning_rate": 9.90152039822001e-09, "loss": 0.3968, "step": 8895 }, { "epoch": 0.9802754820936639, "grad_norm": 11.630650520324707, "learning_rate": 9.791844930697736e-09, "loss": 0.4068, "step": 8896 }, { "epoch": 0.9803856749311295, "grad_norm": 3.7398698329925537, "learning_rate": 9.68277967260789e-09, "loss": 0.3398, "step": 8897 }, { "epoch": 0.980495867768595, "grad_norm": 5.201544284820557, "learning_rate": 9.574324637285915e-09, "loss": 0.4375, "step": 8898 }, { "epoch": 0.9806060606060606, "grad_norm": 15.65612506866455, "learning_rate": 9.466479837994536e-09, "loss": 0.407, "step": 8899 }, { "epoch": 0.9807162534435262, "grad_norm": 5.909888744354248, "learning_rate": 9.359245287920981e-09, "loss": 0.4024, "step": 8900 }, { "epoch": 0.9808264462809917, "grad_norm": 5.173218727111816, "learning_rate": 9.252621000178097e-09, "loss": 0.4007, "step": 8901 }, { "epoch": 0.9809366391184573, "grad_norm": 4.32283353805542, "learning_rate": 9.146606987804341e-09, "loss": 0.3089, "step": 8902 }, { "epoch": 0.9810468319559229, "grad_norm": 10.846911430358887, "learning_rate": 9.041203263762122e-09, "loss": 0.4318, "step": 8903 }, { "epoch": 0.9811570247933884, "grad_norm": 6.127249717712402, "learning_rate": 8.936409840941685e-09, "loss": 0.4559, "step": 8904 }, { "epoch": 0.981267217630854, "grad_norm": 6.813745975494385, "learning_rate": 8.832226732156668e-09, "loss": 0.4043, "step": 8905 }, { "epoch": 0.9813774104683196, "grad_norm": 5.514586925506592, "learning_rate": 8.728653950146326e-09, "loss": 0.4176, "step": 8906 }, { "epoch": 0.9814876033057851, "grad_norm": 7.442978858947754, "learning_rate": 8.625691507576638e-09, "loss": 0.4868, "step": 8907 }, { "epoch": 0.9815977961432507, "grad_norm": 6.240289211273193, "learning_rate": 8.523339417037535e-09, "loss": 0.4179, "step": 8908 }, { "epoch": 0.9817079889807162, "grad_norm": 5.748809814453125, "learning_rate": 8.421597691044559e-09, "loss": 0.4019, "step": 8909 }, { "epoch": 0.9818181818181818, "grad_norm": 6.198007106781006, "learning_rate": 8.320466342038868e-09, "loss": 0.3816, "step": 8910 }, { "epoch": 0.9819283746556474, "grad_norm": 4.0680928230285645, "learning_rate": 8.21994538238724e-09, "loss": 0.3458, "step": 8911 }, { "epoch": 0.9820385674931129, "grad_norm": 7.397459030151367, "learning_rate": 8.120034824381506e-09, "loss": 0.4114, "step": 8912 }, { "epoch": 0.9821487603305785, "grad_norm": 5.5714430809021, "learning_rate": 8.020734680238562e-09, "loss": 0.3178, "step": 8913 }, { "epoch": 0.9822589531680441, "grad_norm": 5.759981632232666, "learning_rate": 7.922044962100916e-09, "loss": 0.3357, "step": 8914 }, { "epoch": 0.9823691460055096, "grad_norm": 8.685111045837402, "learning_rate": 7.823965682037249e-09, "loss": 0.4521, "step": 8915 }, { "epoch": 0.9824793388429752, "grad_norm": 6.009195327758789, "learning_rate": 7.726496852039633e-09, "loss": 0.3254, "step": 8916 }, { "epoch": 0.9825895316804407, "grad_norm": 6.0467143058776855, "learning_rate": 7.629638484027424e-09, "loss": 0.2981, "step": 8917 }, { "epoch": 0.9826997245179063, "grad_norm": 6.970085144042969, "learning_rate": 7.533390589845035e-09, "loss": 0.3373, "step": 8918 }, { "epoch": 0.9828099173553719, "grad_norm": 8.984983444213867, "learning_rate": 7.437753181260831e-09, "loss": 0.3906, "step": 8919 }, { "epoch": 0.9829201101928374, "grad_norm": 12.746138572692871, "learning_rate": 7.342726269969902e-09, "loss": 0.513, "step": 8920 }, { "epoch": 0.983030303030303, "grad_norm": 7.483966827392578, "learning_rate": 7.2483098675923955e-09, "loss": 0.4206, "step": 8921 }, { "epoch": 0.9831404958677686, "grad_norm": 11.717601776123047, "learning_rate": 7.154503985673522e-09, "loss": 0.4531, "step": 8922 }, { "epoch": 0.9832506887052341, "grad_norm": 5.9319353103637695, "learning_rate": 7.061308635684105e-09, "loss": 0.3361, "step": 8923 }, { "epoch": 0.9833608815426997, "grad_norm": 8.37967586517334, "learning_rate": 6.96872382902003e-09, "loss": 0.3765, "step": 8924 }, { "epoch": 0.9834710743801653, "grad_norm": 5.437576770782471, "learning_rate": 6.876749577002795e-09, "loss": 0.3946, "step": 8925 }, { "epoch": 0.9835812672176308, "grad_norm": 4.543797492980957, "learning_rate": 6.7853858908784046e-09, "loss": 0.4174, "step": 8926 }, { "epoch": 0.9836914600550964, "grad_norm": 5.815567970275879, "learning_rate": 6.694632781820698e-09, "loss": 0.4073, "step": 8927 }, { "epoch": 0.9838016528925619, "grad_norm": 9.523588180541992, "learning_rate": 6.604490260925245e-09, "loss": 0.4364, "step": 8928 }, { "epoch": 0.9839118457300275, "grad_norm": 4.060024261474609, "learning_rate": 6.514958339216004e-09, "loss": 0.3589, "step": 8929 }, { "epoch": 0.9840220385674932, "grad_norm": 6.048622131347656, "learning_rate": 6.4260370276408855e-09, "loss": 0.3405, "step": 8930 }, { "epoch": 0.9841322314049586, "grad_norm": 5.9720282554626465, "learning_rate": 6.3377263370728585e-09, "loss": 0.4164, "step": 8931 }, { "epoch": 0.9842424242424243, "grad_norm": 5.029387474060059, "learning_rate": 6.250026278310506e-09, "loss": 0.3852, "step": 8932 }, { "epoch": 0.9843526170798899, "grad_norm": 4.894454479217529, "learning_rate": 6.162936862078583e-09, "loss": 0.337, "step": 8933 }, { "epoch": 0.9844628099173554, "grad_norm": 6.65284538269043, "learning_rate": 6.07645809902635e-09, "loss": 0.3496, "step": 8934 }, { "epoch": 0.984573002754821, "grad_norm": 5.625543594360352, "learning_rate": 5.9905899997281244e-09, "loss": 0.3761, "step": 8935 }, { "epoch": 0.9846831955922865, "grad_norm": 11.028755187988281, "learning_rate": 5.905332574684397e-09, "loss": 0.5301, "step": 8936 }, { "epoch": 0.9847933884297521, "grad_norm": 8.856451988220215, "learning_rate": 5.820685834320161e-09, "loss": 0.4884, "step": 8937 }, { "epoch": 0.9849035812672177, "grad_norm": 8.534035682678223, "learning_rate": 5.736649788986581e-09, "loss": 0.4377, "step": 8938 }, { "epoch": 0.9850137741046832, "grad_norm": 6.711327075958252, "learning_rate": 5.653224448959882e-09, "loss": 0.3666, "step": 8939 }, { "epoch": 0.9851239669421488, "grad_norm": 7.608486175537109, "learning_rate": 5.570409824440237e-09, "loss": 0.3626, "step": 8940 }, { "epoch": 0.9852341597796144, "grad_norm": 6.2303786277771, "learning_rate": 5.488205925555656e-09, "loss": 0.342, "step": 8941 }, { "epoch": 0.9853443526170799, "grad_norm": 9.190738677978516, "learning_rate": 5.406612762357544e-09, "loss": 0.445, "step": 8942 }, { "epoch": 0.9854545454545455, "grad_norm": 6.4787278175354, "learning_rate": 5.325630344823474e-09, "loss": 0.4057, "step": 8943 }, { "epoch": 0.985564738292011, "grad_norm": 5.341314792633057, "learning_rate": 5.24525868285608e-09, "loss": 0.4303, "step": 8944 }, { "epoch": 0.9856749311294766, "grad_norm": 6.1412129402160645, "learning_rate": 5.1654977862825025e-09, "loss": 0.4085, "step": 8945 }, { "epoch": 0.9857851239669422, "grad_norm": 5.3331732749938965, "learning_rate": 5.0863476648571605e-09, "loss": 0.3269, "step": 8946 }, { "epoch": 0.9858953168044077, "grad_norm": 6.190547466278076, "learning_rate": 5.007808328258423e-09, "loss": 0.4189, "step": 8947 }, { "epoch": 0.9860055096418733, "grad_norm": 13.57688045501709, "learning_rate": 4.9298797860891645e-09, "loss": 0.5056, "step": 8948 }, { "epoch": 0.9861157024793389, "grad_norm": 6.407979488372803, "learning_rate": 4.852562047879539e-09, "loss": 0.2966, "step": 8949 }, { "epoch": 0.9862258953168044, "grad_norm": 5.984018325805664, "learning_rate": 4.775855123084206e-09, "loss": 0.3656, "step": 8950 }, { "epoch": 0.98633608815427, "grad_norm": 8.00353717803955, "learning_rate": 4.699759021082328e-09, "loss": 0.4132, "step": 8951 }, { "epoch": 0.9864462809917356, "grad_norm": 6.895720958709717, "learning_rate": 4.624273751179797e-09, "loss": 0.4028, "step": 8952 }, { "epoch": 0.9865564738292011, "grad_norm": 9.572421073913574, "learning_rate": 4.549399322606451e-09, "loss": 0.4381, "step": 8953 }, { "epoch": 0.9866666666666667, "grad_norm": 5.052114963531494, "learning_rate": 4.4751357445177445e-09, "loss": 0.3868, "step": 8954 }, { "epoch": 0.9867768595041322, "grad_norm": 5.700187683105469, "learning_rate": 4.4014830259958565e-09, "loss": 0.4035, "step": 8955 }, { "epoch": 0.9868870523415978, "grad_norm": 4.689455509185791, "learning_rate": 4.328441176045806e-09, "loss": 0.3585, "step": 8956 }, { "epoch": 0.9869972451790634, "grad_norm": 5.007065296173096, "learning_rate": 4.256010203600447e-09, "loss": 0.3385, "step": 8957 }, { "epoch": 0.9871074380165289, "grad_norm": 8.176401138305664, "learning_rate": 4.184190117516029e-09, "loss": 0.4321, "step": 8958 }, { "epoch": 0.9872176308539945, "grad_norm": 9.480592727661133, "learning_rate": 4.112980926574972e-09, "loss": 0.37, "step": 8959 }, { "epoch": 0.9873278236914601, "grad_norm": 5.8697733879089355, "learning_rate": 4.04238263948531e-09, "loss": 0.3528, "step": 8960 }, { "epoch": 0.9874380165289256, "grad_norm": 8.033915519714355, "learning_rate": 3.972395264879026e-09, "loss": 0.3632, "step": 8961 }, { "epoch": 0.9875482093663912, "grad_norm": 5.839441776275635, "learning_rate": 3.9030188113142745e-09, "loss": 0.3946, "step": 8962 }, { "epoch": 0.9876584022038567, "grad_norm": 7.490885257720947, "learning_rate": 3.83425328727538e-09, "loss": 0.3771, "step": 8963 }, { "epoch": 0.9877685950413223, "grad_norm": 5.09083890914917, "learning_rate": 3.766098701170617e-09, "loss": 0.3528, "step": 8964 }, { "epoch": 0.9878787878787879, "grad_norm": 4.408278942108154, "learning_rate": 3.69855506133443e-09, "loss": 0.3541, "step": 8965 }, { "epoch": 0.9879889807162534, "grad_norm": 13.632762908935547, "learning_rate": 3.6316223760252125e-09, "loss": 0.3984, "step": 8966 }, { "epoch": 0.988099173553719, "grad_norm": 10.03503704071045, "learning_rate": 3.5653006534280833e-09, "loss": 0.4507, "step": 8967 }, { "epoch": 0.9882093663911846, "grad_norm": 11.511576652526855, "learning_rate": 3.499589901653222e-09, "loss": 0.5408, "step": 8968 }, { "epoch": 0.9883195592286501, "grad_norm": 5.304442405700684, "learning_rate": 3.4344901287353126e-09, "loss": 0.3568, "step": 8969 }, { "epoch": 0.9884297520661157, "grad_norm": 68.96772766113281, "learning_rate": 3.3700013426352096e-09, "loss": 0.4214, "step": 8970 }, { "epoch": 0.9885399449035812, "grad_norm": 6.820242404937744, "learning_rate": 3.3061235512388267e-09, "loss": 0.3921, "step": 8971 }, { "epoch": 0.9886501377410468, "grad_norm": 4.5322442054748535, "learning_rate": 3.242856762356583e-09, "loss": 0.4278, "step": 8972 }, { "epoch": 0.9887603305785124, "grad_norm": 8.196949005126953, "learning_rate": 3.180200983725623e-09, "loss": 0.4112, "step": 8973 }, { "epoch": 0.9888705234159779, "grad_norm": 8.82950496673584, "learning_rate": 3.1181562230070406e-09, "loss": 0.5031, "step": 8974 }, { "epoch": 0.9889807162534435, "grad_norm": 7.549552917480469, "learning_rate": 3.0567224877875445e-09, "loss": 0.4718, "step": 8975 }, { "epoch": 0.9890909090909091, "grad_norm": 7.072907447814941, "learning_rate": 2.9958997855805693e-09, "loss": 0.3899, "step": 8976 }, { "epoch": 0.9892011019283746, "grad_norm": 4.553850173950195, "learning_rate": 2.935688123821834e-09, "loss": 0.3323, "step": 8977 }, { "epoch": 0.9893112947658402, "grad_norm": 6.005707263946533, "learning_rate": 2.876087509875447e-09, "loss": 0.3726, "step": 8978 }, { "epoch": 0.9894214876033058, "grad_norm": 6.978837013244629, "learning_rate": 2.8170979510289133e-09, "loss": 0.3745, "step": 8979 }, { "epoch": 0.9895316804407713, "grad_norm": 15.969650268554688, "learning_rate": 2.758719454495906e-09, "loss": 0.3653, "step": 8980 }, { "epoch": 0.9896418732782369, "grad_norm": 11.831104278564453, "learning_rate": 2.7009520274146052e-09, "loss": 0.4508, "step": 8981 }, { "epoch": 0.9897520661157024, "grad_norm": 5.531377792358398, "learning_rate": 2.643795676848804e-09, "loss": 0.4591, "step": 8982 }, { "epoch": 0.989862258953168, "grad_norm": 7.935631275177002, "learning_rate": 2.5872504097884664e-09, "loss": 0.3249, "step": 8983 }, { "epoch": 0.9899724517906336, "grad_norm": 7.079875469207764, "learning_rate": 2.5313162331469522e-09, "loss": 0.4057, "step": 8984 }, { "epoch": 0.9900826446280991, "grad_norm": 5.866529941558838, "learning_rate": 2.4759931537648998e-09, "loss": 0.4286, "step": 8985 }, { "epoch": 0.9901928374655647, "grad_norm": 4.35883092880249, "learning_rate": 2.4212811784063427e-09, "loss": 0.4128, "step": 8986 }, { "epoch": 0.9903030303030304, "grad_norm": 8.639993667602539, "learning_rate": 2.367180313762596e-09, "loss": 0.4323, "step": 8987 }, { "epoch": 0.9904132231404958, "grad_norm": 7.270481586456299, "learning_rate": 2.3136905664483676e-09, "loss": 0.3641, "step": 8988 }, { "epoch": 0.9905234159779615, "grad_norm": 5.585358142852783, "learning_rate": 2.2608119430045373e-09, "loss": 0.4149, "step": 8989 }, { "epoch": 0.990633608815427, "grad_norm": 5.513559341430664, "learning_rate": 2.2085444498975984e-09, "loss": 0.4228, "step": 8990 }, { "epoch": 0.9907438016528926, "grad_norm": 9.119502067565918, "learning_rate": 2.156888093519105e-09, "loss": 0.4291, "step": 8991 }, { "epoch": 0.9908539944903582, "grad_norm": 5.719584941864014, "learning_rate": 2.105842880184561e-09, "loss": 0.3416, "step": 8992 }, { "epoch": 0.9909641873278237, "grad_norm": 5.899689674377441, "learning_rate": 2.0554088161367503e-09, "loss": 0.3913, "step": 8993 }, { "epoch": 0.9910743801652893, "grad_norm": 14.344022750854492, "learning_rate": 2.005585907542962e-09, "loss": 0.3655, "step": 8994 }, { "epoch": 0.9911845730027549, "grad_norm": 6.823244571685791, "learning_rate": 1.9563741604949893e-09, "loss": 0.3427, "step": 8995 }, { "epoch": 0.9912947658402204, "grad_norm": 4.921267509460449, "learning_rate": 1.9077735810107967e-09, "loss": 0.3867, "step": 8996 }, { "epoch": 0.991404958677686, "grad_norm": 5.286651134490967, "learning_rate": 1.8597841750328528e-09, "loss": 0.4136, "step": 8997 }, { "epoch": 0.9915151515151515, "grad_norm": 8.979911804199219, "learning_rate": 1.8124059484303513e-09, "loss": 0.4196, "step": 8998 }, { "epoch": 0.9916253443526171, "grad_norm": 5.602459907531738, "learning_rate": 1.7656389069958812e-09, "loss": 0.4028, "step": 8999 }, { "epoch": 0.9917355371900827, "grad_norm": 7.817157745361328, "learning_rate": 1.7194830564487564e-09, "loss": 0.3055, "step": 9000 }, { "epoch": 0.9918457300275482, "grad_norm": 6.696260929107666, "learning_rate": 1.6739384024327953e-09, "loss": 0.3815, "step": 9001 }, { "epoch": 0.9919559228650138, "grad_norm": 5.939305782318115, "learning_rate": 1.629004950516877e-09, "loss": 0.3609, "step": 9002 }, { "epoch": 0.9920661157024794, "grad_norm": 5.6286797523498535, "learning_rate": 1.5846827061960501e-09, "loss": 0.3164, "step": 9003 }, { "epoch": 0.9921763085399449, "grad_norm": 7.445435047149658, "learning_rate": 1.5409716748898684e-09, "loss": 0.4076, "step": 9004 }, { "epoch": 0.9922865013774105, "grad_norm": 4.20958137512207, "learning_rate": 1.4978718619435007e-09, "loss": 0.3721, "step": 9005 }, { "epoch": 0.9923966942148761, "grad_norm": 5.760605812072754, "learning_rate": 1.4553832726271755e-09, "loss": 0.3617, "step": 9006 }, { "epoch": 0.9925068870523416, "grad_norm": 4.79775333404541, "learning_rate": 1.4135059121361815e-09, "loss": 0.3678, "step": 9007 }, { "epoch": 0.9926170798898072, "grad_norm": 5.942553997039795, "learning_rate": 1.3722397855919778e-09, "loss": 0.4377, "step": 9008 }, { "epoch": 0.9927272727272727, "grad_norm": 8.425210952758789, "learning_rate": 1.3315848980399726e-09, "loss": 0.4446, "step": 9009 }, { "epoch": 0.9928374655647383, "grad_norm": 7.769210338592529, "learning_rate": 1.2915412544517447e-09, "loss": 0.4648, "step": 9010 }, { "epoch": 0.9929476584022039, "grad_norm": 4.9520134925842285, "learning_rate": 1.2521088597239328e-09, "loss": 0.3496, "step": 9011 }, { "epoch": 0.9930578512396694, "grad_norm": 5.601205348968506, "learning_rate": 1.2132877186787906e-09, "loss": 0.3704, "step": 9012 }, { "epoch": 0.993168044077135, "grad_norm": 7.059554576873779, "learning_rate": 1.1750778360625214e-09, "loss": 0.3442, "step": 9013 }, { "epoch": 0.9932782369146006, "grad_norm": 5.612921714782715, "learning_rate": 1.1374792165486093e-09, "loss": 0.3001, "step": 9014 }, { "epoch": 0.9933884297520661, "grad_norm": 9.377395629882812, "learning_rate": 1.1004918647333773e-09, "loss": 0.4434, "step": 9015 }, { "epoch": 0.9934986225895317, "grad_norm": 5.162177562713623, "learning_rate": 1.0641157851404294e-09, "loss": 0.3526, "step": 9016 }, { "epoch": 0.9936088154269972, "grad_norm": 6.366061210632324, "learning_rate": 1.028350982217874e-09, "loss": 0.367, "step": 9017 }, { "epoch": 0.9937190082644628, "grad_norm": 6.468604564666748, "learning_rate": 9.931974603394345e-10, "loss": 0.3281, "step": 9018 }, { "epoch": 0.9938292011019284, "grad_norm": 5.354187965393066, "learning_rate": 9.586552238027847e-10, "loss": 0.3617, "step": 9019 }, { "epoch": 0.9939393939393939, "grad_norm": 8.523183822631836, "learning_rate": 9.247242768323228e-10, "loss": 0.469, "step": 9020 }, { "epoch": 0.9940495867768595, "grad_norm": 7.622811317443848, "learning_rate": 8.914046235775076e-10, "loss": 0.3906, "step": 9021 }, { "epoch": 0.9941597796143251, "grad_norm": 7.261738300323486, "learning_rate": 8.586962681117472e-10, "loss": 0.4557, "step": 9022 }, { "epoch": 0.9942699724517906, "grad_norm": 5.034419059753418, "learning_rate": 8.265992144357304e-10, "loss": 0.356, "step": 9023 }, { "epoch": 0.9943801652892562, "grad_norm": 5.880802631378174, "learning_rate": 7.951134664740956e-10, "loss": 0.3586, "step": 9024 }, { "epoch": 0.9944903581267218, "grad_norm": 11.561360359191895, "learning_rate": 7.642390280759859e-10, "loss": 0.3061, "step": 9025 }, { "epoch": 0.9946005509641873, "grad_norm": 7.9841766357421875, "learning_rate": 7.339759030183802e-10, "loss": 0.3877, "step": 9026 }, { "epoch": 0.9947107438016529, "grad_norm": 8.438712120056152, "learning_rate": 7.043240950005414e-10, "loss": 0.3889, "step": 9027 }, { "epoch": 0.9948209366391184, "grad_norm": 12.29823112487793, "learning_rate": 6.752836076484581e-10, "loss": 0.4049, "step": 9028 }, { "epoch": 0.994931129476584, "grad_norm": 8.459811210632324, "learning_rate": 6.468544445142888e-10, "loss": 0.4202, "step": 9029 }, { "epoch": 0.9950413223140496, "grad_norm": 7.202066421508789, "learning_rate": 6.190366090735866e-10, "loss": 0.3433, "step": 9030 }, { "epoch": 0.9951515151515151, "grad_norm": 5.43256950378418, "learning_rate": 5.91830104728075e-10, "loss": 0.287, "step": 9031 }, { "epoch": 0.9952617079889807, "grad_norm": 6.569697380065918, "learning_rate": 5.652349348045372e-10, "loss": 0.3686, "step": 9032 }, { "epoch": 0.9953719008264463, "grad_norm": 5.745114326477051, "learning_rate": 5.392511025548164e-10, "loss": 0.3929, "step": 9033 }, { "epoch": 0.9954820936639118, "grad_norm": 5.564207553863525, "learning_rate": 5.13878611156926e-10, "loss": 0.3982, "step": 9034 }, { "epoch": 0.9955922865013774, "grad_norm": 4.55740213394165, "learning_rate": 4.891174637128294e-10, "loss": 0.3603, "step": 9035 }, { "epoch": 0.9957024793388429, "grad_norm": 6.228816986083984, "learning_rate": 4.6496766325121454e-10, "loss": 0.3726, "step": 9036 }, { "epoch": 0.9958126721763085, "grad_norm": 7.958637237548828, "learning_rate": 4.414292127241648e-10, "loss": 0.3639, "step": 9037 }, { "epoch": 0.9959228650137741, "grad_norm": 8.71097183227539, "learning_rate": 4.185021150099333e-10, "loss": 0.451, "step": 9038 }, { "epoch": 0.9960330578512396, "grad_norm": 4.552064418792725, "learning_rate": 3.9618637291349846e-10, "loss": 0.3394, "step": 9039 }, { "epoch": 0.9961432506887052, "grad_norm": 4.1845855712890625, "learning_rate": 3.7448198916212317e-10, "loss": 0.3896, "step": 9040 }, { "epoch": 0.9962534435261708, "grad_norm": 7.433754920959473, "learning_rate": 3.533889664103507e-10, "loss": 0.427, "step": 9041 }, { "epoch": 0.9963636363636363, "grad_norm": 14.062396049499512, "learning_rate": 3.329073072377842e-10, "loss": 0.4165, "step": 9042 }, { "epoch": 0.996473829201102, "grad_norm": 6.495471954345703, "learning_rate": 3.1303701414853174e-10, "loss": 0.3897, "step": 9043 }, { "epoch": 0.9965840220385674, "grad_norm": 5.049517631530762, "learning_rate": 2.9377808957231636e-10, "loss": 0.3929, "step": 9044 }, { "epoch": 0.996694214876033, "grad_norm": 5.869344234466553, "learning_rate": 2.7513053586447624e-10, "loss": 0.388, "step": 9045 }, { "epoch": 0.9968044077134987, "grad_norm": 8.759241104125977, "learning_rate": 2.570943553054095e-10, "loss": 0.3316, "step": 9046 }, { "epoch": 0.9969146005509641, "grad_norm": 5.056518077850342, "learning_rate": 2.3966955010001903e-10, "loss": 0.3831, "step": 9047 }, { "epoch": 0.9970247933884298, "grad_norm": 5.458576679229736, "learning_rate": 2.2285612237937793e-10, "loss": 0.3921, "step": 9048 }, { "epoch": 0.9971349862258954, "grad_norm": 6.452542781829834, "learning_rate": 2.0665407419961925e-10, "loss": 0.4179, "step": 9049 }, { "epoch": 0.9972451790633609, "grad_norm": 7.256555080413818, "learning_rate": 1.9106340754138086e-10, "loss": 0.357, "step": 9050 }, { "epoch": 0.9973553719008265, "grad_norm": 9.529255867004395, "learning_rate": 1.7608412431202592e-10, "loss": 0.439, "step": 9051 }, { "epoch": 0.9974655647382921, "grad_norm": 7.108154296875, "learning_rate": 1.6171622634231222e-10, "loss": 0.3922, "step": 9052 }, { "epoch": 0.9975757575757576, "grad_norm": 5.342614650726318, "learning_rate": 1.4795971538972276e-10, "loss": 0.4049, "step": 9053 }, { "epoch": 0.9976859504132232, "grad_norm": 5.520908355712891, "learning_rate": 1.3481459313624544e-10, "loss": 0.3641, "step": 9054 }, { "epoch": 0.9977961432506887, "grad_norm": 7.096041202545166, "learning_rate": 1.2228086118892812e-10, "loss": 0.4094, "step": 9055 }, { "epoch": 0.9979063360881543, "grad_norm": 5.964447021484375, "learning_rate": 1.1035852108154388e-10, "loss": 0.3797, "step": 9056 }, { "epoch": 0.9980165289256199, "grad_norm": 7.609488487243652, "learning_rate": 9.904757427070533e-11, "loss": 0.2776, "step": 9057 }, { "epoch": 0.9981267217630854, "grad_norm": 7.307365894317627, "learning_rate": 8.834802213975036e-11, "loss": 0.4681, "step": 9058 }, { "epoch": 0.998236914600551, "grad_norm": 18.94178009033203, "learning_rate": 7.825986599763191e-11, "loss": 0.5068, "step": 9059 }, { "epoch": 0.9983471074380166, "grad_norm": 12.599600791931152, "learning_rate": 6.878310707780777e-11, "loss": 0.3579, "step": 9060 }, { "epoch": 0.9984573002754821, "grad_norm": 6.1359639167785645, "learning_rate": 5.991774653879568e-11, "loss": 0.3525, "step": 9061 }, { "epoch": 0.9985674931129477, "grad_norm": 6.206957817077637, "learning_rate": 5.166378546472839e-11, "loss": 0.4174, "step": 9062 }, { "epoch": 0.9986776859504132, "grad_norm": 4.911308288574219, "learning_rate": 4.402122486479865e-11, "loss": 0.3078, "step": 9063 }, { "epoch": 0.9987878787878788, "grad_norm": 5.652713775634766, "learning_rate": 3.699006567381425e-11, "loss": 0.3946, "step": 9064 }, { "epoch": 0.9988980716253444, "grad_norm": 9.284574508666992, "learning_rate": 3.0570308751642906e-11, "loss": 0.4012, "step": 9065 }, { "epoch": 0.9990082644628099, "grad_norm": 10.509775161743164, "learning_rate": 2.4761954883212312e-11, "loss": 0.5294, "step": 9066 }, { "epoch": 0.9991184573002755, "grad_norm": 10.87788200378418, "learning_rate": 1.956500477851009e-11, "loss": 0.3956, "step": 9067 }, { "epoch": 0.9992286501377411, "grad_norm": 7.179844856262207, "learning_rate": 1.4979459073138913e-11, "loss": 0.3698, "step": 9068 }, { "epoch": 0.9993388429752066, "grad_norm": 6.903147220611572, "learning_rate": 1.1005318327761417e-11, "loss": 0.3662, "step": 9069 }, { "epoch": 0.9994490358126722, "grad_norm": 8.5108642578125, "learning_rate": 7.642583028655282e-12, "loss": 0.4146, "step": 9070 }, { "epoch": 0.9995592286501377, "grad_norm": 20.710254669189453, "learning_rate": 4.89125358715814e-12, "loss": 0.4132, "step": 9071 }, { "epoch": 0.9996694214876033, "grad_norm": 7.372518062591553, "learning_rate": 2.751330339112457e-12, "loss": 0.3995, "step": 9072 }, { "epoch": 0.9997796143250689, "grad_norm": 4.217400550842285, "learning_rate": 1.2228135465308654e-12, "loss": 0.3204, "step": 9073 }, { "epoch": 0.9998898071625344, "grad_norm": 35.320533752441406, "learning_rate": 3.057033959308342e-13, "loss": 0.4807, "step": 9074 }, { "epoch": 1.0, "grad_norm": 7.732905864715576, "learning_rate": 0.0, "loss": 0.4039, "step": 9075 }, { "epoch": 1.0, "step": 9075, "total_flos": 2.806207392566477e+18, "train_loss": 0.43036781519897715, "train_runtime": 19668.4772, "train_samples_per_second": 3.691, "train_steps_per_second": 0.461 } ], "logging_steps": 1, "max_steps": 9075, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9075, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.806207392566477e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }