{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7748197448696617, "eval_steps": 500, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011092623405435386, "grad_norm": 80.64155578613281, "learning_rate": 3.6697247706422022e-06, "loss": 13.4594, "step": 2 }, { "epoch": 0.0022185246810870773, "grad_norm": 66.91927337646484, "learning_rate": 7.3394495412844045e-06, "loss": 13.8607, "step": 4 }, { "epoch": 0.0033277870216306157, "grad_norm": 63.7406005859375, "learning_rate": 1.1009174311926607e-05, "loss": 13.0338, "step": 6 }, { "epoch": 0.004437049362174155, "grad_norm": 61.71236801147461, "learning_rate": 1.4678899082568809e-05, "loss": 12.9429, "step": 8 }, { "epoch": 0.005546311702717693, "grad_norm": 63.84341812133789, "learning_rate": 1.834862385321101e-05, "loss": 12.5906, "step": 10 }, { "epoch": 0.0066555740432612314, "grad_norm": 44.9824333190918, "learning_rate": 2.2018348623853213e-05, "loss": 11.0436, "step": 12 }, { "epoch": 0.00776483638380477, "grad_norm": 34.28094482421875, "learning_rate": 2.5688073394495416e-05, "loss": 9.9942, "step": 14 }, { "epoch": 0.00887409872434831, "grad_norm": 28.417001724243164, "learning_rate": 2.9357798165137618e-05, "loss": 9.6041, "step": 16 }, { "epoch": 0.009983361064891847, "grad_norm": 26.446611404418945, "learning_rate": 3.302752293577982e-05, "loss": 8.4555, "step": 18 }, { "epoch": 0.011092623405435386, "grad_norm": 29.56342315673828, "learning_rate": 3.669724770642202e-05, "loss": 7.7889, "step": 20 }, { "epoch": 0.012201885745978924, "grad_norm": 62.20895004272461, "learning_rate": 4.036697247706422e-05, "loss": 6.0004, "step": 22 }, { "epoch": 0.013311148086522463, "grad_norm": 26.70697784423828, "learning_rate": 4.403669724770643e-05, "loss": 5.2102, "step": 24 }, { "epoch": 0.014420410427066, "grad_norm": 20.650150299072266, "learning_rate": 4.7706422018348626e-05, "loss": 3.786, "step": 26 }, { "epoch": 0.01552967276760954, "grad_norm": 18.51991081237793, "learning_rate": 5.137614678899083e-05, "loss": 2.7969, "step": 28 }, { "epoch": 0.016638935108153077, "grad_norm": 11.624157905578613, "learning_rate": 5.504587155963303e-05, "loss": 1.9075, "step": 30 }, { "epoch": 0.01774819744869662, "grad_norm": 4.947307109832764, "learning_rate": 5.8715596330275236e-05, "loss": 1.4263, "step": 32 }, { "epoch": 0.018857459789240156, "grad_norm": 2.2392783164978027, "learning_rate": 6.238532110091744e-05, "loss": 1.3292, "step": 34 }, { "epoch": 0.019966722129783693, "grad_norm": 1.7417014837265015, "learning_rate": 6.605504587155963e-05, "loss": 1.128, "step": 36 }, { "epoch": 0.02107598447032723, "grad_norm": 1.061747431755066, "learning_rate": 6.972477064220184e-05, "loss": 0.9133, "step": 38 }, { "epoch": 0.022185246810870772, "grad_norm": 1.0621497631072998, "learning_rate": 7.339449541284404e-05, "loss": 0.8753, "step": 40 }, { "epoch": 0.02329450915141431, "grad_norm": 0.772523045539856, "learning_rate": 7.706422018348625e-05, "loss": 0.6989, "step": 42 }, { "epoch": 0.024403771491957847, "grad_norm": 0.6044029593467712, "learning_rate": 8.073394495412844e-05, "loss": 0.7296, "step": 44 }, { "epoch": 0.025513033832501388, "grad_norm": 0.5906920433044434, "learning_rate": 8.440366972477065e-05, "loss": 0.6963, "step": 46 }, { "epoch": 0.026622296173044926, "grad_norm": 0.5078668594360352, "learning_rate": 8.807339449541285e-05, "loss": 0.6978, "step": 48 }, { "epoch": 0.027731558513588463, "grad_norm": 0.43668779730796814, "learning_rate": 9.174311926605506e-05, "loss": 0.6444, "step": 50 }, { "epoch": 0.028840820854132, "grad_norm": 0.5769929885864258, "learning_rate": 9.541284403669725e-05, "loss": 0.6321, "step": 52 }, { "epoch": 0.029950083194675542, "grad_norm": 0.6032134890556335, "learning_rate": 9.908256880733946e-05, "loss": 0.6399, "step": 54 }, { "epoch": 0.03105934553521908, "grad_norm": 0.5968932509422302, "learning_rate": 0.00010275229357798166, "loss": 0.6707, "step": 56 }, { "epoch": 0.03216860787576262, "grad_norm": 0.5290119051933289, "learning_rate": 0.00010642201834862387, "loss": 0.8516, "step": 58 }, { "epoch": 0.033277870216306155, "grad_norm": 0.4980998635292053, "learning_rate": 0.00011009174311926606, "loss": 0.6211, "step": 60 }, { "epoch": 0.03438713255684969, "grad_norm": 0.6519685387611389, "learning_rate": 0.00011376146788990827, "loss": 0.7251, "step": 62 }, { "epoch": 0.03549639489739324, "grad_norm": 0.6212429404258728, "learning_rate": 0.00011743119266055047, "loss": 0.4878, "step": 64 }, { "epoch": 0.036605657237936774, "grad_norm": 0.7839425802230835, "learning_rate": 0.00012110091743119268, "loss": 0.6157, "step": 66 }, { "epoch": 0.03771491957848031, "grad_norm": 0.5585289597511292, "learning_rate": 0.00012477064220183488, "loss": 0.6334, "step": 68 }, { "epoch": 0.03882418191902385, "grad_norm": 0.4180700480937958, "learning_rate": 0.00012844036697247707, "loss": 0.639, "step": 70 }, { "epoch": 0.03993344425956739, "grad_norm": 0.4288474917411804, "learning_rate": 0.00013211009174311927, "loss": 0.6489, "step": 72 }, { "epoch": 0.041042706600110924, "grad_norm": 0.391681045293808, "learning_rate": 0.00013577981651376149, "loss": 0.53, "step": 74 }, { "epoch": 0.04215196894065446, "grad_norm": 0.4582154154777527, "learning_rate": 0.00013944954128440368, "loss": 0.6368, "step": 76 }, { "epoch": 0.04326123128119801, "grad_norm": 0.5818586945533752, "learning_rate": 0.0001431192660550459, "loss": 0.5243, "step": 78 }, { "epoch": 0.044370493621741544, "grad_norm": 0.5001092553138733, "learning_rate": 0.0001467889908256881, "loss": 0.4512, "step": 80 }, { "epoch": 0.04547975596228508, "grad_norm": 0.37270447611808777, "learning_rate": 0.00015045871559633028, "loss": 0.582, "step": 82 }, { "epoch": 0.04658901830282862, "grad_norm": 0.3746323585510254, "learning_rate": 0.0001541284403669725, "loss": 0.4406, "step": 84 }, { "epoch": 0.04769828064337216, "grad_norm": 0.3230627775192261, "learning_rate": 0.0001577981651376147, "loss": 0.4755, "step": 86 }, { "epoch": 0.048807542983915694, "grad_norm": 0.46364355087280273, "learning_rate": 0.00016146788990825688, "loss": 0.7201, "step": 88 }, { "epoch": 0.04991680532445923, "grad_norm": 0.37719470262527466, "learning_rate": 0.0001651376146788991, "loss": 0.5318, "step": 90 }, { "epoch": 0.051026067665002776, "grad_norm": 0.3767435550689697, "learning_rate": 0.0001688073394495413, "loss": 0.4638, "step": 92 }, { "epoch": 0.052135330005546314, "grad_norm": 0.42447295784950256, "learning_rate": 0.00017247706422018351, "loss": 0.6399, "step": 94 }, { "epoch": 0.05324459234608985, "grad_norm": 0.33117732405662537, "learning_rate": 0.0001761467889908257, "loss": 0.4514, "step": 96 }, { "epoch": 0.05435385468663339, "grad_norm": 0.32511624693870544, "learning_rate": 0.0001798165137614679, "loss": 0.5366, "step": 98 }, { "epoch": 0.05546311702717693, "grad_norm": 0.34356439113616943, "learning_rate": 0.00018348623853211012, "loss": 0.6371, "step": 100 }, { "epoch": 0.056572379367720464, "grad_norm": 0.4009954035282135, "learning_rate": 0.0001871559633027523, "loss": 0.5844, "step": 102 }, { "epoch": 0.057681641708264, "grad_norm": 0.2896111309528351, "learning_rate": 0.0001908256880733945, "loss": 0.4478, "step": 104 }, { "epoch": 0.058790904048807546, "grad_norm": 0.31993618607521057, "learning_rate": 0.00019449541284403672, "loss": 0.3886, "step": 106 }, { "epoch": 0.059900166389351084, "grad_norm": 0.44810792803764343, "learning_rate": 0.0001981651376146789, "loss": 0.5448, "step": 108 }, { "epoch": 0.06100942872989462, "grad_norm": 0.31323954463005066, "learning_rate": 0.00019999995964675577, "loss": 0.5082, "step": 110 }, { "epoch": 0.06211869107043816, "grad_norm": 0.3845660090446472, "learning_rate": 0.00019999963682099735, "loss": 0.52, "step": 112 }, { "epoch": 0.0632279534109817, "grad_norm": 0.41813963651657104, "learning_rate": 0.00019999899117052264, "loss": 0.6146, "step": 114 }, { "epoch": 0.06433721575152523, "grad_norm": 0.35359835624694824, "learning_rate": 0.000199998022697416, "loss": 0.5561, "step": 116 }, { "epoch": 0.06544647809206877, "grad_norm": 0.3362332284450531, "learning_rate": 0.0001999967314048039, "loss": 0.4836, "step": 118 }, { "epoch": 0.06655574043261231, "grad_norm": 0.5304926633834839, "learning_rate": 0.000199995117296855, "loss": 0.5111, "step": 120 }, { "epoch": 0.06766500277315585, "grad_norm": 0.2606920599937439, "learning_rate": 0.00019999318037877995, "loss": 0.5379, "step": 122 }, { "epoch": 0.06877426511369938, "grad_norm": 0.47892895340919495, "learning_rate": 0.0001999909206568318, "loss": 0.5963, "step": 124 }, { "epoch": 0.06988352745424292, "grad_norm": 0.30522894859313965, "learning_rate": 0.00019998833813830534, "loss": 0.6176, "step": 126 }, { "epoch": 0.07099278979478647, "grad_norm": 0.41433125734329224, "learning_rate": 0.00019998543283153772, "loss": 0.5134, "step": 128 }, { "epoch": 0.07210205213533001, "grad_norm": 0.27291831374168396, "learning_rate": 0.000199982204745908, "loss": 0.5248, "step": 130 }, { "epoch": 0.07321131447587355, "grad_norm": 0.26184335350990295, "learning_rate": 0.0001999786538918372, "loss": 0.4302, "step": 132 }, { "epoch": 0.07432057681641709, "grad_norm": 0.2925998866558075, "learning_rate": 0.00019997478028078853, "loss": 0.4643, "step": 134 }, { "epoch": 0.07542983915696062, "grad_norm": 0.49747079610824585, "learning_rate": 0.0001999705839252669, "loss": 0.6133, "step": 136 }, { "epoch": 0.07653910149750416, "grad_norm": 0.37335023283958435, "learning_rate": 0.0001999660648388193, "loss": 0.508, "step": 138 }, { "epoch": 0.0776483638380477, "grad_norm": 0.383662611246109, "learning_rate": 0.00019996122303603446, "loss": 0.5049, "step": 140 }, { "epoch": 0.07875762617859124, "grad_norm": 0.28099021315574646, "learning_rate": 0.000199956058532543, "loss": 0.4487, "step": 142 }, { "epoch": 0.07986688851913477, "grad_norm": 0.2824121415615082, "learning_rate": 0.00019995057134501726, "loss": 0.4544, "step": 144 }, { "epoch": 0.08097615085967831, "grad_norm": 0.33720389008522034, "learning_rate": 0.00019994476149117133, "loss": 0.5987, "step": 146 }, { "epoch": 0.08208541320022185, "grad_norm": 6.722891330718994, "learning_rate": 0.0001999386289897609, "loss": 0.705, "step": 148 }, { "epoch": 0.08319467554076539, "grad_norm": 0.3853927254676819, "learning_rate": 0.00019993217386058326, "loss": 0.6514, "step": 150 }, { "epoch": 0.08430393788130892, "grad_norm": 0.32647258043289185, "learning_rate": 0.0001999253961244773, "loss": 0.4882, "step": 152 }, { "epoch": 0.08541320022185246, "grad_norm": 0.33191072940826416, "learning_rate": 0.0001999182958033232, "loss": 0.55, "step": 154 }, { "epoch": 0.08652246256239601, "grad_norm": 0.3368849456310272, "learning_rate": 0.00019991087292004273, "loss": 0.4264, "step": 156 }, { "epoch": 0.08763172490293955, "grad_norm": 0.2839064598083496, "learning_rate": 0.00019990312749859886, "loss": 0.4845, "step": 158 }, { "epoch": 0.08874098724348309, "grad_norm": 0.34918418526649475, "learning_rate": 0.00019989505956399578, "loss": 0.4774, "step": 160 }, { "epoch": 0.08985024958402663, "grad_norm": 0.5855952501296997, "learning_rate": 0.0001998866691422789, "loss": 0.5082, "step": 162 }, { "epoch": 0.09095951192457016, "grad_norm": 0.3832476735115051, "learning_rate": 0.00019987795626053468, "loss": 0.5372, "step": 164 }, { "epoch": 0.0920687742651137, "grad_norm": 0.2780646085739136, "learning_rate": 0.00019986892094689052, "loss": 0.4893, "step": 166 }, { "epoch": 0.09317803660565724, "grad_norm": 0.3530760109424591, "learning_rate": 0.00019985956323051478, "loss": 0.4235, "step": 168 }, { "epoch": 0.09428729894620078, "grad_norm": 0.3618020713329315, "learning_rate": 0.00019984988314161658, "loss": 0.4818, "step": 170 }, { "epoch": 0.09539656128674431, "grad_norm": 0.360500693321228, "learning_rate": 0.00019983988071144574, "loss": 0.602, "step": 172 }, { "epoch": 0.09650582362728785, "grad_norm": 0.2537604570388794, "learning_rate": 0.00019982955597229275, "loss": 0.5386, "step": 174 }, { "epoch": 0.09761508596783139, "grad_norm": 0.31803157925605774, "learning_rate": 0.0001998189089574885, "loss": 0.4198, "step": 176 }, { "epoch": 0.09872434830837493, "grad_norm": 0.404077410697937, "learning_rate": 0.0001998079397014043, "loss": 0.4918, "step": 178 }, { "epoch": 0.09983361064891846, "grad_norm": 0.3178037703037262, "learning_rate": 0.00019979664823945178, "loss": 0.5349, "step": 180 }, { "epoch": 0.100942872989462, "grad_norm": 0.3245227336883545, "learning_rate": 0.0001997850346080827, "loss": 0.4801, "step": 182 }, { "epoch": 0.10205213533000555, "grad_norm": 0.33926212787628174, "learning_rate": 0.00019977309884478879, "loss": 0.4323, "step": 184 }, { "epoch": 0.10316139767054909, "grad_norm": 0.41488340497016907, "learning_rate": 0.0001997608409881019, "loss": 0.557, "step": 186 }, { "epoch": 0.10427066001109263, "grad_norm": 0.40327388048171997, "learning_rate": 0.0001997482610775935, "loss": 0.4811, "step": 188 }, { "epoch": 0.10537992235163617, "grad_norm": 0.2919729948043823, "learning_rate": 0.0001997353591538748, "loss": 0.3776, "step": 190 }, { "epoch": 0.1064891846921797, "grad_norm": 0.280122846364975, "learning_rate": 0.00019972213525859658, "loss": 0.4697, "step": 192 }, { "epoch": 0.10759844703272324, "grad_norm": 0.41433635354042053, "learning_rate": 0.00019970858943444897, "loss": 0.591, "step": 194 }, { "epoch": 0.10870770937326678, "grad_norm": 0.33640623092651367, "learning_rate": 0.00019969472172516142, "loss": 0.523, "step": 196 }, { "epoch": 0.10981697171381032, "grad_norm": 0.3238784670829773, "learning_rate": 0.0001996805321755024, "loss": 0.5112, "step": 198 }, { "epoch": 0.11092623405435385, "grad_norm": 0.33250951766967773, "learning_rate": 0.0001996660208312796, "loss": 0.4554, "step": 200 }, { "epoch": 0.11203549639489739, "grad_norm": 0.2348429262638092, "learning_rate": 0.0001996511877393393, "loss": 0.4454, "step": 202 }, { "epoch": 0.11314475873544093, "grad_norm": 0.3479189872741699, "learning_rate": 0.00019963603294756657, "loss": 0.4648, "step": 204 }, { "epoch": 0.11425402107598447, "grad_norm": 0.2339608371257782, "learning_rate": 0.00019962055650488502, "loss": 0.4383, "step": 206 }, { "epoch": 0.115363283416528, "grad_norm": 0.3076520562171936, "learning_rate": 0.0001996047584612566, "loss": 0.4886, "step": 208 }, { "epoch": 0.11647254575707154, "grad_norm": 0.3302302658557892, "learning_rate": 0.00019958863886768147, "loss": 0.5678, "step": 210 }, { "epoch": 0.11758180809761509, "grad_norm": 0.3337957561016083, "learning_rate": 0.00019957219777619786, "loss": 0.4573, "step": 212 }, { "epoch": 0.11869107043815863, "grad_norm": 0.31091246008872986, "learning_rate": 0.0001995554352398819, "loss": 0.5596, "step": 214 }, { "epoch": 0.11980033277870217, "grad_norm": 0.27653515338897705, "learning_rate": 0.00019953835131284738, "loss": 0.4732, "step": 216 }, { "epoch": 0.1209095951192457, "grad_norm": 0.45777037739753723, "learning_rate": 0.00019952094605024562, "loss": 0.5798, "step": 218 }, { "epoch": 0.12201885745978924, "grad_norm": 0.30545416474342346, "learning_rate": 0.00019950321950826534, "loss": 0.4263, "step": 220 }, { "epoch": 0.12312811980033278, "grad_norm": 0.29590895771980286, "learning_rate": 0.00019948517174413238, "loss": 0.4797, "step": 222 }, { "epoch": 0.12423738214087632, "grad_norm": 0.387412965297699, "learning_rate": 0.0001994668028161096, "loss": 0.4482, "step": 224 }, { "epoch": 0.12534664448141986, "grad_norm": 0.337591290473938, "learning_rate": 0.00019944811278349667, "loss": 0.5011, "step": 226 }, { "epoch": 0.1264559068219634, "grad_norm": 0.3025722801685333, "learning_rate": 0.00019942910170662987, "loss": 0.6154, "step": 228 }, { "epoch": 0.12756516916250693, "grad_norm": 0.2529117465019226, "learning_rate": 0.00019940976964688182, "loss": 0.4412, "step": 230 }, { "epoch": 0.12867443150305047, "grad_norm": 0.3442631959915161, "learning_rate": 0.0001993901166666615, "loss": 0.5209, "step": 232 }, { "epoch": 0.129783693843594, "grad_norm": 0.30931201577186584, "learning_rate": 0.00019937014282941373, "loss": 0.6154, "step": 234 }, { "epoch": 0.13089295618413754, "grad_norm": 0.3555050790309906, "learning_rate": 0.00019934984819961927, "loss": 0.5173, "step": 236 }, { "epoch": 0.13200221852468108, "grad_norm": 0.44570550322532654, "learning_rate": 0.00019932923284279446, "loss": 0.5319, "step": 238 }, { "epoch": 0.13311148086522462, "grad_norm": 0.3192990720272064, "learning_rate": 0.00019930829682549095, "loss": 0.581, "step": 240 }, { "epoch": 0.13422074320576816, "grad_norm": 0.24486322700977325, "learning_rate": 0.00019928704021529567, "loss": 0.403, "step": 242 }, { "epoch": 0.1353300055463117, "grad_norm": 0.2888510823249817, "learning_rate": 0.00019926546308083047, "loss": 0.4869, "step": 244 }, { "epoch": 0.13643926788685523, "grad_norm": 0.3472958207130432, "learning_rate": 0.00019924356549175188, "loss": 0.6019, "step": 246 }, { "epoch": 0.13754853022739877, "grad_norm": 0.2682150900363922, "learning_rate": 0.00019922134751875102, "loss": 0.4816, "step": 248 }, { "epoch": 0.1386577925679423, "grad_norm": 0.24147269129753113, "learning_rate": 0.00019919880923355323, "loss": 0.4681, "step": 250 }, { "epoch": 0.13976705490848584, "grad_norm": 0.3548056185245514, "learning_rate": 0.00019917595070891798, "loss": 0.568, "step": 252 }, { "epoch": 0.1408763172490294, "grad_norm": 0.299034982919693, "learning_rate": 0.00019915277201863844, "loss": 0.4479, "step": 254 }, { "epoch": 0.14198557958957295, "grad_norm": 0.32234734296798706, "learning_rate": 0.00019912927323754146, "loss": 0.542, "step": 256 }, { "epoch": 0.14309484193011648, "grad_norm": 0.31738007068634033, "learning_rate": 0.00019910545444148722, "loss": 0.4458, "step": 258 }, { "epoch": 0.14420410427066002, "grad_norm": 0.2500552833080292, "learning_rate": 0.0001990813157073689, "loss": 0.4774, "step": 260 }, { "epoch": 0.14531336661120356, "grad_norm": 0.2912147641181946, "learning_rate": 0.0001990568571131126, "loss": 0.523, "step": 262 }, { "epoch": 0.1464226289517471, "grad_norm": 0.35038962960243225, "learning_rate": 0.00019903207873767705, "loss": 0.5325, "step": 264 }, { "epoch": 0.14753189129229063, "grad_norm": 0.6067700386047363, "learning_rate": 0.00019900698066105317, "loss": 0.5304, "step": 266 }, { "epoch": 0.14864115363283417, "grad_norm": 0.8355000615119934, "learning_rate": 0.00019898156296426414, "loss": 0.6244, "step": 268 }, { "epoch": 0.1497504159733777, "grad_norm": 0.2898505926132202, "learning_rate": 0.00019895582572936475, "loss": 0.4361, "step": 270 }, { "epoch": 0.15085967831392125, "grad_norm": 0.2776695191860199, "learning_rate": 0.0001989297690394416, "loss": 0.5019, "step": 272 }, { "epoch": 0.15196894065446478, "grad_norm": 0.310939222574234, "learning_rate": 0.0001989033929786123, "loss": 0.44, "step": 274 }, { "epoch": 0.15307820299500832, "grad_norm": 0.37729912996292114, "learning_rate": 0.00019887669763202567, "loss": 0.6571, "step": 276 }, { "epoch": 0.15418746533555186, "grad_norm": 0.3895605802536011, "learning_rate": 0.0001988496830858612, "loss": 0.4787, "step": 278 }, { "epoch": 0.1552967276760954, "grad_norm": 0.27666375041007996, "learning_rate": 0.00019882234942732882, "loss": 0.4677, "step": 280 }, { "epoch": 0.15640599001663893, "grad_norm": 0.5543416142463684, "learning_rate": 0.00019879469674466868, "loss": 0.5272, "step": 282 }, { "epoch": 0.15751525235718247, "grad_norm": 0.46799716353416443, "learning_rate": 0.00019876672512715078, "loss": 0.3936, "step": 284 }, { "epoch": 0.158624514697726, "grad_norm": 0.47091013193130493, "learning_rate": 0.00019873843466507475, "loss": 0.5349, "step": 286 }, { "epoch": 0.15973377703826955, "grad_norm": 0.2690078020095825, "learning_rate": 0.0001987098254497695, "loss": 0.4377, "step": 288 }, { "epoch": 0.16084303937881309, "grad_norm": 0.8880047798156738, "learning_rate": 0.000198680897573593, "loss": 0.4537, "step": 290 }, { "epoch": 0.16195230171935662, "grad_norm": 0.25852563977241516, "learning_rate": 0.00019865165112993195, "loss": 0.4001, "step": 292 }, { "epoch": 0.16306156405990016, "grad_norm": 0.3986980617046356, "learning_rate": 0.00019862208621320142, "loss": 0.5538, "step": 294 }, { "epoch": 0.1641708264004437, "grad_norm": 0.2550656795501709, "learning_rate": 0.00019859220291884458, "loss": 0.4261, "step": 296 }, { "epoch": 0.16528008874098724, "grad_norm": 0.2604806125164032, "learning_rate": 0.0001985620013433325, "loss": 0.5003, "step": 298 }, { "epoch": 0.16638935108153077, "grad_norm": 0.3848886787891388, "learning_rate": 0.0001985314815841637, "loss": 0.4393, "step": 300 }, { "epoch": 0.1674986134220743, "grad_norm": 0.37041744589805603, "learning_rate": 0.00019850064373986377, "loss": 0.4831, "step": 302 }, { "epoch": 0.16860787576261785, "grad_norm": 0.4813826382160187, "learning_rate": 0.0001984694879099853, "loss": 0.5145, "step": 304 }, { "epoch": 0.16971713810316139, "grad_norm": 0.3015052080154419, "learning_rate": 0.00019843801419510744, "loss": 0.4991, "step": 306 }, { "epoch": 0.17082640044370492, "grad_norm": 0.33228328824043274, "learning_rate": 0.00019840622269683538, "loss": 0.4475, "step": 308 }, { "epoch": 0.1719356627842485, "grad_norm": 0.3386059105396271, "learning_rate": 0.00019837411351780038, "loss": 0.5565, "step": 310 }, { "epoch": 0.17304492512479203, "grad_norm": 0.38623613119125366, "learning_rate": 0.00019834168676165917, "loss": 0.5547, "step": 312 }, { "epoch": 0.17415418746533556, "grad_norm": 0.32846319675445557, "learning_rate": 0.0001983089425330937, "loss": 0.547, "step": 314 }, { "epoch": 0.1752634498058791, "grad_norm": 0.3607962429523468, "learning_rate": 0.00019827588093781083, "loss": 0.5914, "step": 316 }, { "epoch": 0.17637271214642264, "grad_norm": 0.36206066608428955, "learning_rate": 0.00019824250208254194, "loss": 0.4694, "step": 318 }, { "epoch": 0.17748197448696618, "grad_norm": 0.24287384748458862, "learning_rate": 0.0001982088060750426, "loss": 0.3787, "step": 320 }, { "epoch": 0.17859123682750971, "grad_norm": 0.35104668140411377, "learning_rate": 0.00019817479302409227, "loss": 0.4743, "step": 322 }, { "epoch": 0.17970049916805325, "grad_norm": 0.29966557025909424, "learning_rate": 0.0001981404630394939, "loss": 0.5124, "step": 324 }, { "epoch": 0.1808097615085968, "grad_norm": 0.35917168855667114, "learning_rate": 0.0001981058162320735, "loss": 0.5524, "step": 326 }, { "epoch": 0.18191902384914033, "grad_norm": 0.38302454352378845, "learning_rate": 0.00019807085271368005, "loss": 0.6019, "step": 328 }, { "epoch": 0.18302828618968386, "grad_norm": 0.24732793867588043, "learning_rate": 0.0001980355725971847, "loss": 0.4952, "step": 330 }, { "epoch": 0.1841375485302274, "grad_norm": 0.279240220785141, "learning_rate": 0.0001979999759964809, "loss": 0.3949, "step": 332 }, { "epoch": 0.18524681087077094, "grad_norm": 0.2902393341064453, "learning_rate": 0.00019796406302648368, "loss": 0.4938, "step": 334 }, { "epoch": 0.18635607321131448, "grad_norm": 0.3428916037082672, "learning_rate": 0.00019792783380312936, "loss": 0.4401, "step": 336 }, { "epoch": 0.18746533555185801, "grad_norm": 0.38825634121894836, "learning_rate": 0.00019789128844337528, "loss": 0.5968, "step": 338 }, { "epoch": 0.18857459789240155, "grad_norm": 0.26601892709732056, "learning_rate": 0.0001978544270651993, "loss": 0.3993, "step": 340 }, { "epoch": 0.1896838602329451, "grad_norm": 0.29738616943359375, "learning_rate": 0.00019781724978759955, "loss": 0.4975, "step": 342 }, { "epoch": 0.19079312257348863, "grad_norm": 0.3405514061450958, "learning_rate": 0.00019777975673059383, "loss": 0.5674, "step": 344 }, { "epoch": 0.19190238491403216, "grad_norm": 0.29235440492630005, "learning_rate": 0.00019774194801521947, "loss": 0.4854, "step": 346 }, { "epoch": 0.1930116472545757, "grad_norm": 0.279852032661438, "learning_rate": 0.00019770382376353284, "loss": 0.3933, "step": 348 }, { "epoch": 0.19412090959511924, "grad_norm": 0.3054843544960022, "learning_rate": 0.00019766538409860882, "loss": 0.5125, "step": 350 }, { "epoch": 0.19523017193566278, "grad_norm": 0.2959388494491577, "learning_rate": 0.00019762662914454065, "loss": 0.5114, "step": 352 }, { "epoch": 0.19633943427620631, "grad_norm": 0.42934173345565796, "learning_rate": 0.0001975875590264393, "loss": 0.5157, "step": 354 }, { "epoch": 0.19744869661674985, "grad_norm": 0.28414642810821533, "learning_rate": 0.00019754817387043327, "loss": 0.4952, "step": 356 }, { "epoch": 0.1985579589572934, "grad_norm": 0.27035775780677795, "learning_rate": 0.00019750847380366806, "loss": 0.3946, "step": 358 }, { "epoch": 0.19966722129783693, "grad_norm": 0.23628903925418854, "learning_rate": 0.0001974684589543057, "loss": 0.3691, "step": 360 }, { "epoch": 0.20077648363838047, "grad_norm": 0.3877003788948059, "learning_rate": 0.0001974281294515245, "loss": 0.5729, "step": 362 }, { "epoch": 0.201885745978924, "grad_norm": 0.3051539659500122, "learning_rate": 0.00019738748542551861, "loss": 0.4378, "step": 364 }, { "epoch": 0.20299500831946754, "grad_norm": 0.27406755089759827, "learning_rate": 0.00019734652700749737, "loss": 0.505, "step": 366 }, { "epoch": 0.2041042706600111, "grad_norm": 2.0898008346557617, "learning_rate": 0.0001973052543296852, "loss": 0.5647, "step": 368 }, { "epoch": 0.20521353300055464, "grad_norm": 0.24537453055381775, "learning_rate": 0.000197263667525321, "loss": 0.4543, "step": 370 }, { "epoch": 0.20632279534109818, "grad_norm": 0.26961207389831543, "learning_rate": 0.0001972217667286577, "loss": 0.4595, "step": 372 }, { "epoch": 0.20743205768164172, "grad_norm": 0.24330930411815643, "learning_rate": 0.00019717955207496196, "loss": 0.4383, "step": 374 }, { "epoch": 0.20854132002218526, "grad_norm": 0.42495131492614746, "learning_rate": 0.0001971370237005136, "loss": 0.6703, "step": 376 }, { "epoch": 0.2096505823627288, "grad_norm": 0.28645631670951843, "learning_rate": 0.0001970941817426052, "loss": 0.4693, "step": 378 }, { "epoch": 0.21075984470327233, "grad_norm": 0.27761217951774597, "learning_rate": 0.00019705102633954172, "loss": 0.5217, "step": 380 }, { "epoch": 0.21186910704381587, "grad_norm": 0.27868810296058655, "learning_rate": 0.00019700755763063998, "loss": 0.4173, "step": 382 }, { "epoch": 0.2129783693843594, "grad_norm": 0.3163412809371948, "learning_rate": 0.0001969637757562282, "loss": 0.5144, "step": 384 }, { "epoch": 0.21408763172490294, "grad_norm": 0.2901851534843445, "learning_rate": 0.00019691968085764562, "loss": 0.5128, "step": 386 }, { "epoch": 0.21519689406544648, "grad_norm": 0.3356168270111084, "learning_rate": 0.00019687527307724197, "loss": 0.4398, "step": 388 }, { "epoch": 0.21630615640599002, "grad_norm": 0.33736830949783325, "learning_rate": 0.0001968305525583771, "loss": 0.4771, "step": 390 }, { "epoch": 0.21741541874653356, "grad_norm": 0.2211776226758957, "learning_rate": 0.00019678551944542037, "loss": 0.3975, "step": 392 }, { "epoch": 0.2185246810870771, "grad_norm": 0.2705392837524414, "learning_rate": 0.00019674017388375038, "loss": 0.4347, "step": 394 }, { "epoch": 0.21963394342762063, "grad_norm": 0.2636098861694336, "learning_rate": 0.0001966945160197543, "loss": 0.5036, "step": 396 }, { "epoch": 0.22074320576816417, "grad_norm": 0.26003068685531616, "learning_rate": 0.00019664854600082756, "loss": 0.458, "step": 398 }, { "epoch": 0.2218524681087077, "grad_norm": 0.1831740140914917, "learning_rate": 0.00019660226397537326, "loss": 0.4322, "step": 400 }, { "epoch": 0.22296173044925124, "grad_norm": 1.4381052255630493, "learning_rate": 0.00019655567009280178, "loss": 0.545, "step": 402 }, { "epoch": 0.22407099278979478, "grad_norm": 0.3401663303375244, "learning_rate": 0.00019650876450353022, "loss": 0.5524, "step": 404 }, { "epoch": 0.22518025513033832, "grad_norm": 0.294527530670166, "learning_rate": 0.00019646154735898202, "loss": 0.4059, "step": 406 }, { "epoch": 0.22628951747088186, "grad_norm": 0.3033091127872467, "learning_rate": 0.00019641401881158625, "loss": 0.4119, "step": 408 }, { "epoch": 0.2273987798114254, "grad_norm": 0.3446100950241089, "learning_rate": 0.00019636617901477746, "loss": 0.4299, "step": 410 }, { "epoch": 0.22850804215196893, "grad_norm": 0.2955077886581421, "learning_rate": 0.00019631802812299483, "loss": 0.4496, "step": 412 }, { "epoch": 0.22961730449251247, "grad_norm": 0.2663453221321106, "learning_rate": 0.00019626956629168192, "loss": 0.5373, "step": 414 }, { "epoch": 0.230726566833056, "grad_norm": 0.31393304467201233, "learning_rate": 0.0001962207936772861, "loss": 0.4856, "step": 416 }, { "epoch": 0.23183582917359954, "grad_norm": 0.3716754615306854, "learning_rate": 0.00019617171043725796, "loss": 0.6161, "step": 418 }, { "epoch": 0.23294509151414308, "grad_norm": 0.3159737288951874, "learning_rate": 0.00019612231673005092, "loss": 0.4493, "step": 420 }, { "epoch": 0.23405435385468662, "grad_norm": 0.3315045237541199, "learning_rate": 0.00019607261271512068, "loss": 0.4695, "step": 422 }, { "epoch": 0.23516361619523019, "grad_norm": 0.27597951889038086, "learning_rate": 0.0001960225985529246, "loss": 0.3391, "step": 424 }, { "epoch": 0.23627287853577372, "grad_norm": 0.4410407543182373, "learning_rate": 0.00019597227440492143, "loss": 0.5034, "step": 426 }, { "epoch": 0.23738214087631726, "grad_norm": 0.2749500572681427, "learning_rate": 0.00019592164043357046, "loss": 0.4934, "step": 428 }, { "epoch": 0.2384914032168608, "grad_norm": 0.32692408561706543, "learning_rate": 0.00019587069680233134, "loss": 0.4589, "step": 430 }, { "epoch": 0.23960066555740434, "grad_norm": 0.2124568372964859, "learning_rate": 0.00019581944367566326, "loss": 0.4367, "step": 432 }, { "epoch": 0.24070992789794787, "grad_norm": 0.26193922758102417, "learning_rate": 0.00019576788121902457, "loss": 0.4172, "step": 434 }, { "epoch": 0.2418191902384914, "grad_norm": 0.25930801033973694, "learning_rate": 0.00019571600959887223, "loss": 0.3983, "step": 436 }, { "epoch": 0.24292845257903495, "grad_norm": 0.30770912766456604, "learning_rate": 0.0001956638289826613, "loss": 0.4245, "step": 438 }, { "epoch": 0.24403771491957849, "grad_norm": 0.26078179478645325, "learning_rate": 0.00019561133953884427, "loss": 0.3807, "step": 440 }, { "epoch": 0.24514697726012202, "grad_norm": 0.3434695303440094, "learning_rate": 0.00019555854143687068, "loss": 0.6136, "step": 442 }, { "epoch": 0.24625623960066556, "grad_norm": 0.3655042350292206, "learning_rate": 0.00019550543484718648, "loss": 0.4969, "step": 444 }, { "epoch": 0.2473655019412091, "grad_norm": 0.2804737389087677, "learning_rate": 0.00019545201994123344, "loss": 0.4154, "step": 446 }, { "epoch": 0.24847476428175264, "grad_norm": 0.23757784068584442, "learning_rate": 0.00019539829689144876, "loss": 0.467, "step": 448 }, { "epoch": 0.24958402662229617, "grad_norm": 0.3346647024154663, "learning_rate": 0.00019534426587126434, "loss": 0.5979, "step": 450 }, { "epoch": 0.2506932889628397, "grad_norm": 0.3275079131126404, "learning_rate": 0.00019528992705510629, "loss": 0.4006, "step": 452 }, { "epoch": 0.25180255130338325, "grad_norm": 0.286476731300354, "learning_rate": 0.00019523528061839436, "loss": 0.4051, "step": 454 }, { "epoch": 0.2529118136439268, "grad_norm": 0.3157908022403717, "learning_rate": 0.00019518032673754144, "loss": 0.5217, "step": 456 }, { "epoch": 0.2540210759844703, "grad_norm": 0.30794093012809753, "learning_rate": 0.00019512506558995286, "loss": 0.554, "step": 458 }, { "epoch": 0.25513033832501386, "grad_norm": 0.4166387915611267, "learning_rate": 0.00019506949735402588, "loss": 0.4359, "step": 460 }, { "epoch": 0.2562396006655574, "grad_norm": 0.37193942070007324, "learning_rate": 0.0001950136222091492, "loss": 0.6409, "step": 462 }, { "epoch": 0.25734886300610094, "grad_norm": 0.20224644243717194, "learning_rate": 0.00019495744033570222, "loss": 0.4221, "step": 464 }, { "epoch": 0.2584581253466445, "grad_norm": 0.3197399973869324, "learning_rate": 0.00019490095191505461, "loss": 0.5052, "step": 466 }, { "epoch": 0.259567387687188, "grad_norm": 0.29040077328681946, "learning_rate": 0.0001948441571295656, "loss": 0.5, "step": 468 }, { "epoch": 0.26067665002773155, "grad_norm": 0.28463155031204224, "learning_rate": 0.00019478705616258343, "loss": 0.4063, "step": 470 }, { "epoch": 0.2617859123682751, "grad_norm": 0.2545261085033417, "learning_rate": 0.00019472964919844485, "loss": 0.4336, "step": 472 }, { "epoch": 0.2628951747088186, "grad_norm": 0.3148619532585144, "learning_rate": 0.00019467193642247436, "loss": 0.5194, "step": 474 }, { "epoch": 0.26400443704936216, "grad_norm": 0.26055407524108887, "learning_rate": 0.00019461391802098378, "loss": 0.5101, "step": 476 }, { "epoch": 0.2651136993899057, "grad_norm": 0.2619602382183075, "learning_rate": 0.00019455559418127147, "loss": 0.4795, "step": 478 }, { "epoch": 0.26622296173044924, "grad_norm": 0.33671075105667114, "learning_rate": 0.00019449696509162196, "loss": 0.4891, "step": 480 }, { "epoch": 0.2673322240709928, "grad_norm": 0.3849608302116394, "learning_rate": 0.0001944380309413051, "loss": 0.3657, "step": 482 }, { "epoch": 0.2684414864115363, "grad_norm": 0.26096463203430176, "learning_rate": 0.00019437879192057557, "loss": 0.4491, "step": 484 }, { "epoch": 0.26955074875207985, "grad_norm": 0.3250510096549988, "learning_rate": 0.0001943192482206723, "loss": 0.5835, "step": 486 }, { "epoch": 0.2706600110926234, "grad_norm": 0.38445812463760376, "learning_rate": 0.00019425940003381771, "loss": 0.547, "step": 488 }, { "epoch": 0.2717692734331669, "grad_norm": 0.5909857153892517, "learning_rate": 0.00019419924755321728, "loss": 0.4433, "step": 490 }, { "epoch": 0.27287853577371046, "grad_norm": 0.28175419569015503, "learning_rate": 0.0001941387909730588, "loss": 0.5281, "step": 492 }, { "epoch": 0.273987798114254, "grad_norm": 0.2964307367801666, "learning_rate": 0.00019407803048851173, "loss": 0.5071, "step": 494 }, { "epoch": 0.27509706045479754, "grad_norm": 0.23891805112361908, "learning_rate": 0.00019401696629572666, "loss": 0.403, "step": 496 }, { "epoch": 0.2762063227953411, "grad_norm": 0.2703591287136078, "learning_rate": 0.00019395559859183462, "loss": 0.4754, "step": 498 }, { "epoch": 0.2773155851358846, "grad_norm": 0.34853506088256836, "learning_rate": 0.00019389392757494645, "loss": 0.4713, "step": 500 }, { "epoch": 0.27842484747642815, "grad_norm": 0.33245861530303955, "learning_rate": 0.00019383195344415216, "loss": 0.4999, "step": 502 }, { "epoch": 0.2795341098169717, "grad_norm": 0.33471912145614624, "learning_rate": 0.00019376967639952024, "loss": 0.469, "step": 504 }, { "epoch": 0.2806433721575152, "grad_norm": 0.2644645869731903, "learning_rate": 0.00019370709664209715, "loss": 0.4846, "step": 506 }, { "epoch": 0.2817526344980588, "grad_norm": 0.3247945010662079, "learning_rate": 0.0001936442143739066, "loss": 0.5381, "step": 508 }, { "epoch": 0.28286189683860236, "grad_norm": 0.37427282333374023, "learning_rate": 0.0001935810297979487, "loss": 0.5024, "step": 510 }, { "epoch": 0.2839711591791459, "grad_norm": 0.34435346722602844, "learning_rate": 0.00019351754311819976, "loss": 0.5401, "step": 512 }, { "epoch": 0.28508042151968943, "grad_norm": 0.26365646719932556, "learning_rate": 0.00019345375453961114, "loss": 0.4533, "step": 514 }, { "epoch": 0.28618968386023297, "grad_norm": 0.2864663302898407, "learning_rate": 0.0001933896642681089, "loss": 0.4295, "step": 516 }, { "epoch": 0.2872989462007765, "grad_norm": 0.25872141122817993, "learning_rate": 0.00019332527251059303, "loss": 0.4223, "step": 518 }, { "epoch": 0.28840820854132004, "grad_norm": 0.30672091245651245, "learning_rate": 0.0001932605794749368, "loss": 0.433, "step": 520 }, { "epoch": 0.2895174708818636, "grad_norm": 0.30173277854919434, "learning_rate": 0.00019319558536998605, "loss": 0.4861, "step": 522 }, { "epoch": 0.2906267332224071, "grad_norm": 0.5812121629714966, "learning_rate": 0.0001931302904055586, "loss": 0.5444, "step": 524 }, { "epoch": 0.29173599556295066, "grad_norm": 0.27071669697761536, "learning_rate": 0.0001930646947924435, "loss": 0.4302, "step": 526 }, { "epoch": 0.2928452579034942, "grad_norm": 0.2868422865867615, "learning_rate": 0.0001929987987424004, "loss": 0.4401, "step": 528 }, { "epoch": 0.29395452024403773, "grad_norm": 0.3282202482223511, "learning_rate": 0.0001929326024681587, "loss": 0.5343, "step": 530 }, { "epoch": 0.29506378258458127, "grad_norm": 0.30158400535583496, "learning_rate": 0.00019286610618341726, "loss": 0.4712, "step": 532 }, { "epoch": 0.2961730449251248, "grad_norm": 0.3179633617401123, "learning_rate": 0.00019279931010284323, "loss": 0.5509, "step": 534 }, { "epoch": 0.29728230726566834, "grad_norm": 0.2874014675617218, "learning_rate": 0.00019273221444207161, "loss": 0.4022, "step": 536 }, { "epoch": 0.2983915696062119, "grad_norm": 0.3315957486629486, "learning_rate": 0.0001926648194177046, "loss": 0.6074, "step": 538 }, { "epoch": 0.2995008319467554, "grad_norm": 0.3043927550315857, "learning_rate": 0.00019259712524731084, "loss": 0.4971, "step": 540 }, { "epoch": 0.30061009428729896, "grad_norm": 0.31024664640426636, "learning_rate": 0.00019252913214942455, "loss": 0.4318, "step": 542 }, { "epoch": 0.3017193566278425, "grad_norm": 0.2936079204082489, "learning_rate": 0.0001924608403435451, "loss": 0.5195, "step": 544 }, { "epoch": 0.30282861896838603, "grad_norm": 0.2543775737285614, "learning_rate": 0.0001923922500501361, "loss": 0.4005, "step": 546 }, { "epoch": 0.30393788130892957, "grad_norm": 0.3738788962364197, "learning_rate": 0.0001923233614906248, "loss": 0.5765, "step": 548 }, { "epoch": 0.3050471436494731, "grad_norm": 0.2852479815483093, "learning_rate": 0.00019225417488740127, "loss": 0.5106, "step": 550 }, { "epoch": 0.30615640599001664, "grad_norm": 0.37130045890808105, "learning_rate": 0.0001921846904638178, "loss": 0.4207, "step": 552 }, { "epoch": 0.3072656683305602, "grad_norm": 0.3376319408416748, "learning_rate": 0.0001921149084441881, "loss": 0.3901, "step": 554 }, { "epoch": 0.3083749306711037, "grad_norm": 0.31291213631629944, "learning_rate": 0.00019204482905378658, "loss": 0.4886, "step": 556 }, { "epoch": 0.30948419301164726, "grad_norm": 0.3333558738231659, "learning_rate": 0.00019197445251884765, "loss": 0.493, "step": 558 }, { "epoch": 0.3105934553521908, "grad_norm": 0.2883855104446411, "learning_rate": 0.000191903779066565, "loss": 0.4213, "step": 560 }, { "epoch": 0.31170271769273433, "grad_norm": 0.38979244232177734, "learning_rate": 0.00019183280892509083, "loss": 0.6501, "step": 562 }, { "epoch": 0.31281198003327787, "grad_norm": 0.2953646779060364, "learning_rate": 0.00019176154232353513, "loss": 0.4218, "step": 564 }, { "epoch": 0.3139212423738214, "grad_norm": 0.2818152904510498, "learning_rate": 0.00019168997949196495, "loss": 0.4319, "step": 566 }, { "epoch": 0.31503050471436495, "grad_norm": 0.29279059171676636, "learning_rate": 0.00019161812066140362, "loss": 0.5014, "step": 568 }, { "epoch": 0.3161397670549085, "grad_norm": 0.2888440787792206, "learning_rate": 0.00019154596606383003, "loss": 0.4381, "step": 570 }, { "epoch": 0.317249029395452, "grad_norm": 0.3210539221763611, "learning_rate": 0.00019147351593217792, "loss": 0.405, "step": 572 }, { "epoch": 0.31835829173599556, "grad_norm": 0.29606014490127563, "learning_rate": 0.0001914007705003351, "loss": 0.4297, "step": 574 }, { "epoch": 0.3194675540765391, "grad_norm": 0.2536727786064148, "learning_rate": 0.00019132773000314263, "loss": 0.4009, "step": 576 }, { "epoch": 0.32057681641708263, "grad_norm": 0.31337520480155945, "learning_rate": 0.00019125439467639416, "loss": 0.554, "step": 578 }, { "epoch": 0.32168607875762617, "grad_norm": 0.31429773569107056, "learning_rate": 0.00019118076475683506, "loss": 0.4452, "step": 580 }, { "epoch": 0.3227953410981697, "grad_norm": 0.22459080815315247, "learning_rate": 0.00019110684048216184, "loss": 0.4353, "step": 582 }, { "epoch": 0.32390460343871325, "grad_norm": 0.34493422508239746, "learning_rate": 0.0001910326220910211, "loss": 0.4338, "step": 584 }, { "epoch": 0.3250138657792568, "grad_norm": 0.29713448882102966, "learning_rate": 0.00019095810982300914, "loss": 0.4684, "step": 586 }, { "epoch": 0.3261231281198003, "grad_norm": 0.3797847330570221, "learning_rate": 0.0001908833039186708, "loss": 0.5029, "step": 588 }, { "epoch": 0.32723239046034386, "grad_norm": 0.3142600357532501, "learning_rate": 0.00019080820461949886, "loss": 0.4978, "step": 590 }, { "epoch": 0.3283416528008874, "grad_norm": 0.3005797863006592, "learning_rate": 0.0001907328121679334, "loss": 0.4405, "step": 592 }, { "epoch": 0.32945091514143093, "grad_norm": 0.4218408763408661, "learning_rate": 0.00019065712680736067, "loss": 0.4554, "step": 594 }, { "epoch": 0.33056017748197447, "grad_norm": 0.22723990678787231, "learning_rate": 0.00019058114878211266, "loss": 0.4094, "step": 596 }, { "epoch": 0.331669439822518, "grad_norm": 0.3160037398338318, "learning_rate": 0.0001905048783374661, "loss": 0.557, "step": 598 }, { "epoch": 0.33277870216306155, "grad_norm": 0.31518688797950745, "learning_rate": 0.00019042831571964173, "loss": 0.4034, "step": 600 }, { "epoch": 0.3338879645036051, "grad_norm": 0.28441253304481506, "learning_rate": 0.0001903514611758035, "loss": 0.4561, "step": 602 }, { "epoch": 0.3349972268441486, "grad_norm": 0.26106494665145874, "learning_rate": 0.00019027431495405778, "loss": 0.3955, "step": 604 }, { "epoch": 0.33610648918469216, "grad_norm": 0.22984157502651215, "learning_rate": 0.00019019687730345252, "loss": 0.373, "step": 606 }, { "epoch": 0.3372157515252357, "grad_norm": 0.30182814598083496, "learning_rate": 0.00019011914847397654, "loss": 0.4447, "step": 608 }, { "epoch": 0.33832501386577923, "grad_norm": 0.37010881304740906, "learning_rate": 0.0001900411287165586, "loss": 0.4138, "step": 610 }, { "epoch": 0.33943427620632277, "grad_norm": 0.27557408809661865, "learning_rate": 0.00018996281828306667, "loss": 0.4136, "step": 612 }, { "epoch": 0.3405435385468663, "grad_norm": 0.29684868454933167, "learning_rate": 0.00018988421742630713, "loss": 0.436, "step": 614 }, { "epoch": 0.34165280088740985, "grad_norm": 0.3333280086517334, "learning_rate": 0.0001898053264000239, "loss": 0.5294, "step": 616 }, { "epoch": 0.3427620632279534, "grad_norm": 0.35531410574913025, "learning_rate": 0.00018972614545889756, "loss": 0.4914, "step": 618 }, { "epoch": 0.343871325568497, "grad_norm": 0.41459953784942627, "learning_rate": 0.00018964667485854483, "loss": 0.4388, "step": 620 }, { "epoch": 0.3449805879090405, "grad_norm": 0.29694780707359314, "learning_rate": 0.0001895669148555172, "loss": 0.4568, "step": 622 }, { "epoch": 0.34608985024958405, "grad_norm": 0.398698627948761, "learning_rate": 0.00018948686570730076, "loss": 0.4192, "step": 624 }, { "epoch": 0.3471991125901276, "grad_norm": 0.3701520562171936, "learning_rate": 0.00018940652767231484, "loss": 0.6006, "step": 626 }, { "epoch": 0.3483083749306711, "grad_norm": 0.25342532992362976, "learning_rate": 0.00018932590100991135, "loss": 0.3716, "step": 628 }, { "epoch": 0.34941763727121466, "grad_norm": 0.2552190124988556, "learning_rate": 0.00018924498598037412, "loss": 0.4289, "step": 630 }, { "epoch": 0.3505268996117582, "grad_norm": 0.2539176940917969, "learning_rate": 0.00018916378284491775, "loss": 0.4231, "step": 632 }, { "epoch": 0.35163616195230174, "grad_norm": 0.30955174565315247, "learning_rate": 0.00018908229186568706, "loss": 0.4843, "step": 634 }, { "epoch": 0.3527454242928453, "grad_norm": 0.3081991672515869, "learning_rate": 0.00018900051330575593, "loss": 0.3903, "step": 636 }, { "epoch": 0.3538546866333888, "grad_norm": 0.4423012137413025, "learning_rate": 0.00018891844742912679, "loss": 0.4496, "step": 638 }, { "epoch": 0.35496394897393235, "grad_norm": 0.27272704243659973, "learning_rate": 0.0001888360945007295, "loss": 0.4673, "step": 640 }, { "epoch": 0.3560732113144759, "grad_norm": 0.3022926449775696, "learning_rate": 0.00018875345478642068, "loss": 0.4034, "step": 642 }, { "epoch": 0.35718247365501943, "grad_norm": 0.2414725422859192, "learning_rate": 0.00018867052855298265, "loss": 0.4186, "step": 644 }, { "epoch": 0.35829173599556297, "grad_norm": 0.3422887921333313, "learning_rate": 0.00018858731606812286, "loss": 0.4808, "step": 646 }, { "epoch": 0.3594009983361065, "grad_norm": 0.28430530428886414, "learning_rate": 0.00018850381760047264, "loss": 0.3808, "step": 648 }, { "epoch": 0.36051026067665004, "grad_norm": 0.43009158968925476, "learning_rate": 0.00018842003341958675, "loss": 0.4426, "step": 650 }, { "epoch": 0.3616195230171936, "grad_norm": 1.1762899160385132, "learning_rate": 0.00018833596379594215, "loss": 0.4216, "step": 652 }, { "epoch": 0.3627287853577371, "grad_norm": 0.2741614878177643, "learning_rate": 0.0001882516090009374, "loss": 0.5421, "step": 654 }, { "epoch": 0.36383804769828065, "grad_norm": 0.2537575364112854, "learning_rate": 0.0001881669693068916, "loss": 0.3817, "step": 656 }, { "epoch": 0.3649473100388242, "grad_norm": 0.7648308277130127, "learning_rate": 0.00018808204498704358, "loss": 0.3751, "step": 658 }, { "epoch": 0.36605657237936773, "grad_norm": 0.7363317012786865, "learning_rate": 0.00018799683631555105, "loss": 0.4518, "step": 660 }, { "epoch": 0.36716583471991127, "grad_norm": 0.2524745762348175, "learning_rate": 0.00018791134356748964, "loss": 0.3931, "step": 662 }, { "epoch": 0.3682750970604548, "grad_norm": 0.27496540546417236, "learning_rate": 0.00018782556701885212, "loss": 0.4645, "step": 664 }, { "epoch": 0.36938435940099834, "grad_norm": 0.389417827129364, "learning_rate": 0.00018773950694654735, "loss": 0.5246, "step": 666 }, { "epoch": 0.3704936217415419, "grad_norm": 0.3206813335418701, "learning_rate": 0.00018765316362839954, "loss": 0.3842, "step": 668 }, { "epoch": 0.3716028840820854, "grad_norm": 0.36864709854125977, "learning_rate": 0.00018756653734314723, "loss": 0.5298, "step": 670 }, { "epoch": 0.37271214642262895, "grad_norm": 0.30406880378723145, "learning_rate": 0.00018747962837044256, "loss": 0.5117, "step": 672 }, { "epoch": 0.3738214087631725, "grad_norm": 0.39465242624282837, "learning_rate": 0.0001873924369908502, "loss": 0.4595, "step": 674 }, { "epoch": 0.37493067110371603, "grad_norm": 0.23132115602493286, "learning_rate": 0.00018730496348584645, "loss": 0.4108, "step": 676 }, { "epoch": 0.37603993344425957, "grad_norm": 0.3196391761302948, "learning_rate": 0.00018721720813781842, "loss": 0.6211, "step": 678 }, { "epoch": 0.3771491957848031, "grad_norm": 0.2508983314037323, "learning_rate": 0.00018712917123006316, "loss": 0.4121, "step": 680 }, { "epoch": 0.37825845812534664, "grad_norm": 0.3900391459465027, "learning_rate": 0.0001870408530467865, "loss": 0.523, "step": 682 }, { "epoch": 0.3793677204658902, "grad_norm": 0.3014692962169647, "learning_rate": 0.00018695225387310252, "loss": 0.4602, "step": 684 }, { "epoch": 0.3804769828064337, "grad_norm": 0.25084492564201355, "learning_rate": 0.00018686337399503219, "loss": 0.4827, "step": 686 }, { "epoch": 0.38158624514697725, "grad_norm": 0.3503668010234833, "learning_rate": 0.00018677421369950282, "loss": 0.4804, "step": 688 }, { "epoch": 0.3826955074875208, "grad_norm": 0.4252930283546448, "learning_rate": 0.00018668477327434687, "loss": 0.5005, "step": 690 }, { "epoch": 0.38380476982806433, "grad_norm": 0.3079698979854584, "learning_rate": 0.00018659505300830124, "loss": 0.5406, "step": 692 }, { "epoch": 0.38491403216860787, "grad_norm": 0.4625144600868225, "learning_rate": 0.00018650505319100618, "loss": 0.5213, "step": 694 }, { "epoch": 0.3860232945091514, "grad_norm": 0.35006484389305115, "learning_rate": 0.00018641477411300442, "loss": 0.4865, "step": 696 }, { "epoch": 0.38713255684969494, "grad_norm": 0.34998831152915955, "learning_rate": 0.0001863242160657401, "loss": 0.5338, "step": 698 }, { "epoch": 0.3882418191902385, "grad_norm": 0.40413737297058105, "learning_rate": 0.00018623337934155814, "loss": 0.4186, "step": 700 }, { "epoch": 0.389351081530782, "grad_norm": 0.29122504591941833, "learning_rate": 0.0001861422642337029, "loss": 0.4655, "step": 702 }, { "epoch": 0.39046034387132555, "grad_norm": 0.39020445942878723, "learning_rate": 0.00018605087103631767, "loss": 0.5498, "step": 704 }, { "epoch": 0.3915696062118691, "grad_norm": 0.26732391119003296, "learning_rate": 0.00018595920004444325, "loss": 0.4595, "step": 706 }, { "epoch": 0.39267886855241263, "grad_norm": 0.22082771360874176, "learning_rate": 0.00018586725155401735, "loss": 0.4043, "step": 708 }, { "epoch": 0.39378813089295617, "grad_norm": 0.2727007567882538, "learning_rate": 0.00018577502586187354, "loss": 0.5069, "step": 710 }, { "epoch": 0.3948973932334997, "grad_norm": 0.35480618476867676, "learning_rate": 0.0001856825232657402, "loss": 0.3777, "step": 712 }, { "epoch": 0.39600665557404324, "grad_norm": 0.4109397530555725, "learning_rate": 0.00018558974406423967, "loss": 0.5574, "step": 714 }, { "epoch": 0.3971159179145868, "grad_norm": 0.2954327166080475, "learning_rate": 0.00018549668855688723, "loss": 0.452, "step": 716 }, { "epoch": 0.3982251802551303, "grad_norm": 0.234983429312706, "learning_rate": 0.00018540335704409012, "loss": 0.4585, "step": 718 }, { "epoch": 0.39933444259567386, "grad_norm": 0.23228606581687927, "learning_rate": 0.00018530974982714667, "loss": 0.4854, "step": 720 }, { "epoch": 0.4004437049362174, "grad_norm": 0.4279260039329529, "learning_rate": 0.00018521586720824518, "loss": 0.39, "step": 722 }, { "epoch": 0.40155296727676093, "grad_norm": 0.4057956039905548, "learning_rate": 0.00018512170949046305, "loss": 0.5139, "step": 724 }, { "epoch": 0.40266222961730447, "grad_norm": 0.6070723533630371, "learning_rate": 0.0001850272769777658, "loss": 0.435, "step": 726 }, { "epoch": 0.403771491957848, "grad_norm": 0.31548893451690674, "learning_rate": 0.000184932569975006, "loss": 0.3534, "step": 728 }, { "epoch": 0.40488075429839154, "grad_norm": 0.25872763991355896, "learning_rate": 0.0001848375887879224, "loss": 0.3636, "step": 730 }, { "epoch": 0.4059900166389351, "grad_norm": 0.3243245780467987, "learning_rate": 0.00018474233372313878, "loss": 0.4528, "step": 732 }, { "epoch": 0.4070992789794787, "grad_norm": 0.2900272309780121, "learning_rate": 0.00018464680508816324, "loss": 0.4874, "step": 734 }, { "epoch": 0.4082085413200222, "grad_norm": 0.29125553369522095, "learning_rate": 0.00018455100319138694, "loss": 0.5529, "step": 736 }, { "epoch": 0.40931780366056575, "grad_norm": 0.36948704719543457, "learning_rate": 0.00018445492834208318, "loss": 0.4883, "step": 738 }, { "epoch": 0.4104270660011093, "grad_norm": 0.27613064646720886, "learning_rate": 0.00018435858085040643, "loss": 0.4219, "step": 740 }, { "epoch": 0.4115363283416528, "grad_norm": 0.33854255080223083, "learning_rate": 0.00018426196102739134, "loss": 0.4645, "step": 742 }, { "epoch": 0.41264559068219636, "grad_norm": 0.3527907729148865, "learning_rate": 0.00018416506918495176, "loss": 0.4726, "step": 744 }, { "epoch": 0.4137548530227399, "grad_norm": 0.45844894647598267, "learning_rate": 0.00018406790563587957, "loss": 0.5893, "step": 746 }, { "epoch": 0.41486411536328344, "grad_norm": 0.450692743062973, "learning_rate": 0.00018397047069384392, "loss": 0.5054, "step": 748 }, { "epoch": 0.415973377703827, "grad_norm": 0.2807348668575287, "learning_rate": 0.00018387276467338997, "loss": 0.3518, "step": 750 }, { "epoch": 0.4170826400443705, "grad_norm": 0.33243051171302795, "learning_rate": 0.00018377478788993813, "loss": 0.4834, "step": 752 }, { "epoch": 0.41819190238491405, "grad_norm": 0.27179285883903503, "learning_rate": 0.00018367654065978278, "loss": 0.4649, "step": 754 }, { "epoch": 0.4193011647254576, "grad_norm": 0.27518779039382935, "learning_rate": 0.00018357802330009137, "loss": 0.4289, "step": 756 }, { "epoch": 0.4204104270660011, "grad_norm": 0.7208437323570251, "learning_rate": 0.0001834792361289035, "loss": 0.6417, "step": 758 }, { "epoch": 0.42151968940654466, "grad_norm": 0.41777920722961426, "learning_rate": 0.0001833801794651297, "loss": 0.5582, "step": 760 }, { "epoch": 0.4226289517470882, "grad_norm": 0.3758167624473572, "learning_rate": 0.00018328085362855057, "loss": 0.4747, "step": 762 }, { "epoch": 0.42373821408763174, "grad_norm": 0.42126914858818054, "learning_rate": 0.00018318125893981556, "loss": 0.4996, "step": 764 }, { "epoch": 0.4248474764281753, "grad_norm": 0.3479858338832855, "learning_rate": 0.00018308139572044215, "loss": 0.4798, "step": 766 }, { "epoch": 0.4259567387687188, "grad_norm": 0.287246972322464, "learning_rate": 0.00018298126429281469, "loss": 0.4337, "step": 768 }, { "epoch": 0.42706600110926235, "grad_norm": 0.23056888580322266, "learning_rate": 0.00018288086498018327, "loss": 0.3374, "step": 770 }, { "epoch": 0.4281752634498059, "grad_norm": 0.25939542055130005, "learning_rate": 0.00018278019810666295, "loss": 0.4086, "step": 772 }, { "epoch": 0.4292845257903494, "grad_norm": 0.3149937391281128, "learning_rate": 0.0001826792639972324, "loss": 0.4979, "step": 774 }, { "epoch": 0.43039378813089296, "grad_norm": 0.27951163053512573, "learning_rate": 0.0001825780629777331, "loss": 0.4328, "step": 776 }, { "epoch": 0.4315030504714365, "grad_norm": 0.3050474524497986, "learning_rate": 0.00018247659537486813, "loss": 0.3905, "step": 778 }, { "epoch": 0.43261231281198004, "grad_norm": 0.34463128447532654, "learning_rate": 0.00018237486151620114, "loss": 0.4865, "step": 780 }, { "epoch": 0.4337215751525236, "grad_norm": 0.33969131112098694, "learning_rate": 0.0001822728617301554, "loss": 0.559, "step": 782 }, { "epoch": 0.4348308374930671, "grad_norm": 0.2980121970176697, "learning_rate": 0.0001821705963460126, "loss": 0.4415, "step": 784 }, { "epoch": 0.43594009983361065, "grad_norm": 0.2601833939552307, "learning_rate": 0.0001820680656939119, "loss": 0.3262, "step": 786 }, { "epoch": 0.4370493621741542, "grad_norm": 0.3651084005832672, "learning_rate": 0.0001819652701048488, "loss": 0.4405, "step": 788 }, { "epoch": 0.4381586245146977, "grad_norm": 0.2805362343788147, "learning_rate": 0.00018186220991067408, "loss": 0.3434, "step": 790 }, { "epoch": 0.43926788685524126, "grad_norm": 0.26046329736709595, "learning_rate": 0.00018175888544409266, "loss": 0.4936, "step": 792 }, { "epoch": 0.4403771491957848, "grad_norm": 0.3160867393016815, "learning_rate": 0.00018165529703866273, "loss": 0.5399, "step": 794 }, { "epoch": 0.44148641153632834, "grad_norm": 0.2804858088493347, "learning_rate": 0.00018155144502879445, "loss": 0.4354, "step": 796 }, { "epoch": 0.4425956738768719, "grad_norm": 0.3244550824165344, "learning_rate": 0.00018144732974974902, "loss": 0.4432, "step": 798 }, { "epoch": 0.4437049362174154, "grad_norm": 0.3002081513404846, "learning_rate": 0.00018134295153763747, "loss": 0.473, "step": 800 }, { "epoch": 0.44481419855795895, "grad_norm": 0.3061865270137787, "learning_rate": 0.00018123831072941978, "loss": 0.4445, "step": 802 }, { "epoch": 0.4459234608985025, "grad_norm": 0.2574765384197235, "learning_rate": 0.00018113340766290345, "loss": 0.3834, "step": 804 }, { "epoch": 0.447032723239046, "grad_norm": 0.4501776397228241, "learning_rate": 0.00018102824267674284, "loss": 0.5404, "step": 806 }, { "epoch": 0.44814198557958956, "grad_norm": 0.3008052706718445, "learning_rate": 0.00018092281611043767, "loss": 0.5067, "step": 808 }, { "epoch": 0.4492512479201331, "grad_norm": 0.2804155647754669, "learning_rate": 0.00018081712830433227, "loss": 0.502, "step": 810 }, { "epoch": 0.45036051026067664, "grad_norm": 0.28651684522628784, "learning_rate": 0.00018071117959961416, "loss": 0.4172, "step": 812 }, { "epoch": 0.4514697726012202, "grad_norm": 0.23672421276569366, "learning_rate": 0.00018060497033831326, "loss": 0.4926, "step": 814 }, { "epoch": 0.4525790349417637, "grad_norm": 0.2747003436088562, "learning_rate": 0.00018049850086330048, "loss": 0.392, "step": 816 }, { "epoch": 0.45368829728230725, "grad_norm": 0.33606404066085815, "learning_rate": 0.00018039177151828693, "loss": 0.4872, "step": 818 }, { "epoch": 0.4547975596228508, "grad_norm": 0.2584729790687561, "learning_rate": 0.00018028478264782254, "loss": 0.4253, "step": 820 }, { "epoch": 0.4559068219633943, "grad_norm": 0.25303879380226135, "learning_rate": 0.00018017753459729504, "loss": 0.3802, "step": 822 }, { "epoch": 0.45701608430393786, "grad_norm": 0.305148184299469, "learning_rate": 0.00018007002771292899, "loss": 0.4692, "step": 824 }, { "epoch": 0.4581253466444814, "grad_norm": 0.2929096519947052, "learning_rate": 0.00017996226234178435, "loss": 0.52, "step": 826 }, { "epoch": 0.45923460898502494, "grad_norm": 0.31055986881256104, "learning_rate": 0.00017985423883175569, "loss": 0.4284, "step": 828 }, { "epoch": 0.4603438713255685, "grad_norm": 0.27399498224258423, "learning_rate": 0.00017974595753157083, "loss": 0.4484, "step": 830 }, { "epoch": 0.461453133666112, "grad_norm": 0.27760255336761475, "learning_rate": 0.00017963741879078988, "loss": 0.3668, "step": 832 }, { "epoch": 0.46256239600665555, "grad_norm": 0.4046492278575897, "learning_rate": 0.000179528622959804, "loss": 0.4709, "step": 834 }, { "epoch": 0.4636716583471991, "grad_norm": 0.2615452706813812, "learning_rate": 0.00017941957038983427, "loss": 0.385, "step": 836 }, { "epoch": 0.4647809206877426, "grad_norm": 0.2847132086753845, "learning_rate": 0.0001793102614329306, "loss": 0.4561, "step": 838 }, { "epoch": 0.46589018302828616, "grad_norm": 0.26887843012809753, "learning_rate": 0.00017920069644197063, "loss": 0.4318, "step": 840 }, { "epoch": 0.4669994453688297, "grad_norm": 0.30276423692703247, "learning_rate": 0.00017909087577065852, "loss": 0.457, "step": 842 }, { "epoch": 0.46810870770937324, "grad_norm": 0.29458096623420715, "learning_rate": 0.00017898079977352383, "loss": 0.4328, "step": 844 }, { "epoch": 0.46921797004991683, "grad_norm": 0.3137955665588379, "learning_rate": 0.00017887046880592035, "loss": 0.5369, "step": 846 }, { "epoch": 0.47032723239046037, "grad_norm": 0.29104721546173096, "learning_rate": 0.00017875988322402505, "loss": 0.4281, "step": 848 }, { "epoch": 0.4714364947310039, "grad_norm": 0.3767394423484802, "learning_rate": 0.00017864904338483676, "loss": 0.4719, "step": 850 }, { "epoch": 0.47254575707154745, "grad_norm": 0.3240392804145813, "learning_rate": 0.00017853794964617522, "loss": 0.473, "step": 852 }, { "epoch": 0.473655019412091, "grad_norm": 0.28845375776290894, "learning_rate": 0.00017842660236667976, "loss": 0.5162, "step": 854 }, { "epoch": 0.4747642817526345, "grad_norm": 0.305284321308136, "learning_rate": 0.00017831500190580826, "loss": 0.4475, "step": 856 }, { "epoch": 0.47587354409317806, "grad_norm": 0.33257827162742615, "learning_rate": 0.00017820314862383586, "loss": 0.5563, "step": 858 }, { "epoch": 0.4769828064337216, "grad_norm": 0.3269551396369934, "learning_rate": 0.0001780910428818539, "loss": 0.466, "step": 860 }, { "epoch": 0.47809206877426513, "grad_norm": 0.3051519989967346, "learning_rate": 0.00017797868504176875, "loss": 0.4077, "step": 862 }, { "epoch": 0.47920133111480867, "grad_norm": 0.27613380551338196, "learning_rate": 0.00017786607546630063, "loss": 0.4272, "step": 864 }, { "epoch": 0.4803105934553522, "grad_norm": 0.29356688261032104, "learning_rate": 0.00017775321451898242, "loss": 0.449, "step": 866 }, { "epoch": 0.48141985579589575, "grad_norm": 0.2650798559188843, "learning_rate": 0.0001776401025641584, "loss": 0.3956, "step": 868 }, { "epoch": 0.4825291181364393, "grad_norm": 0.3456670641899109, "learning_rate": 0.00017752673996698328, "loss": 0.516, "step": 870 }, { "epoch": 0.4836383804769828, "grad_norm": 0.23135840892791748, "learning_rate": 0.0001774131270934209, "loss": 0.4079, "step": 872 }, { "epoch": 0.48474764281752636, "grad_norm": 0.24187476933002472, "learning_rate": 0.00017729926431024303, "loss": 0.338, "step": 874 }, { "epoch": 0.4858569051580699, "grad_norm": 0.32144275307655334, "learning_rate": 0.00017718515198502814, "loss": 0.4518, "step": 876 }, { "epoch": 0.48696616749861343, "grad_norm": 0.336337149143219, "learning_rate": 0.00017707079048616045, "loss": 0.4441, "step": 878 }, { "epoch": 0.48807542983915697, "grad_norm": 0.3350709080696106, "learning_rate": 0.00017695618018282843, "loss": 0.4252, "step": 880 }, { "epoch": 0.4891846921797005, "grad_norm": 0.37555792927742004, "learning_rate": 0.00017684132144502382, "loss": 0.5188, "step": 882 }, { "epoch": 0.49029395452024405, "grad_norm": 0.3198007047176361, "learning_rate": 0.00017672621464354034, "loss": 0.4292, "step": 884 }, { "epoch": 0.4914032168607876, "grad_norm": 0.3064371943473816, "learning_rate": 0.0001766108601499726, "loss": 0.4173, "step": 886 }, { "epoch": 0.4925124792013311, "grad_norm": 0.3241244852542877, "learning_rate": 0.00017649525833671477, "loss": 0.4137, "step": 888 }, { "epoch": 0.49362174154187466, "grad_norm": 0.348093181848526, "learning_rate": 0.00017637940957695934, "loss": 0.5232, "step": 890 }, { "epoch": 0.4947310038824182, "grad_norm": 0.2986682653427124, "learning_rate": 0.00017626331424469617, "loss": 0.403, "step": 892 }, { "epoch": 0.49584026622296173, "grad_norm": 0.2748506963253021, "learning_rate": 0.00017614697271471107, "loss": 0.453, "step": 894 }, { "epoch": 0.49694952856350527, "grad_norm": 0.3349149227142334, "learning_rate": 0.00017603038536258457, "loss": 0.5059, "step": 896 }, { "epoch": 0.4980587909040488, "grad_norm": 0.2706896662712097, "learning_rate": 0.0001759135525646908, "loss": 0.3997, "step": 898 }, { "epoch": 0.49916805324459235, "grad_norm": 0.283390074968338, "learning_rate": 0.00017579647469819634, "loss": 0.4285, "step": 900 }, { "epoch": 0.5002773155851359, "grad_norm": 0.33636030554771423, "learning_rate": 0.00017567915214105882, "loss": 0.4528, "step": 902 }, { "epoch": 0.5013865779256794, "grad_norm": 0.27481749653816223, "learning_rate": 0.00017556158527202585, "loss": 0.369, "step": 904 }, { "epoch": 0.502495840266223, "grad_norm": 0.2789921164512634, "learning_rate": 0.00017544377447063374, "loss": 0.4782, "step": 906 }, { "epoch": 0.5036051026067665, "grad_norm": 0.26627886295318604, "learning_rate": 0.00017532572011720617, "loss": 0.404, "step": 908 }, { "epoch": 0.50471436494731, "grad_norm": 0.27739235758781433, "learning_rate": 0.00017520742259285324, "loss": 0.4869, "step": 910 }, { "epoch": 0.5058236272878536, "grad_norm": 0.37541431188583374, "learning_rate": 0.00017508888227946994, "loss": 0.5678, "step": 912 }, { "epoch": 0.5069328896283971, "grad_norm": 0.42174577713012695, "learning_rate": 0.00017497009955973512, "loss": 0.5259, "step": 914 }, { "epoch": 0.5080421519689406, "grad_norm": 0.2786255180835724, "learning_rate": 0.00017485107481711012, "loss": 0.3499, "step": 916 }, { "epoch": 0.5091514143094842, "grad_norm": 0.35035160183906555, "learning_rate": 0.00017473180843583763, "loss": 0.4238, "step": 918 }, { "epoch": 0.5102606766500277, "grad_norm": 0.27141204476356506, "learning_rate": 0.00017461230080094043, "loss": 0.4411, "step": 920 }, { "epoch": 0.5113699389905713, "grad_norm": 0.33987727761268616, "learning_rate": 0.0001744925522982201, "loss": 0.5017, "step": 922 }, { "epoch": 0.5124792013311148, "grad_norm": 0.28170111775398254, "learning_rate": 0.0001743725633142558, "loss": 0.3683, "step": 924 }, { "epoch": 0.5135884636716583, "grad_norm": 0.3536321222782135, "learning_rate": 0.000174252334236403, "loss": 0.4578, "step": 926 }, { "epoch": 0.5146977260122019, "grad_norm": 0.2716483175754547, "learning_rate": 0.0001741318654527923, "loss": 0.4402, "step": 928 }, { "epoch": 0.5158069883527454, "grad_norm": 0.27619215846061707, "learning_rate": 0.00017401115735232817, "loss": 0.4859, "step": 930 }, { "epoch": 0.516916250693289, "grad_norm": 0.3302086591720581, "learning_rate": 0.0001738902103246876, "loss": 0.4283, "step": 932 }, { "epoch": 0.5180255130338325, "grad_norm": 0.3179870843887329, "learning_rate": 0.00017376902476031885, "loss": 0.4857, "step": 934 }, { "epoch": 0.519134775374376, "grad_norm": 0.2627505660057068, "learning_rate": 0.00017364760105044032, "loss": 0.4503, "step": 936 }, { "epoch": 0.5202440377149196, "grad_norm": 0.3984716832637787, "learning_rate": 0.00017352593958703922, "loss": 0.5252, "step": 938 }, { "epoch": 0.5213533000554631, "grad_norm": 0.2471695989370346, "learning_rate": 0.00017340404076287023, "loss": 0.4228, "step": 940 }, { "epoch": 0.5224625623960066, "grad_norm": 0.27779459953308105, "learning_rate": 0.00017328190497145428, "loss": 0.3583, "step": 942 }, { "epoch": 0.5235718247365502, "grad_norm": 0.30559295415878296, "learning_rate": 0.00017315953260707735, "loss": 0.4199, "step": 944 }, { "epoch": 0.5246810870770937, "grad_norm": 0.2473781406879425, "learning_rate": 0.0001730369240647891, "loss": 0.4046, "step": 946 }, { "epoch": 0.5257903494176372, "grad_norm": 0.2680453956127167, "learning_rate": 0.00017291407974040168, "loss": 0.4313, "step": 948 }, { "epoch": 0.5268996117581808, "grad_norm": 0.3109484612941742, "learning_rate": 0.00017279100003048833, "loss": 0.524, "step": 950 }, { "epoch": 0.5280088740987243, "grad_norm": 0.3450772166252136, "learning_rate": 0.00017266768533238228, "loss": 0.4348, "step": 952 }, { "epoch": 0.5291181364392679, "grad_norm": 0.3616882562637329, "learning_rate": 0.0001725441360441752, "loss": 0.4318, "step": 954 }, { "epoch": 0.5302273987798114, "grad_norm": 0.3496112823486328, "learning_rate": 0.00017242035256471625, "loss": 0.4405, "step": 956 }, { "epoch": 0.5313366611203549, "grad_norm": 0.33496448397636414, "learning_rate": 0.00017229633529361054, "loss": 0.4737, "step": 958 }, { "epoch": 0.5324459234608985, "grad_norm": 0.30328652262687683, "learning_rate": 0.0001721720846312179, "loss": 0.4184, "step": 960 }, { "epoch": 0.533555185801442, "grad_norm": 0.23888161778450012, "learning_rate": 0.00017204760097865168, "loss": 0.4652, "step": 962 }, { "epoch": 0.5346644481419855, "grad_norm": 0.33510059118270874, "learning_rate": 0.00017192288473777731, "loss": 0.4666, "step": 964 }, { "epoch": 0.5357737104825291, "grad_norm": 0.2732909619808197, "learning_rate": 0.00017179793631121108, "loss": 0.4723, "step": 966 }, { "epoch": 0.5368829728230726, "grad_norm": 0.2259574830532074, "learning_rate": 0.0001716727561023189, "loss": 0.3925, "step": 968 }, { "epoch": 0.5379922351636162, "grad_norm": 0.28999099135398865, "learning_rate": 0.0001715473445152149, "loss": 0.4329, "step": 970 }, { "epoch": 0.5391014975041597, "grad_norm": 0.26387131214141846, "learning_rate": 0.00017142170195476005, "loss": 0.4403, "step": 972 }, { "epoch": 0.5402107598447032, "grad_norm": 0.32671770453453064, "learning_rate": 0.00017129582882656122, "loss": 0.4605, "step": 974 }, { "epoch": 0.5413200221852468, "grad_norm": 0.2896054685115814, "learning_rate": 0.00017116972553696933, "loss": 0.4452, "step": 976 }, { "epoch": 0.5424292845257903, "grad_norm": 0.3383495509624481, "learning_rate": 0.0001710433924930785, "loss": 0.476, "step": 978 }, { "epoch": 0.5435385468663338, "grad_norm": 0.3241276741027832, "learning_rate": 0.00017091683010272447, "loss": 0.4016, "step": 980 }, { "epoch": 0.5446478092068774, "grad_norm": 0.24984405934810638, "learning_rate": 0.00017079003877448344, "loss": 0.3818, "step": 982 }, { "epoch": 0.5457570715474209, "grad_norm": 0.2572356164455414, "learning_rate": 0.00017066301891767061, "loss": 0.3587, "step": 984 }, { "epoch": 0.5468663338879645, "grad_norm": 0.2701364755630493, "learning_rate": 0.000170535770942339, "loss": 0.3153, "step": 986 }, { "epoch": 0.547975596228508, "grad_norm": 0.4156802296638489, "learning_rate": 0.00017040829525927798, "loss": 0.5246, "step": 988 }, { "epoch": 0.5490848585690515, "grad_norm": 0.33432650566101074, "learning_rate": 0.00017028059228001207, "loss": 0.4401, "step": 990 }, { "epoch": 0.5501941209095951, "grad_norm": 0.22204925119876862, "learning_rate": 0.00017015266241679952, "loss": 0.387, "step": 992 }, { "epoch": 0.5513033832501386, "grad_norm": 0.39056891202926636, "learning_rate": 0.0001700245060826311, "loss": 0.4271, "step": 994 }, { "epoch": 0.5524126455906821, "grad_norm": 0.36917203664779663, "learning_rate": 0.0001698961236912286, "loss": 0.4976, "step": 996 }, { "epoch": 0.5535219079312257, "grad_norm": 0.2684764564037323, "learning_rate": 0.00016976751565704362, "loss": 0.4287, "step": 998 }, { "epoch": 0.5546311702717692, "grad_norm": 0.7510117888450623, "learning_rate": 0.00016963868239525622, "loss": 0.4789, "step": 1000 }, { "epoch": 0.5557404326123128, "grad_norm": 0.2503146231174469, "learning_rate": 0.00016950962432177348, "loss": 0.3128, "step": 1002 }, { "epoch": 0.5568496949528563, "grad_norm": 0.271791011095047, "learning_rate": 0.0001693803418532283, "loss": 0.4387, "step": 1004 }, { "epoch": 0.5579589572933998, "grad_norm": 0.2682740092277527, "learning_rate": 0.0001692508354069779, "loss": 0.357, "step": 1006 }, { "epoch": 0.5590682196339434, "grad_norm": 0.2809149920940399, "learning_rate": 0.00016912110540110272, "loss": 0.4949, "step": 1008 }, { "epoch": 0.5601774819744869, "grad_norm": 0.300929456949234, "learning_rate": 0.0001689911522544047, "loss": 0.4333, "step": 1010 }, { "epoch": 0.5612867443150305, "grad_norm": 0.30407261848449707, "learning_rate": 0.00016886097638640633, "loss": 0.3449, "step": 1012 }, { "epoch": 0.562396006655574, "grad_norm": 0.24028092622756958, "learning_rate": 0.00016873057821734897, "loss": 0.3714, "step": 1014 }, { "epoch": 0.5635052689961176, "grad_norm": 0.19116805493831635, "learning_rate": 0.00016859995816819167, "loss": 0.3797, "step": 1016 }, { "epoch": 0.5646145313366612, "grad_norm": 0.30579307675361633, "learning_rate": 0.00016846911666060975, "loss": 0.408, "step": 1018 }, { "epoch": 0.5657237936772047, "grad_norm": 0.27811676263809204, "learning_rate": 0.00016833805411699347, "loss": 0.5091, "step": 1020 }, { "epoch": 0.5668330560177482, "grad_norm": 0.2803473174571991, "learning_rate": 0.00016820677096044667, "loss": 0.4186, "step": 1022 }, { "epoch": 0.5679423183582918, "grad_norm": 0.32603880763053894, "learning_rate": 0.00016807526761478535, "loss": 0.4373, "step": 1024 }, { "epoch": 0.5690515806988353, "grad_norm": 0.3556148111820221, "learning_rate": 0.0001679435445045364, "loss": 0.4877, "step": 1026 }, { "epoch": 0.5701608430393789, "grad_norm": 0.3248284161090851, "learning_rate": 0.00016781160205493607, "loss": 0.4164, "step": 1028 }, { "epoch": 0.5712701053799224, "grad_norm": 0.21463936567306519, "learning_rate": 0.00016767944069192878, "loss": 0.3279, "step": 1030 }, { "epoch": 0.5723793677204659, "grad_norm": 0.2522839605808258, "learning_rate": 0.00016754706084216555, "loss": 0.353, "step": 1032 }, { "epoch": 0.5734886300610095, "grad_norm": 0.2605576515197754, "learning_rate": 0.00016741446293300292, "loss": 0.4001, "step": 1034 }, { "epoch": 0.574597892401553, "grad_norm": 0.37828049063682556, "learning_rate": 0.0001672816473925012, "loss": 0.4332, "step": 1036 }, { "epoch": 0.5757071547420965, "grad_norm": 0.35154885053634644, "learning_rate": 0.00016714861464942336, "loss": 0.4899, "step": 1038 }, { "epoch": 0.5768164170826401, "grad_norm": 0.29194343090057373, "learning_rate": 0.0001670153651332335, "loss": 0.3842, "step": 1040 }, { "epoch": 0.5779256794231836, "grad_norm": 0.2817918658256531, "learning_rate": 0.0001668818992740956, "loss": 0.4277, "step": 1042 }, { "epoch": 0.5790349417637272, "grad_norm": 0.30157536268234253, "learning_rate": 0.00016674821750287198, "loss": 0.5127, "step": 1044 }, { "epoch": 0.5801442041042707, "grad_norm": 0.27585723996162415, "learning_rate": 0.00016661432025112202, "loss": 0.3875, "step": 1046 }, { "epoch": 0.5812534664448142, "grad_norm": 0.3143744468688965, "learning_rate": 0.00016648020795110072, "loss": 0.4697, "step": 1048 }, { "epoch": 0.5823627287853578, "grad_norm": 0.31697356700897217, "learning_rate": 0.00016634588103575727, "loss": 0.4986, "step": 1050 }, { "epoch": 0.5834719911259013, "grad_norm": 0.2747526168823242, "learning_rate": 0.00016621133993873372, "loss": 0.3629, "step": 1052 }, { "epoch": 0.5845812534664449, "grad_norm": 0.3277036249637604, "learning_rate": 0.0001660765850943636, "loss": 0.3862, "step": 1054 }, { "epoch": 0.5856905158069884, "grad_norm": 0.2825765311717987, "learning_rate": 0.00016594161693767048, "loss": 0.4167, "step": 1056 }, { "epoch": 0.5867997781475319, "grad_norm": 0.25938060879707336, "learning_rate": 0.00016580643590436643, "loss": 0.3921, "step": 1058 }, { "epoch": 0.5879090404880755, "grad_norm": 0.25034335255622864, "learning_rate": 0.00016567104243085082, "loss": 0.3272, "step": 1060 }, { "epoch": 0.589018302828619, "grad_norm": 0.3136093318462372, "learning_rate": 0.0001655354369542089, "loss": 0.389, "step": 1062 }, { "epoch": 0.5901275651691625, "grad_norm": 0.21099263429641724, "learning_rate": 0.00016539961991221018, "loss": 0.3347, "step": 1064 }, { "epoch": 0.5912368275097061, "grad_norm": 0.28475597500801086, "learning_rate": 0.0001652635917433073, "loss": 0.4019, "step": 1066 }, { "epoch": 0.5923460898502496, "grad_norm": 0.3473896384239197, "learning_rate": 0.00016512735288663435, "loss": 0.5109, "step": 1068 }, { "epoch": 0.5934553521907932, "grad_norm": 0.29283806681632996, "learning_rate": 0.00016499090378200565, "loss": 0.3714, "step": 1070 }, { "epoch": 0.5945646145313367, "grad_norm": 0.3305450677871704, "learning_rate": 0.00016485424486991428, "loss": 0.4171, "step": 1072 }, { "epoch": 0.5956738768718802, "grad_norm": 0.2901533544063568, "learning_rate": 0.00016471737659153054, "loss": 0.4221, "step": 1074 }, { "epoch": 0.5967831392124238, "grad_norm": 0.27300742268562317, "learning_rate": 0.00016458029938870063, "loss": 0.3829, "step": 1076 }, { "epoch": 0.5978924015529673, "grad_norm": 0.2941387891769409, "learning_rate": 0.00016444301370394535, "loss": 0.3936, "step": 1078 }, { "epoch": 0.5990016638935108, "grad_norm": 0.34587568044662476, "learning_rate": 0.00016430551998045833, "loss": 0.4583, "step": 1080 }, { "epoch": 0.6001109262340544, "grad_norm": 0.2715665400028229, "learning_rate": 0.00016416781866210496, "loss": 0.3773, "step": 1082 }, { "epoch": 0.6012201885745979, "grad_norm": 0.3803468942642212, "learning_rate": 0.00016402991019342074, "loss": 0.4065, "step": 1084 }, { "epoch": 0.6023294509151415, "grad_norm": 0.34928134083747864, "learning_rate": 0.0001638917950196099, "loss": 0.4144, "step": 1086 }, { "epoch": 0.603438713255685, "grad_norm": 0.312226802110672, "learning_rate": 0.000163753473586544, "loss": 0.5789, "step": 1088 }, { "epoch": 0.6045479755962285, "grad_norm": 0.28748974204063416, "learning_rate": 0.00016361494634076036, "loss": 0.4137, "step": 1090 }, { "epoch": 0.6056572379367721, "grad_norm": 0.2908242642879486, "learning_rate": 0.00016347621372946088, "loss": 0.5035, "step": 1092 }, { "epoch": 0.6067665002773156, "grad_norm": 0.2888891100883484, "learning_rate": 0.00016333727620051032, "loss": 0.4518, "step": 1094 }, { "epoch": 0.6078757626178591, "grad_norm": 0.40034082531929016, "learning_rate": 0.00016319813420243498, "loss": 0.5334, "step": 1096 }, { "epoch": 0.6089850249584027, "grad_norm": 0.32615357637405396, "learning_rate": 0.00016305878818442124, "loss": 0.3839, "step": 1098 }, { "epoch": 0.6100942872989462, "grad_norm": 0.213283509016037, "learning_rate": 0.00016291923859631418, "loss": 0.3875, "step": 1100 }, { "epoch": 0.6112035496394898, "grad_norm": 0.3383128345012665, "learning_rate": 0.0001627794858886159, "loss": 0.4749, "step": 1102 }, { "epoch": 0.6123128119800333, "grad_norm": 0.25966745615005493, "learning_rate": 0.0001626395305124844, "loss": 0.4304, "step": 1104 }, { "epoch": 0.6134220743205768, "grad_norm": 0.23151825368404388, "learning_rate": 0.00016249937291973186, "loss": 0.5318, "step": 1106 }, { "epoch": 0.6145313366611204, "grad_norm": 0.323313444852829, "learning_rate": 0.00016235901356282325, "loss": 0.4639, "step": 1108 }, { "epoch": 0.6156405990016639, "grad_norm": 0.30469194054603577, "learning_rate": 0.00016221845289487492, "loss": 0.4229, "step": 1110 }, { "epoch": 0.6167498613422074, "grad_norm": 0.2814004123210907, "learning_rate": 0.00016207769136965307, "loss": 0.3136, "step": 1112 }, { "epoch": 0.617859123682751, "grad_norm": 0.2871707081794739, "learning_rate": 0.00016193672944157241, "loss": 0.3771, "step": 1114 }, { "epoch": 0.6189683860232945, "grad_norm": 0.3541894555091858, "learning_rate": 0.0001617955675656945, "loss": 0.4944, "step": 1116 }, { "epoch": 0.620077648363838, "grad_norm": 0.2884300947189331, "learning_rate": 0.00016165420619772638, "loss": 0.447, "step": 1118 }, { "epoch": 0.6211869107043816, "grad_norm": 0.2753572165966034, "learning_rate": 0.0001615126457940192, "loss": 0.3346, "step": 1120 }, { "epoch": 0.6222961730449251, "grad_norm": 0.27947622537612915, "learning_rate": 0.00016137088681156654, "loss": 0.4008, "step": 1122 }, { "epoch": 0.6234054353854687, "grad_norm": 0.3177914023399353, "learning_rate": 0.00016122892970800318, "loss": 0.5574, "step": 1124 }, { "epoch": 0.6245146977260122, "grad_norm": 0.29654982686042786, "learning_rate": 0.0001610867749416033, "loss": 0.4083, "step": 1126 }, { "epoch": 0.6256239600665557, "grad_norm": 0.2599790394306183, "learning_rate": 0.00016094442297127936, "loss": 0.3301, "step": 1128 }, { "epoch": 0.6267332224070993, "grad_norm": 0.2212723195552826, "learning_rate": 0.00016080187425658035, "loss": 0.3485, "step": 1130 }, { "epoch": 0.6278424847476428, "grad_norm": 0.3127800226211548, "learning_rate": 0.0001606591292576904, "loss": 0.3452, "step": 1132 }, { "epoch": 0.6289517470881864, "grad_norm": 0.3709440529346466, "learning_rate": 0.00016051618843542737, "loss": 0.5686, "step": 1134 }, { "epoch": 0.6300610094287299, "grad_norm": 0.34709852933883667, "learning_rate": 0.0001603730522512412, "loss": 0.4726, "step": 1136 }, { "epoch": 0.6311702717692734, "grad_norm": 0.41093623638153076, "learning_rate": 0.00016022972116721258, "loss": 0.4736, "step": 1138 }, { "epoch": 0.632279534109817, "grad_norm": 0.34302660822868347, "learning_rate": 0.00016008619564605132, "loss": 0.4422, "step": 1140 }, { "epoch": 0.6333887964503605, "grad_norm": 0.25461357831954956, "learning_rate": 0.000159942476151095, "loss": 0.4575, "step": 1142 }, { "epoch": 0.634498058790904, "grad_norm": 0.29111289978027344, "learning_rate": 0.00015979856314630731, "loss": 0.4227, "step": 1144 }, { "epoch": 0.6356073211314476, "grad_norm": 0.2498675435781479, "learning_rate": 0.00015965445709627672, "loss": 0.4112, "step": 1146 }, { "epoch": 0.6367165834719911, "grad_norm": 0.23331956565380096, "learning_rate": 0.00015951015846621484, "loss": 0.3565, "step": 1148 }, { "epoch": 0.6378258458125347, "grad_norm": 0.19519895315170288, "learning_rate": 0.00015936566772195506, "loss": 0.3559, "step": 1150 }, { "epoch": 0.6389351081530782, "grad_norm": 0.27646857500076294, "learning_rate": 0.00015922098532995083, "loss": 0.5526, "step": 1152 }, { "epoch": 0.6400443704936217, "grad_norm": 0.2813306152820587, "learning_rate": 0.00015907611175727443, "loss": 0.4691, "step": 1154 }, { "epoch": 0.6411536328341653, "grad_norm": 0.27971866726875305, "learning_rate": 0.00015893104747161525, "loss": 0.4229, "step": 1156 }, { "epoch": 0.6422628951747088, "grad_norm": 0.2833665907382965, "learning_rate": 0.00015878579294127833, "loss": 0.3703, "step": 1158 }, { "epoch": 0.6433721575152523, "grad_norm": 0.2487824559211731, "learning_rate": 0.00015864034863518294, "loss": 0.4713, "step": 1160 }, { "epoch": 0.6444814198557959, "grad_norm": 0.2890799641609192, "learning_rate": 0.0001584947150228609, "loss": 0.4088, "step": 1162 }, { "epoch": 0.6455906821963394, "grad_norm": 0.38614341616630554, "learning_rate": 0.00015834889257445526, "loss": 0.6233, "step": 1164 }, { "epoch": 0.646699944536883, "grad_norm": 0.32349148392677307, "learning_rate": 0.00015820288176071861, "loss": 0.4304, "step": 1166 }, { "epoch": 0.6478092068774265, "grad_norm": 0.3689476549625397, "learning_rate": 0.0001580566830530117, "loss": 0.4791, "step": 1168 }, { "epoch": 0.64891846921797, "grad_norm": 0.33342450857162476, "learning_rate": 0.00015791029692330174, "loss": 0.5544, "step": 1170 }, { "epoch": 0.6500277315585136, "grad_norm": 0.27900078892707825, "learning_rate": 0.00015776372384416107, "loss": 0.3984, "step": 1172 }, { "epoch": 0.6511369938990571, "grad_norm": 0.2558038532733917, "learning_rate": 0.00015761696428876558, "loss": 0.3949, "step": 1174 }, { "epoch": 0.6522462562396006, "grad_norm": 0.3070181906223297, "learning_rate": 0.00015747001873089305, "loss": 0.4118, "step": 1176 }, { "epoch": 0.6533555185801442, "grad_norm": 0.3025922477245331, "learning_rate": 0.00015732288764492184, "loss": 0.4185, "step": 1178 }, { "epoch": 0.6544647809206877, "grad_norm": 0.2864340543746948, "learning_rate": 0.0001571755715058292, "loss": 0.4129, "step": 1180 }, { "epoch": 0.6555740432612313, "grad_norm": 0.30463624000549316, "learning_rate": 0.00015702807078918967, "loss": 0.3837, "step": 1182 }, { "epoch": 0.6566833056017748, "grad_norm": 0.36540600657463074, "learning_rate": 0.0001568803859711738, "loss": 0.4379, "step": 1184 }, { "epoch": 0.6577925679423183, "grad_norm": 0.30166247487068176, "learning_rate": 0.00015673251752854644, "loss": 0.4748, "step": 1186 }, { "epoch": 0.6589018302828619, "grad_norm": 0.2592626214027405, "learning_rate": 0.00015658446593866518, "loss": 0.4211, "step": 1188 }, { "epoch": 0.6600110926234054, "grad_norm": 0.3025457262992859, "learning_rate": 0.0001564362316794789, "loss": 0.4014, "step": 1190 }, { "epoch": 0.6611203549639489, "grad_norm": 0.3164603114128113, "learning_rate": 0.00015628781522952613, "loss": 0.3479, "step": 1192 }, { "epoch": 0.6622296173044925, "grad_norm": 0.3951297700405121, "learning_rate": 0.00015613921706793363, "loss": 0.5119, "step": 1194 }, { "epoch": 0.663338879645036, "grad_norm": 0.3717001974582672, "learning_rate": 0.00015599043767441473, "loss": 0.6534, "step": 1196 }, { "epoch": 0.6644481419855796, "grad_norm": 0.3157028555870056, "learning_rate": 0.0001558414775292678, "loss": 0.4616, "step": 1198 }, { "epoch": 0.6655574043261231, "grad_norm": 0.3201621472835541, "learning_rate": 0.00015569233711337476, "loss": 0.5525, "step": 1200 }, { "epoch": 0.6666666666666666, "grad_norm": 0.2802707254886627, "learning_rate": 0.00015554301690819952, "loss": 0.371, "step": 1202 }, { "epoch": 0.6677759290072102, "grad_norm": 0.2569611668586731, "learning_rate": 0.00015539351739578632, "loss": 0.4024, "step": 1204 }, { "epoch": 0.6688851913477537, "grad_norm": 0.20614203810691833, "learning_rate": 0.0001552438390587583, "loss": 0.4023, "step": 1206 }, { "epoch": 0.6699944536882972, "grad_norm": 0.28919216990470886, "learning_rate": 0.00015509398238031588, "loss": 0.5277, "step": 1208 }, { "epoch": 0.6711037160288408, "grad_norm": 0.3089144825935364, "learning_rate": 0.00015494394784423525, "loss": 0.4952, "step": 1210 }, { "epoch": 0.6722129783693843, "grad_norm": 0.3368174433708191, "learning_rate": 0.00015479373593486667, "loss": 0.5854, "step": 1212 }, { "epoch": 0.6733222407099279, "grad_norm": 0.45089298486709595, "learning_rate": 0.0001546433471371331, "loss": 0.4141, "step": 1214 }, { "epoch": 0.6744315030504714, "grad_norm": 0.3095158040523529, "learning_rate": 0.00015449278193652854, "loss": 0.4969, "step": 1216 }, { "epoch": 0.6755407653910149, "grad_norm": 0.36336401104927063, "learning_rate": 0.00015434204081911642, "loss": 0.4679, "step": 1218 }, { "epoch": 0.6766500277315585, "grad_norm": 0.3150896430015564, "learning_rate": 0.00015419112427152807, "loss": 0.5038, "step": 1220 }, { "epoch": 0.677759290072102, "grad_norm": 0.29561394453048706, "learning_rate": 0.0001540400327809612, "loss": 0.4697, "step": 1222 }, { "epoch": 0.6788685524126455, "grad_norm": 0.2975095212459564, "learning_rate": 0.00015388876683517826, "loss": 0.5544, "step": 1224 }, { "epoch": 0.6799778147531891, "grad_norm": 0.2124488651752472, "learning_rate": 0.00015373732692250486, "loss": 0.3321, "step": 1226 }, { "epoch": 0.6810870770937326, "grad_norm": 0.37978988885879517, "learning_rate": 0.00015358571353182824, "loss": 0.5268, "step": 1228 }, { "epoch": 0.6821963394342762, "grad_norm": 0.32258403301239014, "learning_rate": 0.0001534339271525957, "loss": 0.4983, "step": 1230 }, { "epoch": 0.6833056017748197, "grad_norm": 0.3182342052459717, "learning_rate": 0.00015328196827481302, "loss": 0.4742, "step": 1232 }, { "epoch": 0.6844148641153632, "grad_norm": 0.26553046703338623, "learning_rate": 0.0001531298373890427, "loss": 0.4627, "step": 1234 }, { "epoch": 0.6855241264559068, "grad_norm": 0.3853413164615631, "learning_rate": 0.0001529775349864026, "loss": 0.5493, "step": 1236 }, { "epoch": 0.6866333887964503, "grad_norm": 0.26181191205978394, "learning_rate": 0.0001528250615585644, "loss": 0.4308, "step": 1238 }, { "epoch": 0.687742651136994, "grad_norm": 0.29632505774497986, "learning_rate": 0.0001526724175977518, "loss": 0.3972, "step": 1240 }, { "epoch": 0.6888519134775375, "grad_norm": 0.21180076897144318, "learning_rate": 0.000152519603596739, "loss": 0.2871, "step": 1242 }, { "epoch": 0.689961175818081, "grad_norm": 0.3028866946697235, "learning_rate": 0.00015236662004884912, "loss": 0.5045, "step": 1244 }, { "epoch": 0.6910704381586246, "grad_norm": 0.22230632603168488, "learning_rate": 0.0001522134674479527, "loss": 0.3913, "step": 1246 }, { "epoch": 0.6921797004991681, "grad_norm": 0.3115563690662384, "learning_rate": 0.00015206014628846594, "loss": 0.4612, "step": 1248 }, { "epoch": 0.6932889628397116, "grad_norm": 0.30068281292915344, "learning_rate": 0.00015190665706534925, "loss": 0.4224, "step": 1250 }, { "epoch": 0.6943982251802552, "grad_norm": 0.3180410861968994, "learning_rate": 0.00015175300027410566, "loss": 0.4094, "step": 1252 }, { "epoch": 0.6955074875207987, "grad_norm": 0.3450130224227905, "learning_rate": 0.00015159917641077895, "loss": 0.5635, "step": 1254 }, { "epoch": 0.6966167498613423, "grad_norm": 0.27590128779411316, "learning_rate": 0.00015144518597195243, "loss": 0.4893, "step": 1256 }, { "epoch": 0.6977260122018858, "grad_norm": 0.25257548689842224, "learning_rate": 0.0001512910294547471, "loss": 0.4252, "step": 1258 }, { "epoch": 0.6988352745424293, "grad_norm": 0.3455219864845276, "learning_rate": 0.00015113670735682013, "loss": 0.5274, "step": 1260 }, { "epoch": 0.6999445368829729, "grad_norm": 0.3079371750354767, "learning_rate": 0.0001509822201763632, "loss": 0.3667, "step": 1262 }, { "epoch": 0.7010537992235164, "grad_norm": 0.27760395407676697, "learning_rate": 0.00015082756841210086, "loss": 0.4693, "step": 1264 }, { "epoch": 0.7021630615640599, "grad_norm": 0.3691544830799103, "learning_rate": 0.0001506727525632891, "loss": 0.4975, "step": 1266 }, { "epoch": 0.7032723239046035, "grad_norm": 0.30312782526016235, "learning_rate": 0.00015051777312971357, "loss": 0.4377, "step": 1268 }, { "epoch": 0.704381586245147, "grad_norm": 0.31264883279800415, "learning_rate": 0.00015036263061168797, "loss": 0.3841, "step": 1270 }, { "epoch": 0.7054908485856906, "grad_norm": 0.27693864703178406, "learning_rate": 0.0001502073255100525, "loss": 0.4347, "step": 1272 }, { "epoch": 0.7066001109262341, "grad_norm": 0.3322737514972687, "learning_rate": 0.0001500518583261723, "loss": 0.4424, "step": 1274 }, { "epoch": 0.7077093732667776, "grad_norm": 0.2735716998577118, "learning_rate": 0.0001498962295619356, "loss": 0.3577, "step": 1276 }, { "epoch": 0.7088186356073212, "grad_norm": 0.2565724849700928, "learning_rate": 0.00014974043971975243, "loss": 0.4086, "step": 1278 }, { "epoch": 0.7099278979478647, "grad_norm": 0.4010816216468811, "learning_rate": 0.00014958448930255265, "loss": 0.5353, "step": 1280 }, { "epoch": 0.7110371602884082, "grad_norm": 0.29970696568489075, "learning_rate": 0.00014942837881378465, "loss": 0.4261, "step": 1282 }, { "epoch": 0.7121464226289518, "grad_norm": 0.36919617652893066, "learning_rate": 0.00014927210875741347, "loss": 0.4935, "step": 1284 }, { "epoch": 0.7132556849694953, "grad_norm": 0.3045351505279541, "learning_rate": 0.00014911567963791928, "loss": 0.4191, "step": 1286 }, { "epoch": 0.7143649473100389, "grad_norm": 0.30813243985176086, "learning_rate": 0.00014895909196029585, "loss": 0.3992, "step": 1288 }, { "epoch": 0.7154742096505824, "grad_norm": 0.3444893956184387, "learning_rate": 0.00014880234623004866, "loss": 0.4351, "step": 1290 }, { "epoch": 0.7165834719911259, "grad_norm": 0.29215720295906067, "learning_rate": 0.00014864544295319356, "loss": 0.4323, "step": 1292 }, { "epoch": 0.7176927343316695, "grad_norm": 0.37148571014404297, "learning_rate": 0.00014848838263625496, "loss": 0.4463, "step": 1294 }, { "epoch": 0.718801996672213, "grad_norm": 0.38779592514038086, "learning_rate": 0.00014833116578626417, "loss": 0.5293, "step": 1296 }, { "epoch": 0.7199112590127565, "grad_norm": 0.31231850385665894, "learning_rate": 0.00014817379291075792, "loss": 0.4575, "step": 1298 }, { "epoch": 0.7210205213533001, "grad_norm": 0.2966194450855255, "learning_rate": 0.00014801626451777658, "loss": 0.4337, "step": 1300 }, { "epoch": 0.7221297836938436, "grad_norm": 0.2993515133857727, "learning_rate": 0.00014785858111586258, "loss": 0.3994, "step": 1302 }, { "epoch": 0.7232390460343872, "grad_norm": 0.27827030420303345, "learning_rate": 0.00014770074321405878, "loss": 0.4079, "step": 1304 }, { "epoch": 0.7243483083749307, "grad_norm": 0.2576323449611664, "learning_rate": 0.00014754275132190678, "loss": 0.3798, "step": 1306 }, { "epoch": 0.7254575707154742, "grad_norm": 0.24843810498714447, "learning_rate": 0.0001473846059494453, "loss": 0.4564, "step": 1308 }, { "epoch": 0.7265668330560178, "grad_norm": 0.2799087166786194, "learning_rate": 0.00014722630760720856, "loss": 0.3906, "step": 1310 }, { "epoch": 0.7276760953965613, "grad_norm": 0.3635210394859314, "learning_rate": 0.00014706785680622462, "loss": 0.4409, "step": 1312 }, { "epoch": 0.7287853577371048, "grad_norm": 0.30838248133659363, "learning_rate": 0.0001469092540580136, "loss": 0.4268, "step": 1314 }, { "epoch": 0.7298946200776484, "grad_norm": 0.2162405252456665, "learning_rate": 0.0001467504998745863, "loss": 0.4085, "step": 1316 }, { "epoch": 0.7310038824181919, "grad_norm": 0.30635640025138855, "learning_rate": 0.00014659159476844232, "loss": 0.4089, "step": 1318 }, { "epoch": 0.7321131447587355, "grad_norm": 0.4048236012458801, "learning_rate": 0.00014643253925256846, "loss": 0.5283, "step": 1320 }, { "epoch": 0.733222407099279, "grad_norm": 0.4575350284576416, "learning_rate": 0.00014627333384043713, "loss": 0.4675, "step": 1322 }, { "epoch": 0.7343316694398225, "grad_norm": 0.3289128243923187, "learning_rate": 0.00014611397904600458, "loss": 0.4777, "step": 1324 }, { "epoch": 0.7354409317803661, "grad_norm": 0.29231977462768555, "learning_rate": 0.00014595447538370935, "loss": 0.3851, "step": 1326 }, { "epoch": 0.7365501941209096, "grad_norm": 0.33795663714408875, "learning_rate": 0.00014579482336847058, "loss": 0.475, "step": 1328 }, { "epoch": 0.7376594564614531, "grad_norm": 0.20978425443172455, "learning_rate": 0.00014563502351568625, "loss": 0.3455, "step": 1330 }, { "epoch": 0.7387687188019967, "grad_norm": 0.4113425016403198, "learning_rate": 0.00014547507634123176, "loss": 0.3741, "step": 1332 }, { "epoch": 0.7398779811425402, "grad_norm": 0.38009563088417053, "learning_rate": 0.0001453149823614579, "loss": 0.53, "step": 1334 }, { "epoch": 0.7409872434830838, "grad_norm": 0.2306888997554779, "learning_rate": 0.00014515474209318948, "loss": 0.4229, "step": 1336 }, { "epoch": 0.7420965058236273, "grad_norm": 0.326107382774353, "learning_rate": 0.00014499435605372366, "loss": 0.3731, "step": 1338 }, { "epoch": 0.7432057681641708, "grad_norm": 0.26597580313682556, "learning_rate": 0.00014483382476082802, "loss": 0.3841, "step": 1340 }, { "epoch": 0.7443150305047144, "grad_norm": 0.3690161406993866, "learning_rate": 0.00014467314873273918, "loss": 0.4814, "step": 1342 }, { "epoch": 0.7454242928452579, "grad_norm": 0.30208808183670044, "learning_rate": 0.0001445123284881609, "loss": 0.4522, "step": 1344 }, { "epoch": 0.7465335551858014, "grad_norm": 0.28505003452301025, "learning_rate": 0.00014435136454626264, "loss": 0.4283, "step": 1346 }, { "epoch": 0.747642817526345, "grad_norm": 0.2772189974784851, "learning_rate": 0.0001441902574266776, "loss": 0.3964, "step": 1348 }, { "epoch": 0.7487520798668885, "grad_norm": 0.4741387665271759, "learning_rate": 0.0001440290076495013, "loss": 0.4577, "step": 1350 }, { "epoch": 0.7498613422074321, "grad_norm": 0.3234037160873413, "learning_rate": 0.00014386761573528976, "loss": 0.4542, "step": 1352 }, { "epoch": 0.7509706045479756, "grad_norm": 0.3096826374530792, "learning_rate": 0.0001437060822050579, "loss": 0.3892, "step": 1354 }, { "epoch": 0.7520798668885191, "grad_norm": 0.2626829743385315, "learning_rate": 0.00014354440758027772, "loss": 0.4441, "step": 1356 }, { "epoch": 0.7531891292290627, "grad_norm": 0.2835614085197449, "learning_rate": 0.00014338259238287678, "loss": 0.4905, "step": 1358 }, { "epoch": 0.7542983915696062, "grad_norm": 0.23285934329032898, "learning_rate": 0.00014322063713523647, "loss": 0.3911, "step": 1360 }, { "epoch": 0.7554076539101497, "grad_norm": 0.2746225595474243, "learning_rate": 0.00014305854236019018, "loss": 0.4681, "step": 1362 }, { "epoch": 0.7565169162506933, "grad_norm": 0.30551669001579285, "learning_rate": 0.0001428963085810219, "loss": 0.3883, "step": 1364 }, { "epoch": 0.7576261785912368, "grad_norm": 0.3372795283794403, "learning_rate": 0.0001427339363214642, "loss": 0.4452, "step": 1366 }, { "epoch": 0.7587354409317804, "grad_norm": 0.2697629928588867, "learning_rate": 0.00014257142610569682, "loss": 0.3853, "step": 1368 }, { "epoch": 0.7598447032723239, "grad_norm": 0.29290249943733215, "learning_rate": 0.00014240877845834472, "loss": 0.4694, "step": 1370 }, { "epoch": 0.7609539656128674, "grad_norm": 0.298874169588089, "learning_rate": 0.00014224599390447672, "loss": 0.4644, "step": 1372 }, { "epoch": 0.762063227953411, "grad_norm": 0.23659248650074005, "learning_rate": 0.00014208307296960344, "loss": 0.395, "step": 1374 }, { "epoch": 0.7631724902939545, "grad_norm": 0.2965550124645233, "learning_rate": 0.00014192001617967587, "loss": 0.5332, "step": 1376 }, { "epoch": 0.764281752634498, "grad_norm": 0.3338853716850281, "learning_rate": 0.00014175682406108352, "loss": 0.5176, "step": 1378 }, { "epoch": 0.7653910149750416, "grad_norm": 0.24134789407253265, "learning_rate": 0.0001415934971406528, "loss": 0.4224, "step": 1380 }, { "epoch": 0.7665002773155851, "grad_norm": 0.3920575678348541, "learning_rate": 0.00014143003594564528, "loss": 0.4627, "step": 1382 }, { "epoch": 0.7676095396561287, "grad_norm": 0.3521714508533478, "learning_rate": 0.00014126644100375603, "loss": 0.446, "step": 1384 }, { "epoch": 0.7687188019966722, "grad_norm": 0.2819899022579193, "learning_rate": 0.0001411027128431119, "loss": 0.3637, "step": 1386 }, { "epoch": 0.7698280643372157, "grad_norm": 0.1896730363368988, "learning_rate": 0.00014093885199226972, "loss": 0.3206, "step": 1388 }, { "epoch": 0.7709373266777593, "grad_norm": 0.21066512167453766, "learning_rate": 0.0001407748589802148, "loss": 0.3081, "step": 1390 }, { "epoch": 0.7720465890183028, "grad_norm": 0.2417469471693039, "learning_rate": 0.000140610734336359, "loss": 0.3954, "step": 1392 }, { "epoch": 0.7731558513588463, "grad_norm": 0.41810843348503113, "learning_rate": 0.00014044647859053915, "loss": 0.521, "step": 1394 }, { "epoch": 0.7742651136993899, "grad_norm": 0.21894732117652893, "learning_rate": 0.00014028209227301533, "loss": 0.342, "step": 1396 }, { "epoch": 0.7753743760399334, "grad_norm": 0.31191563606262207, "learning_rate": 0.00014011757591446918, "loss": 0.4173, "step": 1398 }, { "epoch": 0.776483638380477, "grad_norm": 0.34966176748275757, "learning_rate": 0.00013995293004600206, "loss": 0.4993, "step": 1400 }, { "epoch": 0.7775929007210205, "grad_norm": 0.3630419671535492, "learning_rate": 0.00013978815519913345, "loss": 0.4331, "step": 1402 }, { "epoch": 0.778702163061564, "grad_norm": 0.2934836447238922, "learning_rate": 0.00013962325190579919, "loss": 0.4618, "step": 1404 }, { "epoch": 0.7798114254021076, "grad_norm": 0.330842524766922, "learning_rate": 0.00013945822069834983, "loss": 0.4437, "step": 1406 }, { "epoch": 0.7809206877426511, "grad_norm": 0.24101948738098145, "learning_rate": 0.0001392930621095489, "loss": 0.4065, "step": 1408 }, { "epoch": 0.7820299500831946, "grad_norm": 0.5105953216552734, "learning_rate": 0.00013912777667257094, "loss": 0.5135, "step": 1410 }, { "epoch": 0.7831392124237382, "grad_norm": 0.2571849226951599, "learning_rate": 0.00013896236492100025, "loss": 0.4153, "step": 1412 }, { "epoch": 0.7842484747642817, "grad_norm": 0.27226725220680237, "learning_rate": 0.00013879682738882873, "loss": 0.3522, "step": 1414 }, { "epoch": 0.7853577371048253, "grad_norm": 0.2498832792043686, "learning_rate": 0.0001386311646104544, "loss": 0.4186, "step": 1416 }, { "epoch": 0.7864669994453688, "grad_norm": 0.18799692392349243, "learning_rate": 0.00013846537712067962, "loss": 0.3278, "step": 1418 }, { "epoch": 0.7875762617859123, "grad_norm": 0.3741225600242615, "learning_rate": 0.0001382994654547093, "loss": 0.4816, "step": 1420 }, { "epoch": 0.7886855241264559, "grad_norm": 0.34847941994667053, "learning_rate": 0.00013813343014814925, "loss": 0.5673, "step": 1422 }, { "epoch": 0.7897947864669994, "grad_norm": 0.38350898027420044, "learning_rate": 0.00013796727173700444, "loss": 0.5009, "step": 1424 }, { "epoch": 0.790904048807543, "grad_norm": 0.2870427668094635, "learning_rate": 0.0001378009907576772, "loss": 0.4333, "step": 1426 }, { "epoch": 0.7920133111480865, "grad_norm": 0.2194262146949768, "learning_rate": 0.00013763458774696563, "loss": 0.3433, "step": 1428 }, { "epoch": 0.79312257348863, "grad_norm": 0.26900714635849, "learning_rate": 0.00013746806324206173, "loss": 0.5099, "step": 1430 }, { "epoch": 0.7942318358291736, "grad_norm": 0.2423945814371109, "learning_rate": 0.00013730141778054962, "loss": 0.333, "step": 1432 }, { "epoch": 0.7953410981697171, "grad_norm": 0.31677019596099854, "learning_rate": 0.00013713465190040415, "loss": 0.4285, "step": 1434 }, { "epoch": 0.7964503605102606, "grad_norm": 0.33169832825660706, "learning_rate": 0.0001369677661399886, "loss": 0.4058, "step": 1436 }, { "epoch": 0.7975596228508042, "grad_norm": 0.3489621579647064, "learning_rate": 0.0001368007610380535, "loss": 0.4153, "step": 1438 }, { "epoch": 0.7986688851913477, "grad_norm": 0.2639998495578766, "learning_rate": 0.00013663363713373454, "loss": 0.3959, "step": 1440 }, { "epoch": 0.7997781475318912, "grad_norm": 0.27134037017822266, "learning_rate": 0.0001364663949665509, "loss": 0.4331, "step": 1442 }, { "epoch": 0.8008874098724348, "grad_norm": 0.28077998757362366, "learning_rate": 0.00013629903507640369, "loss": 0.5282, "step": 1444 }, { "epoch": 0.8019966722129783, "grad_norm": 0.2948680818080902, "learning_rate": 0.00013613155800357385, "loss": 0.3951, "step": 1446 }, { "epoch": 0.8031059345535219, "grad_norm": 0.3196534812450409, "learning_rate": 0.0001359639642887208, "loss": 0.4191, "step": 1448 }, { "epoch": 0.8042151968940654, "grad_norm": 0.26202476024627686, "learning_rate": 0.00013579625447288044, "loss": 0.352, "step": 1450 }, { "epoch": 0.8053244592346089, "grad_norm": 0.24866290390491486, "learning_rate": 0.00013562842909746342, "loss": 0.3203, "step": 1452 }, { "epoch": 0.8064337215751525, "grad_norm": 0.41027557849884033, "learning_rate": 0.00013546048870425356, "loss": 0.4187, "step": 1454 }, { "epoch": 0.807542983915696, "grad_norm": 0.34640318155288696, "learning_rate": 0.0001352924338354059, "loss": 0.3204, "step": 1456 }, { "epoch": 0.8086522462562395, "grad_norm": 0.3971330523490906, "learning_rate": 0.0001351242650334451, "loss": 0.4598, "step": 1458 }, { "epoch": 0.8097615085967831, "grad_norm": 0.3868078887462616, "learning_rate": 0.0001349559828412635, "loss": 0.3641, "step": 1460 }, { "epoch": 0.8108707709373266, "grad_norm": 0.26136353611946106, "learning_rate": 0.00013478758780211965, "loss": 0.4286, "step": 1462 }, { "epoch": 0.8119800332778702, "grad_norm": 0.2987557351589203, "learning_rate": 0.00013461908045963634, "loss": 0.4286, "step": 1464 }, { "epoch": 0.8130892956184138, "grad_norm": 0.33275967836380005, "learning_rate": 0.00013445046135779885, "loss": 0.3616, "step": 1466 }, { "epoch": 0.8141985579589573, "grad_norm": 0.2795950770378113, "learning_rate": 0.00013428173104095331, "loss": 0.5246, "step": 1468 }, { "epoch": 0.8153078202995009, "grad_norm": 0.3052369952201843, "learning_rate": 0.00013411289005380494, "loss": 0.3672, "step": 1470 }, { "epoch": 0.8164170826400444, "grad_norm": 0.2728108763694763, "learning_rate": 0.00013394393894141605, "loss": 0.4897, "step": 1472 }, { "epoch": 0.817526344980588, "grad_norm": 0.3310782015323639, "learning_rate": 0.00013377487824920459, "loss": 0.5144, "step": 1474 }, { "epoch": 0.8186356073211315, "grad_norm": 0.4153352677822113, "learning_rate": 0.00013360570852294227, "loss": 0.5313, "step": 1476 }, { "epoch": 0.819744869661675, "grad_norm": 0.4107600450515747, "learning_rate": 0.00013343643030875276, "loss": 0.4873, "step": 1478 }, { "epoch": 0.8208541320022186, "grad_norm": 0.2750249207019806, "learning_rate": 0.00013326704415311, "loss": 0.4373, "step": 1480 }, { "epoch": 0.8219633943427621, "grad_norm": 0.2961103618144989, "learning_rate": 0.00013309755060283626, "loss": 0.4252, "step": 1482 }, { "epoch": 0.8230726566833056, "grad_norm": 0.25156864523887634, "learning_rate": 0.00013292795020510066, "loss": 0.307, "step": 1484 }, { "epoch": 0.8241819190238492, "grad_norm": 0.2984169125556946, "learning_rate": 0.00013275824350741716, "loss": 0.4416, "step": 1486 }, { "epoch": 0.8252911813643927, "grad_norm": 0.27711641788482666, "learning_rate": 0.00013258843105764297, "loss": 0.3382, "step": 1488 }, { "epoch": 0.8264004437049363, "grad_norm": 0.3105831742286682, "learning_rate": 0.00013241851340397656, "loss": 0.4276, "step": 1490 }, { "epoch": 0.8275097060454798, "grad_norm": 0.3303448557853699, "learning_rate": 0.00013224849109495622, "loss": 0.4679, "step": 1492 }, { "epoch": 0.8286189683860233, "grad_norm": 0.3527246415615082, "learning_rate": 0.00013207836467945785, "loss": 0.5059, "step": 1494 }, { "epoch": 0.8297282307265669, "grad_norm": 0.41127604246139526, "learning_rate": 0.00013190813470669363, "loss": 0.5412, "step": 1496 }, { "epoch": 0.8308374930671104, "grad_norm": 0.24502909183502197, "learning_rate": 0.00013173780172620999, "loss": 0.3072, "step": 1498 }, { "epoch": 0.831946755407654, "grad_norm": 0.21016305685043335, "learning_rate": 0.00013156736628788584, "loss": 0.3889, "step": 1500 }, { "epoch": 0.8330560177481975, "grad_norm": 0.5300863981246948, "learning_rate": 0.000131396828941931, "loss": 0.5858, "step": 1502 }, { "epoch": 0.834165280088741, "grad_norm": 0.2194632738828659, "learning_rate": 0.00013122619023888402, "loss": 0.3433, "step": 1504 }, { "epoch": 0.8352745424292846, "grad_norm": 0.7686997056007385, "learning_rate": 0.00013105545072961093, "loss": 0.5005, "step": 1506 }, { "epoch": 0.8363838047698281, "grad_norm": 0.2888992428779602, "learning_rate": 0.00013088461096530304, "loss": 0.3987, "step": 1508 }, { "epoch": 0.8374930671103716, "grad_norm": 0.2540237009525299, "learning_rate": 0.00013071367149747535, "loss": 0.4531, "step": 1510 }, { "epoch": 0.8386023294509152, "grad_norm": 0.3219870924949646, "learning_rate": 0.00013054263287796465, "loss": 0.561, "step": 1512 }, { "epoch": 0.8397115917914587, "grad_norm": 0.24539148807525635, "learning_rate": 0.00013037149565892794, "loss": 0.3225, "step": 1514 }, { "epoch": 0.8408208541320022, "grad_norm": 0.23382632434368134, "learning_rate": 0.00013020026039284045, "loss": 0.3557, "step": 1516 }, { "epoch": 0.8419301164725458, "grad_norm": 0.2739519476890564, "learning_rate": 0.00013002892763249398, "loss": 0.3635, "step": 1518 }, { "epoch": 0.8430393788130893, "grad_norm": 0.23864848911762238, "learning_rate": 0.000129857497930995, "loss": 0.393, "step": 1520 }, { "epoch": 0.8441486411536329, "grad_norm": 0.3150426149368286, "learning_rate": 0.00012968597184176298, "loss": 0.4337, "step": 1522 }, { "epoch": 0.8452579034941764, "grad_norm": 0.25435131788253784, "learning_rate": 0.00012951434991852857, "loss": 0.3681, "step": 1524 }, { "epoch": 0.8463671658347199, "grad_norm": 0.3123234808444977, "learning_rate": 0.0001293426327153317, "loss": 0.4089, "step": 1526 }, { "epoch": 0.8474764281752635, "grad_norm": 0.52870112657547, "learning_rate": 0.00012917082078652, "loss": 0.4779, "step": 1528 }, { "epoch": 0.848585690515807, "grad_norm": 0.2784457206726074, "learning_rate": 0.00012899891468674688, "loss": 0.4163, "step": 1530 }, { "epoch": 0.8496949528563505, "grad_norm": 0.26158082485198975, "learning_rate": 0.0001288269149709697, "loss": 0.464, "step": 1532 }, { "epoch": 0.8508042151968941, "grad_norm": 0.2595478594303131, "learning_rate": 0.00012865482219444804, "loss": 0.3473, "step": 1534 }, { "epoch": 0.8519134775374376, "grad_norm": 0.28629809617996216, "learning_rate": 0.000128482636912742, "loss": 0.4626, "step": 1536 }, { "epoch": 0.8530227398779812, "grad_norm": 0.2769101560115814, "learning_rate": 0.00012831035968171025, "loss": 0.5318, "step": 1538 }, { "epoch": 0.8541320022185247, "grad_norm": 0.32585474848747253, "learning_rate": 0.00012813799105750823, "loss": 0.4428, "step": 1540 }, { "epoch": 0.8552412645590682, "grad_norm": 0.2759438157081604, "learning_rate": 0.00012796553159658653, "loss": 0.3853, "step": 1542 }, { "epoch": 0.8563505268996118, "grad_norm": 0.34193405508995056, "learning_rate": 0.0001277929818556889, "loss": 0.5144, "step": 1544 }, { "epoch": 0.8574597892401553, "grad_norm": 0.3385421335697174, "learning_rate": 0.00012762034239185063, "loss": 0.4466, "step": 1546 }, { "epoch": 0.8585690515806988, "grad_norm": 0.3747256398200989, "learning_rate": 0.00012744761376239655, "loss": 0.517, "step": 1548 }, { "epoch": 0.8596783139212424, "grad_norm": 0.31231391429901123, "learning_rate": 0.00012727479652493943, "loss": 0.3797, "step": 1550 }, { "epoch": 0.8607875762617859, "grad_norm": 0.2660077214241028, "learning_rate": 0.00012710189123737802, "loss": 0.3481, "step": 1552 }, { "epoch": 0.8618968386023295, "grad_norm": 0.34666526317596436, "learning_rate": 0.00012692889845789538, "loss": 0.4369, "step": 1554 }, { "epoch": 0.863006100942873, "grad_norm": 0.26898959279060364, "learning_rate": 0.00012675581874495697, "loss": 0.4428, "step": 1556 }, { "epoch": 0.8641153632834165, "grad_norm": 0.378214567899704, "learning_rate": 0.0001265826526573089, "loss": 0.5425, "step": 1558 }, { "epoch": 0.8652246256239601, "grad_norm": 0.3740502595901489, "learning_rate": 0.0001264094007539762, "loss": 0.5318, "step": 1560 }, { "epoch": 0.8663338879645036, "grad_norm": 0.25904256105422974, "learning_rate": 0.00012623606359426077, "loss": 0.4032, "step": 1562 }, { "epoch": 0.8674431503050472, "grad_norm": 0.29247143864631653, "learning_rate": 0.00012606264173773988, "loss": 0.4446, "step": 1564 }, { "epoch": 0.8685524126455907, "grad_norm": 0.2748325765132904, "learning_rate": 0.0001258891357442642, "loss": 0.4435, "step": 1566 }, { "epoch": 0.8696616749861342, "grad_norm": 0.4215485155582428, "learning_rate": 0.00012571554617395598, "loss": 0.4285, "step": 1568 }, { "epoch": 0.8707709373266778, "grad_norm": 0.2875049412250519, "learning_rate": 0.00012554187358720725, "loss": 0.4743, "step": 1570 }, { "epoch": 0.8718801996672213, "grad_norm": 0.3484318256378174, "learning_rate": 0.00012536811854467817, "loss": 0.4997, "step": 1572 }, { "epoch": 0.8729894620077648, "grad_norm": 0.33628854155540466, "learning_rate": 0.0001251942816072949, "loss": 0.5044, "step": 1574 }, { "epoch": 0.8740987243483084, "grad_norm": 0.28208592534065247, "learning_rate": 0.00012502036333624815, "loss": 0.3405, "step": 1576 }, { "epoch": 0.8752079866888519, "grad_norm": 0.2113669067621231, "learning_rate": 0.00012484636429299114, "loss": 0.3996, "step": 1578 }, { "epoch": 0.8763172490293955, "grad_norm": 0.2464189976453781, "learning_rate": 0.00012467228503923773, "loss": 0.3572, "step": 1580 }, { "epoch": 0.877426511369939, "grad_norm": 0.391923189163208, "learning_rate": 0.00012449812613696094, "loss": 0.6324, "step": 1582 }, { "epoch": 0.8785357737104825, "grad_norm": 0.2782968282699585, "learning_rate": 0.0001243238881483907, "loss": 0.352, "step": 1584 }, { "epoch": 0.8796450360510261, "grad_norm": 0.30209866166114807, "learning_rate": 0.00012414957163601236, "loss": 0.4611, "step": 1586 }, { "epoch": 0.8807542983915696, "grad_norm": 0.2990707457065582, "learning_rate": 0.0001239751771625648, "loss": 0.5419, "step": 1588 }, { "epoch": 0.8818635607321131, "grad_norm": 0.25346022844314575, "learning_rate": 0.00012380070529103852, "loss": 0.4205, "step": 1590 }, { "epoch": 0.8829728230726567, "grad_norm": 0.33787423372268677, "learning_rate": 0.00012362615658467377, "loss": 0.4186, "step": 1592 }, { "epoch": 0.8840820854132002, "grad_norm": 0.23708541691303253, "learning_rate": 0.00012345153160695917, "loss": 0.3261, "step": 1594 }, { "epoch": 0.8851913477537438, "grad_norm": 0.22642351686954498, "learning_rate": 0.00012327683092162918, "loss": 0.3415, "step": 1596 }, { "epoch": 0.8863006100942873, "grad_norm": 0.27935388684272766, "learning_rate": 0.00012310205509266292, "loss": 0.3471, "step": 1598 }, { "epoch": 0.8874098724348308, "grad_norm": 0.23772071301937103, "learning_rate": 0.000122927204684282, "loss": 0.3386, "step": 1600 }, { "epoch": 0.8885191347753744, "grad_norm": 0.3254069983959198, "learning_rate": 0.00012275228026094881, "loss": 0.5074, "step": 1602 }, { "epoch": 0.8896283971159179, "grad_norm": 0.37417536973953247, "learning_rate": 0.00012257728238736467, "loss": 0.5318, "step": 1604 }, { "epoch": 0.8907376594564614, "grad_norm": 0.35727638006210327, "learning_rate": 0.000122402211628468, "loss": 0.4729, "step": 1606 }, { "epoch": 0.891846921797005, "grad_norm": 0.26733312010765076, "learning_rate": 0.00012222706854943255, "loss": 0.421, "step": 1608 }, { "epoch": 0.8929561841375485, "grad_norm": 0.22087961435317993, "learning_rate": 0.00012205185371566554, "loss": 0.3354, "step": 1610 }, { "epoch": 0.894065446478092, "grad_norm": 0.4256139099597931, "learning_rate": 0.00012187656769280578, "loss": 0.4, "step": 1612 }, { "epoch": 0.8951747088186356, "grad_norm": 0.2818162441253662, "learning_rate": 0.00012170121104672196, "loss": 0.4098, "step": 1614 }, { "epoch": 0.8962839711591791, "grad_norm": 0.2936331331729889, "learning_rate": 0.00012152578434351071, "loss": 0.436, "step": 1616 }, { "epoch": 0.8973932334997227, "grad_norm": 0.2814910113811493, "learning_rate": 0.00012135028814949487, "loss": 0.4096, "step": 1618 }, { "epoch": 0.8985024958402662, "grad_norm": 0.3062569797039032, "learning_rate": 0.00012117472303122157, "loss": 0.4595, "step": 1620 }, { "epoch": 0.8996117581808097, "grad_norm": 0.3199828565120697, "learning_rate": 0.00012099908955546044, "loss": 0.4696, "step": 1622 }, { "epoch": 0.9007210205213533, "grad_norm": 0.35935017466545105, "learning_rate": 0.00012082338828920185, "loss": 0.5822, "step": 1624 }, { "epoch": 0.9018302828618968, "grad_norm": 0.2030808925628662, "learning_rate": 0.00012064761979965497, "loss": 0.3524, "step": 1626 }, { "epoch": 0.9029395452024404, "grad_norm": 0.29535773396492004, "learning_rate": 0.00012047178465424596, "loss": 0.3698, "step": 1628 }, { "epoch": 0.9040488075429839, "grad_norm": 0.26572179794311523, "learning_rate": 0.00012029588342061621, "loss": 0.3789, "step": 1630 }, { "epoch": 0.9051580698835274, "grad_norm": 0.4271789491176605, "learning_rate": 0.00012011991666662044, "loss": 0.5669, "step": 1632 }, { "epoch": 0.906267332224071, "grad_norm": 0.35716575384140015, "learning_rate": 0.00011994388496032487, "loss": 0.4521, "step": 1634 }, { "epoch": 0.9073765945646145, "grad_norm": 0.2956486642360687, "learning_rate": 0.00011976778887000543, "loss": 0.3755, "step": 1636 }, { "epoch": 0.908485856905158, "grad_norm": 0.3578818738460541, "learning_rate": 0.0001195916289641459, "loss": 0.4935, "step": 1638 }, { "epoch": 0.9095951192457016, "grad_norm": 0.3232196867465973, "learning_rate": 0.00011941540581143608, "loss": 0.4826, "step": 1640 }, { "epoch": 0.9107043815862451, "grad_norm": 0.2944696247577667, "learning_rate": 0.00011923911998076988, "loss": 0.3827, "step": 1642 }, { "epoch": 0.9118136439267887, "grad_norm": 0.27748194336891174, "learning_rate": 0.00011906277204124363, "loss": 0.5143, "step": 1644 }, { "epoch": 0.9129229062673322, "grad_norm": 0.2819176912307739, "learning_rate": 0.00011888636256215413, "loss": 0.4159, "step": 1646 }, { "epoch": 0.9140321686078757, "grad_norm": 0.3712371289730072, "learning_rate": 0.00011870989211299686, "loss": 0.5419, "step": 1648 }, { "epoch": 0.9151414309484193, "grad_norm": 0.2566871643066406, "learning_rate": 0.00011853336126346406, "loss": 0.4926, "step": 1650 }, { "epoch": 0.9162506932889628, "grad_norm": 0.38075196743011475, "learning_rate": 0.0001183567705834431, "loss": 0.4784, "step": 1652 }, { "epoch": 0.9173599556295063, "grad_norm": 0.30149558186531067, "learning_rate": 0.00011818012064301433, "loss": 0.3791, "step": 1654 }, { "epoch": 0.9184692179700499, "grad_norm": 0.37024736404418945, "learning_rate": 0.00011800341201244954, "loss": 0.4495, "step": 1656 }, { "epoch": 0.9195784803105934, "grad_norm": 0.30697816610336304, "learning_rate": 0.00011782664526220992, "loss": 0.385, "step": 1658 }, { "epoch": 0.920687742651137, "grad_norm": 0.48160433769226074, "learning_rate": 0.00011764982096294432, "loss": 0.3435, "step": 1660 }, { "epoch": 0.9217970049916805, "grad_norm": 0.3319704234600067, "learning_rate": 0.00011747293968548734, "loss": 0.4893, "step": 1662 }, { "epoch": 0.922906267332224, "grad_norm": 0.2384938895702362, "learning_rate": 0.00011729600200085752, "loss": 0.4826, "step": 1664 }, { "epoch": 0.9240155296727676, "grad_norm": 0.2830295264720917, "learning_rate": 0.00011711900848025555, "loss": 0.5185, "step": 1666 }, { "epoch": 0.9251247920133111, "grad_norm": 0.3078785836696625, "learning_rate": 0.0001169419596950623, "loss": 0.6303, "step": 1668 }, { "epoch": 0.9262340543538546, "grad_norm": 0.2661837637424469, "learning_rate": 0.00011676485621683713, "loss": 0.4059, "step": 1670 }, { "epoch": 0.9273433166943982, "grad_norm": 0.2619323134422302, "learning_rate": 0.00011658769861731584, "loss": 0.3383, "step": 1672 }, { "epoch": 0.9284525790349417, "grad_norm": 0.3200634717941284, "learning_rate": 0.00011641048746840912, "loss": 0.42, "step": 1674 }, { "epoch": 0.9295618413754853, "grad_norm": 0.29915183782577515, "learning_rate": 0.00011623322334220038, "loss": 0.4156, "step": 1676 }, { "epoch": 0.9306711037160288, "grad_norm": 0.36972782015800476, "learning_rate": 0.0001160559068109441, "loss": 0.3698, "step": 1678 }, { "epoch": 0.9317803660565723, "grad_norm": 0.24870358407497406, "learning_rate": 0.00011587853844706397, "loss": 0.4126, "step": 1680 }, { "epoch": 0.9328896283971159, "grad_norm": 0.2665979862213135, "learning_rate": 0.000115701118823151, "loss": 0.4097, "step": 1682 }, { "epoch": 0.9339988907376594, "grad_norm": 0.34804749488830566, "learning_rate": 0.00011552364851196167, "loss": 0.3956, "step": 1684 }, { "epoch": 0.9351081530782029, "grad_norm": 0.2750212550163269, "learning_rate": 0.00011534612808641603, "loss": 0.3434, "step": 1686 }, { "epoch": 0.9362174154187465, "grad_norm": 0.249044269323349, "learning_rate": 0.00011516855811959604, "loss": 0.4786, "step": 1688 }, { "epoch": 0.93732667775929, "grad_norm": 0.29392093420028687, "learning_rate": 0.00011499093918474348, "loss": 0.3028, "step": 1690 }, { "epoch": 0.9384359400998337, "grad_norm": 0.2747836112976074, "learning_rate": 0.00011481327185525828, "loss": 0.4296, "step": 1692 }, { "epoch": 0.9395452024403772, "grad_norm": 0.3494579493999481, "learning_rate": 0.00011463555670469657, "loss": 0.3412, "step": 1694 }, { "epoch": 0.9406544647809207, "grad_norm": 0.28468579053878784, "learning_rate": 0.00011445779430676884, "loss": 0.5185, "step": 1696 }, { "epoch": 0.9417637271214643, "grad_norm": 0.27110087871551514, "learning_rate": 0.0001142799852353382, "loss": 0.3075, "step": 1698 }, { "epoch": 0.9428729894620078, "grad_norm": 0.38002222776412964, "learning_rate": 0.00011410213006441827, "loss": 0.6445, "step": 1700 }, { "epoch": 0.9439822518025514, "grad_norm": 0.27994948625564575, "learning_rate": 0.00011392422936817166, "loss": 0.3741, "step": 1702 }, { "epoch": 0.9450915141430949, "grad_norm": 0.26837414503097534, "learning_rate": 0.00011374628372090783, "loss": 0.3902, "step": 1704 }, { "epoch": 0.9462007764836384, "grad_norm": 0.2525213062763214, "learning_rate": 0.00011356829369708146, "loss": 0.397, "step": 1706 }, { "epoch": 0.947310038824182, "grad_norm": 0.24402830004692078, "learning_rate": 0.00011339025987129032, "loss": 0.349, "step": 1708 }, { "epoch": 0.9484193011647255, "grad_norm": 0.2694087624549866, "learning_rate": 0.0001132121828182738, "loss": 0.4212, "step": 1710 }, { "epoch": 0.949528563505269, "grad_norm": 0.24677637219429016, "learning_rate": 0.00011303406311291065, "loss": 0.4076, "step": 1712 }, { "epoch": 0.9506378258458126, "grad_norm": 0.23484551906585693, "learning_rate": 0.00011285590133021741, "loss": 0.3533, "step": 1714 }, { "epoch": 0.9517470881863561, "grad_norm": 0.2949685752391815, "learning_rate": 0.00011267769804534647, "loss": 0.4117, "step": 1716 }, { "epoch": 0.9528563505268997, "grad_norm": 0.9004955887794495, "learning_rate": 0.00011249945383358414, "loss": 0.4805, "step": 1718 }, { "epoch": 0.9539656128674432, "grad_norm": 0.33412787318229675, "learning_rate": 0.00011232116927034893, "loss": 0.5482, "step": 1720 }, { "epoch": 0.9550748752079867, "grad_norm": 0.24728718400001526, "learning_rate": 0.00011214284493118948, "loss": 0.329, "step": 1722 }, { "epoch": 0.9561841375485303, "grad_norm": 0.3215670883655548, "learning_rate": 0.00011196448139178298, "loss": 0.4933, "step": 1724 }, { "epoch": 0.9572933998890738, "grad_norm": 0.21227394044399261, "learning_rate": 0.00011178607922793307, "loss": 0.3171, "step": 1726 }, { "epoch": 0.9584026622296173, "grad_norm": 0.22638051211833954, "learning_rate": 0.0001116076390155682, "loss": 0.4248, "step": 1728 }, { "epoch": 0.9595119245701609, "grad_norm": 0.24086622893810272, "learning_rate": 0.00011142916133073948, "loss": 0.3039, "step": 1730 }, { "epoch": 0.9606211869107044, "grad_norm": 0.25115931034088135, "learning_rate": 0.00011125064674961913, "loss": 0.3836, "step": 1732 }, { "epoch": 0.961730449251248, "grad_norm": 0.274597704410553, "learning_rate": 0.00011107209584849845, "loss": 0.393, "step": 1734 }, { "epoch": 0.9628397115917915, "grad_norm": 0.2955986559391022, "learning_rate": 0.00011089350920378592, "loss": 0.3905, "step": 1736 }, { "epoch": 0.963948973932335, "grad_norm": 0.3580799400806427, "learning_rate": 0.00011071488739200551, "loss": 0.3933, "step": 1738 }, { "epoch": 0.9650582362728786, "grad_norm": 0.3148622512817383, "learning_rate": 0.00011053623098979465, "loss": 0.5094, "step": 1740 }, { "epoch": 0.9661674986134221, "grad_norm": 0.23635894060134888, "learning_rate": 0.00011035754057390247, "loss": 0.4101, "step": 1742 }, { "epoch": 0.9672767609539656, "grad_norm": 0.3264128863811493, "learning_rate": 0.00011017881672118786, "loss": 0.3513, "step": 1744 }, { "epoch": 0.9683860232945092, "grad_norm": 0.2713950276374817, "learning_rate": 0.00011000006000861771, "loss": 0.41, "step": 1746 }, { "epoch": 0.9694952856350527, "grad_norm": 0.4210665822029114, "learning_rate": 0.00010982127101326498, "loss": 0.3747, "step": 1748 }, { "epoch": 0.9706045479755963, "grad_norm": 0.21227708458900452, "learning_rate": 0.00010964245031230684, "loss": 0.4087, "step": 1750 }, { "epoch": 0.9717138103161398, "grad_norm": 0.4090538024902344, "learning_rate": 0.00010946359848302275, "loss": 0.5652, "step": 1752 }, { "epoch": 0.9728230726566833, "grad_norm": 0.2974146604537964, "learning_rate": 0.00010928471610279278, "loss": 0.3913, "step": 1754 }, { "epoch": 0.9739323349972269, "grad_norm": 0.2675114870071411, "learning_rate": 0.00010910580374909551, "loss": 0.4146, "step": 1756 }, { "epoch": 0.9750415973377704, "grad_norm": 0.2641230821609497, "learning_rate": 0.0001089268619995064, "loss": 0.4093, "step": 1758 }, { "epoch": 0.9761508596783139, "grad_norm": 0.30527427792549133, "learning_rate": 0.00010874789143169568, "loss": 0.4269, "step": 1760 }, { "epoch": 0.9772601220188575, "grad_norm": 0.25456637144088745, "learning_rate": 0.0001085688926234267, "loss": 0.4085, "step": 1762 }, { "epoch": 0.978369384359401, "grad_norm": 0.2829885184764862, "learning_rate": 0.00010838986615255397, "loss": 0.4017, "step": 1764 }, { "epoch": 0.9794786466999446, "grad_norm": 0.31530699133872986, "learning_rate": 0.00010821081259702128, "loss": 0.3994, "step": 1766 }, { "epoch": 0.9805879090404881, "grad_norm": 0.26515138149261475, "learning_rate": 0.00010803173253485983, "loss": 0.3345, "step": 1768 }, { "epoch": 0.9816971713810316, "grad_norm": 0.26404955983161926, "learning_rate": 0.00010785262654418647, "loss": 0.3681, "step": 1770 }, { "epoch": 0.9828064337215752, "grad_norm": 0.35473427176475525, "learning_rate": 0.00010767349520320167, "loss": 0.4161, "step": 1772 }, { "epoch": 0.9839156960621187, "grad_norm": 0.26969531178474426, "learning_rate": 0.00010749433909018778, "loss": 0.4034, "step": 1774 }, { "epoch": 0.9850249584026622, "grad_norm": 0.949966311454773, "learning_rate": 0.0001073151587835071, "loss": 0.4101, "step": 1776 }, { "epoch": 0.9861342207432058, "grad_norm": 0.26887133717536926, "learning_rate": 0.00010713595486160013, "loss": 0.365, "step": 1778 }, { "epoch": 0.9872434830837493, "grad_norm": 0.2992730140686035, "learning_rate": 0.00010695672790298341, "loss": 0.4104, "step": 1780 }, { "epoch": 0.9883527454242929, "grad_norm": 0.29282405972480774, "learning_rate": 0.00010677747848624804, "loss": 0.3768, "step": 1782 }, { "epoch": 0.9894620077648364, "grad_norm": 0.3003996014595032, "learning_rate": 0.00010659820719005748, "loss": 0.3454, "step": 1784 }, { "epoch": 0.9905712701053799, "grad_norm": 0.3009726107120514, "learning_rate": 0.00010641891459314597, "loss": 0.3189, "step": 1786 }, { "epoch": 0.9916805324459235, "grad_norm": 0.32993796467781067, "learning_rate": 0.00010623960127431636, "loss": 0.3873, "step": 1788 }, { "epoch": 0.992789794786467, "grad_norm": 0.28225094079971313, "learning_rate": 0.00010606026781243847, "loss": 0.3549, "step": 1790 }, { "epoch": 0.9938990571270105, "grad_norm": 0.30678993463516235, "learning_rate": 0.00010588091478644715, "loss": 0.3856, "step": 1792 }, { "epoch": 0.9950083194675541, "grad_norm": 0.25429633259773254, "learning_rate": 0.00010570154277534042, "loss": 0.3276, "step": 1794 }, { "epoch": 0.9961175818080976, "grad_norm": 0.20127154886722565, "learning_rate": 0.00010552215235817754, "loss": 0.3675, "step": 1796 }, { "epoch": 0.9972268441486412, "grad_norm": 0.2773531973361969, "learning_rate": 0.00010534274411407725, "loss": 0.3993, "step": 1798 }, { "epoch": 0.9983361064891847, "grad_norm": 0.3768630921840668, "learning_rate": 0.00010516331862221582, "loss": 0.4841, "step": 1800 }, { "epoch": 0.9994453688297282, "grad_norm": 0.3159618079662323, "learning_rate": 0.00010498387646182512, "loss": 0.4443, "step": 1802 }, { "epoch": 1.0, "eval_loss": 0.38768309354782104, "eval_runtime": 38.6863, "eval_samples_per_second": 58.031, "eval_steps_per_second": 29.028, "step": 1803 }, { "epoch": 1.0005546311702718, "grad_norm": 0.18592089414596558, "learning_rate": 0.00010480441821219096, "loss": 0.3043, "step": 1804 }, { "epoch": 1.0016638935108153, "grad_norm": 0.23864133656024933, "learning_rate": 0.00010462494445265098, "loss": 0.3108, "step": 1806 }, { "epoch": 1.0027731558513588, "grad_norm": 0.33824145793914795, "learning_rate": 0.00010444545576259304, "loss": 0.3777, "step": 1808 }, { "epoch": 1.0038824181919024, "grad_norm": 0.2805129289627075, "learning_rate": 0.00010426595272145296, "loss": 0.3282, "step": 1810 }, { "epoch": 1.004991680532446, "grad_norm": 0.2824130058288574, "learning_rate": 0.00010408643590871312, "loss": 0.3395, "step": 1812 }, { "epoch": 1.0061009428729895, "grad_norm": 0.2482519894838333, "learning_rate": 0.00010390690590390023, "loss": 0.3296, "step": 1814 }, { "epoch": 1.007210205213533, "grad_norm": 0.31125134229660034, "learning_rate": 0.00010372736328658363, "loss": 0.3626, "step": 1816 }, { "epoch": 1.0083194675540765, "grad_norm": 0.2817044258117676, "learning_rate": 0.00010354780863637339, "loss": 0.3466, "step": 1818 }, { "epoch": 1.00942872989462, "grad_norm": 0.28125834465026855, "learning_rate": 0.00010336824253291837, "loss": 0.3274, "step": 1820 }, { "epoch": 1.0105379922351636, "grad_norm": 0.18501298129558563, "learning_rate": 0.0001031886655559045, "loss": 0.2943, "step": 1822 }, { "epoch": 1.0116472545757071, "grad_norm": 0.3577679395675659, "learning_rate": 0.00010300907828505269, "loss": 0.3546, "step": 1824 }, { "epoch": 1.0127565169162507, "grad_norm": 0.26521244645118713, "learning_rate": 0.00010282948130011715, "loss": 0.3626, "step": 1826 }, { "epoch": 1.0138657792567942, "grad_norm": 0.25927355885505676, "learning_rate": 0.00010264987518088347, "loss": 0.4119, "step": 1828 }, { "epoch": 1.0149750415973378, "grad_norm": 0.2581847608089447, "learning_rate": 0.0001024702605071667, "loss": 0.3306, "step": 1830 }, { "epoch": 1.0160843039378813, "grad_norm": 0.2600712478160858, "learning_rate": 0.00010229063785880948, "loss": 0.3486, "step": 1832 }, { "epoch": 1.0171935662784248, "grad_norm": 0.2579622268676758, "learning_rate": 0.00010211100781568024, "loss": 0.3191, "step": 1834 }, { "epoch": 1.0183028286189684, "grad_norm": 0.3741399645805359, "learning_rate": 0.00010193137095767125, "loss": 0.4502, "step": 1836 }, { "epoch": 1.019412090959512, "grad_norm": 0.31590569019317627, "learning_rate": 0.0001017517278646968, "loss": 0.3766, "step": 1838 }, { "epoch": 1.0205213533000554, "grad_norm": 0.2550656199455261, "learning_rate": 0.00010157207911669132, "loss": 0.3151, "step": 1840 }, { "epoch": 1.021630615640599, "grad_norm": 0.28483080863952637, "learning_rate": 0.00010139242529360744, "loss": 0.3957, "step": 1842 }, { "epoch": 1.0227398779811425, "grad_norm": 0.27653104066848755, "learning_rate": 0.00010121276697541427, "loss": 0.4507, "step": 1844 }, { "epoch": 1.023849140321686, "grad_norm": 0.24907124042510986, "learning_rate": 0.00010103310474209528, "loss": 0.4098, "step": 1846 }, { "epoch": 1.0249584026622296, "grad_norm": 0.26746734976768494, "learning_rate": 0.00010085343917364675, "loss": 0.376, "step": 1848 }, { "epoch": 1.0260676650027731, "grad_norm": 0.3076433539390564, "learning_rate": 0.00010067377085007561, "loss": 0.4063, "step": 1850 }, { "epoch": 1.0271769273433167, "grad_norm": 0.2736207842826843, "learning_rate": 0.00010049410035139774, "loss": 0.3646, "step": 1852 }, { "epoch": 1.0282861896838602, "grad_norm": 0.2720719277858734, "learning_rate": 0.00010031442825763603, "loss": 0.361, "step": 1854 }, { "epoch": 1.0293954520244037, "grad_norm": 0.2995122969150543, "learning_rate": 0.00010013475514881852, "loss": 0.3633, "step": 1856 }, { "epoch": 1.0305047143649473, "grad_norm": 0.3213959038257599, "learning_rate": 9.99550816049765e-05, "loss": 0.393, "step": 1858 }, { "epoch": 1.0316139767054908, "grad_norm": 0.2832462191581726, "learning_rate": 9.977540820614266e-05, "loss": 0.372, "step": 1860 }, { "epoch": 1.0327232390460344, "grad_norm": 0.2622850239276886, "learning_rate": 9.959573553234931e-05, "loss": 0.3433, "step": 1862 }, { "epoch": 1.033832501386578, "grad_norm": 0.2624775469303131, "learning_rate": 9.941606416362629e-05, "loss": 0.2845, "step": 1864 }, { "epoch": 1.0349417637271214, "grad_norm": 0.2570095956325531, "learning_rate": 9.92363946799993e-05, "loss": 0.3519, "step": 1866 }, { "epoch": 1.036051026067665, "grad_norm": 0.342472642660141, "learning_rate": 9.9056727661488e-05, "loss": 0.4085, "step": 1868 }, { "epoch": 1.0371602884082085, "grad_norm": 0.22180654108524323, "learning_rate": 9.887706368810403e-05, "loss": 0.3064, "step": 1870 }, { "epoch": 1.038269550748752, "grad_norm": 0.2561916410923004, "learning_rate": 9.869740333984917e-05, "loss": 0.469, "step": 1872 }, { "epoch": 1.0393788130892956, "grad_norm": 0.278629332780838, "learning_rate": 9.851774719671355e-05, "loss": 0.3506, "step": 1874 }, { "epoch": 1.0404880754298391, "grad_norm": 0.2417076975107193, "learning_rate": 9.833809583867374e-05, "loss": 0.3787, "step": 1876 }, { "epoch": 1.0415973377703827, "grad_norm": 0.3406512141227722, "learning_rate": 9.815844984569079e-05, "loss": 0.4375, "step": 1878 }, { "epoch": 1.0427066001109262, "grad_norm": 0.4049924612045288, "learning_rate": 9.797880979770845e-05, "loss": 0.3977, "step": 1880 }, { "epoch": 1.0438158624514697, "grad_norm": 0.2261233627796173, "learning_rate": 9.779917627465139e-05, "loss": 0.3227, "step": 1882 }, { "epoch": 1.0449251247920133, "grad_norm": 0.4143763780593872, "learning_rate": 9.761954985642308e-05, "loss": 0.4072, "step": 1884 }, { "epoch": 1.0460343871325568, "grad_norm": 0.42435726523399353, "learning_rate": 9.743993112290408e-05, "loss": 0.4488, "step": 1886 }, { "epoch": 1.0471436494731003, "grad_norm": 0.3034514784812927, "learning_rate": 9.726032065395022e-05, "loss": 0.4157, "step": 1888 }, { "epoch": 1.0482529118136439, "grad_norm": 0.34837406873703003, "learning_rate": 9.708071902939054e-05, "loss": 0.4582, "step": 1890 }, { "epoch": 1.0493621741541874, "grad_norm": 16.98454475402832, "learning_rate": 9.690112682902558e-05, "loss": 0.487, "step": 1892 }, { "epoch": 1.050471436494731, "grad_norm": 0.32965266704559326, "learning_rate": 9.672154463262545e-05, "loss": 0.4454, "step": 1894 }, { "epoch": 1.0515806988352745, "grad_norm": 0.3266843855381012, "learning_rate": 9.654197301992805e-05, "loss": 0.4016, "step": 1896 }, { "epoch": 1.052689961175818, "grad_norm": 0.28000080585479736, "learning_rate": 9.636241257063697e-05, "loss": 0.2961, "step": 1898 }, { "epoch": 1.0537992235163616, "grad_norm": 0.26942679286003113, "learning_rate": 9.618286386441981e-05, "loss": 0.4189, "step": 1900 }, { "epoch": 1.054908485856905, "grad_norm": 0.30495479702949524, "learning_rate": 9.600332748090633e-05, "loss": 0.3766, "step": 1902 }, { "epoch": 1.0560177481974486, "grad_norm": 0.41964420676231384, "learning_rate": 9.582380399968643e-05, "loss": 0.454, "step": 1904 }, { "epoch": 1.0571270105379922, "grad_norm": 0.28320369124412537, "learning_rate": 9.564429400030837e-05, "loss": 0.3732, "step": 1906 }, { "epoch": 1.0582362728785357, "grad_norm": 0.2792145907878876, "learning_rate": 9.546479806227691e-05, "loss": 0.356, "step": 1908 }, { "epoch": 1.0593455352190793, "grad_norm": 0.29180654883384705, "learning_rate": 9.52853167650514e-05, "loss": 0.436, "step": 1910 }, { "epoch": 1.0604547975596228, "grad_norm": 0.23264792561531067, "learning_rate": 9.510585068804394e-05, "loss": 0.3064, "step": 1912 }, { "epoch": 1.0615640599001663, "grad_norm": 0.3678199350833893, "learning_rate": 9.492640041061752e-05, "loss": 0.4138, "step": 1914 }, { "epoch": 1.0626733222407099, "grad_norm": 0.4804568886756897, "learning_rate": 9.474696651208406e-05, "loss": 0.4117, "step": 1916 }, { "epoch": 1.0637825845812534, "grad_norm": 0.30506354570388794, "learning_rate": 9.456754957170262e-05, "loss": 0.328, "step": 1918 }, { "epoch": 1.064891846921797, "grad_norm": 0.28367388248443604, "learning_rate": 9.438815016867757e-05, "loss": 0.3548, "step": 1920 }, { "epoch": 1.0660011092623405, "grad_norm": 0.2342025488615036, "learning_rate": 9.420876888215661e-05, "loss": 0.3227, "step": 1922 }, { "epoch": 1.067110371602884, "grad_norm": 0.3206954598426819, "learning_rate": 9.402940629122894e-05, "loss": 0.4336, "step": 1924 }, { "epoch": 1.0682196339434276, "grad_norm": 0.35998135805130005, "learning_rate": 9.38500629749235e-05, "loss": 0.4726, "step": 1926 }, { "epoch": 1.069328896283971, "grad_norm": 0.25793230533599854, "learning_rate": 9.367073951220693e-05, "loss": 0.3685, "step": 1928 }, { "epoch": 1.0704381586245146, "grad_norm": 0.27258947491645813, "learning_rate": 9.349143648198176e-05, "loss": 0.3911, "step": 1930 }, { "epoch": 1.0715474209650582, "grad_norm": 0.31704410910606384, "learning_rate": 9.33121544630846e-05, "loss": 0.4008, "step": 1932 }, { "epoch": 1.0726566833056017, "grad_norm": 0.28461799025535583, "learning_rate": 9.313289403428427e-05, "loss": 0.4084, "step": 1934 }, { "epoch": 1.0737659456461452, "grad_norm": 0.32281526923179626, "learning_rate": 9.295365577427976e-05, "loss": 0.3423, "step": 1936 }, { "epoch": 1.0748752079866888, "grad_norm": 0.2618202865123749, "learning_rate": 9.27744402616986e-05, "loss": 0.4544, "step": 1938 }, { "epoch": 1.0759844703272323, "grad_norm": 0.24186968803405762, "learning_rate": 9.259524807509491e-05, "loss": 0.2999, "step": 1940 }, { "epoch": 1.0770937326677759, "grad_norm": 0.3159604072570801, "learning_rate": 9.241607979294745e-05, "loss": 0.3966, "step": 1942 }, { "epoch": 1.0782029950083194, "grad_norm": 0.2324291318655014, "learning_rate": 9.223693599365777e-05, "loss": 0.3959, "step": 1944 }, { "epoch": 1.079312257348863, "grad_norm": 0.3115401566028595, "learning_rate": 9.205781725554849e-05, "loss": 0.4472, "step": 1946 }, { "epoch": 1.0804215196894065, "grad_norm": 0.23825664818286896, "learning_rate": 9.18787241568612e-05, "loss": 0.3472, "step": 1948 }, { "epoch": 1.08153078202995, "grad_norm": 0.22805361449718475, "learning_rate": 9.169965727575482e-05, "loss": 0.3072, "step": 1950 }, { "epoch": 1.0826400443704935, "grad_norm": 0.26474443078041077, "learning_rate": 9.152061719030364e-05, "loss": 0.394, "step": 1952 }, { "epoch": 1.083749306711037, "grad_norm": 0.3261571526527405, "learning_rate": 9.134160447849534e-05, "loss": 0.4263, "step": 1954 }, { "epoch": 1.0848585690515806, "grad_norm": 0.22368104755878448, "learning_rate": 9.116261971822932e-05, "loss": 0.2821, "step": 1956 }, { "epoch": 1.0859678313921242, "grad_norm": 0.2612870931625366, "learning_rate": 9.098366348731476e-05, "loss": 0.3488, "step": 1958 }, { "epoch": 1.0870770937326677, "grad_norm": 0.3046448230743408, "learning_rate": 9.080473636346869e-05, "loss": 0.3153, "step": 1960 }, { "epoch": 1.0881863560732112, "grad_norm": 0.2403235137462616, "learning_rate": 9.062583892431414e-05, "loss": 0.301, "step": 1962 }, { "epoch": 1.0892956184137548, "grad_norm": 0.28794658184051514, "learning_rate": 9.044697174737843e-05, "loss": 0.3576, "step": 1964 }, { "epoch": 1.0904048807542983, "grad_norm": 0.2567608654499054, "learning_rate": 9.026813541009104e-05, "loss": 0.3865, "step": 1966 }, { "epoch": 1.0915141430948418, "grad_norm": 0.2786438465118408, "learning_rate": 9.008933048978198e-05, "loss": 0.2924, "step": 1968 }, { "epoch": 1.0926234054353854, "grad_norm": 0.31907153129577637, "learning_rate": 8.991055756367988e-05, "loss": 0.3104, "step": 1970 }, { "epoch": 1.093732667775929, "grad_norm": 0.24627485871315002, "learning_rate": 8.973181720891e-05, "loss": 0.3708, "step": 1972 }, { "epoch": 1.0948419301164725, "grad_norm": 0.3072524070739746, "learning_rate": 8.955311000249244e-05, "loss": 0.3699, "step": 1974 }, { "epoch": 1.095951192457016, "grad_norm": 0.3048143684864044, "learning_rate": 8.937443652134037e-05, "loss": 0.5596, "step": 1976 }, { "epoch": 1.0970604547975595, "grad_norm": 0.24488745629787445, "learning_rate": 8.919579734225803e-05, "loss": 0.327, "step": 1978 }, { "epoch": 1.098169717138103, "grad_norm": 0.30899879336357117, "learning_rate": 8.901719304193894e-05, "loss": 0.3743, "step": 1980 }, { "epoch": 1.0992789794786466, "grad_norm": 0.3292604982852936, "learning_rate": 8.883862419696398e-05, "loss": 0.414, "step": 1982 }, { "epoch": 1.1003882418191901, "grad_norm": 0.339619517326355, "learning_rate": 8.866009138379967e-05, "loss": 0.3926, "step": 1984 }, { "epoch": 1.1014975041597337, "grad_norm": 0.23006640374660492, "learning_rate": 8.848159517879616e-05, "loss": 0.318, "step": 1986 }, { "epoch": 1.1026067665002772, "grad_norm": 0.24744057655334473, "learning_rate": 8.830313615818535e-05, "loss": 0.3816, "step": 1988 }, { "epoch": 1.1037160288408208, "grad_norm": 0.2574522793292999, "learning_rate": 8.812471489807921e-05, "loss": 0.3559, "step": 1990 }, { "epoch": 1.1048252911813643, "grad_norm": 0.27559810876846313, "learning_rate": 8.79463319744677e-05, "loss": 0.3162, "step": 1992 }, { "epoch": 1.1059345535219078, "grad_norm": 0.2662334144115448, "learning_rate": 8.776798796321715e-05, "loss": 0.3153, "step": 1994 }, { "epoch": 1.1070438158624514, "grad_norm": 0.5994970798492432, "learning_rate": 8.758968344006812e-05, "loss": 0.4635, "step": 1996 }, { "epoch": 1.108153078202995, "grad_norm": 0.24415504932403564, "learning_rate": 8.741141898063386e-05, "loss": 0.2697, "step": 1998 }, { "epoch": 1.1092623405435384, "grad_norm": 0.26829585433006287, "learning_rate": 8.723319516039813e-05, "loss": 0.3761, "step": 2000 }, { "epoch": 1.110371602884082, "grad_norm": 0.2905944585800171, "learning_rate": 8.705501255471357e-05, "loss": 0.295, "step": 2002 }, { "epoch": 1.1114808652246255, "grad_norm": 0.2600457966327667, "learning_rate": 8.687687173879981e-05, "loss": 0.3804, "step": 2004 }, { "epoch": 1.112590127565169, "grad_norm": 0.2356429547071457, "learning_rate": 8.669877328774146e-05, "loss": 0.3239, "step": 2006 }, { "epoch": 1.1136993899057126, "grad_norm": 0.3630295991897583, "learning_rate": 8.652071777648646e-05, "loss": 0.3918, "step": 2008 }, { "epoch": 1.1148086522462561, "grad_norm": 0.26642143726348877, "learning_rate": 8.634270577984411e-05, "loss": 0.2999, "step": 2010 }, { "epoch": 1.1159179145867997, "grad_norm": 0.18832941353321075, "learning_rate": 8.616473787248314e-05, "loss": 0.3064, "step": 2012 }, { "epoch": 1.1170271769273432, "grad_norm": 0.3464590013027191, "learning_rate": 8.598681462893012e-05, "loss": 0.4163, "step": 2014 }, { "epoch": 1.1181364392678868, "grad_norm": 0.2680515646934509, "learning_rate": 8.580893662356731e-05, "loss": 0.4601, "step": 2016 }, { "epoch": 1.1192457016084303, "grad_norm": 0.30981600284576416, "learning_rate": 8.563110443063098e-05, "loss": 0.3951, "step": 2018 }, { "epoch": 1.1203549639489738, "grad_norm": 0.26390036940574646, "learning_rate": 8.545331862420944e-05, "loss": 0.3546, "step": 2020 }, { "epoch": 1.1214642262895174, "grad_norm": 0.2888481616973877, "learning_rate": 8.527557977824137e-05, "loss": 0.317, "step": 2022 }, { "epoch": 1.122573488630061, "grad_norm": 0.2808952331542969, "learning_rate": 8.50978884665137e-05, "loss": 0.4164, "step": 2024 }, { "epoch": 1.1236827509706044, "grad_norm": 0.19364522397518158, "learning_rate": 8.492024526265999e-05, "loss": 0.3127, "step": 2026 }, { "epoch": 1.124792013311148, "grad_norm": 0.28427523374557495, "learning_rate": 8.474265074015857e-05, "loss": 0.3471, "step": 2028 }, { "epoch": 1.1259012756516915, "grad_norm": 0.37454891204833984, "learning_rate": 8.45651054723305e-05, "loss": 0.3614, "step": 2030 }, { "epoch": 1.127010537992235, "grad_norm": 0.33179330825805664, "learning_rate": 8.438761003233784e-05, "loss": 0.3646, "step": 2032 }, { "epoch": 1.1281198003327786, "grad_norm": 0.35652342438697815, "learning_rate": 8.421016499318185e-05, "loss": 0.4751, "step": 2034 }, { "epoch": 1.1292290626733221, "grad_norm": 0.26760151982307434, "learning_rate": 8.403277092770106e-05, "loss": 0.3846, "step": 2036 }, { "epoch": 1.1303383250138657, "grad_norm": 0.3007453978061676, "learning_rate": 8.385542840856939e-05, "loss": 0.3132, "step": 2038 }, { "epoch": 1.1314475873544092, "grad_norm": 0.24367092549800873, "learning_rate": 8.367813800829443e-05, "loss": 0.3935, "step": 2040 }, { "epoch": 1.1325568496949527, "grad_norm": 0.27487999200820923, "learning_rate": 8.35009002992155e-05, "loss": 0.3476, "step": 2042 }, { "epoch": 1.1336661120354963, "grad_norm": 0.29086869955062866, "learning_rate": 8.332371585350186e-05, "loss": 0.3752, "step": 2044 }, { "epoch": 1.1347753743760398, "grad_norm": 0.29050594568252563, "learning_rate": 8.314658524315069e-05, "loss": 0.3457, "step": 2046 }, { "epoch": 1.1358846367165834, "grad_norm": 0.36133047938346863, "learning_rate": 8.29695090399855e-05, "loss": 0.3433, "step": 2048 }, { "epoch": 1.1369938990571269, "grad_norm": 0.25438520312309265, "learning_rate": 8.279248781565407e-05, "loss": 0.3761, "step": 2050 }, { "epoch": 1.1381031613976704, "grad_norm": 0.28709906339645386, "learning_rate": 8.261552214162679e-05, "loss": 0.3383, "step": 2052 }, { "epoch": 1.139212423738214, "grad_norm": 0.36977148056030273, "learning_rate": 8.243861258919466e-05, "loss": 0.3919, "step": 2054 }, { "epoch": 1.1403216860787575, "grad_norm": 0.2531401216983795, "learning_rate": 8.226175972946746e-05, "loss": 0.349, "step": 2056 }, { "epoch": 1.1414309484193013, "grad_norm": 0.2995207607746124, "learning_rate": 8.20849641333721e-05, "loss": 0.4156, "step": 2058 }, { "epoch": 1.1425402107598448, "grad_norm": 0.27246829867362976, "learning_rate": 8.190822637165047e-05, "loss": 0.4034, "step": 2060 }, { "epoch": 1.1436494731003883, "grad_norm": 0.3243946135044098, "learning_rate": 8.173154701485787e-05, "loss": 0.3872, "step": 2062 }, { "epoch": 1.1447587354409319, "grad_norm": 0.4028877317905426, "learning_rate": 8.155492663336094e-05, "loss": 0.4174, "step": 2064 }, { "epoch": 1.1458679977814754, "grad_norm": 0.23149579763412476, "learning_rate": 8.137836579733606e-05, "loss": 0.3133, "step": 2066 }, { "epoch": 1.146977260122019, "grad_norm": 0.36684414744377136, "learning_rate": 8.120186507676724e-05, "loss": 0.4227, "step": 2068 }, { "epoch": 1.1480865224625625, "grad_norm": 0.3168555498123169, "learning_rate": 8.102542504144455e-05, "loss": 0.3852, "step": 2070 }, { "epoch": 1.149195784803106, "grad_norm": 0.27771732211112976, "learning_rate": 8.08490462609621e-05, "loss": 0.3955, "step": 2072 }, { "epoch": 1.1503050471436496, "grad_norm": 0.3797549605369568, "learning_rate": 8.06727293047163e-05, "loss": 0.3989, "step": 2074 }, { "epoch": 1.151414309484193, "grad_norm": 0.28603595495224, "learning_rate": 8.049647474190384e-05, "loss": 0.3518, "step": 2076 }, { "epoch": 1.1525235718247366, "grad_norm": 0.3149471879005432, "learning_rate": 8.032028314152013e-05, "loss": 0.4822, "step": 2078 }, { "epoch": 1.1536328341652802, "grad_norm": 0.307466059923172, "learning_rate": 8.014415507235728e-05, "loss": 0.4285, "step": 2080 }, { "epoch": 1.1547420965058237, "grad_norm": 0.27271440625190735, "learning_rate": 7.996809110300226e-05, "loss": 0.3527, "step": 2082 }, { "epoch": 1.1558513588463672, "grad_norm": 0.2561948299407959, "learning_rate": 7.979209180183515e-05, "loss": 0.3103, "step": 2084 }, { "epoch": 1.1569606211869108, "grad_norm": 0.3454659879207611, "learning_rate": 7.961615773702727e-05, "loss": 0.4885, "step": 2086 }, { "epoch": 1.1580698835274543, "grad_norm": 0.41174525022506714, "learning_rate": 7.944028947653936e-05, "loss": 0.4334, "step": 2088 }, { "epoch": 1.1591791458679979, "grad_norm": 0.346825510263443, "learning_rate": 7.926448758811964e-05, "loss": 0.5005, "step": 2090 }, { "epoch": 1.1602884082085414, "grad_norm": 0.2304445505142212, "learning_rate": 7.908875263930214e-05, "loss": 0.3255, "step": 2092 }, { "epoch": 1.161397670549085, "grad_norm": 0.32904666662216187, "learning_rate": 7.891308519740479e-05, "loss": 0.3873, "step": 2094 }, { "epoch": 1.1625069328896285, "grad_norm": 0.31631243228912354, "learning_rate": 7.873748582952753e-05, "loss": 0.3743, "step": 2096 }, { "epoch": 1.163616195230172, "grad_norm": 0.2747056484222412, "learning_rate": 7.856195510255059e-05, "loss": 0.3566, "step": 2098 }, { "epoch": 1.1647254575707155, "grad_norm": 0.25638052821159363, "learning_rate": 7.838649358313262e-05, "loss": 0.3426, "step": 2100 }, { "epoch": 1.165834719911259, "grad_norm": 0.2546238303184509, "learning_rate": 7.821110183770884e-05, "loss": 0.3575, "step": 2102 }, { "epoch": 1.1669439822518026, "grad_norm": 0.25636041164398193, "learning_rate": 7.803578043248918e-05, "loss": 0.3488, "step": 2104 }, { "epoch": 1.1680532445923462, "grad_norm": 0.4820869266986847, "learning_rate": 7.786052993345656e-05, "loss": 0.5524, "step": 2106 }, { "epoch": 1.1691625069328897, "grad_norm": 0.23651207983493805, "learning_rate": 7.76853509063649e-05, "loss": 0.3091, "step": 2108 }, { "epoch": 1.1702717692734332, "grad_norm": 0.3688579797744751, "learning_rate": 7.75102439167375e-05, "loss": 0.5254, "step": 2110 }, { "epoch": 1.1713810316139768, "grad_norm": 0.2631542384624481, "learning_rate": 7.733520952986506e-05, "loss": 0.3707, "step": 2112 }, { "epoch": 1.1724902939545203, "grad_norm": 0.2712019383907318, "learning_rate": 7.716024831080383e-05, "loss": 0.2887, "step": 2114 }, { "epoch": 1.1735995562950639, "grad_norm": 0.3026323616504669, "learning_rate": 7.6985360824374e-05, "loss": 0.3424, "step": 2116 }, { "epoch": 1.1747088186356074, "grad_norm": 0.3098190426826477, "learning_rate": 7.68105476351576e-05, "loss": 0.4582, "step": 2118 }, { "epoch": 1.175818080976151, "grad_norm": 0.2829030454158783, "learning_rate": 7.663580930749693e-05, "loss": 0.481, "step": 2120 }, { "epoch": 1.1769273433166945, "grad_norm": 0.35570159554481506, "learning_rate": 7.646114640549246e-05, "loss": 0.3777, "step": 2122 }, { "epoch": 1.178036605657238, "grad_norm": 0.30424460768699646, "learning_rate": 7.628655949300133e-05, "loss": 0.3827, "step": 2124 }, { "epoch": 1.1791458679977815, "grad_norm": 0.35098132491111755, "learning_rate": 7.611204913363524e-05, "loss": 0.3636, "step": 2126 }, { "epoch": 1.180255130338325, "grad_norm": 0.27268752455711365, "learning_rate": 7.593761589075879e-05, "loss": 0.2896, "step": 2128 }, { "epoch": 1.1813643926788686, "grad_norm": 0.31950294971466064, "learning_rate": 7.576326032748772e-05, "loss": 0.4369, "step": 2130 }, { "epoch": 1.1824736550194122, "grad_norm": 0.3007744252681732, "learning_rate": 7.558898300668691e-05, "loss": 0.3927, "step": 2132 }, { "epoch": 1.1835829173599557, "grad_norm": 0.3327229917049408, "learning_rate": 7.541478449096861e-05, "loss": 0.4977, "step": 2134 }, { "epoch": 1.1846921797004992, "grad_norm": 0.2749979794025421, "learning_rate": 7.524066534269079e-05, "loss": 0.3875, "step": 2136 }, { "epoch": 1.1858014420410428, "grad_norm": 0.325305312871933, "learning_rate": 7.506662612395514e-05, "loss": 0.4653, "step": 2138 }, { "epoch": 1.1869107043815863, "grad_norm": 0.2727961242198944, "learning_rate": 7.489266739660524e-05, "loss": 0.357, "step": 2140 }, { "epoch": 1.1880199667221298, "grad_norm": 0.2543491721153259, "learning_rate": 7.471878972222495e-05, "loss": 0.3108, "step": 2142 }, { "epoch": 1.1891292290626734, "grad_norm": 0.30424436926841736, "learning_rate": 7.454499366213642e-05, "loss": 0.4161, "step": 2144 }, { "epoch": 1.190238491403217, "grad_norm": 0.35166969895362854, "learning_rate": 7.437127977739835e-05, "loss": 0.3729, "step": 2146 }, { "epoch": 1.1913477537437605, "grad_norm": 0.3051074743270874, "learning_rate": 7.419764862880408e-05, "loss": 0.3194, "step": 2148 }, { "epoch": 1.192457016084304, "grad_norm": 0.42905622720718384, "learning_rate": 7.402410077687993e-05, "loss": 0.4673, "step": 2150 }, { "epoch": 1.1935662784248475, "grad_norm": 0.273519903421402, "learning_rate": 7.38506367818833e-05, "loss": 0.4068, "step": 2152 }, { "epoch": 1.194675540765391, "grad_norm": 0.35866236686706543, "learning_rate": 7.367725720380087e-05, "loss": 0.4065, "step": 2154 }, { "epoch": 1.1957848031059346, "grad_norm": 0.23525385558605194, "learning_rate": 7.350396260234681e-05, "loss": 0.4038, "step": 2156 }, { "epoch": 1.1968940654464781, "grad_norm": 0.2770351767539978, "learning_rate": 7.333075353696089e-05, "loss": 0.3598, "step": 2158 }, { "epoch": 1.1980033277870217, "grad_norm": 0.2863808870315552, "learning_rate": 7.315763056680694e-05, "loss": 0.4592, "step": 2160 }, { "epoch": 1.1991125901275652, "grad_norm": 0.2790552079677582, "learning_rate": 7.298459425077064e-05, "loss": 0.3431, "step": 2162 }, { "epoch": 1.2002218524681088, "grad_norm": 0.3501036763191223, "learning_rate": 7.281164514745806e-05, "loss": 0.4216, "step": 2164 }, { "epoch": 1.2013311148086523, "grad_norm": 0.35109075903892517, "learning_rate": 7.263878381519365e-05, "loss": 0.3759, "step": 2166 }, { "epoch": 1.2024403771491958, "grad_norm": 0.23262017965316772, "learning_rate": 7.246601081201851e-05, "loss": 0.3056, "step": 2168 }, { "epoch": 1.2035496394897394, "grad_norm": 0.2854996919631958, "learning_rate": 7.229332669568871e-05, "loss": 0.3539, "step": 2170 }, { "epoch": 1.204658901830283, "grad_norm": 0.31174349784851074, "learning_rate": 7.21207320236732e-05, "loss": 0.4587, "step": 2172 }, { "epoch": 1.2057681641708264, "grad_norm": 0.23841983079910278, "learning_rate": 7.194822735315229e-05, "loss": 0.428, "step": 2174 }, { "epoch": 1.20687742651137, "grad_norm": 0.2881643772125244, "learning_rate": 7.177581324101576e-05, "loss": 0.3366, "step": 2176 }, { "epoch": 1.2079866888519135, "grad_norm": 0.25862303376197815, "learning_rate": 7.160349024386095e-05, "loss": 0.3604, "step": 2178 }, { "epoch": 1.209095951192457, "grad_norm": 0.2757657468318939, "learning_rate": 7.143125891799112e-05, "loss": 0.3398, "step": 2180 }, { "epoch": 1.2102052135330006, "grad_norm": 0.30355626344680786, "learning_rate": 7.12591198194136e-05, "loss": 0.3497, "step": 2182 }, { "epoch": 1.2113144758735441, "grad_norm": 0.2825808525085449, "learning_rate": 7.108707350383792e-05, "loss": 0.3918, "step": 2184 }, { "epoch": 1.2124237382140877, "grad_norm": 0.22622592747211456, "learning_rate": 7.091512052667413e-05, "loss": 0.2614, "step": 2186 }, { "epoch": 1.2135330005546312, "grad_norm": 0.23999890685081482, "learning_rate": 7.074326144303101e-05, "loss": 0.339, "step": 2188 }, { "epoch": 1.2146422628951747, "grad_norm": 0.3575845956802368, "learning_rate": 7.057149680771413e-05, "loss": 0.3956, "step": 2190 }, { "epoch": 1.2157515252357183, "grad_norm": 0.2398807853460312, "learning_rate": 7.039982717522422e-05, "loss": 0.3376, "step": 2192 }, { "epoch": 1.2168607875762618, "grad_norm": 0.2710171043872833, "learning_rate": 7.02282530997553e-05, "loss": 0.3466, "step": 2194 }, { "epoch": 1.2179700499168054, "grad_norm": 0.3714485764503479, "learning_rate": 7.005677513519288e-05, "loss": 0.3518, "step": 2196 }, { "epoch": 1.219079312257349, "grad_norm": 0.3260788321495056, "learning_rate": 6.988539383511224e-05, "loss": 0.4465, "step": 2198 }, { "epoch": 1.2201885745978924, "grad_norm": 0.34494900703430176, "learning_rate": 6.971410975277655e-05, "loss": 0.4824, "step": 2200 }, { "epoch": 1.221297836938436, "grad_norm": 0.3417797386646271, "learning_rate": 6.954292344113522e-05, "loss": 0.4585, "step": 2202 }, { "epoch": 1.2224070992789795, "grad_norm": 0.3638070821762085, "learning_rate": 6.937183545282199e-05, "loss": 0.4599, "step": 2204 }, { "epoch": 1.223516361619523, "grad_norm": 0.30623239278793335, "learning_rate": 6.920084634015314e-05, "loss": 0.2944, "step": 2206 }, { "epoch": 1.2246256239600666, "grad_norm": 0.26289814710617065, "learning_rate": 6.902995665512581e-05, "loss": 0.2996, "step": 2208 }, { "epoch": 1.2257348863006101, "grad_norm": 0.30422243475914, "learning_rate": 6.885916694941612e-05, "loss": 0.3768, "step": 2210 }, { "epoch": 1.2268441486411537, "grad_norm": 0.3084448575973511, "learning_rate": 6.868847777437748e-05, "loss": 0.3898, "step": 2212 }, { "epoch": 1.2279534109816972, "grad_norm": 0.27117034792900085, "learning_rate": 6.851788968103876e-05, "loss": 0.32, "step": 2214 }, { "epoch": 1.2290626733222407, "grad_norm": 0.31548282504081726, "learning_rate": 6.834740322010241e-05, "loss": 0.4279, "step": 2216 }, { "epoch": 1.2301719356627843, "grad_norm": 0.2990844249725342, "learning_rate": 6.817701894194294e-05, "loss": 0.3003, "step": 2218 }, { "epoch": 1.2312811980033278, "grad_norm": 0.31102094054222107, "learning_rate": 6.800673739660488e-05, "loss": 0.526, "step": 2220 }, { "epoch": 1.2323904603438713, "grad_norm": 0.3252849876880646, "learning_rate": 6.783655913380115e-05, "loss": 0.3848, "step": 2222 }, { "epoch": 1.2334997226844149, "grad_norm": 0.296875536441803, "learning_rate": 6.766648470291124e-05, "loss": 0.3791, "step": 2224 }, { "epoch": 1.2346089850249584, "grad_norm": 0.251607209444046, "learning_rate": 6.749651465297943e-05, "loss": 0.4348, "step": 2226 }, { "epoch": 1.235718247365502, "grad_norm": 0.3017038106918335, "learning_rate": 6.732664953271306e-05, "loss": 0.4041, "step": 2228 }, { "epoch": 1.2368275097060455, "grad_norm": 0.4208067059516907, "learning_rate": 6.715688989048066e-05, "loss": 0.4914, "step": 2230 }, { "epoch": 1.237936772046589, "grad_norm": 0.2521570324897766, "learning_rate": 6.698723627431038e-05, "loss": 0.2871, "step": 2232 }, { "epoch": 1.2390460343871326, "grad_norm": 0.2669265866279602, "learning_rate": 6.681768923188799e-05, "loss": 0.3533, "step": 2234 }, { "epoch": 1.240155296727676, "grad_norm": 0.27364587783813477, "learning_rate": 6.664824931055522e-05, "loss": 0.3058, "step": 2236 }, { "epoch": 1.2412645590682196, "grad_norm": 0.24516661465168, "learning_rate": 6.647891705730802e-05, "loss": 0.3059, "step": 2238 }, { "epoch": 1.2423738214087632, "grad_norm": 0.346921443939209, "learning_rate": 6.630969301879474e-05, "loss": 0.4405, "step": 2240 }, { "epoch": 1.2434830837493067, "grad_norm": 0.36792489886283875, "learning_rate": 6.614057774131437e-05, "loss": 0.5472, "step": 2242 }, { "epoch": 1.2445923460898503, "grad_norm": 0.3431490361690521, "learning_rate": 6.597157177081477e-05, "loss": 0.454, "step": 2244 }, { "epoch": 1.2457016084303938, "grad_norm": 0.3037920594215393, "learning_rate": 6.580267565289106e-05, "loss": 0.4218, "step": 2246 }, { "epoch": 1.2468108707709373, "grad_norm": 0.30732932686805725, "learning_rate": 6.56338899327836e-05, "loss": 0.3228, "step": 2248 }, { "epoch": 1.2479201331114809, "grad_norm": 0.28394436836242676, "learning_rate": 6.546521515537636e-05, "loss": 0.3824, "step": 2250 }, { "epoch": 1.2490293954520244, "grad_norm": 0.3070717751979828, "learning_rate": 6.52966518651952e-05, "loss": 0.4145, "step": 2252 }, { "epoch": 1.250138657792568, "grad_norm": 0.2168722301721573, "learning_rate": 6.512820060640607e-05, "loss": 0.3676, "step": 2254 }, { "epoch": 1.2512479201331115, "grad_norm": 0.3195607364177704, "learning_rate": 6.495986192281325e-05, "loss": 0.3998, "step": 2256 }, { "epoch": 1.252357182473655, "grad_norm": 0.23793111741542816, "learning_rate": 6.479163635785759e-05, "loss": 0.2888, "step": 2258 }, { "epoch": 1.2534664448141986, "grad_norm": 0.32127097249031067, "learning_rate": 6.462352445461469e-05, "loss": 0.3023, "step": 2260 }, { "epoch": 1.254575707154742, "grad_norm": 0.3184230625629425, "learning_rate": 6.445552675579341e-05, "loss": 0.3412, "step": 2262 }, { "epoch": 1.2556849694952856, "grad_norm": 0.3016446828842163, "learning_rate": 6.428764380373376e-05, "loss": 0.3669, "step": 2264 }, { "epoch": 1.2567942318358292, "grad_norm": 0.39701253175735474, "learning_rate": 6.41198761404054e-05, "loss": 0.515, "step": 2266 }, { "epoch": 1.2579034941763727, "grad_norm": 0.23390258848667145, "learning_rate": 6.395222430740573e-05, "loss": 0.3064, "step": 2268 }, { "epoch": 1.2590127565169162, "grad_norm": 0.3082529604434967, "learning_rate": 6.37846888459583e-05, "loss": 0.4432, "step": 2270 }, { "epoch": 1.2601220188574598, "grad_norm": 0.34247317910194397, "learning_rate": 6.361727029691096e-05, "loss": 0.4047, "step": 2272 }, { "epoch": 1.2612312811980033, "grad_norm": 0.37517714500427246, "learning_rate": 6.34499692007341e-05, "loss": 0.466, "step": 2274 }, { "epoch": 1.2623405435385469, "grad_norm": 0.2715705931186676, "learning_rate": 6.328278609751898e-05, "loss": 0.4771, "step": 2276 }, { "epoch": 1.2634498058790904, "grad_norm": 0.2877407670021057, "learning_rate": 6.311572152697598e-05, "loss": 0.4258, "step": 2278 }, { "epoch": 1.264559068219634, "grad_norm": 0.3554646074771881, "learning_rate": 6.294877602843275e-05, "loss": 0.5173, "step": 2280 }, { "epoch": 1.2656683305601775, "grad_norm": 0.27891042828559875, "learning_rate": 6.278195014083257e-05, "loss": 0.3525, "step": 2282 }, { "epoch": 1.266777592900721, "grad_norm": 0.27851444482803345, "learning_rate": 6.261524440273263e-05, "loss": 0.3509, "step": 2284 }, { "epoch": 1.2678868552412645, "grad_norm": 0.23373687267303467, "learning_rate": 6.244865935230215e-05, "loss": 0.3873, "step": 2286 }, { "epoch": 1.268996117581808, "grad_norm": 0.32285311818122864, "learning_rate": 6.228219552732083e-05, "loss": 0.4167, "step": 2288 }, { "epoch": 1.2701053799223516, "grad_norm": 0.30920663475990295, "learning_rate": 6.211585346517701e-05, "loss": 0.3945, "step": 2290 }, { "epoch": 1.2712146422628952, "grad_norm": 0.3399757444858551, "learning_rate": 6.194963370286595e-05, "loss": 0.481, "step": 2292 }, { "epoch": 1.2723239046034387, "grad_norm": 0.3406103849411011, "learning_rate": 6.178353677698801e-05, "loss": 0.3728, "step": 2294 }, { "epoch": 1.2734331669439822, "grad_norm": 0.28863847255706787, "learning_rate": 6.16175632237471e-05, "loss": 0.3582, "step": 2296 }, { "epoch": 1.2745424292845258, "grad_norm": 0.2774694859981537, "learning_rate": 6.145171357894885e-05, "loss": 0.2823, "step": 2298 }, { "epoch": 1.2756516916250693, "grad_norm": 0.312102735042572, "learning_rate": 6.12859883779988e-05, "loss": 0.3745, "step": 2300 }, { "epoch": 1.2767609539656128, "grad_norm": 0.2844535708427429, "learning_rate": 6.11203881559008e-05, "loss": 0.3743, "step": 2302 }, { "epoch": 1.2778702163061564, "grad_norm": 0.348458856344223, "learning_rate": 6.095491344725527e-05, "loss": 0.3885, "step": 2304 }, { "epoch": 1.2789794786467, "grad_norm": 0.2553083300590515, "learning_rate": 6.0789564786257425e-05, "loss": 0.3227, "step": 2306 }, { "epoch": 1.2800887409872435, "grad_norm": 0.32947540283203125, "learning_rate": 6.06243427066955e-05, "loss": 0.3981, "step": 2308 }, { "epoch": 1.281198003327787, "grad_norm": 0.3036400377750397, "learning_rate": 6.0459247741949166e-05, "loss": 0.4151, "step": 2310 }, { "epoch": 1.2823072656683305, "grad_norm": 0.3431527316570282, "learning_rate": 6.0294280424987724e-05, "loss": 0.363, "step": 2312 }, { "epoch": 1.283416528008874, "grad_norm": 0.3487553000450134, "learning_rate": 6.012944128836835e-05, "loss": 0.3893, "step": 2314 }, { "epoch": 1.2845257903494176, "grad_norm": 0.35408324003219604, "learning_rate": 5.99647308642345e-05, "loss": 0.5589, "step": 2316 }, { "epoch": 1.2856350526899611, "grad_norm": 0.1972445398569107, "learning_rate": 5.980014968431396e-05, "loss": 0.3754, "step": 2318 }, { "epoch": 1.2867443150305047, "grad_norm": 0.3481127619743347, "learning_rate": 5.963569827991752e-05, "loss": 0.4485, "step": 2320 }, { "epoch": 1.2878535773710482, "grad_norm": 0.2831405699253082, "learning_rate": 5.947137718193681e-05, "loss": 0.4482, "step": 2322 }, { "epoch": 1.2889628397115918, "grad_norm": 0.3052551746368408, "learning_rate": 5.930718692084289e-05, "loss": 0.4148, "step": 2324 }, { "epoch": 1.2900721020521353, "grad_norm": 0.2522166967391968, "learning_rate": 5.914312802668445e-05, "loss": 0.3882, "step": 2326 }, { "epoch": 1.2911813643926788, "grad_norm": 0.28503984212875366, "learning_rate": 5.897920102908603e-05, "loss": 0.3439, "step": 2328 }, { "epoch": 1.2922906267332224, "grad_norm": 0.36347198486328125, "learning_rate": 5.881540645724646e-05, "loss": 0.315, "step": 2330 }, { "epoch": 1.293399889073766, "grad_norm": 0.300930380821228, "learning_rate": 5.865174483993696e-05, "loss": 0.4299, "step": 2332 }, { "epoch": 1.2945091514143094, "grad_norm": 0.37335050106048584, "learning_rate": 5.8488216705499675e-05, "loss": 0.438, "step": 2334 }, { "epoch": 1.295618413754853, "grad_norm": 0.2799355983734131, "learning_rate": 5.832482258184575e-05, "loss": 0.3494, "step": 2336 }, { "epoch": 1.2967276760953965, "grad_norm": 0.33066707849502563, "learning_rate": 5.816156299645364e-05, "loss": 0.351, "step": 2338 }, { "epoch": 1.29783693843594, "grad_norm": 0.3257594704627991, "learning_rate": 5.799843847636766e-05, "loss": 0.3275, "step": 2340 }, { "epoch": 1.2989462007764836, "grad_norm": 0.31791582703590393, "learning_rate": 5.783544954819592e-05, "loss": 0.32, "step": 2342 }, { "epoch": 1.3000554631170271, "grad_norm": 0.18980185687541962, "learning_rate": 5.76725967381089e-05, "loss": 0.3309, "step": 2344 }, { "epoch": 1.3011647254575707, "grad_norm": 0.243652805685997, "learning_rate": 5.750988057183755e-05, "loss": 0.4156, "step": 2346 }, { "epoch": 1.3022739877981142, "grad_norm": 0.2872309386730194, "learning_rate": 5.7347301574671944e-05, "loss": 0.4077, "step": 2348 }, { "epoch": 1.3033832501386577, "grad_norm": 0.24327610433101654, "learning_rate": 5.718486027145906e-05, "loss": 0.348, "step": 2350 }, { "epoch": 1.3044925124792013, "grad_norm": 0.3590594530105591, "learning_rate": 5.702255718660149e-05, "loss": 0.4491, "step": 2352 }, { "epoch": 1.3056017748197448, "grad_norm": 0.27354371547698975, "learning_rate": 5.686039284405564e-05, "loss": 0.3492, "step": 2354 }, { "epoch": 1.3067110371602884, "grad_norm": 0.2964899241924286, "learning_rate": 5.6698367767329995e-05, "loss": 0.4393, "step": 2356 }, { "epoch": 1.307820299500832, "grad_norm": 0.2882802188396454, "learning_rate": 5.6536482479483424e-05, "loss": 0.3667, "step": 2358 }, { "epoch": 1.3089295618413754, "grad_norm": 0.288278728723526, "learning_rate": 5.63747375031235e-05, "loss": 0.3295, "step": 2360 }, { "epoch": 1.310038824181919, "grad_norm": 0.30691561102867126, "learning_rate": 5.6213133360404946e-05, "loss": 0.3473, "step": 2362 }, { "epoch": 1.3111480865224625, "grad_norm": 0.3028589189052582, "learning_rate": 5.605167057302778e-05, "loss": 0.3748, "step": 2364 }, { "epoch": 1.312257348863006, "grad_norm": 0.24918228387832642, "learning_rate": 5.589034966223568e-05, "loss": 0.4768, "step": 2366 }, { "epoch": 1.3133666112035496, "grad_norm": 0.24106858670711517, "learning_rate": 5.572917114881422e-05, "loss": 0.3659, "step": 2368 }, { "epoch": 1.3144758735440931, "grad_norm": 0.2618839740753174, "learning_rate": 5.5568135553089485e-05, "loss": 0.3333, "step": 2370 }, { "epoch": 1.3155851358846367, "grad_norm": 0.245490163564682, "learning_rate": 5.540724339492602e-05, "loss": 0.2983, "step": 2372 }, { "epoch": 1.3166943982251802, "grad_norm": 0.3315598666667938, "learning_rate": 5.5246495193725326e-05, "loss": 0.4043, "step": 2374 }, { "epoch": 1.3178036605657237, "grad_norm": 0.27348592877388, "learning_rate": 5.5085891468424245e-05, "loss": 0.4301, "step": 2376 }, { "epoch": 1.3189129229062673, "grad_norm": 0.2549262046813965, "learning_rate": 5.492543273749322e-05, "loss": 0.3471, "step": 2378 }, { "epoch": 1.3200221852468108, "grad_norm": 0.2704502046108246, "learning_rate": 5.476511951893454e-05, "loss": 0.3546, "step": 2380 }, { "epoch": 1.3211314475873543, "grad_norm": 0.3413357138633728, "learning_rate": 5.460495233028074e-05, "loss": 0.4929, "step": 2382 }, { "epoch": 1.3222407099278979, "grad_norm": 0.4276898205280304, "learning_rate": 5.444493168859304e-05, "loss": 0.4312, "step": 2384 }, { "epoch": 1.3233499722684414, "grad_norm": 0.36291375756263733, "learning_rate": 5.428505811045948e-05, "loss": 0.5678, "step": 2386 }, { "epoch": 1.324459234608985, "grad_norm": 0.18385890126228333, "learning_rate": 5.412533211199329e-05, "loss": 0.2451, "step": 2388 }, { "epoch": 1.3255684969495285, "grad_norm": 0.372714102268219, "learning_rate": 5.396575420883141e-05, "loss": 0.3532, "step": 2390 }, { "epoch": 1.326677759290072, "grad_norm": 0.4298381805419922, "learning_rate": 5.380632491613265e-05, "loss": 0.3826, "step": 2392 }, { "epoch": 1.3277870216306156, "grad_norm": 0.27926844358444214, "learning_rate": 5.3647044748576e-05, "loss": 0.3808, "step": 2394 }, { "epoch": 1.328896283971159, "grad_norm": 0.528834342956543, "learning_rate": 5.3487914220359035e-05, "loss": 0.3296, "step": 2396 }, { "epoch": 1.3300055463117026, "grad_norm": 0.32054948806762695, "learning_rate": 5.332893384519639e-05, "loss": 0.4404, "step": 2398 }, { "epoch": 1.3311148086522462, "grad_norm": 0.29870322346687317, "learning_rate": 5.317010413631782e-05, "loss": 0.3545, "step": 2400 }, { "epoch": 1.3322240709927897, "grad_norm": 2.708609104156494, "learning_rate": 5.301142560646677e-05, "loss": 0.3227, "step": 2402 }, { "epoch": 1.3333333333333333, "grad_norm": 0.2612524628639221, "learning_rate": 5.285289876789849e-05, "loss": 0.3167, "step": 2404 }, { "epoch": 1.3344425956738768, "grad_norm": 0.2819161117076874, "learning_rate": 5.269452413237885e-05, "loss": 0.3429, "step": 2406 }, { "epoch": 1.3355518580144203, "grad_norm": 0.3466659486293793, "learning_rate": 5.25363022111821e-05, "loss": 0.3428, "step": 2408 }, { "epoch": 1.3366611203549639, "grad_norm": 0.25839924812316895, "learning_rate": 5.237823351508953e-05, "loss": 0.2604, "step": 2410 }, { "epoch": 1.3377703826955074, "grad_norm": 0.30123212933540344, "learning_rate": 5.22203185543878e-05, "loss": 0.3768, "step": 2412 }, { "epoch": 1.338879645036051, "grad_norm": 0.30337926745414734, "learning_rate": 5.2062557838867354e-05, "loss": 0.3955, "step": 2414 }, { "epoch": 1.3399889073765945, "grad_norm": 0.29621419310569763, "learning_rate": 5.190495187782059e-05, "loss": 0.3334, "step": 2416 }, { "epoch": 1.341098169717138, "grad_norm": 0.3272341191768646, "learning_rate": 5.174750118004029e-05, "loss": 0.4122, "step": 2418 }, { "epoch": 1.3422074320576816, "grad_norm": 0.2955860197544098, "learning_rate": 5.159020625381814e-05, "loss": 0.3571, "step": 2420 }, { "epoch": 1.343316694398225, "grad_norm": 0.3072221875190735, "learning_rate": 5.1433067606942905e-05, "loss": 0.3084, "step": 2422 }, { "epoch": 1.3444259567387689, "grad_norm": 0.3217550814151764, "learning_rate": 5.12760857466988e-05, "loss": 0.4189, "step": 2424 }, { "epoch": 1.3455352190793124, "grad_norm": 0.2994224727153778, "learning_rate": 5.111926117986383e-05, "loss": 0.3359, "step": 2426 }, { "epoch": 1.346644481419856, "grad_norm": 0.35402461886405945, "learning_rate": 5.096259441270842e-05, "loss": 0.4055, "step": 2428 }, { "epoch": 1.3477537437603995, "grad_norm": 0.32511135935783386, "learning_rate": 5.080608595099339e-05, "loss": 0.4424, "step": 2430 }, { "epoch": 1.348863006100943, "grad_norm": 0.30743321776390076, "learning_rate": 5.064973629996853e-05, "loss": 0.3594, "step": 2432 }, { "epoch": 1.3499722684414865, "grad_norm": 0.2623654007911682, "learning_rate": 5.0493545964371036e-05, "loss": 0.3297, "step": 2434 }, { "epoch": 1.35108153078203, "grad_norm": 0.9932175874710083, "learning_rate": 5.03375154484238e-05, "loss": 0.4032, "step": 2436 }, { "epoch": 1.3521907931225736, "grad_norm": 0.2910614013671875, "learning_rate": 5.018164525583367e-05, "loss": 0.3441, "step": 2438 }, { "epoch": 1.3533000554631172, "grad_norm": 0.33267906308174133, "learning_rate": 5.0025935889789924e-05, "loss": 0.4265, "step": 2440 }, { "epoch": 1.3544093178036607, "grad_norm": 0.3080008327960968, "learning_rate": 4.987038785296281e-05, "loss": 0.4539, "step": 2442 }, { "epoch": 1.3555185801442042, "grad_norm": 0.3214891850948334, "learning_rate": 4.9715001647501614e-05, "loss": 0.3456, "step": 2444 }, { "epoch": 1.3566278424847478, "grad_norm": 0.3001169264316559, "learning_rate": 4.955977777503319e-05, "loss": 0.3448, "step": 2446 }, { "epoch": 1.3577371048252913, "grad_norm": 0.2781212031841278, "learning_rate": 4.940471673666043e-05, "loss": 0.404, "step": 2448 }, { "epoch": 1.3588463671658348, "grad_norm": 0.3622058928012848, "learning_rate": 4.9249819032960555e-05, "loss": 0.4242, "step": 2450 }, { "epoch": 1.3599556295063784, "grad_norm": 0.29672783613204956, "learning_rate": 4.909508516398339e-05, "loss": 0.4158, "step": 2452 }, { "epoch": 1.361064891846922, "grad_norm": 0.2889052927494049, "learning_rate": 4.8940515629249905e-05, "loss": 0.3568, "step": 2454 }, { "epoch": 1.3621741541874655, "grad_norm": 0.275849312543869, "learning_rate": 4.878611092775065e-05, "loss": 0.4308, "step": 2456 }, { "epoch": 1.363283416528009, "grad_norm": 0.32597821950912476, "learning_rate": 4.863187155794393e-05, "loss": 0.4067, "step": 2458 }, { "epoch": 1.3643926788685525, "grad_norm": 0.3108210861682892, "learning_rate": 4.847779801775436e-05, "loss": 0.4421, "step": 2460 }, { "epoch": 1.365501941209096, "grad_norm": 0.32195425033569336, "learning_rate": 4.832389080457118e-05, "loss": 0.4182, "step": 2462 }, { "epoch": 1.3666112035496396, "grad_norm": 0.2620703876018524, "learning_rate": 4.817015041524676e-05, "loss": 0.3928, "step": 2464 }, { "epoch": 1.3677204658901831, "grad_norm": 0.24564792215824127, "learning_rate": 4.801657734609492e-05, "loss": 0.2702, "step": 2466 }, { "epoch": 1.3688297282307267, "grad_norm": 0.2738599479198456, "learning_rate": 4.786317209288923e-05, "loss": 0.3073, "step": 2468 }, { "epoch": 1.3699389905712702, "grad_norm": 0.4185931086540222, "learning_rate": 4.7709935150861526e-05, "loss": 0.4577, "step": 2470 }, { "epoch": 1.3710482529118138, "grad_norm": 0.29279306530952454, "learning_rate": 4.7556867014700435e-05, "loss": 0.3537, "step": 2472 }, { "epoch": 1.3721575152523573, "grad_norm": 0.31645312905311584, "learning_rate": 4.740396817854945e-05, "loss": 0.3374, "step": 2474 }, { "epoch": 1.3732667775929008, "grad_norm": 0.27415481209754944, "learning_rate": 4.7251239136005586e-05, "loss": 0.3386, "step": 2476 }, { "epoch": 1.3743760399334444, "grad_norm": 0.3509266674518585, "learning_rate": 4.709868038011777e-05, "loss": 0.411, "step": 2478 }, { "epoch": 1.375485302273988, "grad_norm": 0.31094875931739807, "learning_rate": 4.694629240338517e-05, "loss": 0.3523, "step": 2480 }, { "epoch": 1.3765945646145314, "grad_norm": 0.37327978014945984, "learning_rate": 4.6794075697755626e-05, "loss": 0.3622, "step": 2482 }, { "epoch": 1.377703826955075, "grad_norm": 0.2905229330062866, "learning_rate": 4.6642030754624e-05, "loss": 0.3576, "step": 2484 }, { "epoch": 1.3788130892956185, "grad_norm": 0.2602270245552063, "learning_rate": 4.6490158064830834e-05, "loss": 0.3093, "step": 2486 }, { "epoch": 1.379922351636162, "grad_norm": 0.23383758962154388, "learning_rate": 4.6338458118660434e-05, "loss": 0.4123, "step": 2488 }, { "epoch": 1.3810316139767056, "grad_norm": 0.2982844114303589, "learning_rate": 4.618693140583946e-05, "loss": 0.3948, "step": 2490 }, { "epoch": 1.3821408763172491, "grad_norm": 0.2675088047981262, "learning_rate": 4.603557841553542e-05, "loss": 0.4248, "step": 2492 }, { "epoch": 1.3832501386577927, "grad_norm": 0.28989455103874207, "learning_rate": 4.588439963635498e-05, "loss": 0.3647, "step": 2494 }, { "epoch": 1.3843594009983362, "grad_norm": 0.2427791953086853, "learning_rate": 4.573339555634235e-05, "loss": 0.2877, "step": 2496 }, { "epoch": 1.3854686633388797, "grad_norm": 0.36453622579574585, "learning_rate": 4.558256666297773e-05, "loss": 0.4094, "step": 2498 }, { "epoch": 1.3865779256794233, "grad_norm": 0.35472291707992554, "learning_rate": 4.543191344317594e-05, "loss": 0.4244, "step": 2500 }, { "epoch": 1.3876871880199668, "grad_norm": 0.33136647939682007, "learning_rate": 4.5281436383284525e-05, "loss": 0.3849, "step": 2502 }, { "epoch": 1.3887964503605104, "grad_norm": 0.33231866359710693, "learning_rate": 4.513113596908235e-05, "loss": 0.3873, "step": 2504 }, { "epoch": 1.389905712701054, "grad_norm": 0.22978711128234863, "learning_rate": 4.49810126857781e-05, "loss": 0.2924, "step": 2506 }, { "epoch": 1.3910149750415974, "grad_norm": 0.2566686272621155, "learning_rate": 4.483106701800864e-05, "loss": 0.2569, "step": 2508 }, { "epoch": 1.392124237382141, "grad_norm": 0.29530206322669983, "learning_rate": 4.468129944983738e-05, "loss": 0.4219, "step": 2510 }, { "epoch": 1.3932334997226845, "grad_norm": 0.3155916929244995, "learning_rate": 4.453171046475274e-05, "loss": 0.4482, "step": 2512 }, { "epoch": 1.394342762063228, "grad_norm": 0.36319971084594727, "learning_rate": 4.438230054566678e-05, "loss": 0.4071, "step": 2514 }, { "epoch": 1.3954520244037716, "grad_norm": 0.3203721344470978, "learning_rate": 4.423307017491336e-05, "loss": 0.3511, "step": 2516 }, { "epoch": 1.3965612867443151, "grad_norm": 0.297139048576355, "learning_rate": 4.4084019834246746e-05, "loss": 0.3582, "step": 2518 }, { "epoch": 1.3976705490848587, "grad_norm": 0.5080666542053223, "learning_rate": 4.3935150004839996e-05, "loss": 0.5732, "step": 2520 }, { "epoch": 1.3987798114254022, "grad_norm": 0.3271535336971283, "learning_rate": 4.3786461167283496e-05, "loss": 0.3288, "step": 2522 }, { "epoch": 1.3998890737659457, "grad_norm": 0.3107675015926361, "learning_rate": 4.3637953801583344e-05, "loss": 0.3273, "step": 2524 }, { "epoch": 1.4009983361064893, "grad_norm": 0.25614145398139954, "learning_rate": 4.3489628387159706e-05, "loss": 0.3979, "step": 2526 }, { "epoch": 1.4021075984470328, "grad_norm": 0.3792392313480377, "learning_rate": 4.334148540284542e-05, "loss": 0.3923, "step": 2528 }, { "epoch": 1.4032168607875763, "grad_norm": 0.440168172121048, "learning_rate": 4.3193525326884435e-05, "loss": 0.395, "step": 2530 }, { "epoch": 1.4043261231281199, "grad_norm": 0.3226903975009918, "learning_rate": 4.304574863693015e-05, "loss": 0.4261, "step": 2532 }, { "epoch": 1.4054353854686634, "grad_norm": 0.2869783937931061, "learning_rate": 4.289815581004396e-05, "loss": 0.3517, "step": 2534 }, { "epoch": 1.406544647809207, "grad_norm": 0.2879214584827423, "learning_rate": 4.275074732269373e-05, "loss": 0.4079, "step": 2536 }, { "epoch": 1.4076539101497505, "grad_norm": 0.38455456495285034, "learning_rate": 4.260352365075226e-05, "loss": 0.3773, "step": 2538 }, { "epoch": 1.408763172490294, "grad_norm": 0.33934473991394043, "learning_rate": 4.245648526949567e-05, "loss": 0.326, "step": 2540 }, { "epoch": 1.4098724348308376, "grad_norm": 0.35121288895606995, "learning_rate": 4.230963265360185e-05, "loss": 0.3952, "step": 2542 }, { "epoch": 1.410981697171381, "grad_norm": 0.3085687756538391, "learning_rate": 4.216296627714915e-05, "loss": 0.3655, "step": 2544 }, { "epoch": 1.4120909595119246, "grad_norm": 0.320854127407074, "learning_rate": 4.201648661361457e-05, "loss": 0.426, "step": 2546 }, { "epoch": 1.4132002218524682, "grad_norm": 0.24805665016174316, "learning_rate": 4.187019413587234e-05, "loss": 0.2548, "step": 2548 }, { "epoch": 1.4143094841930117, "grad_norm": 0.3249068260192871, "learning_rate": 4.172408931619249e-05, "loss": 0.3629, "step": 2550 }, { "epoch": 1.4154187465335553, "grad_norm": 0.3463628888130188, "learning_rate": 4.1578172626239245e-05, "loss": 0.4273, "step": 2552 }, { "epoch": 1.4165280088740988, "grad_norm": 0.3546282649040222, "learning_rate": 4.143244453706941e-05, "loss": 0.4464, "step": 2554 }, { "epoch": 1.4176372712146423, "grad_norm": 0.2811850607395172, "learning_rate": 4.1286905519130955e-05, "loss": 0.3781, "step": 2556 }, { "epoch": 1.4187465335551859, "grad_norm": 0.2609008252620697, "learning_rate": 4.114155604226159e-05, "loss": 0.3267, "step": 2558 }, { "epoch": 1.4198557958957294, "grad_norm": 0.35078784823417664, "learning_rate": 4.0996396575687e-05, "loss": 0.4347, "step": 2560 }, { "epoch": 1.420965058236273, "grad_norm": 0.26544249057769775, "learning_rate": 4.085142758801953e-05, "loss": 0.341, "step": 2562 }, { "epoch": 1.4220743205768165, "grad_norm": 0.25733861327171326, "learning_rate": 4.070664954725657e-05, "loss": 0.3998, "step": 2564 }, { "epoch": 1.42318358291736, "grad_norm": 0.21705186367034912, "learning_rate": 4.056206292077915e-05, "loss": 0.3024, "step": 2566 }, { "epoch": 1.4242928452579036, "grad_norm": 0.2453695386648178, "learning_rate": 4.0417668175350365e-05, "loss": 0.3176, "step": 2568 }, { "epoch": 1.425402107598447, "grad_norm": 0.31507784128189087, "learning_rate": 4.0273465777113804e-05, "loss": 0.4491, "step": 2570 }, { "epoch": 1.4265113699389906, "grad_norm": 0.2840031087398529, "learning_rate": 4.0129456191592106e-05, "loss": 0.3364, "step": 2572 }, { "epoch": 1.4276206322795342, "grad_norm": 0.2707609236240387, "learning_rate": 3.9985639883685566e-05, "loss": 0.3686, "step": 2574 }, { "epoch": 1.4287298946200777, "grad_norm": 0.30946969985961914, "learning_rate": 3.984201731767042e-05, "loss": 0.3125, "step": 2576 }, { "epoch": 1.4298391569606212, "grad_norm": 0.3130171597003937, "learning_rate": 3.9698588957197456e-05, "loss": 0.3413, "step": 2578 }, { "epoch": 1.4309484193011648, "grad_norm": 0.3146243989467621, "learning_rate": 3.9555355265290605e-05, "loss": 0.3603, "step": 2580 }, { "epoch": 1.4320576816417083, "grad_norm": 0.3555578887462616, "learning_rate": 3.9412316704345307e-05, "loss": 0.4793, "step": 2582 }, { "epoch": 1.4331669439822519, "grad_norm": 0.26377207040786743, "learning_rate": 3.9269473736127075e-05, "loss": 0.421, "step": 2584 }, { "epoch": 1.4342762063227954, "grad_norm": 0.3223215937614441, "learning_rate": 3.9126826821769916e-05, "loss": 0.4443, "step": 2586 }, { "epoch": 1.435385468663339, "grad_norm": 0.36294499039649963, "learning_rate": 3.898437642177508e-05, "loss": 0.4517, "step": 2588 }, { "epoch": 1.4364947310038825, "grad_norm": 0.3447701036930084, "learning_rate": 3.8842122996009324e-05, "loss": 0.4231, "step": 2590 }, { "epoch": 1.437603993344426, "grad_norm": 0.27772748470306396, "learning_rate": 3.8700067003703474e-05, "loss": 0.4101, "step": 2592 }, { "epoch": 1.4387132556849695, "grad_norm": 0.28603261709213257, "learning_rate": 3.8558208903451096e-05, "loss": 0.3622, "step": 2594 }, { "epoch": 1.439822518025513, "grad_norm": 0.3110034763813019, "learning_rate": 3.84165491532069e-05, "loss": 0.3487, "step": 2596 }, { "epoch": 1.4409317803660566, "grad_norm": 0.41178080439567566, "learning_rate": 3.8275088210285195e-05, "loss": 0.4317, "step": 2598 }, { "epoch": 1.4420410427066002, "grad_norm": 0.35106709599494934, "learning_rate": 3.813382653135849e-05, "loss": 0.3458, "step": 2600 }, { "epoch": 1.4431503050471437, "grad_norm": 0.28827354311943054, "learning_rate": 3.799276457245612e-05, "loss": 0.4995, "step": 2602 }, { "epoch": 1.4442595673876872, "grad_norm": 0.2418784201145172, "learning_rate": 3.785190278896258e-05, "loss": 0.3832, "step": 2604 }, { "epoch": 1.4453688297282308, "grad_norm": 0.2937265932559967, "learning_rate": 3.771124163561614e-05, "loss": 0.4129, "step": 2606 }, { "epoch": 1.4464780920687743, "grad_norm": 0.31382623314857483, "learning_rate": 3.757078156650745e-05, "loss": 0.3812, "step": 2608 }, { "epoch": 1.4475873544093179, "grad_norm": 0.4727577269077301, "learning_rate": 3.7430523035078016e-05, "loss": 0.331, "step": 2610 }, { "epoch": 1.4486966167498614, "grad_norm": 0.2999882102012634, "learning_rate": 3.729046649411865e-05, "loss": 0.442, "step": 2612 }, { "epoch": 1.449805879090405, "grad_norm": 0.23084048926830292, "learning_rate": 3.715061239576809e-05, "loss": 0.4015, "step": 2614 }, { "epoch": 1.4509151414309485, "grad_norm": 0.303265780210495, "learning_rate": 3.701096119151165e-05, "loss": 0.3631, "step": 2616 }, { "epoch": 1.452024403771492, "grad_norm": 0.27064260840415955, "learning_rate": 3.687151333217952e-05, "loss": 0.3674, "step": 2618 }, { "epoch": 1.4531336661120355, "grad_norm": 0.3085685074329376, "learning_rate": 3.6732269267945506e-05, "loss": 0.3967, "step": 2620 }, { "epoch": 1.454242928452579, "grad_norm": 0.30328476428985596, "learning_rate": 3.659322944832545e-05, "loss": 0.3723, "step": 2622 }, { "epoch": 1.4553521907931226, "grad_norm": 0.4076519012451172, "learning_rate": 3.645439432217593e-05, "loss": 0.3435, "step": 2624 }, { "epoch": 1.4564614531336662, "grad_norm": 0.32473883032798767, "learning_rate": 3.63157643376927e-05, "loss": 0.3348, "step": 2626 }, { "epoch": 1.4575707154742097, "grad_norm": 0.37467846274375916, "learning_rate": 3.617733994240921e-05, "loss": 0.4531, "step": 2628 }, { "epoch": 1.4586799778147532, "grad_norm": 0.25045719742774963, "learning_rate": 3.6039121583195224e-05, "loss": 0.3328, "step": 2630 }, { "epoch": 1.4597892401552968, "grad_norm": 0.28525876998901367, "learning_rate": 3.590110970625543e-05, "loss": 0.2818, "step": 2632 }, { "epoch": 1.4608985024958403, "grad_norm": 0.3426540791988373, "learning_rate": 3.576330475712788e-05, "loss": 0.4207, "step": 2634 }, { "epoch": 1.4620077648363838, "grad_norm": 0.2515394985675812, "learning_rate": 3.562570718068259e-05, "loss": 0.2807, "step": 2636 }, { "epoch": 1.4631170271769274, "grad_norm": 0.33639976382255554, "learning_rate": 3.5488317421120174e-05, "loss": 0.4283, "step": 2638 }, { "epoch": 1.464226289517471, "grad_norm": 0.3443042039871216, "learning_rate": 3.535113592197041e-05, "loss": 0.4462, "step": 2640 }, { "epoch": 1.4653355518580145, "grad_norm": 0.26121675968170166, "learning_rate": 3.521416312609064e-05, "loss": 0.3072, "step": 2642 }, { "epoch": 1.466444814198558, "grad_norm": 0.2611722946166992, "learning_rate": 3.507739947566447e-05, "loss": 0.3332, "step": 2644 }, { "epoch": 1.4675540765391015, "grad_norm": 0.28175342082977295, "learning_rate": 3.4940845412200465e-05, "loss": 0.3816, "step": 2646 }, { "epoch": 1.468663338879645, "grad_norm": 0.26533833146095276, "learning_rate": 3.480450137653043e-05, "loss": 0.3493, "step": 2648 }, { "epoch": 1.4697726012201886, "grad_norm": 0.28460225462913513, "learning_rate": 3.466836780880818e-05, "loss": 0.351, "step": 2650 }, { "epoch": 1.4708818635607321, "grad_norm": 0.2972380518913269, "learning_rate": 3.4532445148508164e-05, "loss": 0.3784, "step": 2652 }, { "epoch": 1.4719911259012757, "grad_norm": 0.23668786883354187, "learning_rate": 3.439673383442393e-05, "loss": 0.289, "step": 2654 }, { "epoch": 1.4731003882418192, "grad_norm": 0.3169480264186859, "learning_rate": 3.426123430466672e-05, "loss": 0.4497, "step": 2656 }, { "epoch": 1.4742096505823628, "grad_norm": 0.33888205885887146, "learning_rate": 3.412594699666406e-05, "loss": 0.3623, "step": 2658 }, { "epoch": 1.4753189129229063, "grad_norm": 0.32237014174461365, "learning_rate": 3.399087234715846e-05, "loss": 0.3569, "step": 2660 }, { "epoch": 1.4764281752634498, "grad_norm": 0.29429054260253906, "learning_rate": 3.3856010792205836e-05, "loss": 0.3411, "step": 2662 }, { "epoch": 1.4775374376039934, "grad_norm": 0.2668110728263855, "learning_rate": 3.372136276717417e-05, "loss": 0.4503, "step": 2664 }, { "epoch": 1.478646699944537, "grad_norm": 0.30392691493034363, "learning_rate": 3.358692870674219e-05, "loss": 0.406, "step": 2666 }, { "epoch": 1.4797559622850804, "grad_norm": 0.3019475042819977, "learning_rate": 3.34527090448978e-05, "loss": 0.2844, "step": 2668 }, { "epoch": 1.480865224625624, "grad_norm": 0.3453599214553833, "learning_rate": 3.331870421493688e-05, "loss": 0.5458, "step": 2670 }, { "epoch": 1.4819744869661675, "grad_norm": 0.36470338702201843, "learning_rate": 3.318491464946163e-05, "loss": 0.3794, "step": 2672 }, { "epoch": 1.483083749306711, "grad_norm": 0.3321688771247864, "learning_rate": 3.3051340780379494e-05, "loss": 0.4129, "step": 2674 }, { "epoch": 1.4841930116472546, "grad_norm": 0.34594064950942993, "learning_rate": 3.291798303890146e-05, "loss": 0.3418, "step": 2676 }, { "epoch": 1.4853022739877981, "grad_norm": 0.33681973814964294, "learning_rate": 3.2784841855540835e-05, "loss": 0.3854, "step": 2678 }, { "epoch": 1.4864115363283417, "grad_norm": 0.2645164430141449, "learning_rate": 3.265191766011181e-05, "loss": 0.3878, "step": 2680 }, { "epoch": 1.4875207986688852, "grad_norm": 0.29455429315567017, "learning_rate": 3.2519210881728114e-05, "loss": 0.4093, "step": 2682 }, { "epoch": 1.4886300610094287, "grad_norm": 0.3346690833568573, "learning_rate": 3.238672194880162e-05, "loss": 0.5245, "step": 2684 }, { "epoch": 1.4897393233499723, "grad_norm": 0.24734847247600555, "learning_rate": 3.2254451289040886e-05, "loss": 0.3321, "step": 2686 }, { "epoch": 1.4908485856905158, "grad_norm": 0.29895398020744324, "learning_rate": 3.212239932944979e-05, "loss": 0.4684, "step": 2688 }, { "epoch": 1.4919578480310594, "grad_norm": 0.2465728223323822, "learning_rate": 3.1990566496326333e-05, "loss": 0.3415, "step": 2690 }, { "epoch": 1.493067110371603, "grad_norm": 0.4069317579269409, "learning_rate": 3.185895321526099e-05, "loss": 0.3942, "step": 2692 }, { "epoch": 1.4941763727121464, "grad_norm": 0.38210323452949524, "learning_rate": 3.1727559911135464e-05, "loss": 0.4437, "step": 2694 }, { "epoch": 1.49528563505269, "grad_norm": 0.34570619463920593, "learning_rate": 3.1596387008121385e-05, "loss": 0.357, "step": 2696 }, { "epoch": 1.4963948973932335, "grad_norm": 0.27611759305000305, "learning_rate": 3.146543492967889e-05, "loss": 0.4378, "step": 2698 }, { "epoch": 1.497504159733777, "grad_norm": 0.3507147431373596, "learning_rate": 3.133470409855516e-05, "loss": 0.4252, "step": 2700 }, { "epoch": 1.4986134220743206, "grad_norm": 0.25004035234451294, "learning_rate": 3.1204194936783114e-05, "loss": 0.3186, "step": 2702 }, { "epoch": 1.4997226844148641, "grad_norm": 0.28456154465675354, "learning_rate": 3.1073907865680195e-05, "loss": 0.4283, "step": 2704 }, { "epoch": 1.5008319467554077, "grad_norm": 0.22293393313884735, "learning_rate": 3.094384330584674e-05, "loss": 0.3121, "step": 2706 }, { "epoch": 1.5019412090959512, "grad_norm": 0.3094489872455597, "learning_rate": 3.0814001677164816e-05, "loss": 0.4233, "step": 2708 }, { "epoch": 1.5030504714364947, "grad_norm": 0.3050517141819, "learning_rate": 3.0684383398796834e-05, "loss": 0.4525, "step": 2710 }, { "epoch": 1.5041597337770383, "grad_norm": 0.372335821390152, "learning_rate": 3.055498888918419e-05, "loss": 0.4682, "step": 2712 }, { "epoch": 1.5052689961175818, "grad_norm": 0.24653975665569305, "learning_rate": 3.042581856604583e-05, "loss": 0.3376, "step": 2714 }, { "epoch": 1.5063782584581253, "grad_norm": 0.3160128891468048, "learning_rate": 3.0296872846376945e-05, "loss": 0.4083, "step": 2716 }, { "epoch": 1.5074875207986689, "grad_norm": 0.4075262248516083, "learning_rate": 3.016815214644778e-05, "loss": 0.3918, "step": 2718 }, { "epoch": 1.5085967831392124, "grad_norm": 0.2748975157737732, "learning_rate": 3.003965688180206e-05, "loss": 0.425, "step": 2720 }, { "epoch": 1.509706045479756, "grad_norm": 0.3711773753166199, "learning_rate": 2.9911387467255734e-05, "loss": 0.3966, "step": 2722 }, { "epoch": 1.5108153078202995, "grad_norm": 0.2733793258666992, "learning_rate": 2.978334431689568e-05, "loss": 0.4595, "step": 2724 }, { "epoch": 1.511924570160843, "grad_norm": 0.33144858479499817, "learning_rate": 2.9655527844078345e-05, "loss": 0.3786, "step": 2726 }, { "epoch": 1.5130338325013866, "grad_norm": 0.308682918548584, "learning_rate": 2.9527938461428428e-05, "loss": 0.3316, "step": 2728 }, { "epoch": 1.51414309484193, "grad_norm": 0.23612940311431885, "learning_rate": 2.940057658083747e-05, "loss": 0.3468, "step": 2730 }, { "epoch": 1.5152523571824736, "grad_norm": 0.27691999077796936, "learning_rate": 2.9273442613462543e-05, "loss": 0.4282, "step": 2732 }, { "epoch": 1.5163616195230172, "grad_norm": 0.31364646553993225, "learning_rate": 2.914653696972508e-05, "loss": 0.3863, "step": 2734 }, { "epoch": 1.5174708818635607, "grad_norm": 0.3351525366306305, "learning_rate": 2.9019860059309335e-05, "loss": 0.3861, "step": 2736 }, { "epoch": 1.5185801442041043, "grad_norm": 0.24499474465847015, "learning_rate": 2.8893412291161114e-05, "loss": 0.3362, "step": 2738 }, { "epoch": 1.5196894065446478, "grad_norm": 0.249772846698761, "learning_rate": 2.876719407348659e-05, "loss": 0.3336, "step": 2740 }, { "epoch": 1.5207986688851913, "grad_norm": 0.2366907149553299, "learning_rate": 2.864120581375088e-05, "loss": 0.4717, "step": 2742 }, { "epoch": 1.5219079312257349, "grad_norm": 0.2543250322341919, "learning_rate": 2.8515447918676664e-05, "loss": 0.338, "step": 2744 }, { "epoch": 1.5230171935662784, "grad_norm": 0.3489428162574768, "learning_rate": 2.8389920794242963e-05, "loss": 0.4358, "step": 2746 }, { "epoch": 1.524126455906822, "grad_norm": 0.32319262623786926, "learning_rate": 2.8264624845683894e-05, "loss": 0.4148, "step": 2748 }, { "epoch": 1.5252357182473655, "grad_norm": 0.32093530893325806, "learning_rate": 2.813956047748717e-05, "loss": 0.4165, "step": 2750 }, { "epoch": 1.526344980587909, "grad_norm": 0.28289756178855896, "learning_rate": 2.801472809339294e-05, "loss": 0.3601, "step": 2752 }, { "epoch": 1.5274542429284526, "grad_norm": 0.28103169798851013, "learning_rate": 2.7890128096392477e-05, "loss": 0.3864, "step": 2754 }, { "epoch": 1.528563505268996, "grad_norm": 0.3099244236946106, "learning_rate": 2.7765760888726855e-05, "loss": 0.3917, "step": 2756 }, { "epoch": 1.5296727676095396, "grad_norm": 0.23606978356838226, "learning_rate": 2.7641626871885596e-05, "loss": 0.3313, "step": 2758 }, { "epoch": 1.5307820299500832, "grad_norm": 0.36333397030830383, "learning_rate": 2.7517726446605406e-05, "loss": 0.3982, "step": 2760 }, { "epoch": 1.5318912922906267, "grad_norm": 0.27382388710975647, "learning_rate": 2.7394060012868995e-05, "loss": 0.2713, "step": 2762 }, { "epoch": 1.5330005546311702, "grad_norm": 0.2855754494667053, "learning_rate": 2.7270627969903608e-05, "loss": 0.4792, "step": 2764 }, { "epoch": 1.5341098169717138, "grad_norm": 0.35454511642456055, "learning_rate": 2.714743071617979e-05, "loss": 0.3661, "step": 2766 }, { "epoch": 1.5352190793122573, "grad_norm": 0.22000765800476074, "learning_rate": 2.7024468649410228e-05, "loss": 0.3621, "step": 2768 }, { "epoch": 1.5363283416528009, "grad_norm": 0.28072547912597656, "learning_rate": 2.6901742166548262e-05, "loss": 0.3846, "step": 2770 }, { "epoch": 1.5374376039933444, "grad_norm": 0.2560584545135498, "learning_rate": 2.6779251663786797e-05, "loss": 0.4105, "step": 2772 }, { "epoch": 1.538546866333888, "grad_norm": 0.33404773473739624, "learning_rate": 2.665699753655684e-05, "loss": 0.3561, "step": 2774 }, { "epoch": 1.5396561286744315, "grad_norm": 0.33240342140197754, "learning_rate": 2.6534980179526415e-05, "loss": 0.3972, "step": 2776 }, { "epoch": 1.540765391014975, "grad_norm": 0.26221776008605957, "learning_rate": 2.6413199986599112e-05, "loss": 0.3542, "step": 2778 }, { "epoch": 1.5418746533555185, "grad_norm": 0.2851394712924957, "learning_rate": 2.6291657350912923e-05, "loss": 0.3402, "step": 2780 }, { "epoch": 1.542983915696062, "grad_norm": 0.27777722477912903, "learning_rate": 2.6170352664838903e-05, "loss": 0.4094, "step": 2782 }, { "epoch": 1.5440931780366056, "grad_norm": 0.32692790031433105, "learning_rate": 2.6049286319980014e-05, "loss": 0.4145, "step": 2784 }, { "epoch": 1.5452024403771492, "grad_norm": 0.37069231271743774, "learning_rate": 2.5928458707169813e-05, "loss": 0.4012, "step": 2786 }, { "epoch": 1.5463117027176927, "grad_norm": 0.28681105375289917, "learning_rate": 2.5807870216471052e-05, "loss": 0.4338, "step": 2788 }, { "epoch": 1.5474209650582362, "grad_norm": 0.3061560094356537, "learning_rate": 2.5687521237174584e-05, "loss": 0.4096, "step": 2790 }, { "epoch": 1.5485302273987798, "grad_norm": 0.3024190664291382, "learning_rate": 2.5567412157798133e-05, "loss": 0.3737, "step": 2792 }, { "epoch": 1.5496394897393233, "grad_norm": 0.22082455456256866, "learning_rate": 2.544754336608486e-05, "loss": 0.3517, "step": 2794 }, { "epoch": 1.5507487520798668, "grad_norm": 0.23570817708969116, "learning_rate": 2.5327915249002245e-05, "loss": 0.358, "step": 2796 }, { "epoch": 1.5518580144204104, "grad_norm": 0.28938063979148865, "learning_rate": 2.5208528192740834e-05, "loss": 0.3861, "step": 2798 }, { "epoch": 1.552967276760954, "grad_norm": 0.22857078909873962, "learning_rate": 2.5089382582712994e-05, "loss": 0.3072, "step": 2800 }, { "epoch": 1.5540765391014975, "grad_norm": 0.28918081521987915, "learning_rate": 2.4970478803551565e-05, "loss": 0.3366, "step": 2802 }, { "epoch": 1.555185801442041, "grad_norm": 0.28605908155441284, "learning_rate": 2.4851817239108688e-05, "loss": 0.31, "step": 2804 }, { "epoch": 1.5562950637825845, "grad_norm": 0.26103734970092773, "learning_rate": 2.4733398272454687e-05, "loss": 0.3324, "step": 2806 }, { "epoch": 1.557404326123128, "grad_norm": 0.3307429850101471, "learning_rate": 2.4615222285876616e-05, "loss": 0.3568, "step": 2808 }, { "epoch": 1.5585135884636716, "grad_norm": 0.2729584574699402, "learning_rate": 2.449728966087712e-05, "loss": 0.3475, "step": 2810 }, { "epoch": 1.5596228508042151, "grad_norm": 0.27880561351776123, "learning_rate": 2.437960077817326e-05, "loss": 0.371, "step": 2812 }, { "epoch": 1.5607321131447587, "grad_norm": 0.29016315937042236, "learning_rate": 2.426215601769526e-05, "loss": 0.3247, "step": 2814 }, { "epoch": 1.5618413754853022, "grad_norm": 0.3246505558490753, "learning_rate": 2.4144955758585184e-05, "loss": 0.4428, "step": 2816 }, { "epoch": 1.5629506378258458, "grad_norm": 0.27316877245903015, "learning_rate": 2.402800037919578e-05, "loss": 0.3025, "step": 2818 }, { "epoch": 1.5640599001663893, "grad_norm": 0.2579948902130127, "learning_rate": 2.3911290257089348e-05, "loss": 0.3673, "step": 2820 }, { "epoch": 1.5651691625069328, "grad_norm": 0.3941158354282379, "learning_rate": 2.3794825769036334e-05, "loss": 0.4028, "step": 2822 }, { "epoch": 1.5662784248474764, "grad_norm": 0.2645871341228485, "learning_rate": 2.3678607291014242e-05, "loss": 0.3511, "step": 2824 }, { "epoch": 1.56738768718802, "grad_norm": 0.2745266854763031, "learning_rate": 2.356263519820647e-05, "loss": 0.3726, "step": 2826 }, { "epoch": 1.5684969495285634, "grad_norm": 0.4434897303581238, "learning_rate": 2.3446909865000886e-05, "loss": 0.5269, "step": 2828 }, { "epoch": 1.569606211869107, "grad_norm": 0.27076244354248047, "learning_rate": 2.333143166498889e-05, "loss": 0.3558, "step": 2830 }, { "epoch": 1.5707154742096505, "grad_norm": 0.3606158196926117, "learning_rate": 2.3216200970963954e-05, "loss": 0.4266, "step": 2832 }, { "epoch": 1.571824736550194, "grad_norm": 0.2903146743774414, "learning_rate": 2.3101218154920633e-05, "loss": 0.3087, "step": 2834 }, { "epoch": 1.5729339988907376, "grad_norm": 0.3455762565135956, "learning_rate": 2.298648358805322e-05, "loss": 0.389, "step": 2836 }, { "epoch": 1.5740432612312811, "grad_norm": 0.2828775942325592, "learning_rate": 2.2871997640754572e-05, "loss": 0.3795, "step": 2838 }, { "epoch": 1.5751525235718247, "grad_norm": 0.3083617389202118, "learning_rate": 2.275776068261495e-05, "loss": 0.3764, "step": 2840 }, { "epoch": 1.5762617859123682, "grad_norm": 0.32137981057167053, "learning_rate": 2.264377308242086e-05, "loss": 0.3609, "step": 2842 }, { "epoch": 1.5773710482529117, "grad_norm": 0.2916238605976105, "learning_rate": 2.2530035208153822e-05, "loss": 0.3584, "step": 2844 }, { "epoch": 1.5784803105934553, "grad_norm": 0.26702216267585754, "learning_rate": 2.241654742698909e-05, "loss": 0.3635, "step": 2846 }, { "epoch": 1.5795895729339988, "grad_norm": 0.3438095152378082, "learning_rate": 2.2303310105294582e-05, "loss": 0.372, "step": 2848 }, { "epoch": 1.5806988352745424, "grad_norm": 0.31688249111175537, "learning_rate": 2.219032360862976e-05, "loss": 0.3912, "step": 2850 }, { "epoch": 1.581808097615086, "grad_norm": 0.2813704013824463, "learning_rate": 2.2077588301744233e-05, "loss": 0.3545, "step": 2852 }, { "epoch": 1.5829173599556294, "grad_norm": 0.23737509548664093, "learning_rate": 2.1965104548576753e-05, "loss": 0.3507, "step": 2854 }, { "epoch": 1.584026622296173, "grad_norm": 0.32858365774154663, "learning_rate": 2.1852872712254002e-05, "loss": 0.3221, "step": 2856 }, { "epoch": 1.5851358846367165, "grad_norm": 0.2847982943058014, "learning_rate": 2.1740893155089447e-05, "loss": 0.3456, "step": 2858 }, { "epoch": 1.58624514697726, "grad_norm": 0.28835567831993103, "learning_rate": 2.1629166238582056e-05, "loss": 0.3682, "step": 2860 }, { "epoch": 1.5873544093178036, "grad_norm": 0.2693901062011719, "learning_rate": 2.1517692323415205e-05, "loss": 0.3503, "step": 2862 }, { "epoch": 1.5884636716583471, "grad_norm": 0.2496192455291748, "learning_rate": 2.1406471769455615e-05, "loss": 0.3414, "step": 2864 }, { "epoch": 1.5895729339988907, "grad_norm": 0.2739793658256531, "learning_rate": 2.129550493575201e-05, "loss": 0.4304, "step": 2866 }, { "epoch": 1.5906821963394342, "grad_norm": 0.2115955650806427, "learning_rate": 2.118479218053401e-05, "loss": 0.3131, "step": 2868 }, { "epoch": 1.5917914586799777, "grad_norm": 0.283636212348938, "learning_rate": 2.1074333861211103e-05, "loss": 0.4183, "step": 2870 }, { "epoch": 1.5929007210205213, "grad_norm": 0.2762402594089508, "learning_rate": 2.096413033437131e-05, "loss": 0.3805, "step": 2872 }, { "epoch": 1.5940099833610648, "grad_norm": 0.27344250679016113, "learning_rate": 2.0854181955780183e-05, "loss": 0.3537, "step": 2874 }, { "epoch": 1.5951192457016083, "grad_norm": 0.3143325448036194, "learning_rate": 2.0744489080379504e-05, "loss": 0.3461, "step": 2876 }, { "epoch": 1.5962285080421519, "grad_norm": 0.26111075282096863, "learning_rate": 2.063505206228632e-05, "loss": 0.3634, "step": 2878 }, { "epoch": 1.5973377703826954, "grad_norm": 0.32173627614974976, "learning_rate": 2.0525871254791627e-05, "loss": 0.3973, "step": 2880 }, { "epoch": 1.598447032723239, "grad_norm": 0.2806760370731354, "learning_rate": 2.0416947010359355e-05, "loss": 0.3786, "step": 2882 }, { "epoch": 1.5995562950637825, "grad_norm": 0.30123627185821533, "learning_rate": 2.030827968062513e-05, "loss": 0.427, "step": 2884 }, { "epoch": 1.600665557404326, "grad_norm": 0.322729229927063, "learning_rate": 2.019986961639524e-05, "loss": 0.353, "step": 2886 }, { "epoch": 1.6017748197448696, "grad_norm": 0.2584727108478546, "learning_rate": 2.0091717167645475e-05, "loss": 0.2905, "step": 2888 }, { "epoch": 1.602884082085413, "grad_norm": 0.2751784026622772, "learning_rate": 1.9983822683519915e-05, "loss": 0.3394, "step": 2890 }, { "epoch": 1.6039933444259566, "grad_norm": 0.29693764448165894, "learning_rate": 1.9876186512329853e-05, "loss": 0.4027, "step": 2892 }, { "epoch": 1.6051026067665002, "grad_norm": 0.2711539566516876, "learning_rate": 1.9768809001552768e-05, "loss": 0.349, "step": 2894 }, { "epoch": 1.6062118691070437, "grad_norm": 0.25827860832214355, "learning_rate": 1.9661690497831053e-05, "loss": 0.4183, "step": 2896 }, { "epoch": 1.6073211314475873, "grad_norm": 0.34938088059425354, "learning_rate": 1.9554831346970925e-05, "loss": 0.3684, "step": 2898 }, { "epoch": 1.6084303937881308, "grad_norm": 0.26432278752326965, "learning_rate": 1.9448231893941414e-05, "loss": 0.4979, "step": 2900 }, { "epoch": 1.6095396561286743, "grad_norm": 0.32702112197875977, "learning_rate": 1.9341892482873192e-05, "loss": 0.3844, "step": 2902 }, { "epoch": 1.6106489184692179, "grad_norm": 0.36097395420074463, "learning_rate": 1.923581345705736e-05, "loss": 0.3576, "step": 2904 }, { "epoch": 1.6117581808097614, "grad_norm": 0.3077182471752167, "learning_rate": 1.912999515894448e-05, "loss": 0.5143, "step": 2906 }, { "epoch": 1.612867443150305, "grad_norm": 0.2704939544200897, "learning_rate": 1.9024437930143435e-05, "loss": 0.3342, "step": 2908 }, { "epoch": 1.6139767054908485, "grad_norm": 0.22881537675857544, "learning_rate": 1.8919142111420284e-05, "loss": 0.3769, "step": 2910 }, { "epoch": 1.615085967831392, "grad_norm": 0.29385611414909363, "learning_rate": 1.8814108042697144e-05, "loss": 0.3847, "step": 2912 }, { "epoch": 1.6161952301719356, "grad_norm": 0.4236384630203247, "learning_rate": 1.870933606305122e-05, "loss": 0.4581, "step": 2914 }, { "epoch": 1.617304492512479, "grad_norm": 0.2979065477848053, "learning_rate": 1.8604826510713613e-05, "loss": 0.4182, "step": 2916 }, { "epoch": 1.6184137548530226, "grad_norm": 0.335405170917511, "learning_rate": 1.8500579723068177e-05, "loss": 0.3544, "step": 2918 }, { "epoch": 1.6195230171935662, "grad_norm": 0.2822960615158081, "learning_rate": 1.8396596036650514e-05, "loss": 0.336, "step": 2920 }, { "epoch": 1.6206322795341097, "grad_norm": 0.3513801395893097, "learning_rate": 1.8292875787146946e-05, "loss": 0.4, "step": 2922 }, { "epoch": 1.6217415418746532, "grad_norm": 0.2501135766506195, "learning_rate": 1.8189419309393242e-05, "loss": 0.3641, "step": 2924 }, { "epoch": 1.6228508042151968, "grad_norm": 0.3006201684474945, "learning_rate": 1.8086226937373674e-05, "loss": 0.4112, "step": 2926 }, { "epoch": 1.6239600665557403, "grad_norm": 0.2748831808567047, "learning_rate": 1.798329900422e-05, "loss": 0.31, "step": 2928 }, { "epoch": 1.6250693288962839, "grad_norm": 0.3650710880756378, "learning_rate": 1.788063584221017e-05, "loss": 0.3872, "step": 2930 }, { "epoch": 1.6261785912368274, "grad_norm": 0.3932930827140808, "learning_rate": 1.7778237782767504e-05, "loss": 0.4484, "step": 2932 }, { "epoch": 1.627287853577371, "grad_norm": 0.25739145278930664, "learning_rate": 1.7676105156459398e-05, "loss": 0.3541, "step": 2934 }, { "epoch": 1.6283971159179145, "grad_norm": 0.22192710638046265, "learning_rate": 1.7574238292996458e-05, "loss": 0.3301, "step": 2936 }, { "epoch": 1.629506378258458, "grad_norm": 0.2925964593887329, "learning_rate": 1.7472637521231283e-05, "loss": 0.4855, "step": 2938 }, { "epoch": 1.6306156405990015, "grad_norm": 0.2878285050392151, "learning_rate": 1.737130316915744e-05, "loss": 0.4119, "step": 2940 }, { "epoch": 1.631724902939545, "grad_norm": 0.2855752110481262, "learning_rate": 1.7270235563908443e-05, "loss": 0.4221, "step": 2942 }, { "epoch": 1.6328341652800886, "grad_norm": 0.30537575483322144, "learning_rate": 1.716943503175671e-05, "loss": 0.4187, "step": 2944 }, { "epoch": 1.6339434276206322, "grad_norm": 0.32603779435157776, "learning_rate": 1.7068901898112478e-05, "loss": 0.4118, "step": 2946 }, { "epoch": 1.6350526899611757, "grad_norm": 0.21832433342933655, "learning_rate": 1.6968636487522705e-05, "loss": 0.3122, "step": 2948 }, { "epoch": 1.6361619523017192, "grad_norm": 0.30126479268074036, "learning_rate": 1.686863912367006e-05, "loss": 0.322, "step": 2950 }, { "epoch": 1.6372712146422628, "grad_norm": 0.27455347776412964, "learning_rate": 1.6768910129371986e-05, "loss": 0.3588, "step": 2952 }, { "epoch": 1.6383804769828063, "grad_norm": 0.26136961579322815, "learning_rate": 1.6669449826579464e-05, "loss": 0.3672, "step": 2954 }, { "epoch": 1.6394897393233498, "grad_norm": 0.24132628738880157, "learning_rate": 1.6570258536376083e-05, "loss": 0.3935, "step": 2956 }, { "epoch": 1.6405990016638934, "grad_norm": 0.38164663314819336, "learning_rate": 1.6471336578977016e-05, "loss": 0.4923, "step": 2958 }, { "epoch": 1.641708264004437, "grad_norm": 0.3024519979953766, "learning_rate": 1.637268427372799e-05, "loss": 0.4043, "step": 2960 }, { "epoch": 1.6428175263449805, "grad_norm": 0.29123973846435547, "learning_rate": 1.627430193910414e-05, "loss": 0.3372, "step": 2962 }, { "epoch": 1.643926788685524, "grad_norm": 0.2549437880516052, "learning_rate": 1.6176189892709127e-05, "loss": 0.2834, "step": 2964 }, { "epoch": 1.6450360510260675, "grad_norm": 0.3285108804702759, "learning_rate": 1.607834845127405e-05, "loss": 0.3657, "step": 2966 }, { "epoch": 1.646145313366611, "grad_norm": 0.24914546310901642, "learning_rate": 1.59807779306564e-05, "loss": 0.3498, "step": 2968 }, { "epoch": 1.6472545757071546, "grad_norm": 0.2854565978050232, "learning_rate": 1.5883478645839045e-05, "loss": 0.3597, "step": 2970 }, { "epoch": 1.6483638380476981, "grad_norm": 0.24184933304786682, "learning_rate": 1.578645091092933e-05, "loss": 0.3682, "step": 2972 }, { "epoch": 1.6494731003882417, "grad_norm": 0.30457058548927307, "learning_rate": 1.5689695039157848e-05, "loss": 0.3172, "step": 2974 }, { "epoch": 1.6505823627287852, "grad_norm": 0.23675574362277985, "learning_rate": 1.5593211342877645e-05, "loss": 0.27, "step": 2976 }, { "epoch": 1.6516916250693288, "grad_norm": 0.3284320533275604, "learning_rate": 1.5497000133563022e-05, "loss": 0.4104, "step": 2978 }, { "epoch": 1.6528008874098723, "grad_norm": 0.4357747435569763, "learning_rate": 1.540106172180873e-05, "loss": 0.4127, "step": 2980 }, { "epoch": 1.6539101497504158, "grad_norm": 0.3309295177459717, "learning_rate": 1.5305396417328756e-05, "loss": 0.4256, "step": 2982 }, { "epoch": 1.6550194120909594, "grad_norm": 0.27238425612449646, "learning_rate": 1.5210004528955468e-05, "loss": 0.359, "step": 2984 }, { "epoch": 1.656128674431503, "grad_norm": 0.30173739790916443, "learning_rate": 1.5114886364638614e-05, "loss": 0.4343, "step": 2986 }, { "epoch": 1.6572379367720464, "grad_norm": 0.29943424463272095, "learning_rate": 1.5020042231444197e-05, "loss": 0.344, "step": 2988 }, { "epoch": 1.65834719911259, "grad_norm": 0.34404435753822327, "learning_rate": 1.4925472435553701e-05, "loss": 0.3992, "step": 2990 }, { "epoch": 1.6594564614531335, "grad_norm": 0.3563268184661865, "learning_rate": 1.4831177282262842e-05, "loss": 0.4014, "step": 2992 }, { "epoch": 1.660565723793677, "grad_norm": 0.2387107014656067, "learning_rate": 1.4737157075980845e-05, "loss": 0.3141, "step": 2994 }, { "epoch": 1.6616749861342206, "grad_norm": 0.3576110899448395, "learning_rate": 1.4643412120229262e-05, "loss": 0.3765, "step": 2996 }, { "epoch": 1.6627842484747641, "grad_norm": 0.335438072681427, "learning_rate": 1.4549942717641052e-05, "loss": 0.3619, "step": 2998 }, { "epoch": 1.6638935108153077, "grad_norm": 0.2912905216217041, "learning_rate": 1.4456749169959648e-05, "loss": 0.3389, "step": 3000 }, { "epoch": 1.6650027731558512, "grad_norm": 0.2544459104537964, "learning_rate": 1.4363831778037961e-05, "loss": 0.2778, "step": 3002 }, { "epoch": 1.6661120354963947, "grad_norm": 0.36533600091934204, "learning_rate": 1.42711908418374e-05, "loss": 0.3798, "step": 3004 }, { "epoch": 1.6672212978369383, "grad_norm": 0.2710284888744354, "learning_rate": 1.4178826660426891e-05, "loss": 0.305, "step": 3006 }, { "epoch": 1.6683305601774818, "grad_norm": 0.21859732270240784, "learning_rate": 1.4086739531981885e-05, "loss": 0.4432, "step": 3008 }, { "epoch": 1.6694398225180254, "grad_norm": 0.22141209244728088, "learning_rate": 1.3994929753783515e-05, "loss": 0.3012, "step": 3010 }, { "epoch": 1.670549084858569, "grad_norm": 0.3187447786331177, "learning_rate": 1.3903397622217506e-05, "loss": 0.3794, "step": 3012 }, { "epoch": 1.6716583471991124, "grad_norm": 0.37787067890167236, "learning_rate": 1.381214343277324e-05, "loss": 0.3672, "step": 3014 }, { "epoch": 1.672767609539656, "grad_norm": 0.3910747766494751, "learning_rate": 1.3721167480042885e-05, "loss": 0.4577, "step": 3016 }, { "epoch": 1.6738768718801995, "grad_norm": 0.3260791003704071, "learning_rate": 1.3630470057720402e-05, "loss": 0.4624, "step": 3018 }, { "epoch": 1.674986134220743, "grad_norm": 0.3197901248931885, "learning_rate": 1.3540051458600523e-05, "loss": 0.3861, "step": 3020 }, { "epoch": 1.6760953965612866, "grad_norm": 0.3465018570423126, "learning_rate": 1.3449911974577877e-05, "loss": 0.5036, "step": 3022 }, { "epoch": 1.6772046589018301, "grad_norm": 0.32030799984931946, "learning_rate": 1.3360051896646086e-05, "loss": 0.3244, "step": 3024 }, { "epoch": 1.6783139212423737, "grad_norm": 0.2779349088668823, "learning_rate": 1.3270471514896743e-05, "loss": 0.3362, "step": 3026 }, { "epoch": 1.6794231835829172, "grad_norm": 0.312095046043396, "learning_rate": 1.3181171118518465e-05, "loss": 0.5161, "step": 3028 }, { "epoch": 1.6805324459234607, "grad_norm": 0.2570739984512329, "learning_rate": 1.3092150995796115e-05, "loss": 0.3741, "step": 3030 }, { "epoch": 1.6816417082640043, "grad_norm": 0.27533021569252014, "learning_rate": 1.3003411434109647e-05, "loss": 0.3173, "step": 3032 }, { "epoch": 1.6827509706045478, "grad_norm": 0.2738421559333801, "learning_rate": 1.2914952719933371e-05, "loss": 0.4167, "step": 3034 }, { "epoch": 1.6838602329450914, "grad_norm": 0.21936124563217163, "learning_rate": 1.282677513883489e-05, "loss": 0.3356, "step": 3036 }, { "epoch": 1.6849694952856349, "grad_norm": 0.2971390187740326, "learning_rate": 1.2738878975474288e-05, "loss": 0.3919, "step": 3038 }, { "epoch": 1.6860787576261784, "grad_norm": 0.3661748766899109, "learning_rate": 1.2651264513603134e-05, "loss": 0.3864, "step": 3040 }, { "epoch": 1.687188019966722, "grad_norm": 0.3551200330257416, "learning_rate": 1.2563932036063586e-05, "loss": 0.3555, "step": 3042 }, { "epoch": 1.6882972823072657, "grad_norm": 0.27041590213775635, "learning_rate": 1.2476881824787467e-05, "loss": 0.295, "step": 3044 }, { "epoch": 1.6894065446478093, "grad_norm": 0.2313155084848404, "learning_rate": 1.2390114160795419e-05, "loss": 0.3177, "step": 3046 }, { "epoch": 1.6905158069883528, "grad_norm": 0.3004077970981598, "learning_rate": 1.2303629324195943e-05, "loss": 0.3845, "step": 3048 }, { "epoch": 1.6916250693288963, "grad_norm": 0.29577240347862244, "learning_rate": 1.2217427594184461e-05, "loss": 0.3376, "step": 3050 }, { "epoch": 1.6927343316694399, "grad_norm": 0.363438218832016, "learning_rate": 1.213150924904245e-05, "loss": 0.465, "step": 3052 }, { "epoch": 1.6938435940099834, "grad_norm": 0.2636345624923706, "learning_rate": 1.2045874566136617e-05, "loss": 0.2845, "step": 3054 }, { "epoch": 1.694952856350527, "grad_norm": 0.3315665125846863, "learning_rate": 1.1960523821917868e-05, "loss": 0.4179, "step": 3056 }, { "epoch": 1.6960621186910705, "grad_norm": 0.27641746401786804, "learning_rate": 1.1875457291920477e-05, "loss": 0.3542, "step": 3058 }, { "epoch": 1.697171381031614, "grad_norm": 0.39690592885017395, "learning_rate": 1.1790675250761263e-05, "loss": 0.4511, "step": 3060 }, { "epoch": 1.6982806433721576, "grad_norm": 0.1926700472831726, "learning_rate": 1.1706177972138599e-05, "loss": 0.2946, "step": 3062 }, { "epoch": 1.699389905712701, "grad_norm": 0.27746477723121643, "learning_rate": 1.1621965728831564e-05, "loss": 0.3691, "step": 3064 }, { "epoch": 1.7004991680532446, "grad_norm": 0.2863025367259979, "learning_rate": 1.1538038792699068e-05, "loss": 0.3466, "step": 3066 }, { "epoch": 1.7016084303937882, "grad_norm": 0.31509512662887573, "learning_rate": 1.1454397434679021e-05, "loss": 0.3379, "step": 3068 }, { "epoch": 1.7027176927343317, "grad_norm": 0.3157186806201935, "learning_rate": 1.1371041924787362e-05, "loss": 0.3854, "step": 3070 }, { "epoch": 1.7038269550748752, "grad_norm": 0.32956090569496155, "learning_rate": 1.128797253211723e-05, "loss": 0.3036, "step": 3072 }, { "epoch": 1.7049362174154188, "grad_norm": 0.24164661765098572, "learning_rate": 1.120518952483819e-05, "loss": 0.3209, "step": 3074 }, { "epoch": 1.7060454797559623, "grad_norm": 0.34098076820373535, "learning_rate": 1.1122693170195164e-05, "loss": 0.446, "step": 3076 }, { "epoch": 1.7071547420965059, "grad_norm": 0.3181568384170532, "learning_rate": 1.1040483734507789e-05, "loss": 0.3758, "step": 3078 }, { "epoch": 1.7082640044370494, "grad_norm": 0.2597646415233612, "learning_rate": 1.095856148316936e-05, "loss": 0.35, "step": 3080 }, { "epoch": 1.709373266777593, "grad_norm": 0.27917012572288513, "learning_rate": 1.087692668064616e-05, "loss": 0.3619, "step": 3082 }, { "epoch": 1.7104825291181365, "grad_norm": 0.2992468774318695, "learning_rate": 1.0795579590476445e-05, "loss": 0.37, "step": 3084 }, { "epoch": 1.71159179145868, "grad_norm": 0.3110543191432953, "learning_rate": 1.0714520475269652e-05, "loss": 0.3318, "step": 3086 }, { "epoch": 1.7127010537992235, "grad_norm": 0.393775075674057, "learning_rate": 1.0633749596705645e-05, "loss": 0.4044, "step": 3088 }, { "epoch": 1.713810316139767, "grad_norm": 0.32126861810684204, "learning_rate": 1.055326721553368e-05, "loss": 0.4077, "step": 3090 }, { "epoch": 1.7149195784803106, "grad_norm": 0.316629558801651, "learning_rate": 1.0473073591571758e-05, "loss": 0.3887, "step": 3092 }, { "epoch": 1.7160288408208542, "grad_norm": 0.24358634650707245, "learning_rate": 1.0393168983705626e-05, "loss": 0.3439, "step": 3094 }, { "epoch": 1.7171381031613977, "grad_norm": 0.3309425413608551, "learning_rate": 1.0313553649888074e-05, "loss": 0.3894, "step": 3096 }, { "epoch": 1.7182473655019412, "grad_norm": 0.3401065468788147, "learning_rate": 1.0234227847138011e-05, "loss": 0.376, "step": 3098 }, { "epoch": 1.7193566278424848, "grad_norm": 0.33251863718032837, "learning_rate": 1.0155191831539645e-05, "loss": 0.4203, "step": 3100 }, { "epoch": 1.7204658901830283, "grad_norm": 0.3005315363407135, "learning_rate": 1.0076445858241679e-05, "loss": 0.2993, "step": 3102 }, { "epoch": 1.7215751525235718, "grad_norm": 0.2971371114253998, "learning_rate": 9.997990181456528e-06, "loss": 0.3881, "step": 3104 }, { "epoch": 1.7226844148641154, "grad_norm": 0.2904921770095825, "learning_rate": 9.919825054459442e-06, "loss": 0.3812, "step": 3106 }, { "epoch": 1.723793677204659, "grad_norm": 0.3357609212398529, "learning_rate": 9.841950729587668e-06, "loss": 0.4121, "step": 3108 }, { "epoch": 1.7249029395452025, "grad_norm": 0.2711123526096344, "learning_rate": 9.764367458239677e-06, "loss": 0.3789, "step": 3110 }, { "epoch": 1.726012201885746, "grad_norm": 0.24408982694149017, "learning_rate": 9.687075490874376e-06, "loss": 0.3457, "step": 3112 }, { "epoch": 1.7271214642262895, "grad_norm": 0.25458860397338867, "learning_rate": 9.61007507701024e-06, "loss": 0.3098, "step": 3114 }, { "epoch": 1.728230726566833, "grad_norm": 0.2704317569732666, "learning_rate": 9.533366465224514e-06, "loss": 0.3471, "step": 3116 }, { "epoch": 1.7293399889073766, "grad_norm": 0.2258918136358261, "learning_rate": 9.456949903152478e-06, "loss": 0.4087, "step": 3118 }, { "epoch": 1.7304492512479202, "grad_norm": 0.20709431171417236, "learning_rate": 9.38082563748659e-06, "loss": 0.3383, "step": 3120 }, { "epoch": 1.7315585135884637, "grad_norm": 0.24197116494178772, "learning_rate": 9.30499391397568e-06, "loss": 0.3323, "step": 3122 }, { "epoch": 1.7326677759290072, "grad_norm": 0.30395829677581787, "learning_rate": 9.229454977424157e-06, "loss": 0.378, "step": 3124 }, { "epoch": 1.7337770382695508, "grad_norm": 0.2813956141471863, "learning_rate": 9.154209071691289e-06, "loss": 0.3326, "step": 3126 }, { "epoch": 1.7348863006100943, "grad_norm": 0.3281961679458618, "learning_rate": 9.079256439690354e-06, "loss": 0.3518, "step": 3128 }, { "epoch": 1.7359955629506378, "grad_norm": 0.3628225326538086, "learning_rate": 9.004597323387798e-06, "loss": 0.4188, "step": 3130 }, { "epoch": 1.7371048252911814, "grad_norm": 0.3164060711860657, "learning_rate": 8.930231963802637e-06, "loss": 0.3381, "step": 3132 }, { "epoch": 1.738214087631725, "grad_norm": 0.27229782938957214, "learning_rate": 8.856160601005459e-06, "loss": 0.3767, "step": 3134 }, { "epoch": 1.7393233499722685, "grad_norm": 0.34024956822395325, "learning_rate": 8.782383474117838e-06, "loss": 0.4573, "step": 3136 }, { "epoch": 1.740432612312812, "grad_norm": 0.32661277055740356, "learning_rate": 8.708900821311405e-06, "loss": 0.5145, "step": 3138 }, { "epoch": 1.7415418746533555, "grad_norm": 0.24198585748672485, "learning_rate": 8.635712879807222e-06, "loss": 0.2969, "step": 3140 }, { "epoch": 1.742651136993899, "grad_norm": 0.37718066573143005, "learning_rate": 8.562819885874884e-06, "loss": 0.5287, "step": 3142 }, { "epoch": 1.7437603993344426, "grad_norm": 0.3092913329601288, "learning_rate": 8.490222074831845e-06, "loss": 0.3534, "step": 3144 }, { "epoch": 1.7448696616749861, "grad_norm": 0.2609056830406189, "learning_rate": 8.417919681042652e-06, "loss": 0.3774, "step": 3146 }, { "epoch": 1.7459789240155297, "grad_norm": 0.3176262080669403, "learning_rate": 8.345912937918121e-06, "loss": 0.3448, "step": 3148 }, { "epoch": 1.7470881863560732, "grad_norm": 0.3105904757976532, "learning_rate": 8.274202077914705e-06, "loss": 0.3949, "step": 3150 }, { "epoch": 1.7481974486966168, "grad_norm": 0.3904447555541992, "learning_rate": 8.20278733253359e-06, "loss": 0.45, "step": 3152 }, { "epoch": 1.7493067110371603, "grad_norm": 0.27570340037345886, "learning_rate": 8.13166893232008e-06, "loss": 0.4282, "step": 3154 }, { "epoch": 1.7504159733777038, "grad_norm": 0.2809303104877472, "learning_rate": 8.060847106862779e-06, "loss": 0.3358, "step": 3156 }, { "epoch": 1.7515252357182474, "grad_norm": 0.43461307883262634, "learning_rate": 7.990322084792867e-06, "loss": 0.3352, "step": 3158 }, { "epoch": 1.752634498058791, "grad_norm": 0.3733227550983429, "learning_rate": 7.92009409378337e-06, "loss": 0.4386, "step": 3160 }, { "epoch": 1.7537437603993344, "grad_norm": 0.22569668292999268, "learning_rate": 7.850163360548424e-06, "loss": 0.2785, "step": 3162 }, { "epoch": 1.754853022739878, "grad_norm": 0.286538690328598, "learning_rate": 7.780530110842565e-06, "loss": 0.312, "step": 3164 }, { "epoch": 1.7559622850804215, "grad_norm": 0.2738610804080963, "learning_rate": 7.711194569459934e-06, "loss": 0.3244, "step": 3166 }, { "epoch": 1.757071547420965, "grad_norm": 0.30075690150260925, "learning_rate": 7.642156960233592e-06, "loss": 0.3691, "step": 3168 }, { "epoch": 1.7581808097615086, "grad_norm": 0.2853529453277588, "learning_rate": 7.573417506034852e-06, "loss": 0.3259, "step": 3170 }, { "epoch": 1.7592900721020521, "grad_norm": 0.23462392389774323, "learning_rate": 7.504976428772437e-06, "loss": 0.3671, "step": 3172 }, { "epoch": 1.7603993344425957, "grad_norm": 0.365106999874115, "learning_rate": 7.436833949391853e-06, "loss": 0.3698, "step": 3174 }, { "epoch": 1.7615085967831392, "grad_norm": 0.2944175899028778, "learning_rate": 7.368990287874711e-06, "loss": 0.3515, "step": 3176 }, { "epoch": 1.7626178591236827, "grad_norm": 0.2920864224433899, "learning_rate": 7.301445663237861e-06, "loss": 0.3424, "step": 3178 }, { "epoch": 1.7637271214642263, "grad_norm": 0.26654571294784546, "learning_rate": 7.234200293532889e-06, "loss": 0.3553, "step": 3180 }, { "epoch": 1.7648363838047698, "grad_norm": 0.2544094920158386, "learning_rate": 7.167254395845202e-06, "loss": 0.3715, "step": 3182 }, { "epoch": 1.7659456461453134, "grad_norm": 0.2914319932460785, "learning_rate": 7.1006081862935444e-06, "loss": 0.4023, "step": 3184 }, { "epoch": 1.767054908485857, "grad_norm": 0.3055804371833801, "learning_rate": 7.034261880029114e-06, "loss": 0.3967, "step": 3186 }, { "epoch": 1.7681641708264004, "grad_norm": 0.2863101661205292, "learning_rate": 6.968215691234936e-06, "loss": 0.3853, "step": 3188 }, { "epoch": 1.769273433166944, "grad_norm": 0.28304606676101685, "learning_rate": 6.902469833125236e-06, "loss": 0.3937, "step": 3190 }, { "epoch": 1.7703826955074875, "grad_norm": 0.2828314006328583, "learning_rate": 6.837024517944657e-06, "loss": 0.3907, "step": 3192 }, { "epoch": 1.771491957848031, "grad_norm": 0.2963877022266388, "learning_rate": 6.77187995696763e-06, "loss": 0.3885, "step": 3194 }, { "epoch": 1.7726012201885746, "grad_norm": 0.24497413635253906, "learning_rate": 6.707036360497632e-06, "loss": 0.4195, "step": 3196 }, { "epoch": 1.7737104825291181, "grad_norm": 0.25655171275138855, "learning_rate": 6.642493937866623e-06, "loss": 0.3315, "step": 3198 }, { "epoch": 1.7748197448696617, "grad_norm": 0.3175029456615448, "learning_rate": 6.578252897434223e-06, "loss": 0.464, "step": 3200 } ], "logging_steps": 2, "max_steps": 3606, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 64, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7794204280750080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }