{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999822547158093, "eval_steps": 500, "global_step": 14088, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007098113676290526, "grad_norm": 7.898312201157063, "learning_rate": 9.995030526764163e-06, "loss": 1.7908, "step": 10 }, { "epoch": 0.0014196227352581052, "grad_norm": 5.391248052357186, "learning_rate": 9.987931279284396e-06, "loss": 0.7438, "step": 20 }, { "epoch": 0.002129434102887158, "grad_norm": 5.37248606279888, "learning_rate": 9.980832031804629e-06, "loss": 0.6486, "step": 30 }, { "epoch": 0.0028392454705162104, "grad_norm": 6.374151691345343, "learning_rate": 9.973732784324862e-06, "loss": 0.6267, "step": 40 }, { "epoch": 0.003549056838145263, "grad_norm": 8.880583324938707, "learning_rate": 9.966633536845095e-06, "loss": 0.5867, "step": 50 }, { "epoch": 0.004258868205774316, "grad_norm": 6.684640112932781, "learning_rate": 9.959534289365328e-06, "loss": 0.5473, "step": 60 }, { "epoch": 0.004968679573403368, "grad_norm": 9.329177151842533, "learning_rate": 9.952435041885561e-06, "loss": 0.549, "step": 70 }, { "epoch": 0.005678490941032421, "grad_norm": 4.2697386372932575, "learning_rate": 9.945335794405794e-06, "loss": 0.527, "step": 80 }, { "epoch": 0.006388302308661473, "grad_norm": 4.981195314782428, "learning_rate": 9.938236546926027e-06, "loss": 0.5015, "step": 90 }, { "epoch": 0.007098113676290526, "grad_norm": 3.5718526890722155, "learning_rate": 9.931137299446259e-06, "loss": 0.5148, "step": 100 }, { "epoch": 0.007807925043919578, "grad_norm": 5.35860863602349, "learning_rate": 9.924038051966492e-06, "loss": 0.5116, "step": 110 }, { "epoch": 0.008517736411548632, "grad_norm": 3.2550821239727434, "learning_rate": 9.916938804486725e-06, "loss": 0.4992, "step": 120 }, { "epoch": 0.009227547779177683, "grad_norm": 3.4354498076448032, "learning_rate": 9.909839557006958e-06, "loss": 0.5088, "step": 130 }, { "epoch": 0.009937359146806737, "grad_norm": 4.331679140736939, "learning_rate": 9.902740309527191e-06, "loss": 0.5139, "step": 140 }, { "epoch": 0.010647170514435788, "grad_norm": 3.4337848966487265, "learning_rate": 9.895641062047424e-06, "loss": 0.5041, "step": 150 }, { "epoch": 0.011356981882064842, "grad_norm": 8.243351710682422, "learning_rate": 9.888541814567657e-06, "loss": 0.5142, "step": 160 }, { "epoch": 0.012066793249693893, "grad_norm": 4.091704188438657, "learning_rate": 9.88144256708789e-06, "loss": 0.4807, "step": 170 }, { "epoch": 0.012776604617322947, "grad_norm": 21.564891334339755, "learning_rate": 9.874343319608124e-06, "loss": 0.5092, "step": 180 }, { "epoch": 0.013486415984951998, "grad_norm": 3.1521060424258973, "learning_rate": 9.867244072128355e-06, "loss": 0.4787, "step": 190 }, { "epoch": 0.014196227352581052, "grad_norm": 3.986481726421801, "learning_rate": 9.860144824648588e-06, "loss": 0.4827, "step": 200 }, { "epoch": 0.014906038720210105, "grad_norm": 4.774263941683351, "learning_rate": 9.853045577168821e-06, "loss": 0.4775, "step": 210 }, { "epoch": 0.015615850087839157, "grad_norm": 7.968327682274227, "learning_rate": 9.845946329689053e-06, "loss": 0.4716, "step": 220 }, { "epoch": 0.01632566145546821, "grad_norm": 10.121205974855524, "learning_rate": 9.838847082209286e-06, "loss": 0.4969, "step": 230 }, { "epoch": 0.017035472823097263, "grad_norm": 7.454679720256471, "learning_rate": 9.831747834729519e-06, "loss": 0.4923, "step": 240 }, { "epoch": 0.017745284190726313, "grad_norm": 17.103084568275037, "learning_rate": 9.824648587249752e-06, "loss": 0.4701, "step": 250 }, { "epoch": 0.018455095558355367, "grad_norm": 4.48293929960256, "learning_rate": 9.817549339769985e-06, "loss": 0.4734, "step": 260 }, { "epoch": 0.01916490692598442, "grad_norm": 5.345114387506581, "learning_rate": 9.810450092290218e-06, "loss": 0.4894, "step": 270 }, { "epoch": 0.019874718293613473, "grad_norm": 19.40561032433512, "learning_rate": 9.803350844810451e-06, "loss": 0.4791, "step": 280 }, { "epoch": 0.020584529661242523, "grad_norm": 14.25299022016476, "learning_rate": 9.796251597330684e-06, "loss": 0.4699, "step": 290 }, { "epoch": 0.021294341028871577, "grad_norm": 8.257072932675099, "learning_rate": 9.789152349850918e-06, "loss": 0.4712, "step": 300 }, { "epoch": 0.02200415239650063, "grad_norm": 7.954026403143938, "learning_rate": 9.782053102371149e-06, "loss": 0.4703, "step": 310 }, { "epoch": 0.022713963764129683, "grad_norm": 11.392767049791958, "learning_rate": 9.774953854891382e-06, "loss": 0.4991, "step": 320 }, { "epoch": 0.023423775131758737, "grad_norm": 3.6589701257251392, "learning_rate": 9.767854607411615e-06, "loss": 0.48, "step": 330 }, { "epoch": 0.024133586499387787, "grad_norm": 2.8317614498971095, "learning_rate": 9.760755359931848e-06, "loss": 0.473, "step": 340 }, { "epoch": 0.02484339786701684, "grad_norm": 3.3672884329345467, "learning_rate": 9.753656112452081e-06, "loss": 0.4807, "step": 350 }, { "epoch": 0.025553209234645893, "grad_norm": 2.918860353664653, "learning_rate": 9.746556864972314e-06, "loss": 0.474, "step": 360 }, { "epoch": 0.026263020602274947, "grad_norm": 3.985430160063577, "learning_rate": 9.739457617492548e-06, "loss": 0.4606, "step": 370 }, { "epoch": 0.026972831969903997, "grad_norm": 3.8499162197950216, "learning_rate": 9.73235837001278e-06, "loss": 0.474, "step": 380 }, { "epoch": 0.02768264333753305, "grad_norm": 2.955339700163119, "learning_rate": 9.725259122533012e-06, "loss": 0.472, "step": 390 }, { "epoch": 0.028392454705162103, "grad_norm": 5.589731350821559, "learning_rate": 9.718159875053245e-06, "loss": 0.4698, "step": 400 }, { "epoch": 0.029102266072791157, "grad_norm": 3.9824871173931973, "learning_rate": 9.711060627573478e-06, "loss": 0.4581, "step": 410 }, { "epoch": 0.02981207744042021, "grad_norm": 2.524559409598369, "learning_rate": 9.70396138009371e-06, "loss": 0.4478, "step": 420 }, { "epoch": 0.03052188880804926, "grad_norm": 2.970731368598553, "learning_rate": 9.696862132613943e-06, "loss": 0.4508, "step": 430 }, { "epoch": 0.031231700175678313, "grad_norm": 2.893829595170148, "learning_rate": 9.689762885134176e-06, "loss": 0.4379, "step": 440 }, { "epoch": 0.03194151154330736, "grad_norm": 22.795684932698137, "learning_rate": 9.682663637654409e-06, "loss": 0.4482, "step": 450 }, { "epoch": 0.03265132291093642, "grad_norm": 3.2812945854632236, "learning_rate": 9.675564390174642e-06, "loss": 0.4599, "step": 460 }, { "epoch": 0.03336113427856547, "grad_norm": 11.615453520589618, "learning_rate": 9.668465142694875e-06, "loss": 0.4417, "step": 470 }, { "epoch": 0.03407094564619453, "grad_norm": 7.726986291359829, "learning_rate": 9.661365895215108e-06, "loss": 0.4594, "step": 480 }, { "epoch": 0.03478075701382358, "grad_norm": 4.365039492938302, "learning_rate": 9.654266647735341e-06, "loss": 0.4669, "step": 490 }, { "epoch": 0.03549056838145263, "grad_norm": 6.54988906481092, "learning_rate": 9.647167400255574e-06, "loss": 0.4567, "step": 500 }, { "epoch": 0.03620037974908168, "grad_norm": 8.933278546995766, "learning_rate": 9.640068152775806e-06, "loss": 0.4519, "step": 510 }, { "epoch": 0.03691019111671073, "grad_norm": 3.7761657369108907, "learning_rate": 9.632968905296039e-06, "loss": 0.4501, "step": 520 }, { "epoch": 0.03762000248433979, "grad_norm": 3.9418116527565377, "learning_rate": 9.625869657816272e-06, "loss": 0.4561, "step": 530 }, { "epoch": 0.03832981385196884, "grad_norm": 3.5489889583606438, "learning_rate": 9.618770410336505e-06, "loss": 0.4598, "step": 540 }, { "epoch": 0.03903962521959789, "grad_norm": 3.5164230189602548, "learning_rate": 9.611671162856738e-06, "loss": 0.4717, "step": 550 }, { "epoch": 0.03974943658722695, "grad_norm": 2.1822863392109206, "learning_rate": 9.604571915376971e-06, "loss": 0.48, "step": 560 }, { "epoch": 0.040459247954856, "grad_norm": 2.5677413826305413, "learning_rate": 9.597472667897204e-06, "loss": 0.4605, "step": 570 }, { "epoch": 0.041169059322485047, "grad_norm": 3.011759104822335, "learning_rate": 9.590373420417438e-06, "loss": 0.4605, "step": 580 }, { "epoch": 0.0418788706901141, "grad_norm": 2.56502573080614, "learning_rate": 9.58327417293767e-06, "loss": 0.4494, "step": 590 }, { "epoch": 0.04258868205774315, "grad_norm": 3.2396125278123806, "learning_rate": 9.576174925457902e-06, "loss": 0.4542, "step": 600 }, { "epoch": 0.04329849342537221, "grad_norm": 3.480681910714182, "learning_rate": 9.569075677978135e-06, "loss": 0.4548, "step": 610 }, { "epoch": 0.04400830479300126, "grad_norm": 2.623695100630613, "learning_rate": 9.561976430498368e-06, "loss": 0.4594, "step": 620 }, { "epoch": 0.04471811616063031, "grad_norm": 3.042303011325611, "learning_rate": 9.5548771830186e-06, "loss": 0.4557, "step": 630 }, { "epoch": 0.04542792752825937, "grad_norm": 2.8781600946277863, "learning_rate": 9.547777935538833e-06, "loss": 0.484, "step": 640 }, { "epoch": 0.04613773889588842, "grad_norm": 3.3284195205047684, "learning_rate": 9.540678688059066e-06, "loss": 0.4481, "step": 650 }, { "epoch": 0.04684755026351747, "grad_norm": 3.5159109068224987, "learning_rate": 9.533579440579299e-06, "loss": 0.4665, "step": 660 }, { "epoch": 0.04755736163114652, "grad_norm": 6.322136362721481, "learning_rate": 9.526480193099532e-06, "loss": 0.4585, "step": 670 }, { "epoch": 0.04826717299877557, "grad_norm": 21.902103769968996, "learning_rate": 9.519380945619765e-06, "loss": 0.4446, "step": 680 }, { "epoch": 0.04897698436640463, "grad_norm": 3.6046359318609356, "learning_rate": 9.512281698139998e-06, "loss": 0.4519, "step": 690 }, { "epoch": 0.04968679573403368, "grad_norm": 3.039690187186011, "learning_rate": 9.505182450660231e-06, "loss": 0.4448, "step": 700 }, { "epoch": 0.05039660710166273, "grad_norm": 2.608964873836775, "learning_rate": 9.498083203180465e-06, "loss": 0.4486, "step": 710 }, { "epoch": 0.05110641846929179, "grad_norm": 3.368889371027321, "learning_rate": 9.490983955700696e-06, "loss": 0.4617, "step": 720 }, { "epoch": 0.05181622983692084, "grad_norm": 4.094036998235093, "learning_rate": 9.483884708220929e-06, "loss": 0.4569, "step": 730 }, { "epoch": 0.05252604120454989, "grad_norm": 2.979892302450325, "learning_rate": 9.476785460741162e-06, "loss": 0.4645, "step": 740 }, { "epoch": 0.05323585257217894, "grad_norm": 3.676607621277054, "learning_rate": 9.469686213261395e-06, "loss": 0.4407, "step": 750 }, { "epoch": 0.05394566393980799, "grad_norm": 359.9140493382262, "learning_rate": 9.462586965781628e-06, "loss": 0.4262, "step": 760 }, { "epoch": 0.05465547530743705, "grad_norm": 4.447118089247447, "learning_rate": 9.455487718301861e-06, "loss": 0.4344, "step": 770 }, { "epoch": 0.0553652866750661, "grad_norm": 4.569754671227615, "learning_rate": 9.448388470822095e-06, "loss": 0.4462, "step": 780 }, { "epoch": 0.05607509804269516, "grad_norm": 2.3728524211263067, "learning_rate": 9.441289223342328e-06, "loss": 0.4386, "step": 790 }, { "epoch": 0.05678490941032421, "grad_norm": 2.5997362569615903, "learning_rate": 9.434189975862559e-06, "loss": 0.4537, "step": 800 }, { "epoch": 0.057494720777953257, "grad_norm": 4.859327134293274, "learning_rate": 9.427090728382792e-06, "loss": 0.4514, "step": 810 }, { "epoch": 0.05820453214558231, "grad_norm": 2.6304161060559905, "learning_rate": 9.419991480903025e-06, "loss": 0.4306, "step": 820 }, { "epoch": 0.05891434351321136, "grad_norm": 3.504607730078166, "learning_rate": 9.412892233423258e-06, "loss": 0.454, "step": 830 }, { "epoch": 0.05962415488084042, "grad_norm": 3.3227222733710864, "learning_rate": 9.40579298594349e-06, "loss": 0.4407, "step": 840 }, { "epoch": 0.06033396624846947, "grad_norm": 3.328718377292454, "learning_rate": 9.398693738463723e-06, "loss": 0.4581, "step": 850 }, { "epoch": 0.06104377761609852, "grad_norm": 3.4977954338913864, "learning_rate": 9.391594490983956e-06, "loss": 0.4284, "step": 860 }, { "epoch": 0.06175358898372758, "grad_norm": 3.228432256709841, "learning_rate": 9.384495243504189e-06, "loss": 0.4373, "step": 870 }, { "epoch": 0.06246340035135663, "grad_norm": 3.1586832054050964, "learning_rate": 9.377395996024422e-06, "loss": 0.4348, "step": 880 }, { "epoch": 0.06317321171898568, "grad_norm": 13.155465477764636, "learning_rate": 9.370296748544655e-06, "loss": 0.4217, "step": 890 }, { "epoch": 0.06388302308661473, "grad_norm": 15.543581430412525, "learning_rate": 9.363197501064888e-06, "loss": 0.4593, "step": 900 }, { "epoch": 0.06459283445424378, "grad_norm": 8.921864061523843, "learning_rate": 9.356098253585121e-06, "loss": 0.4608, "step": 910 }, { "epoch": 0.06530264582187284, "grad_norm": 5.3983003526617335, "learning_rate": 9.348999006105353e-06, "loss": 0.4514, "step": 920 }, { "epoch": 0.06601245718950188, "grad_norm": 7.595139513838182, "learning_rate": 9.341899758625586e-06, "loss": 0.4273, "step": 930 }, { "epoch": 0.06672226855713094, "grad_norm": 3.2331459925046815, "learning_rate": 9.334800511145819e-06, "loss": 0.422, "step": 940 }, { "epoch": 0.06743207992476, "grad_norm": 3.8699272404865686, "learning_rate": 9.327701263666052e-06, "loss": 0.4477, "step": 950 }, { "epoch": 0.06814189129238905, "grad_norm": 2.68446192265652, "learning_rate": 9.320602016186285e-06, "loss": 0.4449, "step": 960 }, { "epoch": 0.0688517026600181, "grad_norm": 2.637260503772899, "learning_rate": 9.313502768706518e-06, "loss": 0.4532, "step": 970 }, { "epoch": 0.06956151402764715, "grad_norm": 3.9618993923437085, "learning_rate": 9.306403521226751e-06, "loss": 0.4534, "step": 980 }, { "epoch": 0.07027132539527621, "grad_norm": 3.429568261104227, "learning_rate": 9.299304273746985e-06, "loss": 0.452, "step": 990 }, { "epoch": 0.07098113676290525, "grad_norm": 3.663179434126313, "learning_rate": 9.292205026267218e-06, "loss": 0.439, "step": 1000 }, { "epoch": 0.07169094813053431, "grad_norm": 4.408975026773321, "learning_rate": 9.285105778787449e-06, "loss": 0.4184, "step": 1010 }, { "epoch": 0.07240075949816337, "grad_norm": 2.415108601943808, "learning_rate": 9.278006531307682e-06, "loss": 0.4342, "step": 1020 }, { "epoch": 0.07311057086579241, "grad_norm": 6.698239896408658, "learning_rate": 9.270907283827915e-06, "loss": 0.4535, "step": 1030 }, { "epoch": 0.07382038223342147, "grad_norm": 11.189940656850219, "learning_rate": 9.263808036348147e-06, "loss": 0.4192, "step": 1040 }, { "epoch": 0.07453019360105052, "grad_norm": 3.85625217339617, "learning_rate": 9.25670878886838e-06, "loss": 0.4278, "step": 1050 }, { "epoch": 0.07524000496867958, "grad_norm": 32.21212360326382, "learning_rate": 9.249609541388613e-06, "loss": 0.4509, "step": 1060 }, { "epoch": 0.07594981633630862, "grad_norm": 5.919396215012425, "learning_rate": 9.242510293908846e-06, "loss": 0.4525, "step": 1070 }, { "epoch": 0.07665962770393768, "grad_norm": 5.904196801283348, "learning_rate": 9.235411046429079e-06, "loss": 0.4422, "step": 1080 }, { "epoch": 0.07736943907156674, "grad_norm": 4.486326467883555, "learning_rate": 9.228311798949312e-06, "loss": 0.4685, "step": 1090 }, { "epoch": 0.07807925043919578, "grad_norm": 11.745437972621287, "learning_rate": 9.221212551469545e-06, "loss": 0.4646, "step": 1100 }, { "epoch": 0.07878906180682484, "grad_norm": 6.5181010077573145, "learning_rate": 9.214113303989778e-06, "loss": 0.443, "step": 1110 }, { "epoch": 0.0794988731744539, "grad_norm": 11.270983163134655, "learning_rate": 9.207014056510012e-06, "loss": 0.4605, "step": 1120 }, { "epoch": 0.08020868454208294, "grad_norm": 3.7069012881976975, "learning_rate": 9.199914809030243e-06, "loss": 0.4459, "step": 1130 }, { "epoch": 0.080918495909712, "grad_norm": 8.667969696855055, "learning_rate": 9.192815561550476e-06, "loss": 0.4556, "step": 1140 }, { "epoch": 0.08162830727734105, "grad_norm": 7.559635091166787, "learning_rate": 9.185716314070709e-06, "loss": 0.4357, "step": 1150 }, { "epoch": 0.08233811864497009, "grad_norm": 17.430750080762536, "learning_rate": 9.178617066590942e-06, "loss": 0.4301, "step": 1160 }, { "epoch": 0.08304793001259915, "grad_norm": 4.351276343100192, "learning_rate": 9.171517819111175e-06, "loss": 0.4184, "step": 1170 }, { "epoch": 0.0837577413802282, "grad_norm": 6.471581804191342, "learning_rate": 9.164418571631408e-06, "loss": 0.4516, "step": 1180 }, { "epoch": 0.08446755274785726, "grad_norm": 4.3294841586504855, "learning_rate": 9.157319324151642e-06, "loss": 0.4211, "step": 1190 }, { "epoch": 0.0851773641154863, "grad_norm": 4.385208474639979, "learning_rate": 9.150220076671875e-06, "loss": 0.4203, "step": 1200 }, { "epoch": 0.08588717548311536, "grad_norm": 5.8972560031050065, "learning_rate": 9.143120829192106e-06, "loss": 0.4284, "step": 1210 }, { "epoch": 0.08659698685074442, "grad_norm": 4.604861487503107, "learning_rate": 9.136021581712339e-06, "loss": 0.4277, "step": 1220 }, { "epoch": 0.08730679821837346, "grad_norm": 4.321101106082931, "learning_rate": 9.128922334232572e-06, "loss": 0.4216, "step": 1230 }, { "epoch": 0.08801660958600252, "grad_norm": 11.04681514927992, "learning_rate": 9.121823086752805e-06, "loss": 0.4181, "step": 1240 }, { "epoch": 0.08872642095363158, "grad_norm": 4.31849841935359, "learning_rate": 9.114723839273037e-06, "loss": 0.4264, "step": 1250 }, { "epoch": 0.08943623232126062, "grad_norm": 4.674845237449041, "learning_rate": 9.10762459179327e-06, "loss": 0.4281, "step": 1260 }, { "epoch": 0.09014604368888968, "grad_norm": 3.447760098274006, "learning_rate": 9.100525344313503e-06, "loss": 0.4304, "step": 1270 }, { "epoch": 0.09085585505651873, "grad_norm": 7.189274212443334, "learning_rate": 9.093426096833736e-06, "loss": 0.4252, "step": 1280 }, { "epoch": 0.09156566642414778, "grad_norm": 19.69024332171456, "learning_rate": 9.08632684935397e-06, "loss": 0.4336, "step": 1290 }, { "epoch": 0.09227547779177683, "grad_norm": 55.22992334000048, "learning_rate": 9.079227601874202e-06, "loss": 0.4256, "step": 1300 }, { "epoch": 0.09298528915940589, "grad_norm": 5.066816349007046, "learning_rate": 9.072128354394435e-06, "loss": 0.407, "step": 1310 }, { "epoch": 0.09369510052703495, "grad_norm": 6.37711035743208, "learning_rate": 9.065029106914668e-06, "loss": 0.4257, "step": 1320 }, { "epoch": 0.09440491189466399, "grad_norm": 4.696335985596692, "learning_rate": 9.057929859434902e-06, "loss": 0.4188, "step": 1330 }, { "epoch": 0.09511472326229305, "grad_norm": 15.82313293688476, "learning_rate": 9.050830611955133e-06, "loss": 0.433, "step": 1340 }, { "epoch": 0.0958245346299221, "grad_norm": 5.692904308794704, "learning_rate": 9.043731364475366e-06, "loss": 0.4269, "step": 1350 }, { "epoch": 0.09653434599755115, "grad_norm": 15.303387309564082, "learning_rate": 9.0366321169956e-06, "loss": 0.4174, "step": 1360 }, { "epoch": 0.0972441573651802, "grad_norm": 3.9801928029461666, "learning_rate": 9.029532869515832e-06, "loss": 0.4132, "step": 1370 }, { "epoch": 0.09795396873280926, "grad_norm": 3.523690216407914, "learning_rate": 9.022433622036065e-06, "loss": 0.4281, "step": 1380 }, { "epoch": 0.0986637801004383, "grad_norm": 7.099888052775042, "learning_rate": 9.015334374556298e-06, "loss": 0.4431, "step": 1390 }, { "epoch": 0.09937359146806736, "grad_norm": 6.538985360116972, "learning_rate": 9.008235127076532e-06, "loss": 0.4172, "step": 1400 }, { "epoch": 0.10008340283569642, "grad_norm": 7.959800060910741, "learning_rate": 9.001135879596763e-06, "loss": 0.4243, "step": 1410 }, { "epoch": 0.10079321420332546, "grad_norm": 8.790445771142394, "learning_rate": 8.994036632116996e-06, "loss": 0.4254, "step": 1420 }, { "epoch": 0.10150302557095452, "grad_norm": 4.285966498899181, "learning_rate": 8.98693738463723e-06, "loss": 0.4122, "step": 1430 }, { "epoch": 0.10221283693858357, "grad_norm": 6.286806035291326, "learning_rate": 8.979838137157462e-06, "loss": 0.433, "step": 1440 }, { "epoch": 0.10292264830621263, "grad_norm": 7.3066834855049345, "learning_rate": 8.972738889677695e-06, "loss": 0.4258, "step": 1450 }, { "epoch": 0.10363245967384167, "grad_norm": 6.5695520214785565, "learning_rate": 8.965639642197927e-06, "loss": 0.4164, "step": 1460 }, { "epoch": 0.10434227104147073, "grad_norm": 20.93641513291179, "learning_rate": 8.95854039471816e-06, "loss": 0.4095, "step": 1470 }, { "epoch": 0.10505208240909979, "grad_norm": 5.657042957398901, "learning_rate": 8.951441147238393e-06, "loss": 0.4168, "step": 1480 }, { "epoch": 0.10576189377672883, "grad_norm": 6.076726326140851, "learning_rate": 8.944341899758626e-06, "loss": 0.4112, "step": 1490 }, { "epoch": 0.10647170514435789, "grad_norm": 5.092565408624009, "learning_rate": 8.93724265227886e-06, "loss": 0.4269, "step": 1500 }, { "epoch": 0.10718151651198694, "grad_norm": 2.894012289515038, "learning_rate": 8.930143404799092e-06, "loss": 0.4239, "step": 1510 }, { "epoch": 0.10789132787961599, "grad_norm": 3.7173915295575637, "learning_rate": 8.923044157319325e-06, "loss": 0.4288, "step": 1520 }, { "epoch": 0.10860113924724504, "grad_norm": 3.025402596869208, "learning_rate": 8.915944909839559e-06, "loss": 0.4421, "step": 1530 }, { "epoch": 0.1093109506148741, "grad_norm": 8.212502187483185, "learning_rate": 8.90884566235979e-06, "loss": 0.4241, "step": 1540 }, { "epoch": 0.11002076198250314, "grad_norm": 5.773771344339805, "learning_rate": 8.901746414880023e-06, "loss": 0.4355, "step": 1550 }, { "epoch": 0.1107305733501322, "grad_norm": 4.158426885786249, "learning_rate": 8.894647167400256e-06, "loss": 0.436, "step": 1560 }, { "epoch": 0.11144038471776126, "grad_norm": 6.56740526603354, "learning_rate": 8.88754791992049e-06, "loss": 0.4397, "step": 1570 }, { "epoch": 0.11215019608539031, "grad_norm": 8.263663970839248, "learning_rate": 8.880448672440722e-06, "loss": 0.4201, "step": 1580 }, { "epoch": 0.11286000745301936, "grad_norm": 2.424368072981463, "learning_rate": 8.873349424960955e-06, "loss": 0.4235, "step": 1590 }, { "epoch": 0.11356981882064841, "grad_norm": 6.489454078474153, "learning_rate": 8.866250177481189e-06, "loss": 0.4243, "step": 1600 }, { "epoch": 0.11427963018827747, "grad_norm": 3.541006640864803, "learning_rate": 8.859150930001422e-06, "loss": 0.4313, "step": 1610 }, { "epoch": 0.11498944155590651, "grad_norm": 12.323605643567065, "learning_rate": 8.852051682521653e-06, "loss": 0.4253, "step": 1620 }, { "epoch": 0.11569925292353557, "grad_norm": 4.600225981753095, "learning_rate": 8.844952435041886e-06, "loss": 0.42, "step": 1630 }, { "epoch": 0.11640906429116463, "grad_norm": 8.589796661850784, "learning_rate": 8.83785318756212e-06, "loss": 0.4219, "step": 1640 }, { "epoch": 0.11711887565879367, "grad_norm": 10.182911442610934, "learning_rate": 8.830753940082352e-06, "loss": 0.4285, "step": 1650 }, { "epoch": 0.11782868702642273, "grad_norm": 5.186284643440543, "learning_rate": 8.823654692602584e-06, "loss": 0.4139, "step": 1660 }, { "epoch": 0.11853849839405178, "grad_norm": 5.23154203196852, "learning_rate": 8.816555445122817e-06, "loss": 0.4251, "step": 1670 }, { "epoch": 0.11924830976168084, "grad_norm": 6.9839536559537505, "learning_rate": 8.80945619764305e-06, "loss": 0.4233, "step": 1680 }, { "epoch": 0.11995812112930988, "grad_norm": 6.376179671333375, "learning_rate": 8.802356950163283e-06, "loss": 0.4089, "step": 1690 }, { "epoch": 0.12066793249693894, "grad_norm": 3.824113092644885, "learning_rate": 8.795257702683516e-06, "loss": 0.4347, "step": 1700 }, { "epoch": 0.121377743864568, "grad_norm": 11.282936555631686, "learning_rate": 8.78815845520375e-06, "loss": 0.423, "step": 1710 }, { "epoch": 0.12208755523219704, "grad_norm": 4.218268240264897, "learning_rate": 8.781059207723982e-06, "loss": 0.4188, "step": 1720 }, { "epoch": 0.1227973665998261, "grad_norm": 3.943582749857493, "learning_rate": 8.773959960244215e-06, "loss": 0.4276, "step": 1730 }, { "epoch": 0.12350717796745515, "grad_norm": 9.679933576473074, "learning_rate": 8.766860712764449e-06, "loss": 0.42, "step": 1740 }, { "epoch": 0.1242169893350842, "grad_norm": 15.414309701859608, "learning_rate": 8.75976146528468e-06, "loss": 0.4316, "step": 1750 }, { "epoch": 0.12492680070271325, "grad_norm": 9.429737278511919, "learning_rate": 8.752662217804913e-06, "loss": 0.422, "step": 1760 }, { "epoch": 0.1256366120703423, "grad_norm": 23.10494354556988, "learning_rate": 8.745562970325146e-06, "loss": 0.4276, "step": 1770 }, { "epoch": 0.12634642343797137, "grad_norm": 13.541923724604345, "learning_rate": 8.73846372284538e-06, "loss": 0.4271, "step": 1780 }, { "epoch": 0.1270562348056004, "grad_norm": 2.846694152973873, "learning_rate": 8.731364475365612e-06, "loss": 0.4151, "step": 1790 }, { "epoch": 0.12776604617322945, "grad_norm": 6.934597145753292, "learning_rate": 8.724265227885845e-06, "loss": 0.4247, "step": 1800 }, { "epoch": 0.12847585754085852, "grad_norm": 3.435112347451886, "learning_rate": 8.717165980406079e-06, "loss": 0.4225, "step": 1810 }, { "epoch": 0.12918566890848757, "grad_norm": 3.4829699382867823, "learning_rate": 8.71006673292631e-06, "loss": 0.4458, "step": 1820 }, { "epoch": 0.1298954802761166, "grad_norm": 5.077072978235785, "learning_rate": 8.702967485446543e-06, "loss": 0.4283, "step": 1830 }, { "epoch": 0.13060529164374568, "grad_norm": 5.917300462616358, "learning_rate": 8.695868237966776e-06, "loss": 0.4119, "step": 1840 }, { "epoch": 0.13131510301137472, "grad_norm": 10.693397543481625, "learning_rate": 8.68876899048701e-06, "loss": 0.4305, "step": 1850 }, { "epoch": 0.13202491437900377, "grad_norm": 3.6456780546239456, "learning_rate": 8.681669743007242e-06, "loss": 0.4391, "step": 1860 }, { "epoch": 0.13273472574663284, "grad_norm": 14.68038430401678, "learning_rate": 8.674570495527474e-06, "loss": 0.4111, "step": 1870 }, { "epoch": 0.13344453711426188, "grad_norm": 5.101838800313352, "learning_rate": 8.667471248047707e-06, "loss": 0.4323, "step": 1880 }, { "epoch": 0.13415434848189095, "grad_norm": 4.497686869632987, "learning_rate": 8.66037200056794e-06, "loss": 0.4154, "step": 1890 }, { "epoch": 0.13486415984952, "grad_norm": 9.511227824879294, "learning_rate": 8.653272753088173e-06, "loss": 0.4295, "step": 1900 }, { "epoch": 0.13557397121714904, "grad_norm": 5.344003791146658, "learning_rate": 8.646173505608406e-06, "loss": 0.4254, "step": 1910 }, { "epoch": 0.1362837825847781, "grad_norm": 8.10132953922794, "learning_rate": 8.63907425812864e-06, "loss": 0.4219, "step": 1920 }, { "epoch": 0.13699359395240715, "grad_norm": 8.840386508572838, "learning_rate": 8.631975010648872e-06, "loss": 0.416, "step": 1930 }, { "epoch": 0.1377034053200362, "grad_norm": 5.639143297883941, "learning_rate": 8.624875763169106e-06, "loss": 0.4246, "step": 1940 }, { "epoch": 0.13841321668766526, "grad_norm": 5.375177742256173, "learning_rate": 8.617776515689339e-06, "loss": 0.4263, "step": 1950 }, { "epoch": 0.1391230280552943, "grad_norm": 13.872628674699765, "learning_rate": 8.61067726820957e-06, "loss": 0.4368, "step": 1960 }, { "epoch": 0.13983283942292335, "grad_norm": 6.612051924514802, "learning_rate": 8.603578020729803e-06, "loss": 0.4235, "step": 1970 }, { "epoch": 0.14054265079055242, "grad_norm": 7.420592038738273, "learning_rate": 8.596478773250036e-06, "loss": 0.4315, "step": 1980 }, { "epoch": 0.14125246215818146, "grad_norm": 3.883491154973528, "learning_rate": 8.58937952577027e-06, "loss": 0.4394, "step": 1990 }, { "epoch": 0.1419622735258105, "grad_norm": 4.031594828995353, "learning_rate": 8.582280278290502e-06, "loss": 0.4274, "step": 2000 }, { "epoch": 0.14267208489343958, "grad_norm": 6.272786134188022, "learning_rate": 8.575181030810736e-06, "loss": 0.42, "step": 2010 }, { "epoch": 0.14338189626106862, "grad_norm": 8.45570312290703, "learning_rate": 8.568081783330967e-06, "loss": 0.4336, "step": 2020 }, { "epoch": 0.14409170762869766, "grad_norm": 3.8497660341027693, "learning_rate": 8.5609825358512e-06, "loss": 0.4259, "step": 2030 }, { "epoch": 0.14480151899632673, "grad_norm": 10.12069309920438, "learning_rate": 8.553883288371433e-06, "loss": 0.4208, "step": 2040 }, { "epoch": 0.14551133036395578, "grad_norm": 5.128975578462212, "learning_rate": 8.546784040891666e-06, "loss": 0.4215, "step": 2050 }, { "epoch": 0.14622114173158482, "grad_norm": 4.45602583843403, "learning_rate": 8.5396847934119e-06, "loss": 0.4135, "step": 2060 }, { "epoch": 0.1469309530992139, "grad_norm": 5.172069700283945, "learning_rate": 8.53258554593213e-06, "loss": 0.4122, "step": 2070 }, { "epoch": 0.14764076446684293, "grad_norm": 7.147216717746435, "learning_rate": 8.525486298452364e-06, "loss": 0.4423, "step": 2080 }, { "epoch": 0.14835057583447198, "grad_norm": 14.946527022046613, "learning_rate": 8.518387050972597e-06, "loss": 0.4094, "step": 2090 }, { "epoch": 0.14906038720210105, "grad_norm": 8.460267496546166, "learning_rate": 8.51128780349283e-06, "loss": 0.4186, "step": 2100 }, { "epoch": 0.1497701985697301, "grad_norm": 8.93023218882671, "learning_rate": 8.504188556013063e-06, "loss": 0.4062, "step": 2110 }, { "epoch": 0.15048000993735916, "grad_norm": 3.213343020811049, "learning_rate": 8.497089308533296e-06, "loss": 0.3994, "step": 2120 }, { "epoch": 0.1511898213049882, "grad_norm": 8.718801113577726, "learning_rate": 8.48999006105353e-06, "loss": 0.4232, "step": 2130 }, { "epoch": 0.15189963267261725, "grad_norm": 2.832643819770658, "learning_rate": 8.482890813573762e-06, "loss": 0.4261, "step": 2140 }, { "epoch": 0.15260944404024632, "grad_norm": 3.2673324405839255, "learning_rate": 8.475791566093996e-06, "loss": 0.42, "step": 2150 }, { "epoch": 0.15331925540787536, "grad_norm": 3.2621489770969214, "learning_rate": 8.468692318614227e-06, "loss": 0.4282, "step": 2160 }, { "epoch": 0.1540290667755044, "grad_norm": 17.34420036770468, "learning_rate": 8.46159307113446e-06, "loss": 0.4198, "step": 2170 }, { "epoch": 0.15473887814313347, "grad_norm": 3.6148665582762094, "learning_rate": 8.454493823654693e-06, "loss": 0.4157, "step": 2180 }, { "epoch": 0.15544868951076252, "grad_norm": 2.775836768166624, "learning_rate": 8.447394576174926e-06, "loss": 0.417, "step": 2190 }, { "epoch": 0.15615850087839156, "grad_norm": 5.052761832862739, "learning_rate": 8.44029532869516e-06, "loss": 0.4035, "step": 2200 }, { "epoch": 0.15686831224602063, "grad_norm": 4.778779661514333, "learning_rate": 8.433196081215393e-06, "loss": 0.4445, "step": 2210 }, { "epoch": 0.15757812361364967, "grad_norm": 4.6274782338902325, "learning_rate": 8.426096833735626e-06, "loss": 0.4147, "step": 2220 }, { "epoch": 0.15828793498127872, "grad_norm": 4.310225523508245, "learning_rate": 8.418997586255857e-06, "loss": 0.4167, "step": 2230 }, { "epoch": 0.1589977463489078, "grad_norm": 4.802519845626961, "learning_rate": 8.41189833877609e-06, "loss": 0.4052, "step": 2240 }, { "epoch": 0.15970755771653683, "grad_norm": 3.949892413625005, "learning_rate": 8.404799091296323e-06, "loss": 0.4263, "step": 2250 }, { "epoch": 0.16041736908416587, "grad_norm": 5.685661053410237, "learning_rate": 8.397699843816556e-06, "loss": 0.4148, "step": 2260 }, { "epoch": 0.16112718045179494, "grad_norm": 4.337480471983148, "learning_rate": 8.39060059633679e-06, "loss": 0.4101, "step": 2270 }, { "epoch": 0.161836991819424, "grad_norm": 4.809277499740254, "learning_rate": 8.38350134885702e-06, "loss": 0.4071, "step": 2280 }, { "epoch": 0.16254680318705303, "grad_norm": 7.364507480899371, "learning_rate": 8.376402101377254e-06, "loss": 0.4021, "step": 2290 }, { "epoch": 0.1632566145546821, "grad_norm": 5.408145626972555, "learning_rate": 8.369302853897487e-06, "loss": 0.4154, "step": 2300 }, { "epoch": 0.16396642592231114, "grad_norm": 2.9449217220121784, "learning_rate": 8.36220360641772e-06, "loss": 0.4296, "step": 2310 }, { "epoch": 0.16467623728994019, "grad_norm": 3.843647555602573, "learning_rate": 8.355104358937953e-06, "loss": 0.4197, "step": 2320 }, { "epoch": 0.16538604865756926, "grad_norm": 5.843629733774891, "learning_rate": 8.348005111458186e-06, "loss": 0.4052, "step": 2330 }, { "epoch": 0.1660958600251983, "grad_norm": 4.182196885965926, "learning_rate": 8.34090586397842e-06, "loss": 0.4304, "step": 2340 }, { "epoch": 0.16680567139282734, "grad_norm": 12.343897765958163, "learning_rate": 8.333806616498653e-06, "loss": 0.4057, "step": 2350 }, { "epoch": 0.1675154827604564, "grad_norm": 4.52770872028285, "learning_rate": 8.326707369018886e-06, "loss": 0.4234, "step": 2360 }, { "epoch": 0.16822529412808546, "grad_norm": 5.473115632671873, "learning_rate": 8.319608121539117e-06, "loss": 0.4127, "step": 2370 }, { "epoch": 0.16893510549571453, "grad_norm": 5.243162829393595, "learning_rate": 8.31250887405935e-06, "loss": 0.4148, "step": 2380 }, { "epoch": 0.16964491686334357, "grad_norm": 9.638919529909746, "learning_rate": 8.305409626579583e-06, "loss": 0.4244, "step": 2390 }, { "epoch": 0.1703547282309726, "grad_norm": 5.824204497516263, "learning_rate": 8.298310379099816e-06, "loss": 0.3991, "step": 2400 }, { "epoch": 0.17106453959860168, "grad_norm": 8.92013550945478, "learning_rate": 8.29121113162005e-06, "loss": 0.4107, "step": 2410 }, { "epoch": 0.17177435096623073, "grad_norm": 4.310339052965044, "learning_rate": 8.284111884140283e-06, "loss": 0.4198, "step": 2420 }, { "epoch": 0.17248416233385977, "grad_norm": 3.674140188675587, "learning_rate": 8.277012636660514e-06, "loss": 0.4066, "step": 2430 }, { "epoch": 0.17319397370148884, "grad_norm": 3.2816580938205986, "learning_rate": 8.269913389180747e-06, "loss": 0.3948, "step": 2440 }, { "epoch": 0.17390378506911788, "grad_norm": 3.119520711268051, "learning_rate": 8.26281414170098e-06, "loss": 0.4236, "step": 2450 }, { "epoch": 0.17461359643674693, "grad_norm": 3.9529990200341216, "learning_rate": 8.255714894221213e-06, "loss": 0.4028, "step": 2460 }, { "epoch": 0.175323407804376, "grad_norm": 6.5624619571577, "learning_rate": 8.248615646741446e-06, "loss": 0.4207, "step": 2470 }, { "epoch": 0.17603321917200504, "grad_norm": 6.563862400109423, "learning_rate": 8.24151639926168e-06, "loss": 0.4234, "step": 2480 }, { "epoch": 0.17674303053963408, "grad_norm": 4.124646423199101, "learning_rate": 8.234417151781911e-06, "loss": 0.421, "step": 2490 }, { "epoch": 0.17745284190726315, "grad_norm": 8.460797246337737, "learning_rate": 8.227317904302144e-06, "loss": 0.4169, "step": 2500 }, { "epoch": 0.1781626532748922, "grad_norm": 4.636207121737827, "learning_rate": 8.220218656822377e-06, "loss": 0.4154, "step": 2510 }, { "epoch": 0.17887246464252124, "grad_norm": 15.193279765427832, "learning_rate": 8.21311940934261e-06, "loss": 0.4, "step": 2520 }, { "epoch": 0.1795822760101503, "grad_norm": 8.394690912531237, "learning_rate": 8.206020161862843e-06, "loss": 0.3994, "step": 2530 }, { "epoch": 0.18029208737777935, "grad_norm": 11.829872869588135, "learning_rate": 8.198920914383076e-06, "loss": 0.4045, "step": 2540 }, { "epoch": 0.1810018987454084, "grad_norm": 10.598164946336963, "learning_rate": 8.19182166690331e-06, "loss": 0.4167, "step": 2550 }, { "epoch": 0.18171171011303747, "grad_norm": 8.644167493937724, "learning_rate": 8.184722419423543e-06, "loss": 0.4193, "step": 2560 }, { "epoch": 0.1824215214806665, "grad_norm": 5.532113862418252, "learning_rate": 8.177623171943776e-06, "loss": 0.4134, "step": 2570 }, { "epoch": 0.18313133284829555, "grad_norm": 8.962347784457894, "learning_rate": 8.170523924464007e-06, "loss": 0.4231, "step": 2580 }, { "epoch": 0.18384114421592462, "grad_norm": 4.789480578365759, "learning_rate": 8.16342467698424e-06, "loss": 0.4056, "step": 2590 }, { "epoch": 0.18455095558355367, "grad_norm": 7.463666547462272, "learning_rate": 8.156325429504473e-06, "loss": 0.4082, "step": 2600 }, { "epoch": 0.1852607669511827, "grad_norm": 3.543632295285487, "learning_rate": 8.149226182024706e-06, "loss": 0.3957, "step": 2610 }, { "epoch": 0.18597057831881178, "grad_norm": 10.128862482609126, "learning_rate": 8.14212693454494e-06, "loss": 0.4104, "step": 2620 }, { "epoch": 0.18668038968644082, "grad_norm": 2.279815139257822, "learning_rate": 8.135027687065171e-06, "loss": 0.4023, "step": 2630 }, { "epoch": 0.1873902010540699, "grad_norm": 5.651432220535337, "learning_rate": 8.127928439585404e-06, "loss": 0.4174, "step": 2640 }, { "epoch": 0.18810001242169894, "grad_norm": 2.764126752423827, "learning_rate": 8.120829192105637e-06, "loss": 0.4316, "step": 2650 }, { "epoch": 0.18880982378932798, "grad_norm": 2.2008942019632443, "learning_rate": 8.11372994462587e-06, "loss": 0.3998, "step": 2660 }, { "epoch": 0.18951963515695705, "grad_norm": 2.6464894767494194, "learning_rate": 8.106630697146103e-06, "loss": 0.4152, "step": 2670 }, { "epoch": 0.1902294465245861, "grad_norm": 2.9891233500309697, "learning_rate": 8.099531449666336e-06, "loss": 0.4065, "step": 2680 }, { "epoch": 0.19093925789221514, "grad_norm": 3.2947192783933303, "learning_rate": 8.092432202186568e-06, "loss": 0.4096, "step": 2690 }, { "epoch": 0.1916490692598442, "grad_norm": 2.6266501022263093, "learning_rate": 8.085332954706801e-06, "loss": 0.4079, "step": 2700 }, { "epoch": 0.19235888062747325, "grad_norm": 2.0600161188196258, "learning_rate": 8.078233707227034e-06, "loss": 0.4245, "step": 2710 }, { "epoch": 0.1930686919951023, "grad_norm": 3.4259686474049587, "learning_rate": 8.071134459747267e-06, "loss": 0.4168, "step": 2720 }, { "epoch": 0.19377850336273136, "grad_norm": 4.184352662206747, "learning_rate": 8.0640352122675e-06, "loss": 0.4265, "step": 2730 }, { "epoch": 0.1944883147303604, "grad_norm": 3.7320888080359174, "learning_rate": 8.056935964787733e-06, "loss": 0.4172, "step": 2740 }, { "epoch": 0.19519812609798945, "grad_norm": 3.750448672171502, "learning_rate": 8.049836717307966e-06, "loss": 0.4327, "step": 2750 }, { "epoch": 0.19590793746561852, "grad_norm": 3.0158382271152564, "learning_rate": 8.0427374698282e-06, "loss": 0.4284, "step": 2760 }, { "epoch": 0.19661774883324756, "grad_norm": 2.438159262347708, "learning_rate": 8.035638222348433e-06, "loss": 0.4117, "step": 2770 }, { "epoch": 0.1973275602008766, "grad_norm": 4.795802800628808, "learning_rate": 8.028538974868664e-06, "loss": 0.4207, "step": 2780 }, { "epoch": 0.19803737156850568, "grad_norm": 2.5291141301554405, "learning_rate": 8.021439727388897e-06, "loss": 0.4146, "step": 2790 }, { "epoch": 0.19874718293613472, "grad_norm": 2.4740979454164727, "learning_rate": 8.01434047990913e-06, "loss": 0.3999, "step": 2800 }, { "epoch": 0.19945699430376376, "grad_norm": 3.4467777684569927, "learning_rate": 8.007241232429363e-06, "loss": 0.4151, "step": 2810 }, { "epoch": 0.20016680567139283, "grad_norm": 2.741445348023422, "learning_rate": 8.000141984949596e-06, "loss": 0.4165, "step": 2820 }, { "epoch": 0.20087661703902188, "grad_norm": 2.977547725757033, "learning_rate": 7.99304273746983e-06, "loss": 0.4137, "step": 2830 }, { "epoch": 0.20158642840665092, "grad_norm": 3.493123708582949, "learning_rate": 7.985943489990061e-06, "loss": 0.4095, "step": 2840 }, { "epoch": 0.20229623977428, "grad_norm": 9.43644672917822, "learning_rate": 7.978844242510294e-06, "loss": 0.4066, "step": 2850 }, { "epoch": 0.20300605114190903, "grad_norm": 4.050870492633986, "learning_rate": 7.971744995030527e-06, "loss": 0.4079, "step": 2860 }, { "epoch": 0.2037158625095381, "grad_norm": 7.830134940271083, "learning_rate": 7.96464574755076e-06, "loss": 0.3896, "step": 2870 }, { "epoch": 0.20442567387716715, "grad_norm": 7.557535176254197, "learning_rate": 7.957546500070993e-06, "loss": 0.4096, "step": 2880 }, { "epoch": 0.2051354852447962, "grad_norm": 4.715465621080843, "learning_rate": 7.950447252591226e-06, "loss": 0.3907, "step": 2890 }, { "epoch": 0.20584529661242526, "grad_norm": 30.299863630729803, "learning_rate": 7.943348005111458e-06, "loss": 0.4142, "step": 2900 }, { "epoch": 0.2065551079800543, "grad_norm": 13.362349279952854, "learning_rate": 7.936248757631691e-06, "loss": 0.4211, "step": 2910 }, { "epoch": 0.20726491934768335, "grad_norm": 7.166470527615742, "learning_rate": 7.929149510151924e-06, "loss": 0.4038, "step": 2920 }, { "epoch": 0.20797473071531242, "grad_norm": 218.37559359733393, "learning_rate": 7.922050262672157e-06, "loss": 0.3814, "step": 2930 }, { "epoch": 0.20868454208294146, "grad_norm": 4.776318350142146, "learning_rate": 7.91495101519239e-06, "loss": 0.4033, "step": 2940 }, { "epoch": 0.2093943534505705, "grad_norm": 6.050705359465637, "learning_rate": 7.907851767712623e-06, "loss": 0.4006, "step": 2950 }, { "epoch": 0.21010416481819957, "grad_norm": 7.0609749250244125, "learning_rate": 7.900752520232857e-06, "loss": 0.3996, "step": 2960 }, { "epoch": 0.21081397618582862, "grad_norm": 5.2294105499183985, "learning_rate": 7.89365327275309e-06, "loss": 0.3906, "step": 2970 }, { "epoch": 0.21152378755345766, "grad_norm": 5.037453517661707, "learning_rate": 7.886554025273323e-06, "loss": 0.3925, "step": 2980 }, { "epoch": 0.21223359892108673, "grad_norm": 4.329367488091813, "learning_rate": 7.879454777793554e-06, "loss": 0.4005, "step": 2990 }, { "epoch": 0.21294341028871577, "grad_norm": 4.587934783884384, "learning_rate": 7.872355530313787e-06, "loss": 0.3949, "step": 3000 }, { "epoch": 0.21365322165634482, "grad_norm": 4.34538375508175, "learning_rate": 7.86525628283402e-06, "loss": 0.3963, "step": 3010 }, { "epoch": 0.2143630330239739, "grad_norm": 14.538466945533717, "learning_rate": 7.858157035354253e-06, "loss": 0.4145, "step": 3020 }, { "epoch": 0.21507284439160293, "grad_norm": 5.725604081866674, "learning_rate": 7.851057787874487e-06, "loss": 0.397, "step": 3030 }, { "epoch": 0.21578265575923197, "grad_norm": 4.100595238075657, "learning_rate": 7.843958540394718e-06, "loss": 0.407, "step": 3040 }, { "epoch": 0.21649246712686104, "grad_norm": 3.6102459737641452, "learning_rate": 7.836859292914951e-06, "loss": 0.3941, "step": 3050 }, { "epoch": 0.2172022784944901, "grad_norm": 9.48884086833176, "learning_rate": 7.829760045435184e-06, "loss": 0.3981, "step": 3060 }, { "epoch": 0.21791208986211913, "grad_norm": 5.265598040684193, "learning_rate": 7.822660797955417e-06, "loss": 0.3865, "step": 3070 }, { "epoch": 0.2186219012297482, "grad_norm": 5.853395704700518, "learning_rate": 7.81556155047565e-06, "loss": 0.4089, "step": 3080 }, { "epoch": 0.21933171259737724, "grad_norm": 2.867041909768411, "learning_rate": 7.808462302995883e-06, "loss": 0.411, "step": 3090 }, { "epoch": 0.22004152396500629, "grad_norm": 6.447556295363806, "learning_rate": 7.801363055516117e-06, "loss": 0.4054, "step": 3100 }, { "epoch": 0.22075133533263536, "grad_norm": 6.665403407542621, "learning_rate": 7.794263808036348e-06, "loss": 0.4331, "step": 3110 }, { "epoch": 0.2214611467002644, "grad_norm": 3.740543632288075, "learning_rate": 7.787164560556581e-06, "loss": 0.4132, "step": 3120 }, { "epoch": 0.22217095806789347, "grad_norm": 19.12212944661018, "learning_rate": 7.780065313076814e-06, "loss": 0.4229, "step": 3130 }, { "epoch": 0.2228807694355225, "grad_norm": 5.646216224084272, "learning_rate": 7.772966065597047e-06, "loss": 0.4123, "step": 3140 }, { "epoch": 0.22359058080315156, "grad_norm": 12.549975615460761, "learning_rate": 7.76586681811728e-06, "loss": 0.4156, "step": 3150 }, { "epoch": 0.22430039217078063, "grad_norm": 5.34509934381609, "learning_rate": 7.758767570637513e-06, "loss": 0.3935, "step": 3160 }, { "epoch": 0.22501020353840967, "grad_norm": 4.868356423660982, "learning_rate": 7.751668323157747e-06, "loss": 0.4121, "step": 3170 }, { "epoch": 0.2257200149060387, "grad_norm": 3.604594374317723, "learning_rate": 7.74456907567798e-06, "loss": 0.3949, "step": 3180 }, { "epoch": 0.22642982627366778, "grad_norm": 2.6762060130385565, "learning_rate": 7.737469828198211e-06, "loss": 0.4192, "step": 3190 }, { "epoch": 0.22713963764129683, "grad_norm": 3.7277037964888957, "learning_rate": 7.730370580718444e-06, "loss": 0.4063, "step": 3200 }, { "epoch": 0.22784944900892587, "grad_norm": 4.2017308560808395, "learning_rate": 7.723271333238677e-06, "loss": 0.3983, "step": 3210 }, { "epoch": 0.22855926037655494, "grad_norm": 6.82717398390433, "learning_rate": 7.71617208575891e-06, "loss": 0.4003, "step": 3220 }, { "epoch": 0.22926907174418398, "grad_norm": 3.3720424392184865, "learning_rate": 7.709072838279143e-06, "loss": 0.384, "step": 3230 }, { "epoch": 0.22997888311181303, "grad_norm": 15.234041629621501, "learning_rate": 7.701973590799375e-06, "loss": 0.3936, "step": 3240 }, { "epoch": 0.2306886944794421, "grad_norm": 6.450291645106787, "learning_rate": 7.694874343319608e-06, "loss": 0.4153, "step": 3250 }, { "epoch": 0.23139850584707114, "grad_norm": 5.0596647748479056, "learning_rate": 7.687775095839841e-06, "loss": 0.4098, "step": 3260 }, { "epoch": 0.23210831721470018, "grad_norm": 6.351369993733097, "learning_rate": 7.680675848360074e-06, "loss": 0.4036, "step": 3270 }, { "epoch": 0.23281812858232925, "grad_norm": 7.706709044787595, "learning_rate": 7.673576600880307e-06, "loss": 0.4137, "step": 3280 }, { "epoch": 0.2335279399499583, "grad_norm": 6.111103199878706, "learning_rate": 7.66647735340054e-06, "loss": 0.4163, "step": 3290 }, { "epoch": 0.23423775131758734, "grad_norm": 3.182362422678598, "learning_rate": 7.659378105920773e-06, "loss": 0.4007, "step": 3300 }, { "epoch": 0.2349475626852164, "grad_norm": 3.929827344563346, "learning_rate": 7.652278858441005e-06, "loss": 0.4011, "step": 3310 }, { "epoch": 0.23565737405284545, "grad_norm": 6.606808853169358, "learning_rate": 7.645179610961238e-06, "loss": 0.4113, "step": 3320 }, { "epoch": 0.2363671854204745, "grad_norm": 7.983975561443669, "learning_rate": 7.638080363481471e-06, "loss": 0.3941, "step": 3330 }, { "epoch": 0.23707699678810357, "grad_norm": 2.551810232754013, "learning_rate": 7.630981116001704e-06, "loss": 0.3987, "step": 3340 }, { "epoch": 0.2377868081557326, "grad_norm": 16.325804366695763, "learning_rate": 7.623881868521937e-06, "loss": 0.3814, "step": 3350 }, { "epoch": 0.23849661952336168, "grad_norm": 17.86582631307272, "learning_rate": 7.61678262104217e-06, "loss": 0.4065, "step": 3360 }, { "epoch": 0.23920643089099072, "grad_norm": 4.439905284094514, "learning_rate": 7.6096833735624035e-06, "loss": 0.4079, "step": 3370 }, { "epoch": 0.23991624225861977, "grad_norm": 13.632710588001641, "learning_rate": 7.602584126082636e-06, "loss": 0.4075, "step": 3380 }, { "epoch": 0.24062605362624884, "grad_norm": 7.4557485788963405, "learning_rate": 7.595484878602869e-06, "loss": 0.399, "step": 3390 }, { "epoch": 0.24133586499387788, "grad_norm": 6.032057911933067, "learning_rate": 7.588385631123102e-06, "loss": 0.3892, "step": 3400 }, { "epoch": 0.24204567636150692, "grad_norm": 5.1424876309924, "learning_rate": 7.581286383643335e-06, "loss": 0.396, "step": 3410 }, { "epoch": 0.242755487729136, "grad_norm": 3.6691932120100987, "learning_rate": 7.574187136163567e-06, "loss": 0.4108, "step": 3420 }, { "epoch": 0.24346529909676504, "grad_norm": 2.8083232656002033, "learning_rate": 7.5670878886838004e-06, "loss": 0.3984, "step": 3430 }, { "epoch": 0.24417511046439408, "grad_norm": 13.589049355107566, "learning_rate": 7.5599886412040335e-06, "loss": 0.3957, "step": 3440 }, { "epoch": 0.24488492183202315, "grad_norm": 6.813624263530042, "learning_rate": 7.552889393724265e-06, "loss": 0.4105, "step": 3450 }, { "epoch": 0.2455947331996522, "grad_norm": 13.609829369379536, "learning_rate": 7.545790146244498e-06, "loss": 0.4175, "step": 3460 }, { "epoch": 0.24630454456728124, "grad_norm": 5.1258006881261915, "learning_rate": 7.538690898764731e-06, "loss": 0.3966, "step": 3470 }, { "epoch": 0.2470143559349103, "grad_norm": 40.31962236147607, "learning_rate": 7.531591651284964e-06, "loss": 0.3839, "step": 3480 }, { "epoch": 0.24772416730253935, "grad_norm": 6.537768993909155, "learning_rate": 7.524492403805197e-06, "loss": 0.4122, "step": 3490 }, { "epoch": 0.2484339786701684, "grad_norm": 17.652356012021233, "learning_rate": 7.51739315632543e-06, "loss": 0.3948, "step": 3500 }, { "epoch": 0.24914379003779746, "grad_norm": 3.85528406182526, "learning_rate": 7.510293908845663e-06, "loss": 0.3938, "step": 3510 }, { "epoch": 0.2498536014054265, "grad_norm": 125.62304184951121, "learning_rate": 7.503194661365896e-06, "loss": 0.389, "step": 3520 }, { "epoch": 0.25056341277305555, "grad_norm": 8.558355724038593, "learning_rate": 7.496095413886129e-06, "loss": 0.3787, "step": 3530 }, { "epoch": 0.2512732241406846, "grad_norm": 4.216427070872869, "learning_rate": 7.488996166406361e-06, "loss": 0.3835, "step": 3540 }, { "epoch": 0.2519830355083137, "grad_norm": 4.314131483032103, "learning_rate": 7.481896918926594e-06, "loss": 0.3946, "step": 3550 }, { "epoch": 0.25269284687594273, "grad_norm": 4.159823786853909, "learning_rate": 7.474797671446827e-06, "loss": 0.3972, "step": 3560 }, { "epoch": 0.2534026582435718, "grad_norm": 3.4947296702394586, "learning_rate": 7.4676984239670605e-06, "loss": 0.4165, "step": 3570 }, { "epoch": 0.2541124696112008, "grad_norm": 4.022241190948728, "learning_rate": 7.4605991764872936e-06, "loss": 0.3988, "step": 3580 }, { "epoch": 0.25482228097882986, "grad_norm": 3.4849637281174006, "learning_rate": 7.453499929007526e-06, "loss": 0.4106, "step": 3590 }, { "epoch": 0.2555320923464589, "grad_norm": 5.338306458076586, "learning_rate": 7.446400681527759e-06, "loss": 0.4082, "step": 3600 }, { "epoch": 0.256241903714088, "grad_norm": 4.970005106695202, "learning_rate": 7.439301434047992e-06, "loss": 0.3914, "step": 3610 }, { "epoch": 0.25695171508171705, "grad_norm": 6.355373029038747, "learning_rate": 7.432202186568225e-06, "loss": 0.3989, "step": 3620 }, { "epoch": 0.2576615264493461, "grad_norm": 5.996742366501121, "learning_rate": 7.425102939088457e-06, "loss": 0.3999, "step": 3630 }, { "epoch": 0.25837133781697513, "grad_norm": 6.966686936423967, "learning_rate": 7.4180036916086905e-06, "loss": 0.3831, "step": 3640 }, { "epoch": 0.2590811491846042, "grad_norm": 4.185121399245409, "learning_rate": 7.410904444128923e-06, "loss": 0.408, "step": 3650 }, { "epoch": 0.2597909605522332, "grad_norm": 2.2056616209460866, "learning_rate": 7.403805196649155e-06, "loss": 0.3931, "step": 3660 }, { "epoch": 0.2605007719198623, "grad_norm": 4.176248780095696, "learning_rate": 7.396705949169388e-06, "loss": 0.409, "step": 3670 }, { "epoch": 0.26121058328749136, "grad_norm": 2.47926985794175, "learning_rate": 7.389606701689621e-06, "loss": 0.4091, "step": 3680 }, { "epoch": 0.2619203946551204, "grad_norm": 3.02240842448802, "learning_rate": 7.382507454209854e-06, "loss": 0.4102, "step": 3690 }, { "epoch": 0.26263020602274945, "grad_norm": 2.0291710541228816, "learning_rate": 7.3754082067300866e-06, "loss": 0.382, "step": 3700 }, { "epoch": 0.2633400173903785, "grad_norm": 2.1912303159611084, "learning_rate": 7.36830895925032e-06, "loss": 0.3974, "step": 3710 }, { "epoch": 0.26404982875800753, "grad_norm": 2.964541482780821, "learning_rate": 7.361209711770553e-06, "loss": 0.4096, "step": 3720 }, { "epoch": 0.26475964012563663, "grad_norm": 5.810099164313448, "learning_rate": 7.354110464290786e-06, "loss": 0.4092, "step": 3730 }, { "epoch": 0.2654694514932657, "grad_norm": 4.879409457746285, "learning_rate": 7.347011216811019e-06, "loss": 0.4034, "step": 3740 }, { "epoch": 0.2661792628608947, "grad_norm": 2.761287928392515, "learning_rate": 7.339911969331251e-06, "loss": 0.3971, "step": 3750 }, { "epoch": 0.26688907422852376, "grad_norm": 14.80879239487425, "learning_rate": 7.332812721851484e-06, "loss": 0.4203, "step": 3760 }, { "epoch": 0.2675988855961528, "grad_norm": 2.589550559546521, "learning_rate": 7.325713474371717e-06, "loss": 0.4065, "step": 3770 }, { "epoch": 0.2683086969637819, "grad_norm": 2.1908148156089204, "learning_rate": 7.3186142268919505e-06, "loss": 0.4001, "step": 3780 }, { "epoch": 0.26901850833141094, "grad_norm": 3.614429975395643, "learning_rate": 7.311514979412183e-06, "loss": 0.3949, "step": 3790 }, { "epoch": 0.26972831969904, "grad_norm": 8.199581604131074, "learning_rate": 7.304415731932416e-06, "loss": 0.4027, "step": 3800 }, { "epoch": 0.27043813106666903, "grad_norm": 1.9841735875976263, "learning_rate": 7.297316484452649e-06, "loss": 0.3803, "step": 3810 }, { "epoch": 0.2711479424342981, "grad_norm": 1.7818490390141006, "learning_rate": 7.290217236972882e-06, "loss": 0.3979, "step": 3820 }, { "epoch": 0.2718577538019271, "grad_norm": 2.664420697627613, "learning_rate": 7.283117989493115e-06, "loss": 0.4112, "step": 3830 }, { "epoch": 0.2725675651695562, "grad_norm": 7.6015896940216345, "learning_rate": 7.2760187420133474e-06, "loss": 0.3978, "step": 3840 }, { "epoch": 0.27327737653718526, "grad_norm": 5.109710356060471, "learning_rate": 7.2689194945335805e-06, "loss": 0.3911, "step": 3850 }, { "epoch": 0.2739871879048143, "grad_norm": 1.8719451344781273, "learning_rate": 7.261820247053813e-06, "loss": 0.4039, "step": 3860 }, { "epoch": 0.27469699927244334, "grad_norm": 7.834590688589366, "learning_rate": 7.254720999574045e-06, "loss": 0.3972, "step": 3870 }, { "epoch": 0.2754068106400724, "grad_norm": 3.4725606354409915, "learning_rate": 7.247621752094278e-06, "loss": 0.4106, "step": 3880 }, { "epoch": 0.27611662200770143, "grad_norm": 2.131887069098727, "learning_rate": 7.240522504614511e-06, "loss": 0.3921, "step": 3890 }, { "epoch": 0.2768264333753305, "grad_norm": 3.840712773368679, "learning_rate": 7.233423257134744e-06, "loss": 0.3963, "step": 3900 }, { "epoch": 0.27753624474295957, "grad_norm": 1.8435607174327202, "learning_rate": 7.226324009654977e-06, "loss": 0.4171, "step": 3910 }, { "epoch": 0.2782460561105886, "grad_norm": 2.927315889095762, "learning_rate": 7.21922476217521e-06, "loss": 0.4078, "step": 3920 }, { "epoch": 0.27895586747821766, "grad_norm": 2.4533548064235955, "learning_rate": 7.212125514695443e-06, "loss": 0.4018, "step": 3930 }, { "epoch": 0.2796656788458467, "grad_norm": 2.6808622987821424, "learning_rate": 7.205026267215676e-06, "loss": 0.3952, "step": 3940 }, { "epoch": 0.28037549021347574, "grad_norm": 2.006870713713202, "learning_rate": 7.197927019735908e-06, "loss": 0.4041, "step": 3950 }, { "epoch": 0.28108530158110484, "grad_norm": 4.1552921396903955, "learning_rate": 7.190827772256141e-06, "loss": 0.3815, "step": 3960 }, { "epoch": 0.2817951129487339, "grad_norm": 3.088912130241367, "learning_rate": 7.183728524776374e-06, "loss": 0.4018, "step": 3970 }, { "epoch": 0.2825049243163629, "grad_norm": 2.9619382181530853, "learning_rate": 7.1766292772966075e-06, "loss": 0.4071, "step": 3980 }, { "epoch": 0.28321473568399197, "grad_norm": 3.194525382034512, "learning_rate": 7.1695300298168406e-06, "loss": 0.3861, "step": 3990 }, { "epoch": 0.283924547051621, "grad_norm": 2.58824315637412, "learning_rate": 7.162430782337073e-06, "loss": 0.4022, "step": 4000 }, { "epoch": 0.2846343584192501, "grad_norm": 1.6807083864960135, "learning_rate": 7.155331534857306e-06, "loss": 0.3953, "step": 4010 }, { "epoch": 0.28534416978687915, "grad_norm": 2.9052226494936706, "learning_rate": 7.148232287377539e-06, "loss": 0.3803, "step": 4020 }, { "epoch": 0.2860539811545082, "grad_norm": 1.9518486816171219, "learning_rate": 7.141133039897772e-06, "loss": 0.4076, "step": 4030 }, { "epoch": 0.28676379252213724, "grad_norm": 2.223176862483651, "learning_rate": 7.134033792418004e-06, "loss": 0.4058, "step": 4040 }, { "epoch": 0.2874736038897663, "grad_norm": 2.2196780309614854, "learning_rate": 7.1269345449382375e-06, "loss": 0.3926, "step": 4050 }, { "epoch": 0.2881834152573953, "grad_norm": 6.524368077094248, "learning_rate": 7.11983529745847e-06, "loss": 0.4172, "step": 4060 }, { "epoch": 0.2888932266250244, "grad_norm": 5.292339769504148, "learning_rate": 7.112736049978702e-06, "loss": 0.3908, "step": 4070 }, { "epoch": 0.28960303799265347, "grad_norm": 2.3067804343233282, "learning_rate": 7.105636802498935e-06, "loss": 0.3899, "step": 4080 }, { "epoch": 0.2903128493602825, "grad_norm": 3.23451698379491, "learning_rate": 7.098537555019168e-06, "loss": 0.4078, "step": 4090 }, { "epoch": 0.29102266072791155, "grad_norm": 1.9975711149406958, "learning_rate": 7.091438307539401e-06, "loss": 0.3892, "step": 4100 }, { "epoch": 0.2917324720955406, "grad_norm": 2.172457996529036, "learning_rate": 7.084339060059634e-06, "loss": 0.4024, "step": 4110 }, { "epoch": 0.29244228346316964, "grad_norm": 4.2611345539293985, "learning_rate": 7.077239812579867e-06, "loss": 0.4051, "step": 4120 }, { "epoch": 0.29315209483079874, "grad_norm": 4.8499954927547915, "learning_rate": 7.0701405651001e-06, "loss": 0.4051, "step": 4130 }, { "epoch": 0.2938619061984278, "grad_norm": 3.133374032170856, "learning_rate": 7.063041317620333e-06, "loss": 0.4113, "step": 4140 }, { "epoch": 0.2945717175660568, "grad_norm": 3.0408556337828667, "learning_rate": 7.055942070140566e-06, "loss": 0.3918, "step": 4150 }, { "epoch": 0.29528152893368587, "grad_norm": 2.967610716656761, "learning_rate": 7.048842822660798e-06, "loss": 0.3935, "step": 4160 }, { "epoch": 0.2959913403013149, "grad_norm": 4.089654504142007, "learning_rate": 7.041743575181031e-06, "loss": 0.3812, "step": 4170 }, { "epoch": 0.29670115166894395, "grad_norm": 6.123820735815897, "learning_rate": 7.0346443277012644e-06, "loss": 0.3894, "step": 4180 }, { "epoch": 0.29741096303657305, "grad_norm": 9.52031358542494, "learning_rate": 7.0275450802214975e-06, "loss": 0.3933, "step": 4190 }, { "epoch": 0.2981207744042021, "grad_norm": 4.241656002923987, "learning_rate": 7.02044583274173e-06, "loss": 0.3938, "step": 4200 }, { "epoch": 0.29883058577183114, "grad_norm": 10.364254693083032, "learning_rate": 7.013346585261963e-06, "loss": 0.3939, "step": 4210 }, { "epoch": 0.2995403971394602, "grad_norm": 2.493001703497579, "learning_rate": 7.006247337782196e-06, "loss": 0.3904, "step": 4220 }, { "epoch": 0.3002502085070892, "grad_norm": 2.372260556132136, "learning_rate": 6.999148090302429e-06, "loss": 0.4002, "step": 4230 }, { "epoch": 0.3009600198747183, "grad_norm": 4.447948099801884, "learning_rate": 6.992048842822662e-06, "loss": 0.3894, "step": 4240 }, { "epoch": 0.30166983124234736, "grad_norm": 2.4733723007039847, "learning_rate": 6.9849495953428944e-06, "loss": 0.3863, "step": 4250 }, { "epoch": 0.3023796426099764, "grad_norm": 11.318740156291982, "learning_rate": 6.977850347863127e-06, "loss": 0.3881, "step": 4260 }, { "epoch": 0.30308945397760545, "grad_norm": 3.6328999006662563, "learning_rate": 6.97075110038336e-06, "loss": 0.3894, "step": 4270 }, { "epoch": 0.3037992653452345, "grad_norm": 2.0376811180198353, "learning_rate": 6.963651852903592e-06, "loss": 0.3993, "step": 4280 }, { "epoch": 0.30450907671286354, "grad_norm": 2.1376755414320625, "learning_rate": 6.956552605423825e-06, "loss": 0.3903, "step": 4290 }, { "epoch": 0.30521888808049263, "grad_norm": 2.883515618882684, "learning_rate": 6.949453357944058e-06, "loss": 0.4082, "step": 4300 }, { "epoch": 0.3059286994481217, "grad_norm": 2.0964398516334444, "learning_rate": 6.942354110464291e-06, "loss": 0.3857, "step": 4310 }, { "epoch": 0.3066385108157507, "grad_norm": 5.410779818418891, "learning_rate": 6.935254862984524e-06, "loss": 0.391, "step": 4320 }, { "epoch": 0.30734832218337976, "grad_norm": 4.439425532620099, "learning_rate": 6.928155615504757e-06, "loss": 0.4099, "step": 4330 }, { "epoch": 0.3080581335510088, "grad_norm": 12.275643206811255, "learning_rate": 6.92105636802499e-06, "loss": 0.3953, "step": 4340 }, { "epoch": 0.30876794491863785, "grad_norm": 5.947992733400443, "learning_rate": 6.913957120545223e-06, "loss": 0.3945, "step": 4350 }, { "epoch": 0.30947775628626695, "grad_norm": 3.4397054213510843, "learning_rate": 6.906857873065456e-06, "loss": 0.3875, "step": 4360 }, { "epoch": 0.310187567653896, "grad_norm": 41.88563893552131, "learning_rate": 6.899758625585688e-06, "loss": 0.3928, "step": 4370 }, { "epoch": 0.31089737902152503, "grad_norm": 3.227989243444744, "learning_rate": 6.892659378105921e-06, "loss": 0.3908, "step": 4380 }, { "epoch": 0.3116071903891541, "grad_norm": 22.897381721878148, "learning_rate": 6.8855601306261545e-06, "loss": 0.391, "step": 4390 }, { "epoch": 0.3123170017567831, "grad_norm": 3.3630974135990406, "learning_rate": 6.878460883146388e-06, "loss": 0.374, "step": 4400 }, { "epoch": 0.31302681312441216, "grad_norm": 4.877401136832981, "learning_rate": 6.87136163566662e-06, "loss": 0.3923, "step": 4410 }, { "epoch": 0.31373662449204126, "grad_norm": 6.179682561885886, "learning_rate": 6.864262388186853e-06, "loss": 0.3865, "step": 4420 }, { "epoch": 0.3144464358596703, "grad_norm": 4.8910756460648885, "learning_rate": 6.857163140707086e-06, "loss": 0.3865, "step": 4430 }, { "epoch": 0.31515624722729935, "grad_norm": 3.260915462621521, "learning_rate": 6.850063893227319e-06, "loss": 0.3982, "step": 4440 }, { "epoch": 0.3158660585949284, "grad_norm": 4.599472395508018, "learning_rate": 6.842964645747551e-06, "loss": 0.3961, "step": 4450 }, { "epoch": 0.31657586996255743, "grad_norm": 7.776943140920524, "learning_rate": 6.8358653982677845e-06, "loss": 0.3873, "step": 4460 }, { "epoch": 0.3172856813301865, "grad_norm": 3.0126570398502723, "learning_rate": 6.828766150788017e-06, "loss": 0.3859, "step": 4470 }, { "epoch": 0.3179954926978156, "grad_norm": 1.935360939609241, "learning_rate": 6.82166690330825e-06, "loss": 0.3893, "step": 4480 }, { "epoch": 0.3187053040654446, "grad_norm": 2.8545870894952055, "learning_rate": 6.814567655828482e-06, "loss": 0.3963, "step": 4490 }, { "epoch": 0.31941511543307366, "grad_norm": 4.70013317139999, "learning_rate": 6.807468408348715e-06, "loss": 0.3939, "step": 4500 }, { "epoch": 0.3201249268007027, "grad_norm": 3.264719904276936, "learning_rate": 6.800369160868948e-06, "loss": 0.3851, "step": 4510 }, { "epoch": 0.32083473816833175, "grad_norm": 19.735683632874615, "learning_rate": 6.793269913389181e-06, "loss": 0.3722, "step": 4520 }, { "epoch": 0.32154454953596084, "grad_norm": 2.501896594333183, "learning_rate": 6.786170665909414e-06, "loss": 0.3744, "step": 4530 }, { "epoch": 0.3222543609035899, "grad_norm": 6.776418259400934, "learning_rate": 6.779071418429647e-06, "loss": 0.3868, "step": 4540 }, { "epoch": 0.32296417227121893, "grad_norm": 7.759324029832955, "learning_rate": 6.77197217094988e-06, "loss": 0.3978, "step": 4550 }, { "epoch": 0.323673983638848, "grad_norm": 5.1020465787210805, "learning_rate": 6.764872923470113e-06, "loss": 0.3756, "step": 4560 }, { "epoch": 0.324383795006477, "grad_norm": 4.584721636805871, "learning_rate": 6.757773675990345e-06, "loss": 0.3962, "step": 4570 }, { "epoch": 0.32509360637410606, "grad_norm": 5.227400251430727, "learning_rate": 6.750674428510578e-06, "loss": 0.3934, "step": 4580 }, { "epoch": 0.32580341774173516, "grad_norm": 6.3055606292098645, "learning_rate": 6.7435751810308114e-06, "loss": 0.3921, "step": 4590 }, { "epoch": 0.3265132291093642, "grad_norm": 3.6872617865325914, "learning_rate": 6.7364759335510445e-06, "loss": 0.3818, "step": 4600 }, { "epoch": 0.32722304047699324, "grad_norm": 2.007884918336012, "learning_rate": 6.729376686071278e-06, "loss": 0.4005, "step": 4610 }, { "epoch": 0.3279328518446223, "grad_norm": 5.042964957635144, "learning_rate": 6.72227743859151e-06, "loss": 0.3934, "step": 4620 }, { "epoch": 0.32864266321225133, "grad_norm": 4.122572427444757, "learning_rate": 6.715178191111743e-06, "loss": 0.3835, "step": 4630 }, { "epoch": 0.32935247457988037, "grad_norm": 4.528744366296638, "learning_rate": 6.708078943631976e-06, "loss": 0.3781, "step": 4640 }, { "epoch": 0.33006228594750947, "grad_norm": 3.0405586193089107, "learning_rate": 6.700979696152209e-06, "loss": 0.4013, "step": 4650 }, { "epoch": 0.3307720973151385, "grad_norm": 2.497528895602537, "learning_rate": 6.6938804486724415e-06, "loss": 0.4012, "step": 4660 }, { "epoch": 0.33148190868276756, "grad_norm": 3.949569099861772, "learning_rate": 6.686781201192674e-06, "loss": 0.3791, "step": 4670 }, { "epoch": 0.3321917200503966, "grad_norm": 2.9026740036563714, "learning_rate": 6.679681953712907e-06, "loss": 0.379, "step": 4680 }, { "epoch": 0.33290153141802564, "grad_norm": 4.750694201369016, "learning_rate": 6.672582706233139e-06, "loss": 0.3962, "step": 4690 }, { "epoch": 0.3336113427856547, "grad_norm": 4.9647752226572655, "learning_rate": 6.665483458753372e-06, "loss": 0.4014, "step": 4700 }, { "epoch": 0.3343211541532838, "grad_norm": 5.007567374826438, "learning_rate": 6.658384211273605e-06, "loss": 0.386, "step": 4710 }, { "epoch": 0.3350309655209128, "grad_norm": 24.665793733036637, "learning_rate": 6.651284963793838e-06, "loss": 0.3904, "step": 4720 }, { "epoch": 0.33574077688854187, "grad_norm": 8.807448982539153, "learning_rate": 6.6441857163140715e-06, "loss": 0.3817, "step": 4730 }, { "epoch": 0.3364505882561709, "grad_norm": 5.649488918187287, "learning_rate": 6.637086468834304e-06, "loss": 0.3952, "step": 4740 }, { "epoch": 0.33716039962379996, "grad_norm": 10.030238684862177, "learning_rate": 6.629987221354537e-06, "loss": 0.3894, "step": 4750 }, { "epoch": 0.33787021099142905, "grad_norm": 8.229307584465264, "learning_rate": 6.62288797387477e-06, "loss": 0.3777, "step": 4760 }, { "epoch": 0.3385800223590581, "grad_norm": 4.702015980686352, "learning_rate": 6.615788726395003e-06, "loss": 0.3846, "step": 4770 }, { "epoch": 0.33928983372668714, "grad_norm": 7.609531980298162, "learning_rate": 6.608689478915235e-06, "loss": 0.3876, "step": 4780 }, { "epoch": 0.3399996450943162, "grad_norm": 9.359016840144466, "learning_rate": 6.601590231435468e-06, "loss": 0.3912, "step": 4790 }, { "epoch": 0.3407094564619452, "grad_norm": 6.921512932106153, "learning_rate": 6.5944909839557015e-06, "loss": 0.3808, "step": 4800 }, { "epoch": 0.34141926782957427, "grad_norm": 7.896921462163668, "learning_rate": 6.587391736475935e-06, "loss": 0.3822, "step": 4810 }, { "epoch": 0.34212907919720337, "grad_norm": 41.265653283488135, "learning_rate": 6.580292488996167e-06, "loss": 0.3704, "step": 4820 }, { "epoch": 0.3428388905648324, "grad_norm": 22.410728414840314, "learning_rate": 6.5731932415164e-06, "loss": 0.3879, "step": 4830 }, { "epoch": 0.34354870193246145, "grad_norm": 28.36796548695283, "learning_rate": 6.566093994036633e-06, "loss": 0.3819, "step": 4840 }, { "epoch": 0.3442585133000905, "grad_norm": 5.964443376270807, "learning_rate": 6.558994746556866e-06, "loss": 0.3793, "step": 4850 }, { "epoch": 0.34496832466771954, "grad_norm": 4.876522423500047, "learning_rate": 6.551895499077099e-06, "loss": 0.3882, "step": 4860 }, { "epoch": 0.3456781360353486, "grad_norm": 4.871742533391797, "learning_rate": 6.544796251597331e-06, "loss": 0.3896, "step": 4870 }, { "epoch": 0.3463879474029777, "grad_norm": 11.91690423514364, "learning_rate": 6.537697004117564e-06, "loss": 0.3736, "step": 4880 }, { "epoch": 0.3470977587706067, "grad_norm": 5.986322327762981, "learning_rate": 6.530597756637797e-06, "loss": 0.368, "step": 4890 }, { "epoch": 0.34780757013823577, "grad_norm": 4.671637222361169, "learning_rate": 6.523498509158029e-06, "loss": 0.3722, "step": 4900 }, { "epoch": 0.3485173815058648, "grad_norm": 16.438976188514197, "learning_rate": 6.516399261678262e-06, "loss": 0.3776, "step": 4910 }, { "epoch": 0.34922719287349385, "grad_norm": 11.76911671905372, "learning_rate": 6.509300014198495e-06, "loss": 0.3987, "step": 4920 }, { "epoch": 0.3499370042411229, "grad_norm": 12.380867918847773, "learning_rate": 6.502200766718728e-06, "loss": 0.3949, "step": 4930 }, { "epoch": 0.350646815608752, "grad_norm": 8.367704037629133, "learning_rate": 6.495101519238961e-06, "loss": 0.3767, "step": 4940 }, { "epoch": 0.35135662697638104, "grad_norm": 74.35690108296033, "learning_rate": 6.488002271759194e-06, "loss": 0.3819, "step": 4950 }, { "epoch": 0.3520664383440101, "grad_norm": 16.231219614665278, "learning_rate": 6.480903024279427e-06, "loss": 0.3859, "step": 4960 }, { "epoch": 0.3527762497116391, "grad_norm": 9.060846103909238, "learning_rate": 6.47380377679966e-06, "loss": 0.394, "step": 4970 }, { "epoch": 0.35348606107926817, "grad_norm": 21.88016531222193, "learning_rate": 6.466704529319893e-06, "loss": 0.4167, "step": 4980 }, { "epoch": 0.35419587244689726, "grad_norm": 9.919040843315045, "learning_rate": 6.459605281840125e-06, "loss": 0.4192, "step": 4990 }, { "epoch": 0.3549056838145263, "grad_norm": 5.183299722151934, "learning_rate": 6.4525060343603584e-06, "loss": 0.4249, "step": 5000 }, { "epoch": 0.35561549518215535, "grad_norm": 8.847185946354221, "learning_rate": 6.4454067868805915e-06, "loss": 0.4112, "step": 5010 }, { "epoch": 0.3563253065497844, "grad_norm": 11.864215621262682, "learning_rate": 6.438307539400825e-06, "loss": 0.4165, "step": 5020 }, { "epoch": 0.35703511791741344, "grad_norm": 3.3703428369603503, "learning_rate": 6.431208291921057e-06, "loss": 0.3978, "step": 5030 }, { "epoch": 0.3577449292850425, "grad_norm": 5.015316577294299, "learning_rate": 6.42410904444129e-06, "loss": 0.3872, "step": 5040 }, { "epoch": 0.3584547406526716, "grad_norm": 4.2137919102595305, "learning_rate": 6.417009796961523e-06, "loss": 0.3766, "step": 5050 }, { "epoch": 0.3591645520203006, "grad_norm": 3.0372315306510056, "learning_rate": 6.409910549481756e-06, "loss": 0.3842, "step": 5060 }, { "epoch": 0.35987436338792966, "grad_norm": 2.7515400586423318, "learning_rate": 6.4028113020019885e-06, "loss": 0.3993, "step": 5070 }, { "epoch": 0.3605841747555587, "grad_norm": 9.185207292504243, "learning_rate": 6.395712054522221e-06, "loss": 0.3875, "step": 5080 }, { "epoch": 0.36129398612318775, "grad_norm": 19.515842749867563, "learning_rate": 6.388612807042454e-06, "loss": 0.4035, "step": 5090 }, { "epoch": 0.3620037974908168, "grad_norm": 12.30636697197178, "learning_rate": 6.381513559562686e-06, "loss": 0.4035, "step": 5100 }, { "epoch": 0.3627136088584459, "grad_norm": 6.732979846623905, "learning_rate": 6.374414312082919e-06, "loss": 0.4079, "step": 5110 }, { "epoch": 0.36342342022607493, "grad_norm": 6.642326962423095, "learning_rate": 6.367315064603152e-06, "loss": 0.3945, "step": 5120 }, { "epoch": 0.364133231593704, "grad_norm": 6.314154234087903, "learning_rate": 6.360215817123385e-06, "loss": 0.394, "step": 5130 }, { "epoch": 0.364843042961333, "grad_norm": 4.760512258914551, "learning_rate": 6.3531165696436185e-06, "loss": 0.3863, "step": 5140 }, { "epoch": 0.36555285432896206, "grad_norm": 4.048747245175314, "learning_rate": 6.346017322163851e-06, "loss": 0.3863, "step": 5150 }, { "epoch": 0.3662626656965911, "grad_norm": 4.190578946223062, "learning_rate": 6.338918074684084e-06, "loss": 0.3723, "step": 5160 }, { "epoch": 0.3669724770642202, "grad_norm": 4.175965799380943, "learning_rate": 6.331818827204317e-06, "loss": 0.3889, "step": 5170 }, { "epoch": 0.36768228843184925, "grad_norm": 4.807186811656143, "learning_rate": 6.32471957972455e-06, "loss": 0.3874, "step": 5180 }, { "epoch": 0.3683920997994783, "grad_norm": 6.659345248185456, "learning_rate": 6.317620332244782e-06, "loss": 0.3711, "step": 5190 }, { "epoch": 0.36910191116710733, "grad_norm": 7.2186380945453905, "learning_rate": 6.310521084765015e-06, "loss": 0.3827, "step": 5200 }, { "epoch": 0.3698117225347364, "grad_norm": 5.005630183658748, "learning_rate": 6.3034218372852485e-06, "loss": 0.3983, "step": 5210 }, { "epoch": 0.3705215339023654, "grad_norm": 3.527405153009429, "learning_rate": 6.296322589805482e-06, "loss": 0.367, "step": 5220 }, { "epoch": 0.3712313452699945, "grad_norm": 3.882199465110045, "learning_rate": 6.289223342325715e-06, "loss": 0.3883, "step": 5230 }, { "epoch": 0.37194115663762356, "grad_norm": 7.463055050907344, "learning_rate": 6.282124094845947e-06, "loss": 0.3823, "step": 5240 }, { "epoch": 0.3726509680052526, "grad_norm": 8.000906237369843, "learning_rate": 6.27502484736618e-06, "loss": 0.383, "step": 5250 }, { "epoch": 0.37336077937288165, "grad_norm": 8.362063303535368, "learning_rate": 6.267925599886413e-06, "loss": 0.3893, "step": 5260 }, { "epoch": 0.3740705907405107, "grad_norm": 4.721914441661691, "learning_rate": 6.260826352406646e-06, "loss": 0.3763, "step": 5270 }, { "epoch": 0.3747804021081398, "grad_norm": 12.175797518430029, "learning_rate": 6.253727104926878e-06, "loss": 0.3977, "step": 5280 }, { "epoch": 0.37549021347576883, "grad_norm": 9.814402397906687, "learning_rate": 6.246627857447111e-06, "loss": 0.3716, "step": 5290 }, { "epoch": 0.3762000248433979, "grad_norm": 47.1450002499556, "learning_rate": 6.239528609967344e-06, "loss": 0.3792, "step": 5300 }, { "epoch": 0.3769098362110269, "grad_norm": 27.513481595283608, "learning_rate": 6.232429362487576e-06, "loss": 0.3734, "step": 5310 }, { "epoch": 0.37761964757865596, "grad_norm": 48.09984812385904, "learning_rate": 6.225330115007809e-06, "loss": 0.3873, "step": 5320 }, { "epoch": 0.378329458946285, "grad_norm": 5.065884658180426, "learning_rate": 6.218230867528042e-06, "loss": 0.39, "step": 5330 }, { "epoch": 0.3790392703139141, "grad_norm": 9.226418902203303, "learning_rate": 6.2111316200482754e-06, "loss": 0.3819, "step": 5340 }, { "epoch": 0.37974908168154314, "grad_norm": 6.998201025336219, "learning_rate": 6.204032372568508e-06, "loss": 0.3818, "step": 5350 }, { "epoch": 0.3804588930491722, "grad_norm": 4.086309894015096, "learning_rate": 6.196933125088741e-06, "loss": 0.3573, "step": 5360 }, { "epoch": 0.38116870441680123, "grad_norm": 8.280993749723958, "learning_rate": 6.189833877608974e-06, "loss": 0.3763, "step": 5370 }, { "epoch": 0.3818785157844303, "grad_norm": 4.086208683086361, "learning_rate": 6.182734630129207e-06, "loss": 0.3754, "step": 5380 }, { "epoch": 0.3825883271520593, "grad_norm": 5.958244425553627, "learning_rate": 6.17563538264944e-06, "loss": 0.3844, "step": 5390 }, { "epoch": 0.3832981385196884, "grad_norm": 3.580000162662889, "learning_rate": 6.168536135169672e-06, "loss": 0.382, "step": 5400 }, { "epoch": 0.38400794988731746, "grad_norm": 2.986600327490101, "learning_rate": 6.1614368876899054e-06, "loss": 0.3722, "step": 5410 }, { "epoch": 0.3847177612549465, "grad_norm": 3.253411703330411, "learning_rate": 6.1543376402101386e-06, "loss": 0.3723, "step": 5420 }, { "epoch": 0.38542757262257554, "grad_norm": 5.02266916683139, "learning_rate": 6.147238392730372e-06, "loss": 0.353, "step": 5430 }, { "epoch": 0.3861373839902046, "grad_norm": 6.509810117314743, "learning_rate": 6.140139145250604e-06, "loss": 0.3859, "step": 5440 }, { "epoch": 0.38684719535783363, "grad_norm": 3.024955665262126, "learning_rate": 6.133039897770837e-06, "loss": 0.3929, "step": 5450 }, { "epoch": 0.3875570067254627, "grad_norm": 3.1517938939602206, "learning_rate": 6.12594065029107e-06, "loss": 0.3899, "step": 5460 }, { "epoch": 0.38826681809309177, "grad_norm": 4.545747430477116, "learning_rate": 6.118841402811303e-06, "loss": 0.376, "step": 5470 }, { "epoch": 0.3889766294607208, "grad_norm": 4.069699163399179, "learning_rate": 6.111742155331535e-06, "loss": 0.3813, "step": 5480 }, { "epoch": 0.38968644082834986, "grad_norm": 3.562062075517251, "learning_rate": 6.104642907851768e-06, "loss": 0.383, "step": 5490 }, { "epoch": 0.3903962521959789, "grad_norm": 9.15980720106711, "learning_rate": 6.097543660372001e-06, "loss": 0.3921, "step": 5500 }, { "epoch": 0.391106063563608, "grad_norm": 4.449111409231249, "learning_rate": 6.090444412892234e-06, "loss": 0.3823, "step": 5510 }, { "epoch": 0.39181587493123704, "grad_norm": 12.724861852641904, "learning_rate": 6.083345165412466e-06, "loss": 0.3851, "step": 5520 }, { "epoch": 0.3925256862988661, "grad_norm": 6.615402324691555, "learning_rate": 6.076245917932699e-06, "loss": 0.3667, "step": 5530 }, { "epoch": 0.3932354976664951, "grad_norm": 8.817203015753774, "learning_rate": 6.069146670452932e-06, "loss": 0.3886, "step": 5540 }, { "epoch": 0.39394530903412417, "grad_norm": 9.192960733910674, "learning_rate": 6.0620474229731655e-06, "loss": 0.3794, "step": 5550 }, { "epoch": 0.3946551204017532, "grad_norm": 4.825188565131958, "learning_rate": 6.054948175493398e-06, "loss": 0.3786, "step": 5560 }, { "epoch": 0.3953649317693823, "grad_norm": 6.68078822940831, "learning_rate": 6.047848928013631e-06, "loss": 0.3835, "step": 5570 }, { "epoch": 0.39607474313701135, "grad_norm": 2.6400726840916175, "learning_rate": 6.040749680533864e-06, "loss": 0.381, "step": 5580 }, { "epoch": 0.3967845545046404, "grad_norm": 3.6668671304324967, "learning_rate": 6.033650433054097e-06, "loss": 0.3745, "step": 5590 }, { "epoch": 0.39749436587226944, "grad_norm": 2.639833206365908, "learning_rate": 6.026551185574329e-06, "loss": 0.3777, "step": 5600 }, { "epoch": 0.3982041772398985, "grad_norm": 3.79888213287165, "learning_rate": 6.019451938094562e-06, "loss": 0.3911, "step": 5610 }, { "epoch": 0.3989139886075275, "grad_norm": 5.09183422587413, "learning_rate": 6.0123526906147955e-06, "loss": 0.3832, "step": 5620 }, { "epoch": 0.3996237999751566, "grad_norm": 3.3401895175000926, "learning_rate": 6.005253443135029e-06, "loss": 0.3862, "step": 5630 }, { "epoch": 0.40033361134278567, "grad_norm": 2.5702329959348726, "learning_rate": 5.998154195655262e-06, "loss": 0.3934, "step": 5640 }, { "epoch": 0.4010434227104147, "grad_norm": 3.0044071678975937, "learning_rate": 5.991054948175494e-06, "loss": 0.3826, "step": 5650 }, { "epoch": 0.40175323407804375, "grad_norm": 2.412654779599852, "learning_rate": 5.983955700695727e-06, "loss": 0.3969, "step": 5660 }, { "epoch": 0.4024630454456728, "grad_norm": 3.0767944703908356, "learning_rate": 5.97685645321596e-06, "loss": 0.3961, "step": 5670 }, { "epoch": 0.40317285681330184, "grad_norm": 2.8053230371522124, "learning_rate": 5.969757205736193e-06, "loss": 0.3869, "step": 5680 }, { "epoch": 0.40388266818093094, "grad_norm": 7.472643121749521, "learning_rate": 5.962657958256425e-06, "loss": 0.3851, "step": 5690 }, { "epoch": 0.40459247954856, "grad_norm": 14.585388143398843, "learning_rate": 5.955558710776658e-06, "loss": 0.3905, "step": 5700 }, { "epoch": 0.405302290916189, "grad_norm": 4.416692599365141, "learning_rate": 5.948459463296891e-06, "loss": 0.3862, "step": 5710 }, { "epoch": 0.40601210228381807, "grad_norm": 3.4729116521336776, "learning_rate": 5.941360215817123e-06, "loss": 0.402, "step": 5720 }, { "epoch": 0.4067219136514471, "grad_norm": 3.5423705326787114, "learning_rate": 5.934260968337356e-06, "loss": 0.3884, "step": 5730 }, { "epoch": 0.4074317250190762, "grad_norm": 3.1365000657861497, "learning_rate": 5.927161720857589e-06, "loss": 0.3825, "step": 5740 }, { "epoch": 0.40814153638670525, "grad_norm": 5.219488757508086, "learning_rate": 5.9200624733778224e-06, "loss": 0.3894, "step": 5750 }, { "epoch": 0.4088513477543343, "grad_norm": 3.596909048940233, "learning_rate": 5.9129632258980555e-06, "loss": 0.3831, "step": 5760 }, { "epoch": 0.40956115912196334, "grad_norm": 2.476134224023759, "learning_rate": 5.905863978418288e-06, "loss": 0.3825, "step": 5770 }, { "epoch": 0.4102709704895924, "grad_norm": 3.407930958961138, "learning_rate": 5.898764730938521e-06, "loss": 0.3714, "step": 5780 }, { "epoch": 0.4109807818572214, "grad_norm": 3.6349280667767636, "learning_rate": 5.891665483458754e-06, "loss": 0.3949, "step": 5790 }, { "epoch": 0.4116905932248505, "grad_norm": 10.032880290815127, "learning_rate": 5.884566235978987e-06, "loss": 0.3827, "step": 5800 }, { "epoch": 0.41240040459247956, "grad_norm": 4.403552459945297, "learning_rate": 5.877466988499219e-06, "loss": 0.3738, "step": 5810 }, { "epoch": 0.4131102159601086, "grad_norm": 3.2630803210797086, "learning_rate": 5.8703677410194525e-06, "loss": 0.3947, "step": 5820 }, { "epoch": 0.41382002732773765, "grad_norm": 11.228663057773362, "learning_rate": 5.8632684935396856e-06, "loss": 0.3825, "step": 5830 }, { "epoch": 0.4145298386953667, "grad_norm": 18.33844649221444, "learning_rate": 5.856169246059919e-06, "loss": 0.381, "step": 5840 }, { "epoch": 0.41523965006299574, "grad_norm": 14.576257048715338, "learning_rate": 5.849069998580152e-06, "loss": 0.389, "step": 5850 }, { "epoch": 0.41594946143062483, "grad_norm": 3.3799659706310177, "learning_rate": 5.841970751100384e-06, "loss": 0.3687, "step": 5860 }, { "epoch": 0.4166592727982539, "grad_norm": 4.306786145673671, "learning_rate": 5.834871503620617e-06, "loss": 0.3846, "step": 5870 }, { "epoch": 0.4173690841658829, "grad_norm": 2.71585444285802, "learning_rate": 5.82777225614085e-06, "loss": 0.397, "step": 5880 }, { "epoch": 0.41807889553351196, "grad_norm": 4.530639455269193, "learning_rate": 5.820673008661082e-06, "loss": 0.3633, "step": 5890 }, { "epoch": 0.418788706901141, "grad_norm": 5.299365856406392, "learning_rate": 5.813573761181315e-06, "loss": 0.3854, "step": 5900 }, { "epoch": 0.41949851826877005, "grad_norm": 3.5533453867575786, "learning_rate": 5.806474513701548e-06, "loss": 0.3855, "step": 5910 }, { "epoch": 0.42020832963639915, "grad_norm": 9.388008852057116, "learning_rate": 5.799375266221781e-06, "loss": 0.3911, "step": 5920 }, { "epoch": 0.4209181410040282, "grad_norm": 3.378607546141685, "learning_rate": 5.792276018742013e-06, "loss": 0.3751, "step": 5930 }, { "epoch": 0.42162795237165723, "grad_norm": 12.222073948575716, "learning_rate": 5.785176771262246e-06, "loss": 0.3778, "step": 5940 }, { "epoch": 0.4223377637392863, "grad_norm": 4.297952573306613, "learning_rate": 5.778077523782479e-06, "loss": 0.3827, "step": 5950 }, { "epoch": 0.4230475751069153, "grad_norm": 9.764464171752504, "learning_rate": 5.7709782763027125e-06, "loss": 0.3893, "step": 5960 }, { "epoch": 0.4237573864745444, "grad_norm": 3.7569225597805658, "learning_rate": 5.763879028822945e-06, "loss": 0.3901, "step": 5970 }, { "epoch": 0.42446719784217346, "grad_norm": 3.0005485619903824, "learning_rate": 5.756779781343178e-06, "loss": 0.3753, "step": 5980 }, { "epoch": 0.4251770092098025, "grad_norm": 6.457104695432505, "learning_rate": 5.749680533863411e-06, "loss": 0.3585, "step": 5990 }, { "epoch": 0.42588682057743155, "grad_norm": 4.252684527352716, "learning_rate": 5.742581286383644e-06, "loss": 0.3745, "step": 6000 }, { "epoch": 0.4265966319450606, "grad_norm": 3.3319349737549673, "learning_rate": 5.735482038903877e-06, "loss": 0.3836, "step": 6010 }, { "epoch": 0.42730644331268963, "grad_norm": 4.333001859655407, "learning_rate": 5.728382791424109e-06, "loss": 0.3698, "step": 6020 }, { "epoch": 0.42801625468031873, "grad_norm": 3.9838864194561343, "learning_rate": 5.7212835439443425e-06, "loss": 0.3686, "step": 6030 }, { "epoch": 0.4287260660479478, "grad_norm": 3.206673737162168, "learning_rate": 5.714184296464576e-06, "loss": 0.374, "step": 6040 }, { "epoch": 0.4294358774155768, "grad_norm": 7.910008181832549, "learning_rate": 5.707085048984809e-06, "loss": 0.3731, "step": 6050 }, { "epoch": 0.43014568878320586, "grad_norm": 11.533279860672804, "learning_rate": 5.699985801505041e-06, "loss": 0.3842, "step": 6060 }, { "epoch": 0.4308555001508349, "grad_norm": 4.06817553254219, "learning_rate": 5.692886554025274e-06, "loss": 0.3717, "step": 6070 }, { "epoch": 0.43156531151846395, "grad_norm": 12.082596102938004, "learning_rate": 5.685787306545507e-06, "loss": 0.3971, "step": 6080 }, { "epoch": 0.43227512288609304, "grad_norm": 2.685455478240202, "learning_rate": 5.678688059065739e-06, "loss": 0.3822, "step": 6090 }, { "epoch": 0.4329849342537221, "grad_norm": 3.1399973614222643, "learning_rate": 5.671588811585972e-06, "loss": 0.3774, "step": 6100 }, { "epoch": 0.43369474562135113, "grad_norm": 3.518374812592983, "learning_rate": 5.664489564106205e-06, "loss": 0.3781, "step": 6110 }, { "epoch": 0.4344045569889802, "grad_norm": 4.803932844471321, "learning_rate": 5.657390316626438e-06, "loss": 0.3757, "step": 6120 }, { "epoch": 0.4351143683566092, "grad_norm": 12.690594810777407, "learning_rate": 5.650291069146671e-06, "loss": 0.3747, "step": 6130 }, { "epoch": 0.43582417972423826, "grad_norm": 10.80688099347966, "learning_rate": 5.643191821666903e-06, "loss": 0.3676, "step": 6140 }, { "epoch": 0.43653399109186736, "grad_norm": 4.232034052682343, "learning_rate": 5.636092574187136e-06, "loss": 0.395, "step": 6150 }, { "epoch": 0.4372438024594964, "grad_norm": 3.422739256279243, "learning_rate": 5.6289933267073694e-06, "loss": 0.3693, "step": 6160 }, { "epoch": 0.43795361382712544, "grad_norm": 32.06006758689784, "learning_rate": 5.6218940792276025e-06, "loss": 0.3782, "step": 6170 }, { "epoch": 0.4386634251947545, "grad_norm": 5.623034465377633, "learning_rate": 5.614794831747835e-06, "loss": 0.3813, "step": 6180 }, { "epoch": 0.43937323656238353, "grad_norm": 10.612805886316337, "learning_rate": 5.607695584268068e-06, "loss": 0.3702, "step": 6190 }, { "epoch": 0.44008304793001257, "grad_norm": 6.077674805742986, "learning_rate": 5.600596336788301e-06, "loss": 0.3643, "step": 6200 }, { "epoch": 0.44079285929764167, "grad_norm": 7.053795971115957, "learning_rate": 5.593497089308534e-06, "loss": 0.3911, "step": 6210 }, { "epoch": 0.4415026706652707, "grad_norm": 6.212842792838621, "learning_rate": 5.586397841828766e-06, "loss": 0.3774, "step": 6220 }, { "epoch": 0.44221248203289976, "grad_norm": 7.598832178623656, "learning_rate": 5.5792985943489995e-06, "loss": 0.3808, "step": 6230 }, { "epoch": 0.4429222934005288, "grad_norm": 14.834315377312098, "learning_rate": 5.5721993468692326e-06, "loss": 0.3765, "step": 6240 }, { "epoch": 0.44363210476815784, "grad_norm": 15.459970963070427, "learning_rate": 5.565100099389466e-06, "loss": 0.3863, "step": 6250 }, { "epoch": 0.44434191613578694, "grad_norm": 5.002895033502256, "learning_rate": 5.558000851909699e-06, "loss": 0.3718, "step": 6260 }, { "epoch": 0.445051727503416, "grad_norm": 4.67592371180372, "learning_rate": 5.550901604429931e-06, "loss": 0.3869, "step": 6270 }, { "epoch": 0.445761538871045, "grad_norm": 4.246040554798665, "learning_rate": 5.543802356950164e-06, "loss": 0.3673, "step": 6280 }, { "epoch": 0.44647135023867407, "grad_norm": 5.698576828390134, "learning_rate": 5.536703109470397e-06, "loss": 0.3733, "step": 6290 }, { "epoch": 0.4471811616063031, "grad_norm": 4.890818923695549, "learning_rate": 5.529603861990629e-06, "loss": 0.3917, "step": 6300 }, { "epoch": 0.44789097297393216, "grad_norm": 3.5954099385229, "learning_rate": 5.522504614510862e-06, "loss": 0.387, "step": 6310 }, { "epoch": 0.44860078434156125, "grad_norm": 5.819667912733057, "learning_rate": 5.515405367031095e-06, "loss": 0.3772, "step": 6320 }, { "epoch": 0.4493105957091903, "grad_norm": 4.924613328068802, "learning_rate": 5.508306119551328e-06, "loss": 0.3691, "step": 6330 }, { "epoch": 0.45002040707681934, "grad_norm": 4.077670226838275, "learning_rate": 5.50120687207156e-06, "loss": 0.3606, "step": 6340 }, { "epoch": 0.4507302184444484, "grad_norm": 4.7425966011878815, "learning_rate": 5.494107624591793e-06, "loss": 0.3712, "step": 6350 }, { "epoch": 0.4514400298120774, "grad_norm": 3.7724063921848, "learning_rate": 5.487008377112026e-06, "loss": 0.3707, "step": 6360 }, { "epoch": 0.45214984117970647, "grad_norm": 2.8597041255348183, "learning_rate": 5.4799091296322595e-06, "loss": 0.364, "step": 6370 }, { "epoch": 0.45285965254733557, "grad_norm": 5.386440052681094, "learning_rate": 5.472809882152493e-06, "loss": 0.3785, "step": 6380 }, { "epoch": 0.4535694639149646, "grad_norm": 4.20147189666546, "learning_rate": 5.465710634672725e-06, "loss": 0.384, "step": 6390 }, { "epoch": 0.45427927528259365, "grad_norm": 5.4360613411555185, "learning_rate": 5.458611387192958e-06, "loss": 0.3676, "step": 6400 }, { "epoch": 0.4549890866502227, "grad_norm": 7.4543272167324846, "learning_rate": 5.451512139713191e-06, "loss": 0.3973, "step": 6410 }, { "epoch": 0.45569889801785174, "grad_norm": 5.302161729787796, "learning_rate": 5.444412892233424e-06, "loss": 0.3878, "step": 6420 }, { "epoch": 0.4564087093854808, "grad_norm": 4.774927845954586, "learning_rate": 5.437313644753656e-06, "loss": 0.368, "step": 6430 }, { "epoch": 0.4571185207531099, "grad_norm": 4.733108202290537, "learning_rate": 5.4302143972738895e-06, "loss": 0.3841, "step": 6440 }, { "epoch": 0.4578283321207389, "grad_norm": 4.581655513075473, "learning_rate": 5.423115149794123e-06, "loss": 0.3805, "step": 6450 }, { "epoch": 0.45853814348836797, "grad_norm": 2.4364404744853445, "learning_rate": 5.416015902314356e-06, "loss": 0.3587, "step": 6460 }, { "epoch": 0.459247954855997, "grad_norm": 5.16394378928267, "learning_rate": 5.408916654834588e-06, "loss": 0.3793, "step": 6470 }, { "epoch": 0.45995776622362605, "grad_norm": 8.232574335670192, "learning_rate": 5.401817407354821e-06, "loss": 0.3794, "step": 6480 }, { "epoch": 0.46066757759125515, "grad_norm": 10.509485180483269, "learning_rate": 5.394718159875054e-06, "loss": 0.3742, "step": 6490 }, { "epoch": 0.4613773889588842, "grad_norm": 3.418180521754276, "learning_rate": 5.387618912395286e-06, "loss": 0.3733, "step": 6500 }, { "epoch": 0.46208720032651324, "grad_norm": 4.2689703556593495, "learning_rate": 5.380519664915519e-06, "loss": 0.374, "step": 6510 }, { "epoch": 0.4627970116941423, "grad_norm": 7.896842999549548, "learning_rate": 5.373420417435752e-06, "loss": 0.3799, "step": 6520 }, { "epoch": 0.4635068230617713, "grad_norm": 3.4870838077093893, "learning_rate": 5.366321169955985e-06, "loss": 0.3712, "step": 6530 }, { "epoch": 0.46421663442940037, "grad_norm": 27.778526824166995, "learning_rate": 5.359221922476218e-06, "loss": 0.3655, "step": 6540 }, { "epoch": 0.46492644579702946, "grad_norm": 16.796202092439216, "learning_rate": 5.35212267499645e-06, "loss": 0.3846, "step": 6550 }, { "epoch": 0.4656362571646585, "grad_norm": 5.698856930659158, "learning_rate": 5.345023427516683e-06, "loss": 0.3877, "step": 6560 }, { "epoch": 0.46634606853228755, "grad_norm": 8.694016798434083, "learning_rate": 5.3379241800369165e-06, "loss": 0.3607, "step": 6570 }, { "epoch": 0.4670558798999166, "grad_norm": 3.617969654098083, "learning_rate": 5.3308249325571496e-06, "loss": 0.36, "step": 6580 }, { "epoch": 0.46776569126754564, "grad_norm": 7.181014577384461, "learning_rate": 5.323725685077382e-06, "loss": 0.3783, "step": 6590 }, { "epoch": 0.4684755026351747, "grad_norm": 9.52331650225055, "learning_rate": 5.316626437597615e-06, "loss": 0.3707, "step": 6600 }, { "epoch": 0.4691853140028038, "grad_norm": 5.927560976046885, "learning_rate": 5.309527190117848e-06, "loss": 0.3747, "step": 6610 }, { "epoch": 0.4698951253704328, "grad_norm": 33.354649195054265, "learning_rate": 5.302427942638081e-06, "loss": 0.3622, "step": 6620 }, { "epoch": 0.47060493673806186, "grad_norm": 5.109478632635811, "learning_rate": 5.295328695158314e-06, "loss": 0.3702, "step": 6630 }, { "epoch": 0.4713147481056909, "grad_norm": 62.14127099005149, "learning_rate": 5.2882294476785465e-06, "loss": 0.3718, "step": 6640 }, { "epoch": 0.47202455947331995, "grad_norm": 3.9646315343813674, "learning_rate": 5.2811302001987796e-06, "loss": 0.3579, "step": 6650 }, { "epoch": 0.472734370840949, "grad_norm": 5.822229945732986, "learning_rate": 5.274030952719013e-06, "loss": 0.358, "step": 6660 }, { "epoch": 0.4734441822085781, "grad_norm": 3.0706990453586607, "learning_rate": 5.266931705239246e-06, "loss": 0.3712, "step": 6670 }, { "epoch": 0.47415399357620713, "grad_norm": 2.763541771977754, "learning_rate": 5.259832457759478e-06, "loss": 0.3862, "step": 6680 }, { "epoch": 0.4748638049438362, "grad_norm": 2.8054880505902746, "learning_rate": 5.252733210279711e-06, "loss": 0.3609, "step": 6690 }, { "epoch": 0.4755736163114652, "grad_norm": 3.5455500616555864, "learning_rate": 5.245633962799943e-06, "loss": 0.3845, "step": 6700 }, { "epoch": 0.47628342767909426, "grad_norm": 6.871049315984216, "learning_rate": 5.238534715320176e-06, "loss": 0.3681, "step": 6710 }, { "epoch": 0.47699323904672336, "grad_norm": 4.626136895991325, "learning_rate": 5.231435467840409e-06, "loss": 0.3694, "step": 6720 }, { "epoch": 0.4777030504143524, "grad_norm": 4.1689737774582385, "learning_rate": 5.224336220360642e-06, "loss": 0.3722, "step": 6730 }, { "epoch": 0.47841286178198145, "grad_norm": 2.345831388882716, "learning_rate": 5.217236972880875e-06, "loss": 0.3778, "step": 6740 }, { "epoch": 0.4791226731496105, "grad_norm": 5.181993551246977, "learning_rate": 5.210137725401107e-06, "loss": 0.3649, "step": 6750 }, { "epoch": 0.47983248451723953, "grad_norm": 4.144025528380454, "learning_rate": 5.20303847792134e-06, "loss": 0.3854, "step": 6760 }, { "epoch": 0.4805422958848686, "grad_norm": 4.0013049178877536, "learning_rate": 5.195939230441573e-06, "loss": 0.3832, "step": 6770 }, { "epoch": 0.4812521072524977, "grad_norm": 4.375334224867565, "learning_rate": 5.1888399829618065e-06, "loss": 0.3678, "step": 6780 }, { "epoch": 0.4819619186201267, "grad_norm": 2.8158913555106926, "learning_rate": 5.18174073548204e-06, "loss": 0.3735, "step": 6790 }, { "epoch": 0.48267172998775576, "grad_norm": 4.286259213586135, "learning_rate": 5.174641488002272e-06, "loss": 0.3824, "step": 6800 }, { "epoch": 0.4833815413553848, "grad_norm": 2.917255310557774, "learning_rate": 5.167542240522505e-06, "loss": 0.367, "step": 6810 }, { "epoch": 0.48409135272301385, "grad_norm": 2.9474809991081194, "learning_rate": 5.160442993042738e-06, "loss": 0.37, "step": 6820 }, { "epoch": 0.4848011640906429, "grad_norm": 8.0892973566849, "learning_rate": 5.153343745562971e-06, "loss": 0.385, "step": 6830 }, { "epoch": 0.485510975458272, "grad_norm": 5.46237208189901, "learning_rate": 5.1462444980832034e-06, "loss": 0.3723, "step": 6840 }, { "epoch": 0.48622078682590103, "grad_norm": 4.813397707683654, "learning_rate": 5.1391452506034365e-06, "loss": 0.3847, "step": 6850 }, { "epoch": 0.4869305981935301, "grad_norm": 3.839632822272105, "learning_rate": 5.13204600312367e-06, "loss": 0.3994, "step": 6860 }, { "epoch": 0.4876404095611591, "grad_norm": 2.731217984269613, "learning_rate": 5.124946755643903e-06, "loss": 0.3928, "step": 6870 }, { "epoch": 0.48835022092878816, "grad_norm": 7.062296596699752, "learning_rate": 5.117847508164136e-06, "loss": 0.4141, "step": 6880 }, { "epoch": 0.4890600322964172, "grad_norm": 3.0471865890050034, "learning_rate": 5.110748260684368e-06, "loss": 0.3712, "step": 6890 }, { "epoch": 0.4897698436640463, "grad_norm": 8.240874357274272, "learning_rate": 5.103649013204601e-06, "loss": 0.3828, "step": 6900 }, { "epoch": 0.49047965503167534, "grad_norm": 4.557814239490917, "learning_rate": 5.0965497657248334e-06, "loss": 0.3794, "step": 6910 }, { "epoch": 0.4911894663993044, "grad_norm": 6.50934729087624, "learning_rate": 5.089450518245066e-06, "loss": 0.3655, "step": 6920 }, { "epoch": 0.49189927776693343, "grad_norm": 2.7892154452796696, "learning_rate": 5.082351270765299e-06, "loss": 0.3477, "step": 6930 }, { "epoch": 0.4926090891345625, "grad_norm": 4.296820022815862, "learning_rate": 5.075252023285532e-06, "loss": 0.3917, "step": 6940 }, { "epoch": 0.4933189005021915, "grad_norm": 3.7811542108069514, "learning_rate": 5.068152775805765e-06, "loss": 0.3846, "step": 6950 }, { "epoch": 0.4940287118698206, "grad_norm": 12.150770506288081, "learning_rate": 5.061053528325997e-06, "loss": 0.3991, "step": 6960 }, { "epoch": 0.49473852323744966, "grad_norm": 8.737862487013935, "learning_rate": 5.05395428084623e-06, "loss": 0.376, "step": 6970 }, { "epoch": 0.4954483346050787, "grad_norm": 4.705086993153889, "learning_rate": 5.0468550333664635e-06, "loss": 0.3774, "step": 6980 }, { "epoch": 0.49615814597270774, "grad_norm": 3.95177864719572, "learning_rate": 5.0397557858866966e-06, "loss": 0.3867, "step": 6990 }, { "epoch": 0.4968679573403368, "grad_norm": 4.9228476674024995, "learning_rate": 5.03265653840693e-06, "loss": 0.3868, "step": 7000 }, { "epoch": 0.4975777687079659, "grad_norm": 7.598944675436029, "learning_rate": 5.025557290927162e-06, "loss": 0.3791, "step": 7010 }, { "epoch": 0.4982875800755949, "grad_norm": 3.948022335506646, "learning_rate": 5.018458043447395e-06, "loss": 0.3878, "step": 7020 }, { "epoch": 0.49899739144322397, "grad_norm": 2.97600555704115, "learning_rate": 5.011358795967628e-06, "loss": 0.3891, "step": 7030 }, { "epoch": 0.499707202810853, "grad_norm": 7.322058927387839, "learning_rate": 5.004259548487861e-06, "loss": 0.3739, "step": 7040 }, { "epoch": 0.5004170141784821, "grad_norm": 4.054563164115399, "learning_rate": 4.9971603010080935e-06, "loss": 0.3654, "step": 7050 }, { "epoch": 0.5011268255461111, "grad_norm": 6.433797069878189, "learning_rate": 4.990061053528326e-06, "loss": 0.3769, "step": 7060 }, { "epoch": 0.5018366369137401, "grad_norm": 6.244381336548628, "learning_rate": 4.982961806048559e-06, "loss": 0.3698, "step": 7070 }, { "epoch": 0.5025464482813692, "grad_norm": 4.649812061123292, "learning_rate": 4.975862558568792e-06, "loss": 0.3597, "step": 7080 }, { "epoch": 0.5032562596489982, "grad_norm": 13.131635539716475, "learning_rate": 4.968763311089025e-06, "loss": 0.3737, "step": 7090 }, { "epoch": 0.5039660710166274, "grad_norm": 11.654767208116397, "learning_rate": 4.961664063609258e-06, "loss": 0.3809, "step": 7100 }, { "epoch": 0.5046758823842564, "grad_norm": 5.54405844933368, "learning_rate": 4.95456481612949e-06, "loss": 0.3668, "step": 7110 }, { "epoch": 0.5053856937518855, "grad_norm": 17.63140898183613, "learning_rate": 4.9474655686497235e-06, "loss": 0.3751, "step": 7120 }, { "epoch": 0.5060955051195145, "grad_norm": 4.735270750917372, "learning_rate": 4.940366321169957e-06, "loss": 0.3759, "step": 7130 }, { "epoch": 0.5068053164871436, "grad_norm": 3.6005983980475214, "learning_rate": 4.93326707369019e-06, "loss": 0.3932, "step": 7140 }, { "epoch": 0.5075151278547726, "grad_norm": 5.073652881259414, "learning_rate": 4.926167826210422e-06, "loss": 0.3689, "step": 7150 }, { "epoch": 0.5082249392224016, "grad_norm": 6.515311066715168, "learning_rate": 4.919068578730655e-06, "loss": 0.3675, "step": 7160 }, { "epoch": 0.5089347505900307, "grad_norm": 12.98913332417653, "learning_rate": 4.911969331250887e-06, "loss": 0.3861, "step": 7170 }, { "epoch": 0.5096445619576597, "grad_norm": 5.1500756291258005, "learning_rate": 4.90487008377112e-06, "loss": 0.3731, "step": 7180 }, { "epoch": 0.5103543733252888, "grad_norm": 5.833801547579832, "learning_rate": 4.8977708362913535e-06, "loss": 0.3831, "step": 7190 }, { "epoch": 0.5110641846929178, "grad_norm": 8.343761477251691, "learning_rate": 4.890671588811587e-06, "loss": 0.3716, "step": 7200 }, { "epoch": 0.511773996060547, "grad_norm": 6.740845613760958, "learning_rate": 4.883572341331819e-06, "loss": 0.377, "step": 7210 }, { "epoch": 0.512483807428176, "grad_norm": 6.834960096187304, "learning_rate": 4.876473093852052e-06, "loss": 0.3774, "step": 7220 }, { "epoch": 0.513193618795805, "grad_norm": 6.333904565562881, "learning_rate": 4.869373846372285e-06, "loss": 0.3786, "step": 7230 }, { "epoch": 0.5139034301634341, "grad_norm": 7.380378873059882, "learning_rate": 4.862274598892518e-06, "loss": 0.3641, "step": 7240 }, { "epoch": 0.5146132415310631, "grad_norm": 8.15711157363267, "learning_rate": 4.855175351412751e-06, "loss": 0.354, "step": 7250 }, { "epoch": 0.5153230528986922, "grad_norm": 5.298194233144714, "learning_rate": 4.8480761039329835e-06, "loss": 0.3648, "step": 7260 }, { "epoch": 0.5160328642663212, "grad_norm": 6.169565228174972, "learning_rate": 4.840976856453216e-06, "loss": 0.3606, "step": 7270 }, { "epoch": 0.5167426756339503, "grad_norm": 4.633952354333419, "learning_rate": 4.833877608973449e-06, "loss": 0.3627, "step": 7280 }, { "epoch": 0.5174524870015793, "grad_norm": 7.754370375548218, "learning_rate": 4.826778361493682e-06, "loss": 0.384, "step": 7290 }, { "epoch": 0.5181622983692084, "grad_norm": 4.628647672477682, "learning_rate": 4.819679114013915e-06, "loss": 0.3717, "step": 7300 }, { "epoch": 0.5188721097368374, "grad_norm": 4.6108119740619165, "learning_rate": 4.812579866534147e-06, "loss": 0.3531, "step": 7310 }, { "epoch": 0.5195819211044664, "grad_norm": 3.777480319775288, "learning_rate": 4.8054806190543805e-06, "loss": 0.3735, "step": 7320 }, { "epoch": 0.5202917324720956, "grad_norm": 6.455151414772601, "learning_rate": 4.7983813715746136e-06, "loss": 0.3845, "step": 7330 }, { "epoch": 0.5210015438397246, "grad_norm": 5.0016880570007, "learning_rate": 4.791282124094847e-06, "loss": 0.3588, "step": 7340 }, { "epoch": 0.5217113552073537, "grad_norm": 3.596195253014758, "learning_rate": 4.78418287661508e-06, "loss": 0.3664, "step": 7350 }, { "epoch": 0.5224211665749827, "grad_norm": 4.6111563525428005, "learning_rate": 4.777083629135312e-06, "loss": 0.3815, "step": 7360 }, { "epoch": 0.5231309779426118, "grad_norm": 3.81079107236397, "learning_rate": 4.769984381655544e-06, "loss": 0.3603, "step": 7370 }, { "epoch": 0.5238407893102408, "grad_norm": 10.081677733455512, "learning_rate": 4.762885134175777e-06, "loss": 0.3748, "step": 7380 }, { "epoch": 0.5245506006778698, "grad_norm": 4.011909680570432, "learning_rate": 4.7557858866960105e-06, "loss": 0.3736, "step": 7390 }, { "epoch": 0.5252604120454989, "grad_norm": 4.008812937992125, "learning_rate": 4.7486866392162436e-06, "loss": 0.3718, "step": 7400 }, { "epoch": 0.5259702234131279, "grad_norm": 8.895014071619777, "learning_rate": 4.741587391736477e-06, "loss": 0.3747, "step": 7410 }, { "epoch": 0.526680034780757, "grad_norm": 2.5646865204368394, "learning_rate": 4.734488144256709e-06, "loss": 0.3593, "step": 7420 }, { "epoch": 0.527389846148386, "grad_norm": 2.8583907278858147, "learning_rate": 4.727388896776942e-06, "loss": 0.387, "step": 7430 }, { "epoch": 0.5280996575160151, "grad_norm": 2.8626323560816296, "learning_rate": 4.720289649297175e-06, "loss": 0.3756, "step": 7440 }, { "epoch": 0.5288094688836442, "grad_norm": 7.38191434335366, "learning_rate": 4.713190401817408e-06, "loss": 0.3715, "step": 7450 }, { "epoch": 0.5295192802512733, "grad_norm": 3.187699709665762, "learning_rate": 4.7060911543376405e-06, "loss": 0.3763, "step": 7460 }, { "epoch": 0.5302290916189023, "grad_norm": 2.2423385405265366, "learning_rate": 4.698991906857874e-06, "loss": 0.367, "step": 7470 }, { "epoch": 0.5309389029865313, "grad_norm": 3.5525056364166465, "learning_rate": 4.691892659378106e-06, "loss": 0.3639, "step": 7480 }, { "epoch": 0.5316487143541604, "grad_norm": 2.5840538292895405, "learning_rate": 4.684793411898339e-06, "loss": 0.3713, "step": 7490 }, { "epoch": 0.5323585257217894, "grad_norm": 3.6015272776951366, "learning_rate": 4.677694164418572e-06, "loss": 0.3672, "step": 7500 }, { "epoch": 0.5330683370894185, "grad_norm": 2.958338857599813, "learning_rate": 4.670594916938805e-06, "loss": 0.364, "step": 7510 }, { "epoch": 0.5337781484570475, "grad_norm": 2.6780802400700248, "learning_rate": 4.663495669459037e-06, "loss": 0.3871, "step": 7520 }, { "epoch": 0.5344879598246766, "grad_norm": 2.141486624042336, "learning_rate": 4.6563964219792705e-06, "loss": 0.3918, "step": 7530 }, { "epoch": 0.5351977711923056, "grad_norm": 11.627725180923038, "learning_rate": 4.649297174499504e-06, "loss": 0.369, "step": 7540 }, { "epoch": 0.5359075825599346, "grad_norm": 2.164302320101156, "learning_rate": 4.642197927019737e-06, "loss": 0.3763, "step": 7550 }, { "epoch": 0.5366173939275638, "grad_norm": 2.5355641201406716, "learning_rate": 4.63509867953997e-06, "loss": 0.3709, "step": 7560 }, { "epoch": 0.5373272052951928, "grad_norm": 1.7486780225096559, "learning_rate": 4.627999432060202e-06, "loss": 0.3778, "step": 7570 }, { "epoch": 0.5380370166628219, "grad_norm": 2.1996857828607066, "learning_rate": 4.620900184580434e-06, "loss": 0.3878, "step": 7580 }, { "epoch": 0.5387468280304509, "grad_norm": 2.2718302971034325, "learning_rate": 4.613800937100667e-06, "loss": 0.3691, "step": 7590 }, { "epoch": 0.53945663939808, "grad_norm": 2.247788269458988, "learning_rate": 4.6067016896209005e-06, "loss": 0.3764, "step": 7600 }, { "epoch": 0.540166450765709, "grad_norm": 4.951241532022136, "learning_rate": 4.599602442141134e-06, "loss": 0.3696, "step": 7610 }, { "epoch": 0.5408762621333381, "grad_norm": 18.87723312065313, "learning_rate": 4.592503194661366e-06, "loss": 0.3752, "step": 7620 }, { "epoch": 0.5415860735009671, "grad_norm": 4.839150391451601, "learning_rate": 4.585403947181599e-06, "loss": 0.3704, "step": 7630 }, { "epoch": 0.5422958848685961, "grad_norm": 3.252448644894675, "learning_rate": 4.578304699701832e-06, "loss": 0.3662, "step": 7640 }, { "epoch": 0.5430056962362252, "grad_norm": 4.636061450249123, "learning_rate": 4.571205452222065e-06, "loss": 0.3695, "step": 7650 }, { "epoch": 0.5437155076038542, "grad_norm": 2.217398025384477, "learning_rate": 4.564106204742298e-06, "loss": 0.381, "step": 7660 }, { "epoch": 0.5444253189714833, "grad_norm": 3.2864797627789764, "learning_rate": 4.5570069572625305e-06, "loss": 0.3766, "step": 7670 }, { "epoch": 0.5451351303391124, "grad_norm": 2.5595280528292346, "learning_rate": 4.549907709782763e-06, "loss": 0.3753, "step": 7680 }, { "epoch": 0.5458449417067415, "grad_norm": 3.5869951931087356, "learning_rate": 4.542808462302996e-06, "loss": 0.3649, "step": 7690 }, { "epoch": 0.5465547530743705, "grad_norm": 2.878804286325741, "learning_rate": 4.535709214823229e-06, "loss": 0.365, "step": 7700 }, { "epoch": 0.5472645644419996, "grad_norm": 3.835428702840037, "learning_rate": 4.528609967343462e-06, "loss": 0.388, "step": 7710 }, { "epoch": 0.5479743758096286, "grad_norm": 3.3115804743584225, "learning_rate": 4.521510719863695e-06, "loss": 0.3498, "step": 7720 }, { "epoch": 0.5486841871772576, "grad_norm": 2.155325207710473, "learning_rate": 4.5144114723839275e-06, "loss": 0.3638, "step": 7730 }, { "epoch": 0.5493939985448867, "grad_norm": 2.9473064158817506, "learning_rate": 4.5073122249041606e-06, "loss": 0.3756, "step": 7740 }, { "epoch": 0.5501038099125157, "grad_norm": 3.977038197892431, "learning_rate": 4.500212977424394e-06, "loss": 0.3674, "step": 7750 }, { "epoch": 0.5508136212801448, "grad_norm": 5.638630944163406, "learning_rate": 4.493113729944627e-06, "loss": 0.3528, "step": 7760 }, { "epoch": 0.5515234326477738, "grad_norm": 2.8534926361264286, "learning_rate": 4.486014482464859e-06, "loss": 0.3697, "step": 7770 }, { "epoch": 0.5522332440154029, "grad_norm": 6.069502646886042, "learning_rate": 4.478915234985092e-06, "loss": 0.3843, "step": 7780 }, { "epoch": 0.552943055383032, "grad_norm": 4.343605351910854, "learning_rate": 4.471815987505324e-06, "loss": 0.3783, "step": 7790 }, { "epoch": 0.553652866750661, "grad_norm": 2.104465858436518, "learning_rate": 4.4647167400255575e-06, "loss": 0.3601, "step": 7800 }, { "epoch": 0.5543626781182901, "grad_norm": 3.0902122663518448, "learning_rate": 4.457617492545791e-06, "loss": 0.3801, "step": 7810 }, { "epoch": 0.5550724894859191, "grad_norm": 4.573352955842933, "learning_rate": 4.450518245066024e-06, "loss": 0.3835, "step": 7820 }, { "epoch": 0.5557823008535482, "grad_norm": 2.9707860507790924, "learning_rate": 4.443418997586256e-06, "loss": 0.3709, "step": 7830 }, { "epoch": 0.5564921122211772, "grad_norm": 2.5687241689417806, "learning_rate": 4.436319750106489e-06, "loss": 0.3835, "step": 7840 }, { "epoch": 0.5572019235888063, "grad_norm": 3.347322471582433, "learning_rate": 4.429220502626722e-06, "loss": 0.3735, "step": 7850 }, { "epoch": 0.5579117349564353, "grad_norm": 6.431823861299619, "learning_rate": 4.422121255146955e-06, "loss": 0.3704, "step": 7860 }, { "epoch": 0.5586215463240644, "grad_norm": 3.050115422109329, "learning_rate": 4.4150220076671875e-06, "loss": 0.3822, "step": 7870 }, { "epoch": 0.5593313576916934, "grad_norm": 1.7811591664189523, "learning_rate": 4.407922760187421e-06, "loss": 0.3658, "step": 7880 }, { "epoch": 0.5600411690593224, "grad_norm": 3.442846796158278, "learning_rate": 4.400823512707653e-06, "loss": 0.3621, "step": 7890 }, { "epoch": 0.5607509804269515, "grad_norm": 7.2461896738177, "learning_rate": 4.393724265227886e-06, "loss": 0.3526, "step": 7900 }, { "epoch": 0.5614607917945806, "grad_norm": 2.0219408065827875, "learning_rate": 4.386625017748119e-06, "loss": 0.3659, "step": 7910 }, { "epoch": 0.5621706031622097, "grad_norm": 4.896944413168855, "learning_rate": 4.379525770268352e-06, "loss": 0.3765, "step": 7920 }, { "epoch": 0.5628804145298387, "grad_norm": 2.1094695887191848, "learning_rate": 4.372426522788584e-06, "loss": 0.3644, "step": 7930 }, { "epoch": 0.5635902258974678, "grad_norm": 5.596991296221292, "learning_rate": 4.3653272753088175e-06, "loss": 0.3835, "step": 7940 }, { "epoch": 0.5643000372650968, "grad_norm": 2.373450501523087, "learning_rate": 4.358228027829051e-06, "loss": 0.3756, "step": 7950 }, { "epoch": 0.5650098486327259, "grad_norm": 4.1947432157390026, "learning_rate": 4.351128780349284e-06, "loss": 0.3787, "step": 7960 }, { "epoch": 0.5657196600003549, "grad_norm": 2.921985411820113, "learning_rate": 4.344029532869517e-06, "loss": 0.3746, "step": 7970 }, { "epoch": 0.5664294713679839, "grad_norm": 13.63904398617421, "learning_rate": 4.336930285389749e-06, "loss": 0.3535, "step": 7980 }, { "epoch": 0.567139282735613, "grad_norm": 2.6665592498045037, "learning_rate": 4.329831037909981e-06, "loss": 0.3668, "step": 7990 }, { "epoch": 0.567849094103242, "grad_norm": 2.7866449972058795, "learning_rate": 4.3227317904302144e-06, "loss": 0.3747, "step": 8000 }, { "epoch": 0.5685589054708711, "grad_norm": 2.795372211208224, "learning_rate": 4.3156325429504475e-06, "loss": 0.3737, "step": 8010 }, { "epoch": 0.5692687168385002, "grad_norm": 2.829992387736084, "learning_rate": 4.308533295470681e-06, "loss": 0.3813, "step": 8020 }, { "epoch": 0.5699785282061293, "grad_norm": 3.8835793195310706, "learning_rate": 4.301434047990914e-06, "loss": 0.3934, "step": 8030 }, { "epoch": 0.5706883395737583, "grad_norm": 2.157944880021205, "learning_rate": 4.294334800511146e-06, "loss": 0.3619, "step": 8040 }, { "epoch": 0.5713981509413874, "grad_norm": 2.576031100575868, "learning_rate": 4.287235553031379e-06, "loss": 0.3654, "step": 8050 }, { "epoch": 0.5721079623090164, "grad_norm": 2.1013120962560445, "learning_rate": 4.280136305551612e-06, "loss": 0.3808, "step": 8060 }, { "epoch": 0.5728177736766454, "grad_norm": 8.72915943640877, "learning_rate": 4.273037058071845e-06, "loss": 0.3865, "step": 8070 }, { "epoch": 0.5735275850442745, "grad_norm": 3.1373379205439123, "learning_rate": 4.2659378105920776e-06, "loss": 0.3631, "step": 8080 }, { "epoch": 0.5742373964119035, "grad_norm": 10.697527972561883, "learning_rate": 4.258838563112311e-06, "loss": 0.3597, "step": 8090 }, { "epoch": 0.5749472077795326, "grad_norm": 3.6970932139238095, "learning_rate": 4.251739315632543e-06, "loss": 0.3635, "step": 8100 }, { "epoch": 0.5756570191471616, "grad_norm": 2.4203467674630206, "learning_rate": 4.244640068152776e-06, "loss": 0.359, "step": 8110 }, { "epoch": 0.5763668305147907, "grad_norm": 2.9395692807103035, "learning_rate": 4.237540820673009e-06, "loss": 0.3603, "step": 8120 }, { "epoch": 0.5770766418824197, "grad_norm": 3.012599979258794, "learning_rate": 4.230441573193242e-06, "loss": 0.3568, "step": 8130 }, { "epoch": 0.5777864532500488, "grad_norm": 6.667370402568531, "learning_rate": 4.2233423257134745e-06, "loss": 0.3629, "step": 8140 }, { "epoch": 0.5784962646176779, "grad_norm": 4.471487834006219, "learning_rate": 4.2162430782337076e-06, "loss": 0.3683, "step": 8150 }, { "epoch": 0.5792060759853069, "grad_norm": 3.599804032694662, "learning_rate": 4.209143830753941e-06, "loss": 0.3554, "step": 8160 }, { "epoch": 0.579915887352936, "grad_norm": 2.9142466980850985, "learning_rate": 4.202044583274174e-06, "loss": 0.3524, "step": 8170 }, { "epoch": 0.580625698720565, "grad_norm": 3.8569199714753295, "learning_rate": 4.194945335794406e-06, "loss": 0.3663, "step": 8180 }, { "epoch": 0.5813355100881941, "grad_norm": 2.4068975949006077, "learning_rate": 4.187846088314639e-06, "loss": 0.3747, "step": 8190 }, { "epoch": 0.5820453214558231, "grad_norm": 6.174322801188514, "learning_rate": 4.180746840834871e-06, "loss": 0.372, "step": 8200 }, { "epoch": 0.5827551328234521, "grad_norm": 2.888969982284499, "learning_rate": 4.1736475933551045e-06, "loss": 0.361, "step": 8210 }, { "epoch": 0.5834649441910812, "grad_norm": 4.910093339119916, "learning_rate": 4.166548345875338e-06, "loss": 0.3574, "step": 8220 }, { "epoch": 0.5841747555587102, "grad_norm": 5.1058356496999755, "learning_rate": 4.159449098395571e-06, "loss": 0.3786, "step": 8230 }, { "epoch": 0.5848845669263393, "grad_norm": 14.081326767892058, "learning_rate": 4.152349850915803e-06, "loss": 0.3729, "step": 8240 }, { "epoch": 0.5855943782939684, "grad_norm": 4.958684438886047, "learning_rate": 4.145250603436036e-06, "loss": 0.3566, "step": 8250 }, { "epoch": 0.5863041896615975, "grad_norm": 3.9438637049329075, "learning_rate": 4.138151355956269e-06, "loss": 0.3861, "step": 8260 }, { "epoch": 0.5870140010292265, "grad_norm": 2.9499712942928107, "learning_rate": 4.131052108476502e-06, "loss": 0.3439, "step": 8270 }, { "epoch": 0.5877238123968556, "grad_norm": 3.332966504823502, "learning_rate": 4.1239528609967345e-06, "loss": 0.3788, "step": 8280 }, { "epoch": 0.5884336237644846, "grad_norm": 27.970854056782667, "learning_rate": 4.116853613516968e-06, "loss": 0.3591, "step": 8290 }, { "epoch": 0.5891434351321136, "grad_norm": 4.487327484061174, "learning_rate": 4.1097543660372e-06, "loss": 0.3625, "step": 8300 }, { "epoch": 0.5898532464997427, "grad_norm": 3.8006981727665496, "learning_rate": 4.102655118557433e-06, "loss": 0.3709, "step": 8310 }, { "epoch": 0.5905630578673717, "grad_norm": 3.463457513521014, "learning_rate": 4.095555871077666e-06, "loss": 0.3641, "step": 8320 }, { "epoch": 0.5912728692350008, "grad_norm": 7.640707242523127, "learning_rate": 4.088456623597899e-06, "loss": 0.3648, "step": 8330 }, { "epoch": 0.5919826806026298, "grad_norm": 2.8614936603096295, "learning_rate": 4.081357376118132e-06, "loss": 0.3616, "step": 8340 }, { "epoch": 0.5926924919702589, "grad_norm": 3.296737746561609, "learning_rate": 4.0742581286383645e-06, "loss": 0.3808, "step": 8350 }, { "epoch": 0.5934023033378879, "grad_norm": 3.2426352432246976, "learning_rate": 4.067158881158598e-06, "loss": 0.3583, "step": 8360 }, { "epoch": 0.5941121147055171, "grad_norm": 3.4522007032736806, "learning_rate": 4.060059633678831e-06, "loss": 0.365, "step": 8370 }, { "epoch": 0.5948219260731461, "grad_norm": 3.9166457660699145, "learning_rate": 4.052960386199063e-06, "loss": 0.3692, "step": 8380 }, { "epoch": 0.5955317374407751, "grad_norm": 2.9039677495535874, "learning_rate": 4.045861138719296e-06, "loss": 0.3468, "step": 8390 }, { "epoch": 0.5962415488084042, "grad_norm": 3.187977468656372, "learning_rate": 4.038761891239529e-06, "loss": 0.359, "step": 8400 }, { "epoch": 0.5969513601760332, "grad_norm": 4.529576318117622, "learning_rate": 4.0316626437597614e-06, "loss": 0.3452, "step": 8410 }, { "epoch": 0.5976611715436623, "grad_norm": 6.601726345536697, "learning_rate": 4.0245633962799945e-06, "loss": 0.3713, "step": 8420 }, { "epoch": 0.5983709829112913, "grad_norm": 2.4278158486667576, "learning_rate": 4.017464148800228e-06, "loss": 0.3628, "step": 8430 }, { "epoch": 0.5990807942789204, "grad_norm": 2.76630569189727, "learning_rate": 4.010364901320461e-06, "loss": 0.3704, "step": 8440 }, { "epoch": 0.5997906056465494, "grad_norm": 6.7843620715556545, "learning_rate": 4.003265653840693e-06, "loss": 0.3682, "step": 8450 }, { "epoch": 0.6005004170141784, "grad_norm": 2.9403338895288336, "learning_rate": 3.996166406360926e-06, "loss": 0.3608, "step": 8460 }, { "epoch": 0.6012102283818075, "grad_norm": 4.301178222098619, "learning_rate": 3.989067158881159e-06, "loss": 0.3595, "step": 8470 }, { "epoch": 0.6019200397494366, "grad_norm": 3.0914199152912696, "learning_rate": 3.981967911401392e-06, "loss": 0.3718, "step": 8480 }, { "epoch": 0.6026298511170657, "grad_norm": 2.753384437967004, "learning_rate": 3.9748686639216246e-06, "loss": 0.3672, "step": 8490 }, { "epoch": 0.6033396624846947, "grad_norm": 2.576321546323924, "learning_rate": 3.967769416441858e-06, "loss": 0.3706, "step": 8500 }, { "epoch": 0.6040494738523238, "grad_norm": 2.617904283815147, "learning_rate": 3.96067016896209e-06, "loss": 0.3539, "step": 8510 }, { "epoch": 0.6047592852199528, "grad_norm": 4.862875127190094, "learning_rate": 3.953570921482323e-06, "loss": 0.3763, "step": 8520 }, { "epoch": 0.6054690965875819, "grad_norm": 4.741023889550647, "learning_rate": 3.946471674002556e-06, "loss": 0.3611, "step": 8530 }, { "epoch": 0.6061789079552109, "grad_norm": 6.394478684199079, "learning_rate": 3.939372426522789e-06, "loss": 0.3615, "step": 8540 }, { "epoch": 0.6068887193228399, "grad_norm": 4.045100357410319, "learning_rate": 3.9322731790430215e-06, "loss": 0.3648, "step": 8550 }, { "epoch": 0.607598530690469, "grad_norm": 3.756852697194425, "learning_rate": 3.925173931563255e-06, "loss": 0.3689, "step": 8560 }, { "epoch": 0.608308342058098, "grad_norm": 4.04897373953826, "learning_rate": 3.918074684083488e-06, "loss": 0.3644, "step": 8570 }, { "epoch": 0.6090181534257271, "grad_norm": 4.036663207362448, "learning_rate": 3.910975436603721e-06, "loss": 0.366, "step": 8580 }, { "epoch": 0.6097279647933561, "grad_norm": 4.156260594948616, "learning_rate": 3.903876189123953e-06, "loss": 0.3554, "step": 8590 }, { "epoch": 0.6104377761609853, "grad_norm": 3.398605568980307, "learning_rate": 3.896776941644186e-06, "loss": 0.3717, "step": 8600 }, { "epoch": 0.6111475875286143, "grad_norm": 3.5114677948249065, "learning_rate": 3.889677694164418e-06, "loss": 0.3677, "step": 8610 }, { "epoch": 0.6118573988962434, "grad_norm": 4.753605099187553, "learning_rate": 3.8825784466846515e-06, "loss": 0.3547, "step": 8620 }, { "epoch": 0.6125672102638724, "grad_norm": 3.4243729659259334, "learning_rate": 3.875479199204885e-06, "loss": 0.3762, "step": 8630 }, { "epoch": 0.6132770216315014, "grad_norm": 5.94912381861312, "learning_rate": 3.868379951725118e-06, "loss": 0.359, "step": 8640 }, { "epoch": 0.6139868329991305, "grad_norm": 6.590267176028699, "learning_rate": 3.861280704245351e-06, "loss": 0.3758, "step": 8650 }, { "epoch": 0.6146966443667595, "grad_norm": 3.3256854782540497, "learning_rate": 3.854181456765583e-06, "loss": 0.3562, "step": 8660 }, { "epoch": 0.6154064557343886, "grad_norm": 3.6453120360212816, "learning_rate": 3.847082209285816e-06, "loss": 0.3619, "step": 8670 }, { "epoch": 0.6161162671020176, "grad_norm": 13.965716037023453, "learning_rate": 3.839982961806049e-06, "loss": 0.3646, "step": 8680 }, { "epoch": 0.6168260784696467, "grad_norm": 7.837860273774759, "learning_rate": 3.8328837143262815e-06, "loss": 0.3457, "step": 8690 }, { "epoch": 0.6175358898372757, "grad_norm": 4.729547574214101, "learning_rate": 3.825784466846515e-06, "loss": 0.3565, "step": 8700 }, { "epoch": 0.6182457012049049, "grad_norm": 2.5619385732076987, "learning_rate": 3.818685219366748e-06, "loss": 0.3676, "step": 8710 }, { "epoch": 0.6189555125725339, "grad_norm": 6.790019325573497, "learning_rate": 3.8115859718869804e-06, "loss": 0.3646, "step": 8720 }, { "epoch": 0.6196653239401629, "grad_norm": 3.3195434105048665, "learning_rate": 3.804486724407213e-06, "loss": 0.3575, "step": 8730 }, { "epoch": 0.620375135307792, "grad_norm": 3.805294873305076, "learning_rate": 3.797387476927446e-06, "loss": 0.3657, "step": 8740 }, { "epoch": 0.621084946675421, "grad_norm": 5.59682650769057, "learning_rate": 3.790288229447679e-06, "loss": 0.3609, "step": 8750 }, { "epoch": 0.6217947580430501, "grad_norm": 4.89958212672841, "learning_rate": 3.783188981967912e-06, "loss": 0.3669, "step": 8760 }, { "epoch": 0.6225045694106791, "grad_norm": 8.274929479843232, "learning_rate": 3.7760897344881446e-06, "loss": 0.3581, "step": 8770 }, { "epoch": 0.6232143807783082, "grad_norm": 3.2978821299433445, "learning_rate": 3.7689904870083777e-06, "loss": 0.3679, "step": 8780 }, { "epoch": 0.6239241921459372, "grad_norm": 12.435473632592815, "learning_rate": 3.76189123952861e-06, "loss": 0.3677, "step": 8790 }, { "epoch": 0.6246340035135662, "grad_norm": 4.195421567773733, "learning_rate": 3.754791992048843e-06, "loss": 0.3492, "step": 8800 }, { "epoch": 0.6253438148811953, "grad_norm": 4.406904963403177, "learning_rate": 3.7476927445690758e-06, "loss": 0.3597, "step": 8810 }, { "epoch": 0.6260536262488243, "grad_norm": 4.199730218503971, "learning_rate": 3.740593497089309e-06, "loss": 0.3797, "step": 8820 }, { "epoch": 0.6267634376164535, "grad_norm": 3.3446382282646705, "learning_rate": 3.7334942496095415e-06, "loss": 0.3638, "step": 8830 }, { "epoch": 0.6274732489840825, "grad_norm": 4.862585068251522, "learning_rate": 3.7263950021297747e-06, "loss": 0.3573, "step": 8840 }, { "epoch": 0.6281830603517116, "grad_norm": 8.107090011887513, "learning_rate": 3.7192957546500073e-06, "loss": 0.3672, "step": 8850 }, { "epoch": 0.6288928717193406, "grad_norm": 4.3962651782052005, "learning_rate": 3.7121965071702404e-06, "loss": 0.3412, "step": 8860 }, { "epoch": 0.6296026830869697, "grad_norm": 4.6424143973536935, "learning_rate": 3.705097259690473e-06, "loss": 0.3667, "step": 8870 }, { "epoch": 0.6303124944545987, "grad_norm": 3.840268427443435, "learning_rate": 3.697998012210706e-06, "loss": 0.3557, "step": 8880 }, { "epoch": 0.6310223058222277, "grad_norm": 3.6388205049600018, "learning_rate": 3.6908987647309385e-06, "loss": 0.3631, "step": 8890 }, { "epoch": 0.6317321171898568, "grad_norm": 5.233530712843461, "learning_rate": 3.6837995172511716e-06, "loss": 0.3648, "step": 8900 }, { "epoch": 0.6324419285574858, "grad_norm": 3.781452701492992, "learning_rate": 3.6767002697714042e-06, "loss": 0.3788, "step": 8910 }, { "epoch": 0.6331517399251149, "grad_norm": 6.068345043524154, "learning_rate": 3.6696010222916373e-06, "loss": 0.3566, "step": 8920 }, { "epoch": 0.6338615512927439, "grad_norm": 5.599734595118006, "learning_rate": 3.66250177481187e-06, "loss": 0.349, "step": 8930 }, { "epoch": 0.634571362660373, "grad_norm": 10.428150341049763, "learning_rate": 3.655402527332103e-06, "loss": 0.3584, "step": 8940 }, { "epoch": 0.6352811740280021, "grad_norm": 17.681698800577582, "learning_rate": 3.648303279852336e-06, "loss": 0.3458, "step": 8950 }, { "epoch": 0.6359909853956311, "grad_norm": 6.591627899287575, "learning_rate": 3.641204032372569e-06, "loss": 0.3643, "step": 8960 }, { "epoch": 0.6367007967632602, "grad_norm": 31.04186356298661, "learning_rate": 3.634104784892802e-06, "loss": 0.3577, "step": 8970 }, { "epoch": 0.6374106081308892, "grad_norm": 8.824274787999325, "learning_rate": 3.6270055374130347e-06, "loss": 0.3618, "step": 8980 }, { "epoch": 0.6381204194985183, "grad_norm": 4.7185603252655826, "learning_rate": 3.619906289933267e-06, "loss": 0.3598, "step": 8990 }, { "epoch": 0.6388302308661473, "grad_norm": 5.394376788444082, "learning_rate": 3.6128070424535e-06, "loss": 0.362, "step": 9000 }, { "epoch": 0.6395400422337764, "grad_norm": 7.158347387403476, "learning_rate": 3.6057077949737327e-06, "loss": 0.3694, "step": 9010 }, { "epoch": 0.6402498536014054, "grad_norm": 8.033101525768098, "learning_rate": 3.598608547493966e-06, "loss": 0.3626, "step": 9020 }, { "epoch": 0.6409596649690344, "grad_norm": 2.7105647455701667, "learning_rate": 3.591509300014199e-06, "loss": 0.3462, "step": 9030 }, { "epoch": 0.6416694763366635, "grad_norm": 6.3548259889750955, "learning_rate": 3.5844100525344316e-06, "loss": 0.3632, "step": 9040 }, { "epoch": 0.6423792877042925, "grad_norm": 7.341190059846113, "learning_rate": 3.5773108050546647e-06, "loss": 0.3653, "step": 9050 }, { "epoch": 0.6430890990719217, "grad_norm": 3.8869033025489723, "learning_rate": 3.5702115575748974e-06, "loss": 0.3412, "step": 9060 }, { "epoch": 0.6437989104395507, "grad_norm": 4.918908181105817, "learning_rate": 3.5631123100951305e-06, "loss": 0.3616, "step": 9070 }, { "epoch": 0.6445087218071798, "grad_norm": 6.124064792410853, "learning_rate": 3.556013062615363e-06, "loss": 0.3585, "step": 9080 }, { "epoch": 0.6452185331748088, "grad_norm": 3.6806357015000764, "learning_rate": 3.5489138151355963e-06, "loss": 0.3668, "step": 9090 }, { "epoch": 0.6459283445424379, "grad_norm": 5.193254667513745, "learning_rate": 3.5418145676558285e-06, "loss": 0.3669, "step": 9100 }, { "epoch": 0.6466381559100669, "grad_norm": 10.978524486328482, "learning_rate": 3.5347153201760616e-06, "loss": 0.3597, "step": 9110 }, { "epoch": 0.647347967277696, "grad_norm": 4.6611361687349175, "learning_rate": 3.5276160726962943e-06, "loss": 0.3695, "step": 9120 }, { "epoch": 0.648057778645325, "grad_norm": 5.205492428214056, "learning_rate": 3.5205168252165274e-06, "loss": 0.3663, "step": 9130 }, { "epoch": 0.648767590012954, "grad_norm": 5.139991204646184, "learning_rate": 3.51341757773676e-06, "loss": 0.3551, "step": 9140 }, { "epoch": 0.6494774013805831, "grad_norm": 16.35255401640736, "learning_rate": 3.506318330256993e-06, "loss": 0.3553, "step": 9150 }, { "epoch": 0.6501872127482121, "grad_norm": 10.145378264655722, "learning_rate": 3.499219082777226e-06, "loss": 0.3583, "step": 9160 }, { "epoch": 0.6508970241158412, "grad_norm": 24.878144093372033, "learning_rate": 3.492119835297459e-06, "loss": 0.3555, "step": 9170 }, { "epoch": 0.6516068354834703, "grad_norm": 3.902743241561423, "learning_rate": 3.4850205878176916e-06, "loss": 0.3723, "step": 9180 }, { "epoch": 0.6523166468510994, "grad_norm": 4.458085439514939, "learning_rate": 3.4779213403379247e-06, "loss": 0.3701, "step": 9190 }, { "epoch": 0.6530264582187284, "grad_norm": 4.717552266761064, "learning_rate": 3.470822092858157e-06, "loss": 0.3618, "step": 9200 }, { "epoch": 0.6537362695863574, "grad_norm": 4.427364622798698, "learning_rate": 3.46372284537839e-06, "loss": 0.3614, "step": 9210 }, { "epoch": 0.6544460809539865, "grad_norm": 8.323851654330221, "learning_rate": 3.4566235978986228e-06, "loss": 0.3678, "step": 9220 }, { "epoch": 0.6551558923216155, "grad_norm": 4.966094347637934, "learning_rate": 3.449524350418856e-06, "loss": 0.3688, "step": 9230 }, { "epoch": 0.6558657036892446, "grad_norm": 4.930577227679058, "learning_rate": 3.4424251029390886e-06, "loss": 0.3503, "step": 9240 }, { "epoch": 0.6565755150568736, "grad_norm": 5.52399635730182, "learning_rate": 3.4353258554593217e-06, "loss": 0.3696, "step": 9250 }, { "epoch": 0.6572853264245027, "grad_norm": 4.590670373221129, "learning_rate": 3.4282266079795543e-06, "loss": 0.3685, "step": 9260 }, { "epoch": 0.6579951377921317, "grad_norm": 8.264828163926657, "learning_rate": 3.4211273604997874e-06, "loss": 0.3575, "step": 9270 }, { "epoch": 0.6587049491597607, "grad_norm": 8.133262914973033, "learning_rate": 3.4140281130200205e-06, "loss": 0.3713, "step": 9280 }, { "epoch": 0.6594147605273899, "grad_norm": 5.742760195932282, "learning_rate": 3.4069288655402532e-06, "loss": 0.3725, "step": 9290 }, { "epoch": 0.6601245718950189, "grad_norm": 8.53035579823295, "learning_rate": 3.3998296180604855e-06, "loss": 0.3599, "step": 9300 }, { "epoch": 0.660834383262648, "grad_norm": 4.142002947123207, "learning_rate": 3.3927303705807186e-06, "loss": 0.3661, "step": 9310 }, { "epoch": 0.661544194630277, "grad_norm": 6.246166093324293, "learning_rate": 3.3856311231009513e-06, "loss": 0.351, "step": 9320 }, { "epoch": 0.6622540059979061, "grad_norm": 16.243950855343193, "learning_rate": 3.3785318756211844e-06, "loss": 0.3479, "step": 9330 }, { "epoch": 0.6629638173655351, "grad_norm": 6.147144910165458, "learning_rate": 3.3714326281414175e-06, "loss": 0.3543, "step": 9340 }, { "epoch": 0.6636736287331642, "grad_norm": 4.099934401177817, "learning_rate": 3.36433338066165e-06, "loss": 0.3636, "step": 9350 }, { "epoch": 0.6643834401007932, "grad_norm": 4.17019707869721, "learning_rate": 3.3572341331818832e-06, "loss": 0.351, "step": 9360 }, { "epoch": 0.6650932514684222, "grad_norm": 4.102146778496878, "learning_rate": 3.350134885702116e-06, "loss": 0.3737, "step": 9370 }, { "epoch": 0.6658030628360513, "grad_norm": 4.155164161456904, "learning_rate": 3.343035638222349e-06, "loss": 0.3505, "step": 9380 }, { "epoch": 0.6665128742036803, "grad_norm": 4.042739251178277, "learning_rate": 3.3359363907425817e-06, "loss": 0.3578, "step": 9390 }, { "epoch": 0.6672226855713094, "grad_norm": 3.4724621327513057, "learning_rate": 3.328837143262814e-06, "loss": 0.3733, "step": 9400 }, { "epoch": 0.6679324969389385, "grad_norm": 3.284294254497063, "learning_rate": 3.321737895783047e-06, "loss": 0.361, "step": 9410 }, { "epoch": 0.6686423083065676, "grad_norm": 5.224665667041366, "learning_rate": 3.31463864830328e-06, "loss": 0.3597, "step": 9420 }, { "epoch": 0.6693521196741966, "grad_norm": 13.317891191179472, "learning_rate": 3.307539400823513e-06, "loss": 0.36, "step": 9430 }, { "epoch": 0.6700619310418257, "grad_norm": 8.338179465785696, "learning_rate": 3.300440153343746e-06, "loss": 0.3708, "step": 9440 }, { "epoch": 0.6707717424094547, "grad_norm": 4.022884248031831, "learning_rate": 3.2933409058639786e-06, "loss": 0.357, "step": 9450 }, { "epoch": 0.6714815537770837, "grad_norm": 2.816929350582557, "learning_rate": 3.2862416583842117e-06, "loss": 0.3618, "step": 9460 }, { "epoch": 0.6721913651447128, "grad_norm": 3.2609706893982278, "learning_rate": 3.2791424109044444e-06, "loss": 0.3566, "step": 9470 }, { "epoch": 0.6729011765123418, "grad_norm": 2.0212043627509177, "learning_rate": 3.2720431634246775e-06, "loss": 0.3631, "step": 9480 }, { "epoch": 0.6736109878799709, "grad_norm": 3.472359881135022, "learning_rate": 3.26494391594491e-06, "loss": 0.3465, "step": 9490 }, { "epoch": 0.6743207992475999, "grad_norm": 2.365708920981696, "learning_rate": 3.257844668465143e-06, "loss": 0.36, "step": 9500 }, { "epoch": 0.675030610615229, "grad_norm": 6.47059083775482, "learning_rate": 3.2507454209853755e-06, "loss": 0.3589, "step": 9510 }, { "epoch": 0.6757404219828581, "grad_norm": 2.9761715896390872, "learning_rate": 3.2436461735056086e-06, "loss": 0.3737, "step": 9520 }, { "epoch": 0.6764502333504872, "grad_norm": 3.2920710102385375, "learning_rate": 3.2365469260258413e-06, "loss": 0.3631, "step": 9530 }, { "epoch": 0.6771600447181162, "grad_norm": 2.24517655258034, "learning_rate": 3.2294476785460744e-06, "loss": 0.3565, "step": 9540 }, { "epoch": 0.6778698560857452, "grad_norm": 4.585199424065417, "learning_rate": 3.222348431066307e-06, "loss": 0.3587, "step": 9550 }, { "epoch": 0.6785796674533743, "grad_norm": 2.616245813772314, "learning_rate": 3.21524918358654e-06, "loss": 0.3641, "step": 9560 }, { "epoch": 0.6792894788210033, "grad_norm": 6.790868775160296, "learning_rate": 3.208149936106773e-06, "loss": 0.3542, "step": 9570 }, { "epoch": 0.6799992901886324, "grad_norm": 4.6720875235574955, "learning_rate": 3.201050688627006e-06, "loss": 0.3724, "step": 9580 }, { "epoch": 0.6807091015562614, "grad_norm": 2.929891653919803, "learning_rate": 3.193951441147239e-06, "loss": 0.355, "step": 9590 }, { "epoch": 0.6814189129238905, "grad_norm": 2.5935885874594935, "learning_rate": 3.1868521936674717e-06, "loss": 0.3477, "step": 9600 }, { "epoch": 0.6821287242915195, "grad_norm": 4.16743323358689, "learning_rate": 3.179752946187704e-06, "loss": 0.3732, "step": 9610 }, { "epoch": 0.6828385356591485, "grad_norm": 3.119963047712144, "learning_rate": 3.172653698707937e-06, "loss": 0.3583, "step": 9620 }, { "epoch": 0.6835483470267776, "grad_norm": 4.025619816942283, "learning_rate": 3.1655544512281698e-06, "loss": 0.3814, "step": 9630 }, { "epoch": 0.6842581583944067, "grad_norm": 10.60216606667068, "learning_rate": 3.158455203748403e-06, "loss": 0.3599, "step": 9640 }, { "epoch": 0.6849679697620358, "grad_norm": 4.461108822226996, "learning_rate": 3.1513559562686356e-06, "loss": 0.3619, "step": 9650 }, { "epoch": 0.6856777811296648, "grad_norm": 2.7381838956818596, "learning_rate": 3.1442567087888687e-06, "loss": 0.361, "step": 9660 }, { "epoch": 0.6863875924972939, "grad_norm": 3.3932603213636536, "learning_rate": 3.1371574613091018e-06, "loss": 0.3722, "step": 9670 }, { "epoch": 0.6870974038649229, "grad_norm": 3.0238463961256556, "learning_rate": 3.1300582138293344e-06, "loss": 0.3677, "step": 9680 }, { "epoch": 0.687807215232552, "grad_norm": 2.9020326019536236, "learning_rate": 3.1229589663495675e-06, "loss": 0.3587, "step": 9690 }, { "epoch": 0.688517026600181, "grad_norm": 3.4182793620767313, "learning_rate": 3.1158597188698002e-06, "loss": 0.3958, "step": 9700 }, { "epoch": 0.68922683796781, "grad_norm": 2.7346693208831123, "learning_rate": 3.1087604713900325e-06, "loss": 0.3746, "step": 9710 }, { "epoch": 0.6899366493354391, "grad_norm": 2.7001110030197184, "learning_rate": 3.1016612239102656e-06, "loss": 0.3596, "step": 9720 }, { "epoch": 0.6906464607030681, "grad_norm": 3.8786526590857706, "learning_rate": 3.0945619764304987e-06, "loss": 0.3677, "step": 9730 }, { "epoch": 0.6913562720706972, "grad_norm": 3.601819125137747, "learning_rate": 3.0874627289507314e-06, "loss": 0.3599, "step": 9740 }, { "epoch": 0.6920660834383263, "grad_norm": 4.257577712986774, "learning_rate": 3.0803634814709645e-06, "loss": 0.3653, "step": 9750 }, { "epoch": 0.6927758948059554, "grad_norm": 16.2562479732823, "learning_rate": 3.073264233991197e-06, "loss": 0.3786, "step": 9760 }, { "epoch": 0.6934857061735844, "grad_norm": 2.8308341290836037, "learning_rate": 3.0661649865114302e-06, "loss": 0.347, "step": 9770 }, { "epoch": 0.6941955175412134, "grad_norm": 2.386467475595729, "learning_rate": 3.059065739031663e-06, "loss": 0.3785, "step": 9780 }, { "epoch": 0.6949053289088425, "grad_norm": 3.11594441686047, "learning_rate": 3.051966491551896e-06, "loss": 0.3613, "step": 9790 }, { "epoch": 0.6956151402764715, "grad_norm": 3.4457140851193677, "learning_rate": 3.0448672440721287e-06, "loss": 0.3592, "step": 9800 }, { "epoch": 0.6963249516441006, "grad_norm": 6.7733834909511135, "learning_rate": 3.0377679965923614e-06, "loss": 0.3503, "step": 9810 }, { "epoch": 0.6970347630117296, "grad_norm": 2.552293405448118, "learning_rate": 3.030668749112594e-06, "loss": 0.3565, "step": 9820 }, { "epoch": 0.6977445743793587, "grad_norm": 7.3573968999972985, "learning_rate": 3.023569501632827e-06, "loss": 0.3534, "step": 9830 }, { "epoch": 0.6984543857469877, "grad_norm": 2.2835556419626286, "learning_rate": 3.01647025415306e-06, "loss": 0.3627, "step": 9840 }, { "epoch": 0.6991641971146167, "grad_norm": 4.158935806681915, "learning_rate": 3.009371006673293e-06, "loss": 0.3676, "step": 9850 }, { "epoch": 0.6998740084822458, "grad_norm": 3.444386024390724, "learning_rate": 3.0022717591935256e-06, "loss": 0.3498, "step": 9860 }, { "epoch": 0.700583819849875, "grad_norm": 76.68033690471103, "learning_rate": 2.9951725117137587e-06, "loss": 0.3465, "step": 9870 }, { "epoch": 0.701293631217504, "grad_norm": 2.753848553217651, "learning_rate": 2.9880732642339914e-06, "loss": 0.3579, "step": 9880 }, { "epoch": 0.702003442585133, "grad_norm": 6.8770901385155465, "learning_rate": 2.9809740167542245e-06, "loss": 0.3644, "step": 9890 }, { "epoch": 0.7027132539527621, "grad_norm": 8.050770443325867, "learning_rate": 2.9738747692744576e-06, "loss": 0.3534, "step": 9900 }, { "epoch": 0.7034230653203911, "grad_norm": 6.2381173840397794, "learning_rate": 2.96677552179469e-06, "loss": 0.3799, "step": 9910 }, { "epoch": 0.7041328766880202, "grad_norm": 2.527197221067041, "learning_rate": 2.9596762743149225e-06, "loss": 0.3702, "step": 9920 }, { "epoch": 0.7048426880556492, "grad_norm": 3.365675129758323, "learning_rate": 2.9525770268351556e-06, "loss": 0.3618, "step": 9930 }, { "epoch": 0.7055524994232782, "grad_norm": 3.7307831294643323, "learning_rate": 2.9454777793553883e-06, "loss": 0.3552, "step": 9940 }, { "epoch": 0.7062623107909073, "grad_norm": 10.13055799757591, "learning_rate": 2.9383785318756214e-06, "loss": 0.369, "step": 9950 }, { "epoch": 0.7069721221585363, "grad_norm": 3.79159989826404, "learning_rate": 2.931279284395854e-06, "loss": 0.3393, "step": 9960 }, { "epoch": 0.7076819335261654, "grad_norm": 11.361319554472407, "learning_rate": 2.924180036916087e-06, "loss": 0.3726, "step": 9970 }, { "epoch": 0.7083917448937945, "grad_norm": 2.2727709813242, "learning_rate": 2.9170807894363203e-06, "loss": 0.3558, "step": 9980 }, { "epoch": 0.7091015562614236, "grad_norm": 13.54783288221351, "learning_rate": 2.909981541956553e-06, "loss": 0.3522, "step": 9990 }, { "epoch": 0.7098113676290526, "grad_norm": 3.4738198913190037, "learning_rate": 2.902882294476786e-06, "loss": 0.3636, "step": 10000 }, { "epoch": 0.7105211789966817, "grad_norm": 2.599196507580769, "learning_rate": 2.8957830469970183e-06, "loss": 0.373, "step": 10010 }, { "epoch": 0.7112309903643107, "grad_norm": 4.846340487255633, "learning_rate": 2.888683799517251e-06, "loss": 0.364, "step": 10020 }, { "epoch": 0.7119408017319397, "grad_norm": 4.14481835106229, "learning_rate": 2.881584552037484e-06, "loss": 0.3565, "step": 10030 }, { "epoch": 0.7126506130995688, "grad_norm": 3.12959687042078, "learning_rate": 2.8744853045577172e-06, "loss": 0.3597, "step": 10040 }, { "epoch": 0.7133604244671978, "grad_norm": 2.0499607045489157, "learning_rate": 2.86738605707795e-06, "loss": 0.3665, "step": 10050 }, { "epoch": 0.7140702358348269, "grad_norm": 3.4345739303394964, "learning_rate": 2.860286809598183e-06, "loss": 0.3406, "step": 10060 }, { "epoch": 0.7147800472024559, "grad_norm": 3.2507549549593677, "learning_rate": 2.8531875621184157e-06, "loss": 0.3691, "step": 10070 }, { "epoch": 0.715489858570085, "grad_norm": 3.088999571380729, "learning_rate": 2.8460883146386488e-06, "loss": 0.3512, "step": 10080 }, { "epoch": 0.716199669937714, "grad_norm": 3.992697102415428, "learning_rate": 2.8389890671588815e-06, "loss": 0.3584, "step": 10090 }, { "epoch": 0.7169094813053432, "grad_norm": 8.327520697203159, "learning_rate": 2.8318898196791146e-06, "loss": 0.3604, "step": 10100 }, { "epoch": 0.7176192926729722, "grad_norm": 4.600972082353797, "learning_rate": 2.824790572199347e-06, "loss": 0.3641, "step": 10110 }, { "epoch": 0.7183291040406012, "grad_norm": 3.6403983429872384, "learning_rate": 2.81769132471958e-06, "loss": 0.3496, "step": 10120 }, { "epoch": 0.7190389154082303, "grad_norm": 2.831902492470625, "learning_rate": 2.8105920772398126e-06, "loss": 0.3611, "step": 10130 }, { "epoch": 0.7197487267758593, "grad_norm": 4.428260390842955, "learning_rate": 2.8034928297600457e-06, "loss": 0.3572, "step": 10140 }, { "epoch": 0.7204585381434884, "grad_norm": 5.5528766539260825, "learning_rate": 2.7963935822802784e-06, "loss": 0.3605, "step": 10150 }, { "epoch": 0.7211683495111174, "grad_norm": 3.3271150324051124, "learning_rate": 2.7892943348005115e-06, "loss": 0.3646, "step": 10160 }, { "epoch": 0.7218781608787465, "grad_norm": 4.353636452465487, "learning_rate": 2.782195087320744e-06, "loss": 0.3745, "step": 10170 }, { "epoch": 0.7225879722463755, "grad_norm": 4.938483709090633, "learning_rate": 2.7750958398409773e-06, "loss": 0.3586, "step": 10180 }, { "epoch": 0.7232977836140045, "grad_norm": 4.667393928494558, "learning_rate": 2.76799659236121e-06, "loss": 0.3526, "step": 10190 }, { "epoch": 0.7240075949816336, "grad_norm": 5.312814121573459, "learning_rate": 2.760897344881443e-06, "loss": 0.3539, "step": 10200 }, { "epoch": 0.7247174063492627, "grad_norm": 3.102848391211554, "learning_rate": 2.7537980974016757e-06, "loss": 0.3453, "step": 10210 }, { "epoch": 0.7254272177168918, "grad_norm": 3.036840145081599, "learning_rate": 2.7466988499219084e-06, "loss": 0.3627, "step": 10220 }, { "epoch": 0.7261370290845208, "grad_norm": 5.647990352632265, "learning_rate": 2.739599602442141e-06, "loss": 0.3555, "step": 10230 }, { "epoch": 0.7268468404521499, "grad_norm": 4.66342024342857, "learning_rate": 2.732500354962374e-06, "loss": 0.3722, "step": 10240 }, { "epoch": 0.7275566518197789, "grad_norm": 3.168307885423117, "learning_rate": 2.725401107482607e-06, "loss": 0.3673, "step": 10250 }, { "epoch": 0.728266463187408, "grad_norm": 4.968172759395676, "learning_rate": 2.71830186000284e-06, "loss": 0.3556, "step": 10260 }, { "epoch": 0.728976274555037, "grad_norm": 3.5154935991341123, "learning_rate": 2.7112026125230726e-06, "loss": 0.3593, "step": 10270 }, { "epoch": 0.729686085922666, "grad_norm": 5.0083468168620655, "learning_rate": 2.7041033650433057e-06, "loss": 0.3592, "step": 10280 }, { "epoch": 0.7303958972902951, "grad_norm": 3.379094612224907, "learning_rate": 2.697004117563539e-06, "loss": 0.3643, "step": 10290 }, { "epoch": 0.7311057086579241, "grad_norm": 4.180270451928424, "learning_rate": 2.6899048700837715e-06, "loss": 0.3574, "step": 10300 }, { "epoch": 0.7318155200255532, "grad_norm": 4.640198570927561, "learning_rate": 2.6828056226040046e-06, "loss": 0.3578, "step": 10310 }, { "epoch": 0.7325253313931822, "grad_norm": 10.365125402351024, "learning_rate": 2.675706375124237e-06, "loss": 0.3614, "step": 10320 }, { "epoch": 0.7332351427608114, "grad_norm": 15.355341780635097, "learning_rate": 2.6686071276444695e-06, "loss": 0.3631, "step": 10330 }, { "epoch": 0.7339449541284404, "grad_norm": 6.738981517513828, "learning_rate": 2.6615078801647026e-06, "loss": 0.3493, "step": 10340 }, { "epoch": 0.7346547654960695, "grad_norm": 7.55570609393924, "learning_rate": 2.6544086326849357e-06, "loss": 0.371, "step": 10350 }, { "epoch": 0.7353645768636985, "grad_norm": 2.6482961979611526, "learning_rate": 2.6473093852051684e-06, "loss": 0.3591, "step": 10360 }, { "epoch": 0.7360743882313275, "grad_norm": 8.054548870993123, "learning_rate": 2.6402101377254015e-06, "loss": 0.3577, "step": 10370 }, { "epoch": 0.7367841995989566, "grad_norm": 7.370207938746124, "learning_rate": 2.633110890245634e-06, "loss": 0.3509, "step": 10380 }, { "epoch": 0.7374940109665856, "grad_norm": 8.915363239178143, "learning_rate": 2.6260116427658673e-06, "loss": 0.3595, "step": 10390 }, { "epoch": 0.7382038223342147, "grad_norm": 6.453539668987391, "learning_rate": 2.6189123952861e-06, "loss": 0.3735, "step": 10400 }, { "epoch": 0.7389136337018437, "grad_norm": 13.429374820990935, "learning_rate": 2.611813147806333e-06, "loss": 0.343, "step": 10410 }, { "epoch": 0.7396234450694728, "grad_norm": 4.019465503184252, "learning_rate": 2.6047139003265653e-06, "loss": 0.3619, "step": 10420 }, { "epoch": 0.7403332564371018, "grad_norm": 4.77728942914678, "learning_rate": 2.5976146528467984e-06, "loss": 0.3602, "step": 10430 }, { "epoch": 0.7410430678047308, "grad_norm": 16.82021280745509, "learning_rate": 2.590515405367031e-06, "loss": 0.3765, "step": 10440 }, { "epoch": 0.74175287917236, "grad_norm": 4.7659520678895735, "learning_rate": 2.5834161578872642e-06, "loss": 0.3557, "step": 10450 }, { "epoch": 0.742462690539989, "grad_norm": 5.846901706253607, "learning_rate": 2.576316910407497e-06, "loss": 0.3574, "step": 10460 }, { "epoch": 0.7431725019076181, "grad_norm": 5.00717365628058, "learning_rate": 2.56921766292773e-06, "loss": 0.371, "step": 10470 }, { "epoch": 0.7438823132752471, "grad_norm": 12.812616706907704, "learning_rate": 2.5621184154479627e-06, "loss": 0.3612, "step": 10480 }, { "epoch": 0.7445921246428762, "grad_norm": 2.7312101929568375, "learning_rate": 2.5550191679681958e-06, "loss": 0.3551, "step": 10490 }, { "epoch": 0.7453019360105052, "grad_norm": 3.0759041075210782, "learning_rate": 2.5479199204884285e-06, "loss": 0.3574, "step": 10500 }, { "epoch": 0.7460117473781342, "grad_norm": 7.165278043719281, "learning_rate": 2.5408206730086616e-06, "loss": 0.3605, "step": 10510 }, { "epoch": 0.7467215587457633, "grad_norm": 4.908665990783306, "learning_rate": 2.533721425528894e-06, "loss": 0.3479, "step": 10520 }, { "epoch": 0.7474313701133923, "grad_norm": 3.4583261557450227, "learning_rate": 2.526622178049127e-06, "loss": 0.3542, "step": 10530 }, { "epoch": 0.7481411814810214, "grad_norm": 11.387458565670322, "learning_rate": 2.5195229305693596e-06, "loss": 0.3619, "step": 10540 }, { "epoch": 0.7488509928486504, "grad_norm": 10.198798372329442, "learning_rate": 2.5124236830895927e-06, "loss": 0.3434, "step": 10550 }, { "epoch": 0.7495608042162796, "grad_norm": 3.893599380410888, "learning_rate": 2.5053244356098254e-06, "loss": 0.362, "step": 10560 }, { "epoch": 0.7502706155839086, "grad_norm": 5.107597028464082, "learning_rate": 2.4982251881300585e-06, "loss": 0.3688, "step": 10570 }, { "epoch": 0.7509804269515377, "grad_norm": 4.219068583835792, "learning_rate": 2.491125940650291e-06, "loss": 0.3649, "step": 10580 }, { "epoch": 0.7516902383191667, "grad_norm": 4.535592066198855, "learning_rate": 2.4840266931705243e-06, "loss": 0.37, "step": 10590 }, { "epoch": 0.7524000496867957, "grad_norm": 3.541264339618074, "learning_rate": 2.476927445690757e-06, "loss": 0.3679, "step": 10600 }, { "epoch": 0.7531098610544248, "grad_norm": 4.7884449114332845, "learning_rate": 2.4698281982109896e-06, "loss": 0.3472, "step": 10610 }, { "epoch": 0.7538196724220538, "grad_norm": 8.667808097909838, "learning_rate": 2.4627289507312227e-06, "loss": 0.3704, "step": 10620 }, { "epoch": 0.7545294837896829, "grad_norm": 4.925434074834849, "learning_rate": 2.455629703251456e-06, "loss": 0.3701, "step": 10630 }, { "epoch": 0.7552392951573119, "grad_norm": 3.8594886335750807, "learning_rate": 2.4485304557716885e-06, "loss": 0.3662, "step": 10640 }, { "epoch": 0.755949106524941, "grad_norm": 4.971536391123703, "learning_rate": 2.441431208291921e-06, "loss": 0.35, "step": 10650 }, { "epoch": 0.75665891789257, "grad_norm": 15.055144352578429, "learning_rate": 2.434331960812154e-06, "loss": 0.3584, "step": 10660 }, { "epoch": 0.757368729260199, "grad_norm": 14.432076661811932, "learning_rate": 2.427232713332387e-06, "loss": 0.3621, "step": 10670 }, { "epoch": 0.7580785406278282, "grad_norm": 9.810669772230819, "learning_rate": 2.42013346585262e-06, "loss": 0.3588, "step": 10680 }, { "epoch": 0.7587883519954572, "grad_norm": 5.765479927608821, "learning_rate": 2.4130342183728527e-06, "loss": 0.3549, "step": 10690 }, { "epoch": 0.7594981633630863, "grad_norm": 13.617197754978974, "learning_rate": 2.4059349708930854e-06, "loss": 0.3759, "step": 10700 }, { "epoch": 0.7602079747307153, "grad_norm": 5.614482278416453, "learning_rate": 2.3988357234133185e-06, "loss": 0.3376, "step": 10710 }, { "epoch": 0.7609177860983444, "grad_norm": 17.701642596831444, "learning_rate": 2.391736475933551e-06, "loss": 0.3647, "step": 10720 }, { "epoch": 0.7616275974659734, "grad_norm": 4.910333781437824, "learning_rate": 2.3846372284537843e-06, "loss": 0.3643, "step": 10730 }, { "epoch": 0.7623374088336025, "grad_norm": 3.415309685272355, "learning_rate": 2.377537980974017e-06, "loss": 0.3488, "step": 10740 }, { "epoch": 0.7630472202012315, "grad_norm": 4.350903829153794, "learning_rate": 2.3704387334942497e-06, "loss": 0.3577, "step": 10750 }, { "epoch": 0.7637570315688605, "grad_norm": 3.9361079752185435, "learning_rate": 2.3633394860144828e-06, "loss": 0.3591, "step": 10760 }, { "epoch": 0.7644668429364896, "grad_norm": 5.913083445040196, "learning_rate": 2.3562402385347154e-06, "loss": 0.3486, "step": 10770 }, { "epoch": 0.7651766543041186, "grad_norm": 5.982161931863015, "learning_rate": 2.3491409910549485e-06, "loss": 0.3714, "step": 10780 }, { "epoch": 0.7658864656717478, "grad_norm": 4.5231254195655906, "learning_rate": 2.3420417435751812e-06, "loss": 0.3534, "step": 10790 }, { "epoch": 0.7665962770393768, "grad_norm": 5.099871081954513, "learning_rate": 2.334942496095414e-06, "loss": 0.3509, "step": 10800 }, { "epoch": 0.7673060884070059, "grad_norm": 3.361247181502804, "learning_rate": 2.327843248615647e-06, "loss": 0.3692, "step": 10810 }, { "epoch": 0.7680158997746349, "grad_norm": 6.553423618292367, "learning_rate": 2.3207440011358797e-06, "loss": 0.353, "step": 10820 }, { "epoch": 0.768725711142264, "grad_norm": 2.985537513367268, "learning_rate": 2.3136447536561128e-06, "loss": 0.3498, "step": 10830 }, { "epoch": 0.769435522509893, "grad_norm": 3.0266471519507427, "learning_rate": 2.3065455061763455e-06, "loss": 0.3563, "step": 10840 }, { "epoch": 0.770145333877522, "grad_norm": 17.644165005698888, "learning_rate": 2.299446258696578e-06, "loss": 0.3662, "step": 10850 }, { "epoch": 0.7708551452451511, "grad_norm": 3.1894412768611016, "learning_rate": 2.2923470112168112e-06, "loss": 0.3503, "step": 10860 }, { "epoch": 0.7715649566127801, "grad_norm": 4.492544324422795, "learning_rate": 2.285247763737044e-06, "loss": 0.3436, "step": 10870 }, { "epoch": 0.7722747679804092, "grad_norm": 4.173829674998731, "learning_rate": 2.278148516257277e-06, "loss": 0.363, "step": 10880 }, { "epoch": 0.7729845793480382, "grad_norm": 3.114718418646357, "learning_rate": 2.2710492687775097e-06, "loss": 0.3368, "step": 10890 }, { "epoch": 0.7736943907156673, "grad_norm": 2.6323429503484443, "learning_rate": 2.2639500212977424e-06, "loss": 0.3489, "step": 10900 }, { "epoch": 0.7744042020832964, "grad_norm": 2.8865277064459223, "learning_rate": 2.2568507738179755e-06, "loss": 0.3571, "step": 10910 }, { "epoch": 0.7751140134509255, "grad_norm": 8.888602826244627, "learning_rate": 2.249751526338208e-06, "loss": 0.3399, "step": 10920 }, { "epoch": 0.7758238248185545, "grad_norm": 3.532724353902858, "learning_rate": 2.2426522788584412e-06, "loss": 0.3493, "step": 10930 }, { "epoch": 0.7765336361861835, "grad_norm": 3.6781547439101883, "learning_rate": 2.235553031378674e-06, "loss": 0.3462, "step": 10940 }, { "epoch": 0.7772434475538126, "grad_norm": 13.16004359433701, "learning_rate": 2.2284537838989066e-06, "loss": 0.3649, "step": 10950 }, { "epoch": 0.7779532589214416, "grad_norm": 9.642968589987298, "learning_rate": 2.2213545364191397e-06, "loss": 0.3582, "step": 10960 }, { "epoch": 0.7786630702890707, "grad_norm": 6.16050392324128, "learning_rate": 2.2142552889393724e-06, "loss": 0.3624, "step": 10970 }, { "epoch": 0.7793728816566997, "grad_norm": 4.012346442724565, "learning_rate": 2.2071560414596055e-06, "loss": 0.3448, "step": 10980 }, { "epoch": 0.7800826930243288, "grad_norm": 2.6066193255622956, "learning_rate": 2.2000567939798386e-06, "loss": 0.3644, "step": 10990 }, { "epoch": 0.7807925043919578, "grad_norm": 7.331639609512875, "learning_rate": 2.1929575465000713e-06, "loss": 0.3515, "step": 11000 }, { "epoch": 0.7815023157595868, "grad_norm": 2.990816174000455, "learning_rate": 2.185858299020304e-06, "loss": 0.3505, "step": 11010 }, { "epoch": 0.782212127127216, "grad_norm": 3.6112792490950554, "learning_rate": 2.178759051540537e-06, "loss": 0.3548, "step": 11020 }, { "epoch": 0.782921938494845, "grad_norm": 3.8221043132066286, "learning_rate": 2.1716598040607697e-06, "loss": 0.3571, "step": 11030 }, { "epoch": 0.7836317498624741, "grad_norm": 7.476265982563856, "learning_rate": 2.164560556581003e-06, "loss": 0.3428, "step": 11040 }, { "epoch": 0.7843415612301031, "grad_norm": 5.554911455235443, "learning_rate": 2.1574613091012355e-06, "loss": 0.354, "step": 11050 }, { "epoch": 0.7850513725977322, "grad_norm": 2.9298081851011117, "learning_rate": 2.150362061621468e-06, "loss": 0.3597, "step": 11060 }, { "epoch": 0.7857611839653612, "grad_norm": 5.325097733237352, "learning_rate": 2.1432628141417013e-06, "loss": 0.3486, "step": 11070 }, { "epoch": 0.7864709953329903, "grad_norm": 3.5814394523109114, "learning_rate": 2.136163566661934e-06, "loss": 0.3544, "step": 11080 }, { "epoch": 0.7871808067006193, "grad_norm": 3.6972554376986, "learning_rate": 2.129064319182167e-06, "loss": 0.3546, "step": 11090 }, { "epoch": 0.7878906180682483, "grad_norm": 6.754098899246775, "learning_rate": 2.1219650717023997e-06, "loss": 0.3537, "step": 11100 }, { "epoch": 0.7886004294358774, "grad_norm": 3.3122898855719876, "learning_rate": 2.1148658242226324e-06, "loss": 0.3645, "step": 11110 }, { "epoch": 0.7893102408035064, "grad_norm": 2.8223728276754128, "learning_rate": 2.1077665767428655e-06, "loss": 0.3599, "step": 11120 }, { "epoch": 0.7900200521711355, "grad_norm": 2.5012481292133937, "learning_rate": 2.100667329263098e-06, "loss": 0.3486, "step": 11130 }, { "epoch": 0.7907298635387646, "grad_norm": 11.033197138630223, "learning_rate": 2.0935680817833313e-06, "loss": 0.3467, "step": 11140 }, { "epoch": 0.7914396749063937, "grad_norm": 3.730389968284293, "learning_rate": 2.086468834303564e-06, "loss": 0.3544, "step": 11150 }, { "epoch": 0.7921494862740227, "grad_norm": 5.898064410181565, "learning_rate": 2.0793695868237967e-06, "loss": 0.3477, "step": 11160 }, { "epoch": 0.7928592976416518, "grad_norm": 4.55198088261442, "learning_rate": 2.0722703393440298e-06, "loss": 0.3527, "step": 11170 }, { "epoch": 0.7935691090092808, "grad_norm": 5.318762071563834, "learning_rate": 2.0651710918642624e-06, "loss": 0.3478, "step": 11180 }, { "epoch": 0.7942789203769098, "grad_norm": 6.161214607463883, "learning_rate": 2.0580718443844955e-06, "loss": 0.3546, "step": 11190 }, { "epoch": 0.7949887317445389, "grad_norm": 3.1236830623318537, "learning_rate": 2.0509725969047282e-06, "loss": 0.3565, "step": 11200 }, { "epoch": 0.7956985431121679, "grad_norm": 4.197839999078878, "learning_rate": 2.043873349424961e-06, "loss": 0.3496, "step": 11210 }, { "epoch": 0.796408354479797, "grad_norm": 3.2762330861667515, "learning_rate": 2.036774101945194e-06, "loss": 0.348, "step": 11220 }, { "epoch": 0.797118165847426, "grad_norm": 5.961140258537488, "learning_rate": 2.0296748544654267e-06, "loss": 0.3637, "step": 11230 }, { "epoch": 0.797827977215055, "grad_norm": 2.0964322412177263, "learning_rate": 2.0225756069856598e-06, "loss": 0.341, "step": 11240 }, { "epoch": 0.7985377885826842, "grad_norm": 11.078753928620895, "learning_rate": 2.0154763595058925e-06, "loss": 0.3582, "step": 11250 }, { "epoch": 0.7992475999503132, "grad_norm": 11.615859636107096, "learning_rate": 2.008377112026125e-06, "loss": 0.3504, "step": 11260 }, { "epoch": 0.7999574113179423, "grad_norm": 9.267486623233392, "learning_rate": 2.0012778645463582e-06, "loss": 0.3585, "step": 11270 }, { "epoch": 0.8006672226855713, "grad_norm": 3.7638868565818613, "learning_rate": 1.994178617066591e-06, "loss": 0.3572, "step": 11280 }, { "epoch": 0.8013770340532004, "grad_norm": 4.274096264509613, "learning_rate": 1.987079369586824e-06, "loss": 0.352, "step": 11290 }, { "epoch": 0.8020868454208294, "grad_norm": 3.0651382288741824, "learning_rate": 1.979980122107057e-06, "loss": 0.3487, "step": 11300 }, { "epoch": 0.8027966567884585, "grad_norm": 2.585139354778811, "learning_rate": 1.9728808746272894e-06, "loss": 0.3509, "step": 11310 }, { "epoch": 0.8035064681560875, "grad_norm": 3.4507245702670013, "learning_rate": 1.9657816271475225e-06, "loss": 0.3605, "step": 11320 }, { "epoch": 0.8042162795237165, "grad_norm": 2.168473869134373, "learning_rate": 1.9586823796677556e-06, "loss": 0.3473, "step": 11330 }, { "epoch": 0.8049260908913456, "grad_norm": 3.3138804394827126, "learning_rate": 1.9515831321879883e-06, "loss": 0.3451, "step": 11340 }, { "epoch": 0.8056359022589746, "grad_norm": 2.9967871033094284, "learning_rate": 1.9444838847082214e-06, "loss": 0.3586, "step": 11350 }, { "epoch": 0.8063457136266037, "grad_norm": 2.218098420224771, "learning_rate": 1.9373846372284536e-06, "loss": 0.3629, "step": 11360 }, { "epoch": 0.8070555249942328, "grad_norm": 4.124703498173868, "learning_rate": 1.9302853897486867e-06, "loss": 0.349, "step": 11370 }, { "epoch": 0.8077653363618619, "grad_norm": 4.336301638014139, "learning_rate": 1.92318614226892e-06, "loss": 0.3474, "step": 11380 }, { "epoch": 0.8084751477294909, "grad_norm": 5.67446885361532, "learning_rate": 1.9160868947891525e-06, "loss": 0.3577, "step": 11390 }, { "epoch": 0.80918495909712, "grad_norm": 5.496735292829206, "learning_rate": 1.9089876473093856e-06, "loss": 0.3606, "step": 11400 }, { "epoch": 0.809894770464749, "grad_norm": 2.3181036706188505, "learning_rate": 1.901888399829618e-06, "loss": 0.3573, "step": 11410 }, { "epoch": 0.810604581832378, "grad_norm": 4.2823563842257695, "learning_rate": 1.894789152349851e-06, "loss": 0.3456, "step": 11420 }, { "epoch": 0.8113143932000071, "grad_norm": 9.041186743139388, "learning_rate": 1.8876899048700838e-06, "loss": 0.3493, "step": 11430 }, { "epoch": 0.8120242045676361, "grad_norm": 2.135565041402105, "learning_rate": 1.8805906573903167e-06, "loss": 0.3573, "step": 11440 }, { "epoch": 0.8127340159352652, "grad_norm": 4.2654812969837295, "learning_rate": 1.8734914099105498e-06, "loss": 0.3462, "step": 11450 }, { "epoch": 0.8134438273028942, "grad_norm": 3.0226693302416465, "learning_rate": 1.8663921624307823e-06, "loss": 0.3399, "step": 11460 }, { "epoch": 0.8141536386705233, "grad_norm": 5.674429424631266, "learning_rate": 1.8592929149510152e-06, "loss": 0.3445, "step": 11470 }, { "epoch": 0.8148634500381524, "grad_norm": 5.107735874370569, "learning_rate": 1.852193667471248e-06, "loss": 0.3498, "step": 11480 }, { "epoch": 0.8155732614057815, "grad_norm": 4.211595369240753, "learning_rate": 1.8450944199914812e-06, "loss": 0.3509, "step": 11490 }, { "epoch": 0.8162830727734105, "grad_norm": 3.2874196387814485, "learning_rate": 1.837995172511714e-06, "loss": 0.352, "step": 11500 }, { "epoch": 0.8169928841410395, "grad_norm": 2.51051446421893, "learning_rate": 1.8308959250319465e-06, "loss": 0.3445, "step": 11510 }, { "epoch": 0.8177026955086686, "grad_norm": 13.267874952448258, "learning_rate": 1.8237966775521796e-06, "loss": 0.354, "step": 11520 }, { "epoch": 0.8184125068762976, "grad_norm": 4.900767095828628, "learning_rate": 1.8166974300724125e-06, "loss": 0.3594, "step": 11530 }, { "epoch": 0.8191223182439267, "grad_norm": 8.3230418317363, "learning_rate": 1.8095981825926454e-06, "loss": 0.3471, "step": 11540 }, { "epoch": 0.8198321296115557, "grad_norm": 2.8346340256917815, "learning_rate": 1.8024989351128783e-06, "loss": 0.3695, "step": 11550 }, { "epoch": 0.8205419409791848, "grad_norm": 5.533189262204602, "learning_rate": 1.795399687633111e-06, "loss": 0.3728, "step": 11560 }, { "epoch": 0.8212517523468138, "grad_norm": 3.187071233846852, "learning_rate": 1.7883004401533439e-06, "loss": 0.3464, "step": 11570 }, { "epoch": 0.8219615637144428, "grad_norm": 3.9314257894883937, "learning_rate": 1.7812011926735768e-06, "loss": 0.3532, "step": 11580 }, { "epoch": 0.8226713750820719, "grad_norm": 3.6730541227348277, "learning_rate": 1.7741019451938097e-06, "loss": 0.3565, "step": 11590 }, { "epoch": 0.823381186449701, "grad_norm": 2.9136274666194306, "learning_rate": 1.7670026977140426e-06, "loss": 0.3603, "step": 11600 }, { "epoch": 0.8240909978173301, "grad_norm": 6.106992201577366, "learning_rate": 1.7599034502342754e-06, "loss": 0.3484, "step": 11610 }, { "epoch": 0.8248008091849591, "grad_norm": 4.230462903274037, "learning_rate": 1.7528042027545081e-06, "loss": 0.35, "step": 11620 }, { "epoch": 0.8255106205525882, "grad_norm": 3.376064932155992, "learning_rate": 1.745704955274741e-06, "loss": 0.35, "step": 11630 }, { "epoch": 0.8262204319202172, "grad_norm": 2.8424779046250612, "learning_rate": 1.738605707794974e-06, "loss": 0.3552, "step": 11640 }, { "epoch": 0.8269302432878463, "grad_norm": 3.6044824322491347, "learning_rate": 1.7315064603152068e-06, "loss": 0.3633, "step": 11650 }, { "epoch": 0.8276400546554753, "grad_norm": 3.3041226058016324, "learning_rate": 1.7244072128354397e-06, "loss": 0.3453, "step": 11660 }, { "epoch": 0.8283498660231043, "grad_norm": 3.461976575510189, "learning_rate": 1.7173079653556724e-06, "loss": 0.3607, "step": 11670 }, { "epoch": 0.8290596773907334, "grad_norm": 3.96624408516477, "learning_rate": 1.7102087178759052e-06, "loss": 0.3431, "step": 11680 }, { "epoch": 0.8297694887583624, "grad_norm": 10.446490548963004, "learning_rate": 1.7031094703961381e-06, "loss": 0.3518, "step": 11690 }, { "epoch": 0.8304793001259915, "grad_norm": 2.4894424633296888, "learning_rate": 1.696010222916371e-06, "loss": 0.3618, "step": 11700 }, { "epoch": 0.8311891114936206, "grad_norm": 3.7097939930537494, "learning_rate": 1.688910975436604e-06, "loss": 0.3577, "step": 11710 }, { "epoch": 0.8318989228612497, "grad_norm": 2.591589818986439, "learning_rate": 1.6818117279568366e-06, "loss": 0.3454, "step": 11720 }, { "epoch": 0.8326087342288787, "grad_norm": 3.0415000039562816, "learning_rate": 1.6747124804770695e-06, "loss": 0.3514, "step": 11730 }, { "epoch": 0.8333185455965078, "grad_norm": 3.185465708245909, "learning_rate": 1.6676132329973024e-06, "loss": 0.3437, "step": 11740 }, { "epoch": 0.8340283569641368, "grad_norm": 8.153250864972724, "learning_rate": 1.6605139855175353e-06, "loss": 0.3418, "step": 11750 }, { "epoch": 0.8347381683317658, "grad_norm": 17.15311701699765, "learning_rate": 1.6534147380377682e-06, "loss": 0.3533, "step": 11760 }, { "epoch": 0.8354479796993949, "grad_norm": 2.956498750624732, "learning_rate": 1.6463154905580008e-06, "loss": 0.3539, "step": 11770 }, { "epoch": 0.8361577910670239, "grad_norm": 5.182422880739596, "learning_rate": 1.6392162430782337e-06, "loss": 0.3543, "step": 11780 }, { "epoch": 0.836867602434653, "grad_norm": 5.245759433932608, "learning_rate": 1.6321169955984666e-06, "loss": 0.3506, "step": 11790 }, { "epoch": 0.837577413802282, "grad_norm": 2.8777113855306, "learning_rate": 1.6250177481186997e-06, "loss": 0.351, "step": 11800 }, { "epoch": 0.838287225169911, "grad_norm": 3.317900354948997, "learning_rate": 1.6179185006389326e-06, "loss": 0.3426, "step": 11810 }, { "epoch": 0.8389970365375401, "grad_norm": 2.7259998460321295, "learning_rate": 1.610819253159165e-06, "loss": 0.3416, "step": 11820 }, { "epoch": 0.8397068479051693, "grad_norm": 7.203501395811214, "learning_rate": 1.603720005679398e-06, "loss": 0.346, "step": 11830 }, { "epoch": 0.8404166592727983, "grad_norm": 3.5281319520469343, "learning_rate": 1.596620758199631e-06, "loss": 0.3415, "step": 11840 }, { "epoch": 0.8411264706404273, "grad_norm": 2.8068995456792085, "learning_rate": 1.589521510719864e-06, "loss": 0.3506, "step": 11850 }, { "epoch": 0.8418362820080564, "grad_norm": 5.8571413992691, "learning_rate": 1.5824222632400968e-06, "loss": 0.3492, "step": 11860 }, { "epoch": 0.8425460933756854, "grad_norm": 2.8473277239745625, "learning_rate": 1.5753230157603295e-06, "loss": 0.3464, "step": 11870 }, { "epoch": 0.8432559047433145, "grad_norm": 2.743001963303042, "learning_rate": 1.5682237682805624e-06, "loss": 0.3457, "step": 11880 }, { "epoch": 0.8439657161109435, "grad_norm": 10.213481491528695, "learning_rate": 1.5611245208007953e-06, "loss": 0.3578, "step": 11890 }, { "epoch": 0.8446755274785726, "grad_norm": 3.735755256117381, "learning_rate": 1.5540252733210282e-06, "loss": 0.3503, "step": 11900 }, { "epoch": 0.8453853388462016, "grad_norm": 4.459890794830131, "learning_rate": 1.546926025841261e-06, "loss": 0.3409, "step": 11910 }, { "epoch": 0.8460951502138306, "grad_norm": 4.8029617986261295, "learning_rate": 1.5398267783614938e-06, "loss": 0.3538, "step": 11920 }, { "epoch": 0.8468049615814597, "grad_norm": 7.056776646894436, "learning_rate": 1.5327275308817267e-06, "loss": 0.346, "step": 11930 }, { "epoch": 0.8475147729490888, "grad_norm": 7.364554673266408, "learning_rate": 1.5256282834019595e-06, "loss": 0.3478, "step": 11940 }, { "epoch": 0.8482245843167179, "grad_norm": 3.605377806044163, "learning_rate": 1.5185290359221924e-06, "loss": 0.3499, "step": 11950 }, { "epoch": 0.8489343956843469, "grad_norm": 2.452400869581193, "learning_rate": 1.5114297884424253e-06, "loss": 0.339, "step": 11960 }, { "epoch": 0.849644207051976, "grad_norm": 2.870621078183671, "learning_rate": 1.504330540962658e-06, "loss": 0.3441, "step": 11970 }, { "epoch": 0.850354018419605, "grad_norm": 4.473314694561015, "learning_rate": 1.4972312934828909e-06, "loss": 0.3559, "step": 11980 }, { "epoch": 0.851063829787234, "grad_norm": 5.114834992133615, "learning_rate": 1.4901320460031238e-06, "loss": 0.3541, "step": 11990 }, { "epoch": 0.8517736411548631, "grad_norm": 12.083657543428806, "learning_rate": 1.4830327985233567e-06, "loss": 0.358, "step": 12000 }, { "epoch": 0.8524834525224921, "grad_norm": 3.7361409384047923, "learning_rate": 1.4759335510435896e-06, "loss": 0.3395, "step": 12010 }, { "epoch": 0.8531932638901212, "grad_norm": 3.4424635097779657, "learning_rate": 1.4688343035638222e-06, "loss": 0.3593, "step": 12020 }, { "epoch": 0.8539030752577502, "grad_norm": 1.9645069008952134, "learning_rate": 1.4617350560840551e-06, "loss": 0.3508, "step": 12030 }, { "epoch": 0.8546128866253793, "grad_norm": 4.627652849790996, "learning_rate": 1.454635808604288e-06, "loss": 0.3408, "step": 12040 }, { "epoch": 0.8553226979930083, "grad_norm": 3.831924600437753, "learning_rate": 1.447536561124521e-06, "loss": 0.3487, "step": 12050 }, { "epoch": 0.8560325093606375, "grad_norm": 4.570169273747359, "learning_rate": 1.4404373136447538e-06, "loss": 0.3415, "step": 12060 }, { "epoch": 0.8567423207282665, "grad_norm": 4.6135182738223595, "learning_rate": 1.4333380661649865e-06, "loss": 0.3604, "step": 12070 }, { "epoch": 0.8574521320958955, "grad_norm": 4.751574062951781, "learning_rate": 1.4262388186852194e-06, "loss": 0.3636, "step": 12080 }, { "epoch": 0.8581619434635246, "grad_norm": 3.378379003665899, "learning_rate": 1.4191395712054523e-06, "loss": 0.3432, "step": 12090 }, { "epoch": 0.8588717548311536, "grad_norm": 16.540688675093385, "learning_rate": 1.4120403237256851e-06, "loss": 0.3396, "step": 12100 }, { "epoch": 0.8595815661987827, "grad_norm": 4.814104030359969, "learning_rate": 1.404941076245918e-06, "loss": 0.3461, "step": 12110 }, { "epoch": 0.8602913775664117, "grad_norm": 10.051601410520883, "learning_rate": 1.3978418287661507e-06, "loss": 0.3447, "step": 12120 }, { "epoch": 0.8610011889340408, "grad_norm": 2.642610961406552, "learning_rate": 1.3907425812863836e-06, "loss": 0.3361, "step": 12130 }, { "epoch": 0.8617110003016698, "grad_norm": 4.614329866790318, "learning_rate": 1.3836433338066165e-06, "loss": 0.3528, "step": 12140 }, { "epoch": 0.8624208116692988, "grad_norm": 5.744791519089807, "learning_rate": 1.3765440863268496e-06, "loss": 0.3607, "step": 12150 }, { "epoch": 0.8631306230369279, "grad_norm": 3.9315757108747618, "learning_rate": 1.3694448388470825e-06, "loss": 0.3598, "step": 12160 }, { "epoch": 0.8638404344045569, "grad_norm": 5.812032059514415, "learning_rate": 1.3623455913673154e-06, "loss": 0.3406, "step": 12170 }, { "epoch": 0.8645502457721861, "grad_norm": 3.1863830261887784, "learning_rate": 1.3552463438875478e-06, "loss": 0.3435, "step": 12180 }, { "epoch": 0.8652600571398151, "grad_norm": 3.164333810889643, "learning_rate": 1.348147096407781e-06, "loss": 0.3477, "step": 12190 }, { "epoch": 0.8659698685074442, "grad_norm": 4.132090281780686, "learning_rate": 1.3410478489280138e-06, "loss": 0.3476, "step": 12200 }, { "epoch": 0.8666796798750732, "grad_norm": 3.050674443165291, "learning_rate": 1.3339486014482467e-06, "loss": 0.3451, "step": 12210 }, { "epoch": 0.8673894912427023, "grad_norm": 5.9765372634611476, "learning_rate": 1.3268493539684796e-06, "loss": 0.3516, "step": 12220 }, { "epoch": 0.8680993026103313, "grad_norm": 10.801904177839997, "learning_rate": 1.3197501064887123e-06, "loss": 0.3525, "step": 12230 }, { "epoch": 0.8688091139779603, "grad_norm": 10.795290079471496, "learning_rate": 1.3126508590089452e-06, "loss": 0.3458, "step": 12240 }, { "epoch": 0.8695189253455894, "grad_norm": 5.185082480943749, "learning_rate": 1.305551611529178e-06, "loss": 0.3471, "step": 12250 }, { "epoch": 0.8702287367132184, "grad_norm": 5.967453058115287, "learning_rate": 1.298452364049411e-06, "loss": 0.3593, "step": 12260 }, { "epoch": 0.8709385480808475, "grad_norm": 2.9260514202439807, "learning_rate": 1.2913531165696439e-06, "loss": 0.3401, "step": 12270 }, { "epoch": 0.8716483594484765, "grad_norm": 3.5904246593138924, "learning_rate": 1.2842538690898765e-06, "loss": 0.3407, "step": 12280 }, { "epoch": 0.8723581708161057, "grad_norm": 5.983622275696177, "learning_rate": 1.2771546216101094e-06, "loss": 0.3453, "step": 12290 }, { "epoch": 0.8730679821837347, "grad_norm": 4.330501853746522, "learning_rate": 1.2700553741303423e-06, "loss": 0.3494, "step": 12300 }, { "epoch": 0.8737777935513638, "grad_norm": 3.642467957948953, "learning_rate": 1.2629561266505752e-06, "loss": 0.3458, "step": 12310 }, { "epoch": 0.8744876049189928, "grad_norm": 5.610238111701037, "learning_rate": 1.255856879170808e-06, "loss": 0.3533, "step": 12320 }, { "epoch": 0.8751974162866218, "grad_norm": 5.47126817738485, "learning_rate": 1.248757631691041e-06, "loss": 0.3685, "step": 12330 }, { "epoch": 0.8759072276542509, "grad_norm": 2.9438005039273953, "learning_rate": 1.2416583842112737e-06, "loss": 0.3325, "step": 12340 }, { "epoch": 0.8766170390218799, "grad_norm": 3.7896440417507415, "learning_rate": 1.2345591367315065e-06, "loss": 0.3445, "step": 12350 }, { "epoch": 0.877326850389509, "grad_norm": 5.754468251004695, "learning_rate": 1.2274598892517394e-06, "loss": 0.3374, "step": 12360 }, { "epoch": 0.878036661757138, "grad_norm": 4.267624406753751, "learning_rate": 1.2203606417719723e-06, "loss": 0.341, "step": 12370 }, { "epoch": 0.8787464731247671, "grad_norm": 3.1963277785921993, "learning_rate": 1.2132613942922052e-06, "loss": 0.3381, "step": 12380 }, { "epoch": 0.8794562844923961, "grad_norm": 6.653906616284059, "learning_rate": 1.206162146812438e-06, "loss": 0.3506, "step": 12390 }, { "epoch": 0.8801660958600251, "grad_norm": 3.897977105597471, "learning_rate": 1.1990628993326708e-06, "loss": 0.3475, "step": 12400 }, { "epoch": 0.8808759072276543, "grad_norm": 4.962651576299262, "learning_rate": 1.1919636518529037e-06, "loss": 0.349, "step": 12410 }, { "epoch": 0.8815857185952833, "grad_norm": 5.136741390825168, "learning_rate": 1.1848644043731366e-06, "loss": 0.3465, "step": 12420 }, { "epoch": 0.8822955299629124, "grad_norm": 4.445543310701251, "learning_rate": 1.1777651568933695e-06, "loss": 0.3548, "step": 12430 }, { "epoch": 0.8830053413305414, "grad_norm": 20.40372637998409, "learning_rate": 1.1706659094136021e-06, "loss": 0.3583, "step": 12440 }, { "epoch": 0.8837151526981705, "grad_norm": 3.982374880512643, "learning_rate": 1.163566661933835e-06, "loss": 0.3317, "step": 12450 }, { "epoch": 0.8844249640657995, "grad_norm": 32.55413999411799, "learning_rate": 1.156467414454068e-06, "loss": 0.3514, "step": 12460 }, { "epoch": 0.8851347754334286, "grad_norm": 5.420145750098025, "learning_rate": 1.1493681669743008e-06, "loss": 0.3318, "step": 12470 }, { "epoch": 0.8858445868010576, "grad_norm": 3.685854173880656, "learning_rate": 1.1422689194945337e-06, "loss": 0.3429, "step": 12480 }, { "epoch": 0.8865543981686866, "grad_norm": 4.6974765931702605, "learning_rate": 1.1351696720147664e-06, "loss": 0.357, "step": 12490 }, { "epoch": 0.8872642095363157, "grad_norm": 6.795504660900696, "learning_rate": 1.1280704245349995e-06, "loss": 0.3531, "step": 12500 }, { "epoch": 0.8879740209039447, "grad_norm": 4.927867549600845, "learning_rate": 1.1209711770552324e-06, "loss": 0.3647, "step": 12510 }, { "epoch": 0.8886838322715739, "grad_norm": 70.3319920713418, "learning_rate": 1.113871929575465e-06, "loss": 0.3481, "step": 12520 }, { "epoch": 0.8893936436392029, "grad_norm": 29.187269789239732, "learning_rate": 1.106772682095698e-06, "loss": 0.3487, "step": 12530 }, { "epoch": 0.890103455006832, "grad_norm": 2.619165987059257, "learning_rate": 1.0996734346159308e-06, "loss": 0.3557, "step": 12540 }, { "epoch": 0.890813266374461, "grad_norm": 5.724483375383932, "learning_rate": 1.0925741871361637e-06, "loss": 0.3587, "step": 12550 }, { "epoch": 0.89152307774209, "grad_norm": 4.2668973076468, "learning_rate": 1.0854749396563966e-06, "loss": 0.3462, "step": 12560 }, { "epoch": 0.8922328891097191, "grad_norm": 9.234745768295488, "learning_rate": 1.0783756921766293e-06, "loss": 0.3537, "step": 12570 }, { "epoch": 0.8929427004773481, "grad_norm": 3.665665785771113, "learning_rate": 1.0712764446968622e-06, "loss": 0.3643, "step": 12580 }, { "epoch": 0.8936525118449772, "grad_norm": 2.6258893539339656, "learning_rate": 1.064177197217095e-06, "loss": 0.3338, "step": 12590 }, { "epoch": 0.8943623232126062, "grad_norm": 3.154491930622594, "learning_rate": 1.057077949737328e-06, "loss": 0.3444, "step": 12600 }, { "epoch": 0.8950721345802353, "grad_norm": 7.836052713310002, "learning_rate": 1.0499787022575608e-06, "loss": 0.3628, "step": 12610 }, { "epoch": 0.8957819459478643, "grad_norm": 3.8943175763479996, "learning_rate": 1.0428794547777935e-06, "loss": 0.3403, "step": 12620 }, { "epoch": 0.8964917573154934, "grad_norm": 15.29553673398478, "learning_rate": 1.0357802072980264e-06, "loss": 0.3521, "step": 12630 }, { "epoch": 0.8972015686831225, "grad_norm": 4.442650541355824, "learning_rate": 1.0286809598182595e-06, "loss": 0.3342, "step": 12640 }, { "epoch": 0.8979113800507516, "grad_norm": 3.9047310665092247, "learning_rate": 1.0215817123384922e-06, "loss": 0.3427, "step": 12650 }, { "epoch": 0.8986211914183806, "grad_norm": 2.1332446352398544, "learning_rate": 1.014482464858725e-06, "loss": 0.349, "step": 12660 }, { "epoch": 0.8993310027860096, "grad_norm": 2.8714716164962923, "learning_rate": 1.0073832173789578e-06, "loss": 0.357, "step": 12670 }, { "epoch": 0.9000408141536387, "grad_norm": 5.513019742153847, "learning_rate": 1.0002839698991909e-06, "loss": 0.3404, "step": 12680 }, { "epoch": 0.9007506255212677, "grad_norm": 3.940129513886605, "learning_rate": 9.931847224194237e-07, "loss": 0.3637, "step": 12690 }, { "epoch": 0.9014604368888968, "grad_norm": 3.9515535744587256, "learning_rate": 9.860854749396564e-07, "loss": 0.3498, "step": 12700 }, { "epoch": 0.9021702482565258, "grad_norm": 3.0069372274862234, "learning_rate": 9.789862274598893e-07, "loss": 0.3398, "step": 12710 }, { "epoch": 0.9028800596241549, "grad_norm": 3.5043049442535072, "learning_rate": 9.718869799801222e-07, "loss": 0.339, "step": 12720 }, { "epoch": 0.9035898709917839, "grad_norm": 4.7818413498969825, "learning_rate": 9.64787732500355e-07, "loss": 0.3482, "step": 12730 }, { "epoch": 0.9042996823594129, "grad_norm": 2.9143937043517485, "learning_rate": 9.57688485020588e-07, "loss": 0.3289, "step": 12740 }, { "epoch": 0.9050094937270421, "grad_norm": 3.530470062388488, "learning_rate": 9.505892375408208e-07, "loss": 0.3406, "step": 12750 }, { "epoch": 0.9057193050946711, "grad_norm": 3.6289940943514245, "learning_rate": 9.434899900610537e-07, "loss": 0.343, "step": 12760 }, { "epoch": 0.9064291164623002, "grad_norm": 11.92232636233806, "learning_rate": 9.363907425812864e-07, "loss": 0.3538, "step": 12770 }, { "epoch": 0.9071389278299292, "grad_norm": 3.3864038291963787, "learning_rate": 9.292914951015193e-07, "loss": 0.3361, "step": 12780 }, { "epoch": 0.9078487391975583, "grad_norm": 4.345114007441839, "learning_rate": 9.221922476217522e-07, "loss": 0.3307, "step": 12790 }, { "epoch": 0.9085585505651873, "grad_norm": 3.2046183568204687, "learning_rate": 9.15093000141985e-07, "loss": 0.3467, "step": 12800 }, { "epoch": 0.9092683619328163, "grad_norm": 3.030859855481088, "learning_rate": 9.079937526622179e-07, "loss": 0.3467, "step": 12810 }, { "epoch": 0.9099781733004454, "grad_norm": 4.579582289306875, "learning_rate": 9.008945051824507e-07, "loss": 0.3232, "step": 12820 }, { "epoch": 0.9106879846680744, "grad_norm": 3.760749336756688, "learning_rate": 8.937952577026836e-07, "loss": 0.3467, "step": 12830 }, { "epoch": 0.9113977960357035, "grad_norm": 3.179418594295822, "learning_rate": 8.866960102229165e-07, "loss": 0.3473, "step": 12840 }, { "epoch": 0.9121076074033325, "grad_norm": 3.983021666456075, "learning_rate": 8.795967627431492e-07, "loss": 0.3587, "step": 12850 }, { "epoch": 0.9128174187709616, "grad_norm": 2.6025747411648243, "learning_rate": 8.724975152633821e-07, "loss": 0.3462, "step": 12860 }, { "epoch": 0.9135272301385907, "grad_norm": 4.3088037403974315, "learning_rate": 8.65398267783615e-07, "loss": 0.3428, "step": 12870 }, { "epoch": 0.9142370415062198, "grad_norm": 3.7771085521562644, "learning_rate": 8.582990203038478e-07, "loss": 0.3398, "step": 12880 }, { "epoch": 0.9149468528738488, "grad_norm": 2.5115102656996853, "learning_rate": 8.511997728240808e-07, "loss": 0.3419, "step": 12890 }, { "epoch": 0.9156566642414778, "grad_norm": 2.646423568943871, "learning_rate": 8.441005253443135e-07, "loss": 0.3326, "step": 12900 }, { "epoch": 0.9163664756091069, "grad_norm": 4.308215071259538, "learning_rate": 8.370012778645465e-07, "loss": 0.3383, "step": 12910 }, { "epoch": 0.9170762869767359, "grad_norm": 7.273858221430791, "learning_rate": 8.299020303847794e-07, "loss": 0.3411, "step": 12920 }, { "epoch": 0.917786098344365, "grad_norm": 3.1600055981634183, "learning_rate": 8.228027829050122e-07, "loss": 0.3577, "step": 12930 }, { "epoch": 0.918495909711994, "grad_norm": 6.08255963796338, "learning_rate": 8.15703535425245e-07, "loss": 0.3589, "step": 12940 }, { "epoch": 0.9192057210796231, "grad_norm": 4.397885394689723, "learning_rate": 8.086042879454778e-07, "loss": 0.3492, "step": 12950 }, { "epoch": 0.9199155324472521, "grad_norm": 227.99760672787355, "learning_rate": 8.015050404657107e-07, "loss": 0.3346, "step": 12960 }, { "epoch": 0.9206253438148811, "grad_norm": 2.2307237070418853, "learning_rate": 7.944057929859436e-07, "loss": 0.3441, "step": 12970 }, { "epoch": 0.9213351551825103, "grad_norm": 5.180228064847272, "learning_rate": 7.873065455061764e-07, "loss": 0.3465, "step": 12980 }, { "epoch": 0.9220449665501393, "grad_norm": 3.2003044967213836, "learning_rate": 7.802072980264093e-07, "loss": 0.3425, "step": 12990 }, { "epoch": 0.9227547779177684, "grad_norm": 2.734492726273123, "learning_rate": 7.731080505466421e-07, "loss": 0.3403, "step": 13000 }, { "epoch": 0.9234645892853974, "grad_norm": 2.825363146947483, "learning_rate": 7.66008803066875e-07, "loss": 0.3644, "step": 13010 }, { "epoch": 0.9241744006530265, "grad_norm": 6.94935444401322, "learning_rate": 7.589095555871078e-07, "loss": 0.3498, "step": 13020 }, { "epoch": 0.9248842120206555, "grad_norm": 2.8121909722558924, "learning_rate": 7.518103081073406e-07, "loss": 0.356, "step": 13030 }, { "epoch": 0.9255940233882846, "grad_norm": 2.7024231170054946, "learning_rate": 7.447110606275735e-07, "loss": 0.3415, "step": 13040 }, { "epoch": 0.9263038347559136, "grad_norm": 2.9617596087956195, "learning_rate": 7.376118131478063e-07, "loss": 0.3372, "step": 13050 }, { "epoch": 0.9270136461235426, "grad_norm": 42.5976926609076, "learning_rate": 7.305125656680392e-07, "loss": 0.3541, "step": 13060 }, { "epoch": 0.9277234574911717, "grad_norm": 3.769476187835692, "learning_rate": 7.234133181882722e-07, "loss": 0.3594, "step": 13070 }, { "epoch": 0.9284332688588007, "grad_norm": 3.749361674379726, "learning_rate": 7.163140707085049e-07, "loss": 0.3348, "step": 13080 }, { "epoch": 0.9291430802264298, "grad_norm": 2.5267280447133937, "learning_rate": 7.092148232287379e-07, "loss": 0.3579, "step": 13090 }, { "epoch": 0.9298528915940589, "grad_norm": 3.0968195473762097, "learning_rate": 7.021155757489707e-07, "loss": 0.3392, "step": 13100 }, { "epoch": 0.930562702961688, "grad_norm": 3.9129176862736674, "learning_rate": 6.950163282692035e-07, "loss": 0.3533, "step": 13110 }, { "epoch": 0.931272514329317, "grad_norm": 2.7485456874581122, "learning_rate": 6.879170807894364e-07, "loss": 0.3399, "step": 13120 }, { "epoch": 0.9319823256969461, "grad_norm": 4.769184944849367, "learning_rate": 6.808178333096692e-07, "loss": 0.3551, "step": 13130 }, { "epoch": 0.9326921370645751, "grad_norm": 2.8275717207772098, "learning_rate": 6.737185858299021e-07, "loss": 0.348, "step": 13140 }, { "epoch": 0.9334019484322041, "grad_norm": 2.1023857426151595, "learning_rate": 6.66619338350135e-07, "loss": 0.3381, "step": 13150 }, { "epoch": 0.9341117597998332, "grad_norm": 2.8745163990655125, "learning_rate": 6.595200908703678e-07, "loss": 0.3488, "step": 13160 }, { "epoch": 0.9348215711674622, "grad_norm": 3.97821451395574, "learning_rate": 6.524208433906007e-07, "loss": 0.349, "step": 13170 }, { "epoch": 0.9355313825350913, "grad_norm": 7.304369226663597, "learning_rate": 6.453215959108335e-07, "loss": 0.352, "step": 13180 }, { "epoch": 0.9362411939027203, "grad_norm": 4.654909122469299, "learning_rate": 6.382223484310663e-07, "loss": 0.3478, "step": 13190 }, { "epoch": 0.9369510052703494, "grad_norm": 3.4074758383445296, "learning_rate": 6.311231009512992e-07, "loss": 0.3265, "step": 13200 }, { "epoch": 0.9376608166379785, "grad_norm": 2.8891732151802687, "learning_rate": 6.24023853471532e-07, "loss": 0.342, "step": 13210 }, { "epoch": 0.9383706280056076, "grad_norm": 4.315712149288758, "learning_rate": 6.169246059917649e-07, "loss": 0.3542, "step": 13220 }, { "epoch": 0.9390804393732366, "grad_norm": 4.202849073092827, "learning_rate": 6.098253585119978e-07, "loss": 0.3464, "step": 13230 }, { "epoch": 0.9397902507408656, "grad_norm": 4.402135376104271, "learning_rate": 6.027261110322307e-07, "loss": 0.3493, "step": 13240 }, { "epoch": 0.9405000621084947, "grad_norm": 3.3375797449619804, "learning_rate": 5.956268635524635e-07, "loss": 0.3431, "step": 13250 }, { "epoch": 0.9412098734761237, "grad_norm": 2.58448811647569, "learning_rate": 5.885276160726964e-07, "loss": 0.3516, "step": 13260 }, { "epoch": 0.9419196848437528, "grad_norm": 3.1207357827554216, "learning_rate": 5.814283685929293e-07, "loss": 0.3469, "step": 13270 }, { "epoch": 0.9426294962113818, "grad_norm": 5.535335579042853, "learning_rate": 5.74329121113162e-07, "loss": 0.3411, "step": 13280 }, { "epoch": 0.9433393075790109, "grad_norm": 4.157192002051246, "learning_rate": 5.672298736333949e-07, "loss": 0.3357, "step": 13290 }, { "epoch": 0.9440491189466399, "grad_norm": 4.609541473632524, "learning_rate": 5.601306261536277e-07, "loss": 0.3297, "step": 13300 }, { "epoch": 0.9447589303142689, "grad_norm": 4.556290013887312, "learning_rate": 5.530313786738606e-07, "loss": 0.3268, "step": 13310 }, { "epoch": 0.945468741681898, "grad_norm": 4.334131807132338, "learning_rate": 5.459321311940935e-07, "loss": 0.3582, "step": 13320 }, { "epoch": 0.9461785530495271, "grad_norm": 4.733377355574472, "learning_rate": 5.388328837143264e-07, "loss": 0.3366, "step": 13330 }, { "epoch": 0.9468883644171562, "grad_norm": 6.762724277887754, "learning_rate": 5.317336362345592e-07, "loss": 0.345, "step": 13340 }, { "epoch": 0.9475981757847852, "grad_norm": 2.9705397730746634, "learning_rate": 5.246343887547921e-07, "loss": 0.3465, "step": 13350 }, { "epoch": 0.9483079871524143, "grad_norm": 3.195893348669726, "learning_rate": 5.175351412750249e-07, "loss": 0.3348, "step": 13360 }, { "epoch": 0.9490177985200433, "grad_norm": 7.323985518462735, "learning_rate": 5.104358937952577e-07, "loss": 0.3543, "step": 13370 }, { "epoch": 0.9497276098876724, "grad_norm": 2.799618403745627, "learning_rate": 5.033366463154906e-07, "loss": 0.3431, "step": 13380 }, { "epoch": 0.9504374212553014, "grad_norm": 2.7728876598155843, "learning_rate": 4.962373988357234e-07, "loss": 0.3249, "step": 13390 }, { "epoch": 0.9511472326229304, "grad_norm": 5.195465798306655, "learning_rate": 4.891381513559563e-07, "loss": 0.3413, "step": 13400 }, { "epoch": 0.9518570439905595, "grad_norm": 10.319650407110732, "learning_rate": 4.820389038761892e-07, "loss": 0.3289, "step": 13410 }, { "epoch": 0.9525668553581885, "grad_norm": 3.639550539774894, "learning_rate": 4.74939656396422e-07, "loss": 0.358, "step": 13420 }, { "epoch": 0.9532766667258176, "grad_norm": 3.005922518183922, "learning_rate": 4.6784040891665486e-07, "loss": 0.3483, "step": 13430 }, { "epoch": 0.9539864780934467, "grad_norm": 3.658172908229024, "learning_rate": 4.607411614368877e-07, "loss": 0.3503, "step": 13440 }, { "epoch": 0.9546962894610758, "grad_norm": 3.17836271977541, "learning_rate": 4.5364191395712053e-07, "loss": 0.32, "step": 13450 }, { "epoch": 0.9554061008287048, "grad_norm": 2.6050315565816513, "learning_rate": 4.465426664773535e-07, "loss": 0.336, "step": 13460 }, { "epoch": 0.9561159121963339, "grad_norm": 2.516963929561299, "learning_rate": 4.394434189975863e-07, "loss": 0.3461, "step": 13470 }, { "epoch": 0.9568257235639629, "grad_norm": 5.182889994348168, "learning_rate": 4.3234417151781915e-07, "loss": 0.3453, "step": 13480 }, { "epoch": 0.9575355349315919, "grad_norm": 2.2527308195923843, "learning_rate": 4.25244924038052e-07, "loss": 0.3394, "step": 13490 }, { "epoch": 0.958245346299221, "grad_norm": 5.702042483324615, "learning_rate": 4.181456765582848e-07, "loss": 0.3464, "step": 13500 }, { "epoch": 0.95895515766685, "grad_norm": 4.320082944510015, "learning_rate": 4.110464290785177e-07, "loss": 0.361, "step": 13510 }, { "epoch": 0.9596649690344791, "grad_norm": 2.7057123674561683, "learning_rate": 4.0394718159875055e-07, "loss": 0.3451, "step": 13520 }, { "epoch": 0.9603747804021081, "grad_norm": 6.179223629975322, "learning_rate": 3.968479341189834e-07, "loss": 0.3371, "step": 13530 }, { "epoch": 0.9610845917697372, "grad_norm": 2.5395758819730267, "learning_rate": 3.897486866392163e-07, "loss": 0.3587, "step": 13540 }, { "epoch": 0.9617944031373662, "grad_norm": 3.6526335466786835, "learning_rate": 3.8264943915944917e-07, "loss": 0.3439, "step": 13550 }, { "epoch": 0.9625042145049953, "grad_norm": 6.134974420857256, "learning_rate": 3.75550191679682e-07, "loss": 0.3413, "step": 13560 }, { "epoch": 0.9632140258726244, "grad_norm": 4.231152248304582, "learning_rate": 3.6845094419991484e-07, "loss": 0.3412, "step": 13570 }, { "epoch": 0.9639238372402534, "grad_norm": 19.9166049671889, "learning_rate": 3.613516967201477e-07, "loss": 0.3457, "step": 13580 }, { "epoch": 0.9646336486078825, "grad_norm": 3.0744926751867565, "learning_rate": 3.542524492403805e-07, "loss": 0.3501, "step": 13590 }, { "epoch": 0.9653434599755115, "grad_norm": 4.316210901775538, "learning_rate": 3.471532017606134e-07, "loss": 0.3391, "step": 13600 }, { "epoch": 0.9660532713431406, "grad_norm": 5.568442813862272, "learning_rate": 3.400539542808463e-07, "loss": 0.3571, "step": 13610 }, { "epoch": 0.9667630827107696, "grad_norm": 2.464997647373043, "learning_rate": 3.3295470680107913e-07, "loss": 0.3403, "step": 13620 }, { "epoch": 0.9674728940783986, "grad_norm": 9.203447351864554, "learning_rate": 3.2585545932131197e-07, "loss": 0.3372, "step": 13630 }, { "epoch": 0.9681827054460277, "grad_norm": 4.083574237624433, "learning_rate": 3.187562118415448e-07, "loss": 0.3523, "step": 13640 }, { "epoch": 0.9688925168136567, "grad_norm": 2.580899686505033, "learning_rate": 3.1165696436177764e-07, "loss": 0.3331, "step": 13650 }, { "epoch": 0.9696023281812858, "grad_norm": 4.461792584369479, "learning_rate": 3.0455771688201053e-07, "loss": 0.3436, "step": 13660 }, { "epoch": 0.9703121395489148, "grad_norm": 6.002729090963929, "learning_rate": 2.9745846940224337e-07, "loss": 0.3392, "step": 13670 }, { "epoch": 0.971021950916544, "grad_norm": 15.908649085501459, "learning_rate": 2.9035922192247626e-07, "loss": 0.3401, "step": 13680 }, { "epoch": 0.971731762284173, "grad_norm": 3.2548319133826875, "learning_rate": 2.832599744427091e-07, "loss": 0.3466, "step": 13690 }, { "epoch": 0.9724415736518021, "grad_norm": 2.810860141109629, "learning_rate": 2.76160726962942e-07, "loss": 0.3445, "step": 13700 }, { "epoch": 0.9731513850194311, "grad_norm": 5.404897398221347, "learning_rate": 2.690614794831748e-07, "loss": 0.3464, "step": 13710 }, { "epoch": 0.9738611963870601, "grad_norm": 3.07947902781157, "learning_rate": 2.6196223200340766e-07, "loss": 0.3295, "step": 13720 }, { "epoch": 0.9745710077546892, "grad_norm": 3.2905796500928814, "learning_rate": 2.548629845236405e-07, "loss": 0.3491, "step": 13730 }, { "epoch": 0.9752808191223182, "grad_norm": 4.431073995020802, "learning_rate": 2.4776373704387334e-07, "loss": 0.3483, "step": 13740 }, { "epoch": 0.9759906304899473, "grad_norm": 3.5179707782287166, "learning_rate": 2.406644895641062e-07, "loss": 0.3469, "step": 13750 }, { "epoch": 0.9767004418575763, "grad_norm": 4.221356923748856, "learning_rate": 2.3356524208433906e-07, "loss": 0.3343, "step": 13760 }, { "epoch": 0.9774102532252054, "grad_norm": 286.15418214313974, "learning_rate": 2.2646599460457195e-07, "loss": 0.3349, "step": 13770 }, { "epoch": 0.9781200645928344, "grad_norm": 3.4922335144175576, "learning_rate": 2.193667471248048e-07, "loss": 0.3485, "step": 13780 }, { "epoch": 0.9788298759604636, "grad_norm": 3.944308898398798, "learning_rate": 2.1226749964503763e-07, "loss": 0.3288, "step": 13790 }, { "epoch": 0.9795396873280926, "grad_norm": 3.16447581060814, "learning_rate": 2.0516825216527052e-07, "loss": 0.3435, "step": 13800 }, { "epoch": 0.9802494986957216, "grad_norm": 7.105988741131366, "learning_rate": 1.9806900468550335e-07, "loss": 0.342, "step": 13810 }, { "epoch": 0.9809593100633507, "grad_norm": 3.311616450653751, "learning_rate": 1.9096975720573622e-07, "loss": 0.365, "step": 13820 }, { "epoch": 0.9816691214309797, "grad_norm": 3.1283492138129128, "learning_rate": 1.8387050972596905e-07, "loss": 0.3497, "step": 13830 }, { "epoch": 0.9823789327986088, "grad_norm": 4.720800332800002, "learning_rate": 1.7677126224620194e-07, "loss": 0.3356, "step": 13840 }, { "epoch": 0.9830887441662378, "grad_norm": 5.755549723756511, "learning_rate": 1.6967201476643478e-07, "loss": 0.3534, "step": 13850 }, { "epoch": 0.9837985555338669, "grad_norm": 12.413957162417217, "learning_rate": 1.6257276728666762e-07, "loss": 0.3514, "step": 13860 }, { "epoch": 0.9845083669014959, "grad_norm": 3.7416649036415195, "learning_rate": 1.5547351980690048e-07, "loss": 0.3468, "step": 13870 }, { "epoch": 0.985218178269125, "grad_norm": 5.096087166471907, "learning_rate": 1.4837427232713335e-07, "loss": 0.3478, "step": 13880 }, { "epoch": 0.985927989636754, "grad_norm": 2.8643069595501847, "learning_rate": 1.4127502484736618e-07, "loss": 0.3307, "step": 13890 }, { "epoch": 0.986637801004383, "grad_norm": 4.161106542911394, "learning_rate": 1.3417577736759905e-07, "loss": 0.3451, "step": 13900 }, { "epoch": 0.9873476123720122, "grad_norm": 3.161705990477656, "learning_rate": 1.270765298878319e-07, "loss": 0.3389, "step": 13910 }, { "epoch": 0.9880574237396412, "grad_norm": 3.2196566259908637, "learning_rate": 1.1997728240806475e-07, "loss": 0.3508, "step": 13920 }, { "epoch": 0.9887672351072703, "grad_norm": 3.0061617959710403, "learning_rate": 1.1287803492829761e-07, "loss": 0.357, "step": 13930 }, { "epoch": 0.9894770464748993, "grad_norm": 7.195163761877952, "learning_rate": 1.0577878744853047e-07, "loss": 0.3344, "step": 13940 }, { "epoch": 0.9901868578425284, "grad_norm": 4.778295681909435, "learning_rate": 9.867953996876332e-08, "loss": 0.3404, "step": 13950 }, { "epoch": 0.9908966692101574, "grad_norm": 3.6751893575330072, "learning_rate": 9.158029248899617e-08, "loss": 0.3222, "step": 13960 }, { "epoch": 0.9916064805777864, "grad_norm": 6.066838850034421, "learning_rate": 8.448104500922902e-08, "loss": 0.3373, "step": 13970 }, { "epoch": 0.9923162919454155, "grad_norm": 5.8640066255244525, "learning_rate": 7.738179752946189e-08, "loss": 0.35, "step": 13980 }, { "epoch": 0.9930261033130445, "grad_norm": 4.063550481932921, "learning_rate": 7.028255004969474e-08, "loss": 0.3424, "step": 13990 }, { "epoch": 0.9937359146806736, "grad_norm": 6.923421784576789, "learning_rate": 6.31833025699276e-08, "loss": 0.3584, "step": 14000 }, { "epoch": 0.9944457260483026, "grad_norm": 4.621602275306591, "learning_rate": 5.6084055090160446e-08, "loss": 0.3381, "step": 14010 }, { "epoch": 0.9951555374159318, "grad_norm": 5.495946912076004, "learning_rate": 4.89848076103933e-08, "loss": 0.3557, "step": 14020 }, { "epoch": 0.9958653487835608, "grad_norm": 2.261874767912811, "learning_rate": 4.188556013062616e-08, "loss": 0.3346, "step": 14030 }, { "epoch": 0.9965751601511899, "grad_norm": 3.528699506394003, "learning_rate": 3.478631265085901e-08, "loss": 0.3284, "step": 14040 }, { "epoch": 0.9972849715188189, "grad_norm": 3.0483860239618314, "learning_rate": 2.7687065171091867e-08, "loss": 0.3341, "step": 14050 }, { "epoch": 0.9979947828864479, "grad_norm": 4.681194219809911, "learning_rate": 2.0587817691324724e-08, "loss": 0.333, "step": 14060 }, { "epoch": 0.998704594254077, "grad_norm": 5.802114485594721, "learning_rate": 1.3488570211557575e-08, "loss": 0.3457, "step": 14070 }, { "epoch": 0.999414405621706, "grad_norm": 2.8616716300198775, "learning_rate": 6.389322731790431e-09, "loss": 0.3398, "step": 14080 } ], "logging_steps": 10, "max_steps": 14088, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9975763395674112.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }