diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9889 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999822547158093, + "eval_steps": 500, + "global_step": 14088, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007098113676290526, + "grad_norm": 7.898312201157063, + "learning_rate": 9.995030526764163e-06, + "loss": 1.7908, + "step": 10 + }, + { + "epoch": 0.0014196227352581052, + "grad_norm": 5.391248052357186, + "learning_rate": 9.987931279284396e-06, + "loss": 0.7438, + "step": 20 + }, + { + "epoch": 0.002129434102887158, + "grad_norm": 5.37248606279888, + "learning_rate": 9.980832031804629e-06, + "loss": 0.6486, + "step": 30 + }, + { + "epoch": 0.0028392454705162104, + "grad_norm": 6.374151691345343, + "learning_rate": 9.973732784324862e-06, + "loss": 0.6267, + "step": 40 + }, + { + "epoch": 0.003549056838145263, + "grad_norm": 8.880583324938707, + "learning_rate": 9.966633536845095e-06, + "loss": 0.5867, + "step": 50 + }, + { + "epoch": 0.004258868205774316, + "grad_norm": 6.684640112932781, + "learning_rate": 9.959534289365328e-06, + "loss": 0.5473, + "step": 60 + }, + { + "epoch": 0.004968679573403368, + "grad_norm": 9.329177151842533, + "learning_rate": 9.952435041885561e-06, + "loss": 0.549, + "step": 70 + }, + { + "epoch": 0.005678490941032421, + "grad_norm": 4.2697386372932575, + "learning_rate": 9.945335794405794e-06, + "loss": 0.527, + "step": 80 + }, + { + "epoch": 0.006388302308661473, + "grad_norm": 4.981195314782428, + "learning_rate": 9.938236546926027e-06, + "loss": 0.5015, + "step": 90 + }, + { + "epoch": 0.007098113676290526, + "grad_norm": 3.5718526890722155, + "learning_rate": 9.931137299446259e-06, + "loss": 0.5148, + "step": 100 + }, + { + "epoch": 0.007807925043919578, + "grad_norm": 5.35860863602349, + "learning_rate": 9.924038051966492e-06, + "loss": 0.5116, + "step": 110 + }, + { + "epoch": 0.008517736411548632, + "grad_norm": 3.2550821239727434, + "learning_rate": 9.916938804486725e-06, + "loss": 0.4992, + "step": 120 + }, + { + "epoch": 0.009227547779177683, + "grad_norm": 3.4354498076448032, + "learning_rate": 9.909839557006958e-06, + "loss": 0.5088, + "step": 130 + }, + { + "epoch": 0.009937359146806737, + "grad_norm": 4.331679140736939, + "learning_rate": 9.902740309527191e-06, + "loss": 0.5139, + "step": 140 + }, + { + "epoch": 0.010647170514435788, + "grad_norm": 3.4337848966487265, + "learning_rate": 9.895641062047424e-06, + "loss": 0.5041, + "step": 150 + }, + { + "epoch": 0.011356981882064842, + "grad_norm": 8.243351710682422, + "learning_rate": 9.888541814567657e-06, + "loss": 0.5142, + "step": 160 + }, + { + "epoch": 0.012066793249693893, + "grad_norm": 4.091704188438657, + "learning_rate": 9.88144256708789e-06, + "loss": 0.4807, + "step": 170 + }, + { + "epoch": 0.012776604617322947, + "grad_norm": 21.564891334339755, + "learning_rate": 9.874343319608124e-06, + "loss": 0.5092, + "step": 180 + }, + { + "epoch": 0.013486415984951998, + "grad_norm": 3.1521060424258973, + "learning_rate": 9.867244072128355e-06, + "loss": 0.4787, + "step": 190 + }, + { + "epoch": 0.014196227352581052, + "grad_norm": 3.986481726421801, + "learning_rate": 9.860144824648588e-06, + "loss": 0.4827, + "step": 200 + }, + { + "epoch": 0.014906038720210105, + "grad_norm": 4.774263941683351, + "learning_rate": 9.853045577168821e-06, + "loss": 0.4775, + "step": 210 + }, + { + "epoch": 0.015615850087839157, + "grad_norm": 7.968327682274227, + "learning_rate": 9.845946329689053e-06, + "loss": 0.4716, + "step": 220 + }, + { + "epoch": 0.01632566145546821, + "grad_norm": 10.121205974855524, + "learning_rate": 9.838847082209286e-06, + "loss": 0.4969, + "step": 230 + }, + { + "epoch": 0.017035472823097263, + "grad_norm": 7.454679720256471, + "learning_rate": 9.831747834729519e-06, + "loss": 0.4923, + "step": 240 + }, + { + "epoch": 0.017745284190726313, + "grad_norm": 17.103084568275037, + "learning_rate": 9.824648587249752e-06, + "loss": 0.4701, + "step": 250 + }, + { + "epoch": 0.018455095558355367, + "grad_norm": 4.48293929960256, + "learning_rate": 9.817549339769985e-06, + "loss": 0.4734, + "step": 260 + }, + { + "epoch": 0.01916490692598442, + "grad_norm": 5.345114387506581, + "learning_rate": 9.810450092290218e-06, + "loss": 0.4894, + "step": 270 + }, + { + "epoch": 0.019874718293613473, + "grad_norm": 19.40561032433512, + "learning_rate": 9.803350844810451e-06, + "loss": 0.4791, + "step": 280 + }, + { + "epoch": 0.020584529661242523, + "grad_norm": 14.25299022016476, + "learning_rate": 9.796251597330684e-06, + "loss": 0.4699, + "step": 290 + }, + { + "epoch": 0.021294341028871577, + "grad_norm": 8.257072932675099, + "learning_rate": 9.789152349850918e-06, + "loss": 0.4712, + "step": 300 + }, + { + "epoch": 0.02200415239650063, + "grad_norm": 7.954026403143938, + "learning_rate": 9.782053102371149e-06, + "loss": 0.4703, + "step": 310 + }, + { + "epoch": 0.022713963764129683, + "grad_norm": 11.392767049791958, + "learning_rate": 9.774953854891382e-06, + "loss": 0.4991, + "step": 320 + }, + { + "epoch": 0.023423775131758737, + "grad_norm": 3.6589701257251392, + "learning_rate": 9.767854607411615e-06, + "loss": 0.48, + "step": 330 + }, + { + "epoch": 0.024133586499387787, + "grad_norm": 2.8317614498971095, + "learning_rate": 9.760755359931848e-06, + "loss": 0.473, + "step": 340 + }, + { + "epoch": 0.02484339786701684, + "grad_norm": 3.3672884329345467, + "learning_rate": 9.753656112452081e-06, + "loss": 0.4807, + "step": 350 + }, + { + "epoch": 0.025553209234645893, + "grad_norm": 2.918860353664653, + "learning_rate": 9.746556864972314e-06, + "loss": 0.474, + "step": 360 + }, + { + "epoch": 0.026263020602274947, + "grad_norm": 3.985430160063577, + "learning_rate": 9.739457617492548e-06, + "loss": 0.4606, + "step": 370 + }, + { + "epoch": 0.026972831969903997, + "grad_norm": 3.8499162197950216, + "learning_rate": 9.73235837001278e-06, + "loss": 0.474, + "step": 380 + }, + { + "epoch": 0.02768264333753305, + "grad_norm": 2.955339700163119, + "learning_rate": 9.725259122533012e-06, + "loss": 0.472, + "step": 390 + }, + { + "epoch": 0.028392454705162103, + "grad_norm": 5.589731350821559, + "learning_rate": 9.718159875053245e-06, + "loss": 0.4698, + "step": 400 + }, + { + "epoch": 0.029102266072791157, + "grad_norm": 3.9824871173931973, + "learning_rate": 9.711060627573478e-06, + "loss": 0.4581, + "step": 410 + }, + { + "epoch": 0.02981207744042021, + "grad_norm": 2.524559409598369, + "learning_rate": 9.70396138009371e-06, + "loss": 0.4478, + "step": 420 + }, + { + "epoch": 0.03052188880804926, + "grad_norm": 2.970731368598553, + "learning_rate": 9.696862132613943e-06, + "loss": 0.4508, + "step": 430 + }, + { + "epoch": 0.031231700175678313, + "grad_norm": 2.893829595170148, + "learning_rate": 9.689762885134176e-06, + "loss": 0.4379, + "step": 440 + }, + { + "epoch": 0.03194151154330736, + "grad_norm": 22.795684932698137, + "learning_rate": 9.682663637654409e-06, + "loss": 0.4482, + "step": 450 + }, + { + "epoch": 0.03265132291093642, + "grad_norm": 3.2812945854632236, + "learning_rate": 9.675564390174642e-06, + "loss": 0.4599, + "step": 460 + }, + { + "epoch": 0.03336113427856547, + "grad_norm": 11.615453520589618, + "learning_rate": 9.668465142694875e-06, + "loss": 0.4417, + "step": 470 + }, + { + "epoch": 0.03407094564619453, + "grad_norm": 7.726986291359829, + "learning_rate": 9.661365895215108e-06, + "loss": 0.4594, + "step": 480 + }, + { + "epoch": 0.03478075701382358, + "grad_norm": 4.365039492938302, + "learning_rate": 9.654266647735341e-06, + "loss": 0.4669, + "step": 490 + }, + { + "epoch": 0.03549056838145263, + "grad_norm": 6.54988906481092, + "learning_rate": 9.647167400255574e-06, + "loss": 0.4567, + "step": 500 + }, + { + "epoch": 0.03620037974908168, + "grad_norm": 8.933278546995766, + "learning_rate": 9.640068152775806e-06, + "loss": 0.4519, + "step": 510 + }, + { + "epoch": 0.03691019111671073, + "grad_norm": 3.7761657369108907, + "learning_rate": 9.632968905296039e-06, + "loss": 0.4501, + "step": 520 + }, + { + "epoch": 0.03762000248433979, + "grad_norm": 3.9418116527565377, + "learning_rate": 9.625869657816272e-06, + "loss": 0.4561, + "step": 530 + }, + { + "epoch": 0.03832981385196884, + "grad_norm": 3.5489889583606438, + "learning_rate": 9.618770410336505e-06, + "loss": 0.4598, + "step": 540 + }, + { + "epoch": 0.03903962521959789, + "grad_norm": 3.5164230189602548, + "learning_rate": 9.611671162856738e-06, + "loss": 0.4717, + "step": 550 + }, + { + "epoch": 0.03974943658722695, + "grad_norm": 2.1822863392109206, + "learning_rate": 9.604571915376971e-06, + "loss": 0.48, + "step": 560 + }, + { + "epoch": 0.040459247954856, + "grad_norm": 2.5677413826305413, + "learning_rate": 9.597472667897204e-06, + "loss": 0.4605, + "step": 570 + }, + { + "epoch": 0.041169059322485047, + "grad_norm": 3.011759104822335, + "learning_rate": 9.590373420417438e-06, + "loss": 0.4605, + "step": 580 + }, + { + "epoch": 0.0418788706901141, + "grad_norm": 2.56502573080614, + "learning_rate": 9.58327417293767e-06, + "loss": 0.4494, + "step": 590 + }, + { + "epoch": 0.04258868205774315, + "grad_norm": 3.2396125278123806, + "learning_rate": 9.576174925457902e-06, + "loss": 0.4542, + "step": 600 + }, + { + "epoch": 0.04329849342537221, + "grad_norm": 3.480681910714182, + "learning_rate": 9.569075677978135e-06, + "loss": 0.4548, + "step": 610 + }, + { + "epoch": 0.04400830479300126, + "grad_norm": 2.623695100630613, + "learning_rate": 9.561976430498368e-06, + "loss": 0.4594, + "step": 620 + }, + { + "epoch": 0.04471811616063031, + "grad_norm": 3.042303011325611, + "learning_rate": 9.5548771830186e-06, + "loss": 0.4557, + "step": 630 + }, + { + "epoch": 0.04542792752825937, + "grad_norm": 2.8781600946277863, + "learning_rate": 9.547777935538833e-06, + "loss": 0.484, + "step": 640 + }, + { + "epoch": 0.04613773889588842, + "grad_norm": 3.3284195205047684, + "learning_rate": 9.540678688059066e-06, + "loss": 0.4481, + "step": 650 + }, + { + "epoch": 0.04684755026351747, + "grad_norm": 3.5159109068224987, + "learning_rate": 9.533579440579299e-06, + "loss": 0.4665, + "step": 660 + }, + { + "epoch": 0.04755736163114652, + "grad_norm": 6.322136362721481, + "learning_rate": 9.526480193099532e-06, + "loss": 0.4585, + "step": 670 + }, + { + "epoch": 0.04826717299877557, + "grad_norm": 21.902103769968996, + "learning_rate": 9.519380945619765e-06, + "loss": 0.4446, + "step": 680 + }, + { + "epoch": 0.04897698436640463, + "grad_norm": 3.6046359318609356, + "learning_rate": 9.512281698139998e-06, + "loss": 0.4519, + "step": 690 + }, + { + "epoch": 0.04968679573403368, + "grad_norm": 3.039690187186011, + "learning_rate": 9.505182450660231e-06, + "loss": 0.4448, + "step": 700 + }, + { + "epoch": 0.05039660710166273, + "grad_norm": 2.608964873836775, + "learning_rate": 9.498083203180465e-06, + "loss": 0.4486, + "step": 710 + }, + { + "epoch": 0.05110641846929179, + "grad_norm": 3.368889371027321, + "learning_rate": 9.490983955700696e-06, + "loss": 0.4617, + "step": 720 + }, + { + "epoch": 0.05181622983692084, + "grad_norm": 4.094036998235093, + "learning_rate": 9.483884708220929e-06, + "loss": 0.4569, + "step": 730 + }, + { + "epoch": 0.05252604120454989, + "grad_norm": 2.979892302450325, + "learning_rate": 9.476785460741162e-06, + "loss": 0.4645, + "step": 740 + }, + { + "epoch": 0.05323585257217894, + "grad_norm": 3.676607621277054, + "learning_rate": 9.469686213261395e-06, + "loss": 0.4407, + "step": 750 + }, + { + "epoch": 0.05394566393980799, + "grad_norm": 359.9140493382262, + "learning_rate": 9.462586965781628e-06, + "loss": 0.4262, + "step": 760 + }, + { + "epoch": 0.05465547530743705, + "grad_norm": 4.447118089247447, + "learning_rate": 9.455487718301861e-06, + "loss": 0.4344, + "step": 770 + }, + { + "epoch": 0.0553652866750661, + "grad_norm": 4.569754671227615, + "learning_rate": 9.448388470822095e-06, + "loss": 0.4462, + "step": 780 + }, + { + "epoch": 0.05607509804269516, + "grad_norm": 2.3728524211263067, + "learning_rate": 9.441289223342328e-06, + "loss": 0.4386, + "step": 790 + }, + { + "epoch": 0.05678490941032421, + "grad_norm": 2.5997362569615903, + "learning_rate": 9.434189975862559e-06, + "loss": 0.4537, + "step": 800 + }, + { + "epoch": 0.057494720777953257, + "grad_norm": 4.859327134293274, + "learning_rate": 9.427090728382792e-06, + "loss": 0.4514, + "step": 810 + }, + { + "epoch": 0.05820453214558231, + "grad_norm": 2.6304161060559905, + "learning_rate": 9.419991480903025e-06, + "loss": 0.4306, + "step": 820 + }, + { + "epoch": 0.05891434351321136, + "grad_norm": 3.504607730078166, + "learning_rate": 9.412892233423258e-06, + "loss": 0.454, + "step": 830 + }, + { + "epoch": 0.05962415488084042, + "grad_norm": 3.3227222733710864, + "learning_rate": 9.40579298594349e-06, + "loss": 0.4407, + "step": 840 + }, + { + "epoch": 0.06033396624846947, + "grad_norm": 3.328718377292454, + "learning_rate": 9.398693738463723e-06, + "loss": 0.4581, + "step": 850 + }, + { + "epoch": 0.06104377761609852, + "grad_norm": 3.4977954338913864, + "learning_rate": 9.391594490983956e-06, + "loss": 0.4284, + "step": 860 + }, + { + "epoch": 0.06175358898372758, + "grad_norm": 3.228432256709841, + "learning_rate": 9.384495243504189e-06, + "loss": 0.4373, + "step": 870 + }, + { + "epoch": 0.06246340035135663, + "grad_norm": 3.1586832054050964, + "learning_rate": 9.377395996024422e-06, + "loss": 0.4348, + "step": 880 + }, + { + "epoch": 0.06317321171898568, + "grad_norm": 13.155465477764636, + "learning_rate": 9.370296748544655e-06, + "loss": 0.4217, + "step": 890 + }, + { + "epoch": 0.06388302308661473, + "grad_norm": 15.543581430412525, + "learning_rate": 9.363197501064888e-06, + "loss": 0.4593, + "step": 900 + }, + { + "epoch": 0.06459283445424378, + "grad_norm": 8.921864061523843, + "learning_rate": 9.356098253585121e-06, + "loss": 0.4608, + "step": 910 + }, + { + "epoch": 0.06530264582187284, + "grad_norm": 5.3983003526617335, + "learning_rate": 9.348999006105353e-06, + "loss": 0.4514, + "step": 920 + }, + { + "epoch": 0.06601245718950188, + "grad_norm": 7.595139513838182, + "learning_rate": 9.341899758625586e-06, + "loss": 0.4273, + "step": 930 + }, + { + "epoch": 0.06672226855713094, + "grad_norm": 3.2331459925046815, + "learning_rate": 9.334800511145819e-06, + "loss": 0.422, + "step": 940 + }, + { + "epoch": 0.06743207992476, + "grad_norm": 3.8699272404865686, + "learning_rate": 9.327701263666052e-06, + "loss": 0.4477, + "step": 950 + }, + { + "epoch": 0.06814189129238905, + "grad_norm": 2.68446192265652, + "learning_rate": 9.320602016186285e-06, + "loss": 0.4449, + "step": 960 + }, + { + "epoch": 0.0688517026600181, + "grad_norm": 2.637260503772899, + "learning_rate": 9.313502768706518e-06, + "loss": 0.4532, + "step": 970 + }, + { + "epoch": 0.06956151402764715, + "grad_norm": 3.9618993923437085, + "learning_rate": 9.306403521226751e-06, + "loss": 0.4534, + "step": 980 + }, + { + "epoch": 0.07027132539527621, + "grad_norm": 3.429568261104227, + "learning_rate": 9.299304273746985e-06, + "loss": 0.452, + "step": 990 + }, + { + "epoch": 0.07098113676290525, + "grad_norm": 3.663179434126313, + "learning_rate": 9.292205026267218e-06, + "loss": 0.439, + "step": 1000 + }, + { + "epoch": 0.07169094813053431, + "grad_norm": 4.408975026773321, + "learning_rate": 9.285105778787449e-06, + "loss": 0.4184, + "step": 1010 + }, + { + "epoch": 0.07240075949816337, + "grad_norm": 2.415108601943808, + "learning_rate": 9.278006531307682e-06, + "loss": 0.4342, + "step": 1020 + }, + { + "epoch": 0.07311057086579241, + "grad_norm": 6.698239896408658, + "learning_rate": 9.270907283827915e-06, + "loss": 0.4535, + "step": 1030 + }, + { + "epoch": 0.07382038223342147, + "grad_norm": 11.189940656850219, + "learning_rate": 9.263808036348147e-06, + "loss": 0.4192, + "step": 1040 + }, + { + "epoch": 0.07453019360105052, + "grad_norm": 3.85625217339617, + "learning_rate": 9.25670878886838e-06, + "loss": 0.4278, + "step": 1050 + }, + { + "epoch": 0.07524000496867958, + "grad_norm": 32.21212360326382, + "learning_rate": 9.249609541388613e-06, + "loss": 0.4509, + "step": 1060 + }, + { + "epoch": 0.07594981633630862, + "grad_norm": 5.919396215012425, + "learning_rate": 9.242510293908846e-06, + "loss": 0.4525, + "step": 1070 + }, + { + "epoch": 0.07665962770393768, + "grad_norm": 5.904196801283348, + "learning_rate": 9.235411046429079e-06, + "loss": 0.4422, + "step": 1080 + }, + { + "epoch": 0.07736943907156674, + "grad_norm": 4.486326467883555, + "learning_rate": 9.228311798949312e-06, + "loss": 0.4685, + "step": 1090 + }, + { + "epoch": 0.07807925043919578, + "grad_norm": 11.745437972621287, + "learning_rate": 9.221212551469545e-06, + "loss": 0.4646, + "step": 1100 + }, + { + "epoch": 0.07878906180682484, + "grad_norm": 6.5181010077573145, + "learning_rate": 9.214113303989778e-06, + "loss": 0.443, + "step": 1110 + }, + { + "epoch": 0.0794988731744539, + "grad_norm": 11.270983163134655, + "learning_rate": 9.207014056510012e-06, + "loss": 0.4605, + "step": 1120 + }, + { + "epoch": 0.08020868454208294, + "grad_norm": 3.7069012881976975, + "learning_rate": 9.199914809030243e-06, + "loss": 0.4459, + "step": 1130 + }, + { + "epoch": 0.080918495909712, + "grad_norm": 8.667969696855055, + "learning_rate": 9.192815561550476e-06, + "loss": 0.4556, + "step": 1140 + }, + { + "epoch": 0.08162830727734105, + "grad_norm": 7.559635091166787, + "learning_rate": 9.185716314070709e-06, + "loss": 0.4357, + "step": 1150 + }, + { + "epoch": 0.08233811864497009, + "grad_norm": 17.430750080762536, + "learning_rate": 9.178617066590942e-06, + "loss": 0.4301, + "step": 1160 + }, + { + "epoch": 0.08304793001259915, + "grad_norm": 4.351276343100192, + "learning_rate": 9.171517819111175e-06, + "loss": 0.4184, + "step": 1170 + }, + { + "epoch": 0.0837577413802282, + "grad_norm": 6.471581804191342, + "learning_rate": 9.164418571631408e-06, + "loss": 0.4516, + "step": 1180 + }, + { + "epoch": 0.08446755274785726, + "grad_norm": 4.3294841586504855, + "learning_rate": 9.157319324151642e-06, + "loss": 0.4211, + "step": 1190 + }, + { + "epoch": 0.0851773641154863, + "grad_norm": 4.385208474639979, + "learning_rate": 9.150220076671875e-06, + "loss": 0.4203, + "step": 1200 + }, + { + "epoch": 0.08588717548311536, + "grad_norm": 5.8972560031050065, + "learning_rate": 9.143120829192106e-06, + "loss": 0.4284, + "step": 1210 + }, + { + "epoch": 0.08659698685074442, + "grad_norm": 4.604861487503107, + "learning_rate": 9.136021581712339e-06, + "loss": 0.4277, + "step": 1220 + }, + { + "epoch": 0.08730679821837346, + "grad_norm": 4.321101106082931, + "learning_rate": 9.128922334232572e-06, + "loss": 0.4216, + "step": 1230 + }, + { + "epoch": 0.08801660958600252, + "grad_norm": 11.04681514927992, + "learning_rate": 9.121823086752805e-06, + "loss": 0.4181, + "step": 1240 + }, + { + "epoch": 0.08872642095363158, + "grad_norm": 4.31849841935359, + "learning_rate": 9.114723839273037e-06, + "loss": 0.4264, + "step": 1250 + }, + { + "epoch": 0.08943623232126062, + "grad_norm": 4.674845237449041, + "learning_rate": 9.10762459179327e-06, + "loss": 0.4281, + "step": 1260 + }, + { + "epoch": 0.09014604368888968, + "grad_norm": 3.447760098274006, + "learning_rate": 9.100525344313503e-06, + "loss": 0.4304, + "step": 1270 + }, + { + "epoch": 0.09085585505651873, + "grad_norm": 7.189274212443334, + "learning_rate": 9.093426096833736e-06, + "loss": 0.4252, + "step": 1280 + }, + { + "epoch": 0.09156566642414778, + "grad_norm": 19.69024332171456, + "learning_rate": 9.08632684935397e-06, + "loss": 0.4336, + "step": 1290 + }, + { + "epoch": 0.09227547779177683, + "grad_norm": 55.22992334000048, + "learning_rate": 9.079227601874202e-06, + "loss": 0.4256, + "step": 1300 + }, + { + "epoch": 0.09298528915940589, + "grad_norm": 5.066816349007046, + "learning_rate": 9.072128354394435e-06, + "loss": 0.407, + "step": 1310 + }, + { + "epoch": 0.09369510052703495, + "grad_norm": 6.37711035743208, + "learning_rate": 9.065029106914668e-06, + "loss": 0.4257, + "step": 1320 + }, + { + "epoch": 0.09440491189466399, + "grad_norm": 4.696335985596692, + "learning_rate": 9.057929859434902e-06, + "loss": 0.4188, + "step": 1330 + }, + { + "epoch": 0.09511472326229305, + "grad_norm": 15.82313293688476, + "learning_rate": 9.050830611955133e-06, + "loss": 0.433, + "step": 1340 + }, + { + "epoch": 0.0958245346299221, + "grad_norm": 5.692904308794704, + "learning_rate": 9.043731364475366e-06, + "loss": 0.4269, + "step": 1350 + }, + { + "epoch": 0.09653434599755115, + "grad_norm": 15.303387309564082, + "learning_rate": 9.0366321169956e-06, + "loss": 0.4174, + "step": 1360 + }, + { + "epoch": 0.0972441573651802, + "grad_norm": 3.9801928029461666, + "learning_rate": 9.029532869515832e-06, + "loss": 0.4132, + "step": 1370 + }, + { + "epoch": 0.09795396873280926, + "grad_norm": 3.523690216407914, + "learning_rate": 9.022433622036065e-06, + "loss": 0.4281, + "step": 1380 + }, + { + "epoch": 0.0986637801004383, + "grad_norm": 7.099888052775042, + "learning_rate": 9.015334374556298e-06, + "loss": 0.4431, + "step": 1390 + }, + { + "epoch": 0.09937359146806736, + "grad_norm": 6.538985360116972, + "learning_rate": 9.008235127076532e-06, + "loss": 0.4172, + "step": 1400 + }, + { + "epoch": 0.10008340283569642, + "grad_norm": 7.959800060910741, + "learning_rate": 9.001135879596763e-06, + "loss": 0.4243, + "step": 1410 + }, + { + "epoch": 0.10079321420332546, + "grad_norm": 8.790445771142394, + "learning_rate": 8.994036632116996e-06, + "loss": 0.4254, + "step": 1420 + }, + { + "epoch": 0.10150302557095452, + "grad_norm": 4.285966498899181, + "learning_rate": 8.98693738463723e-06, + "loss": 0.4122, + "step": 1430 + }, + { + "epoch": 0.10221283693858357, + "grad_norm": 6.286806035291326, + "learning_rate": 8.979838137157462e-06, + "loss": 0.433, + "step": 1440 + }, + { + "epoch": 0.10292264830621263, + "grad_norm": 7.3066834855049345, + "learning_rate": 8.972738889677695e-06, + "loss": 0.4258, + "step": 1450 + }, + { + "epoch": 0.10363245967384167, + "grad_norm": 6.5695520214785565, + "learning_rate": 8.965639642197927e-06, + "loss": 0.4164, + "step": 1460 + }, + { + "epoch": 0.10434227104147073, + "grad_norm": 20.93641513291179, + "learning_rate": 8.95854039471816e-06, + "loss": 0.4095, + "step": 1470 + }, + { + "epoch": 0.10505208240909979, + "grad_norm": 5.657042957398901, + "learning_rate": 8.951441147238393e-06, + "loss": 0.4168, + "step": 1480 + }, + { + "epoch": 0.10576189377672883, + "grad_norm": 6.076726326140851, + "learning_rate": 8.944341899758626e-06, + "loss": 0.4112, + "step": 1490 + }, + { + "epoch": 0.10647170514435789, + "grad_norm": 5.092565408624009, + "learning_rate": 8.93724265227886e-06, + "loss": 0.4269, + "step": 1500 + }, + { + "epoch": 0.10718151651198694, + "grad_norm": 2.894012289515038, + "learning_rate": 8.930143404799092e-06, + "loss": 0.4239, + "step": 1510 + }, + { + "epoch": 0.10789132787961599, + "grad_norm": 3.7173915295575637, + "learning_rate": 8.923044157319325e-06, + "loss": 0.4288, + "step": 1520 + }, + { + "epoch": 0.10860113924724504, + "grad_norm": 3.025402596869208, + "learning_rate": 8.915944909839559e-06, + "loss": 0.4421, + "step": 1530 + }, + { + "epoch": 0.1093109506148741, + "grad_norm": 8.212502187483185, + "learning_rate": 8.90884566235979e-06, + "loss": 0.4241, + "step": 1540 + }, + { + "epoch": 0.11002076198250314, + "grad_norm": 5.773771344339805, + "learning_rate": 8.901746414880023e-06, + "loss": 0.4355, + "step": 1550 + }, + { + "epoch": 0.1107305733501322, + "grad_norm": 4.158426885786249, + "learning_rate": 8.894647167400256e-06, + "loss": 0.436, + "step": 1560 + }, + { + "epoch": 0.11144038471776126, + "grad_norm": 6.56740526603354, + "learning_rate": 8.88754791992049e-06, + "loss": 0.4397, + "step": 1570 + }, + { + "epoch": 0.11215019608539031, + "grad_norm": 8.263663970839248, + "learning_rate": 8.880448672440722e-06, + "loss": 0.4201, + "step": 1580 + }, + { + "epoch": 0.11286000745301936, + "grad_norm": 2.424368072981463, + "learning_rate": 8.873349424960955e-06, + "loss": 0.4235, + "step": 1590 + }, + { + "epoch": 0.11356981882064841, + "grad_norm": 6.489454078474153, + "learning_rate": 8.866250177481189e-06, + "loss": 0.4243, + "step": 1600 + }, + { + "epoch": 0.11427963018827747, + "grad_norm": 3.541006640864803, + "learning_rate": 8.859150930001422e-06, + "loss": 0.4313, + "step": 1610 + }, + { + "epoch": 0.11498944155590651, + "grad_norm": 12.323605643567065, + "learning_rate": 8.852051682521653e-06, + "loss": 0.4253, + "step": 1620 + }, + { + "epoch": 0.11569925292353557, + "grad_norm": 4.600225981753095, + "learning_rate": 8.844952435041886e-06, + "loss": 0.42, + "step": 1630 + }, + { + "epoch": 0.11640906429116463, + "grad_norm": 8.589796661850784, + "learning_rate": 8.83785318756212e-06, + "loss": 0.4219, + "step": 1640 + }, + { + "epoch": 0.11711887565879367, + "grad_norm": 10.182911442610934, + "learning_rate": 8.830753940082352e-06, + "loss": 0.4285, + "step": 1650 + }, + { + "epoch": 0.11782868702642273, + "grad_norm": 5.186284643440543, + "learning_rate": 8.823654692602584e-06, + "loss": 0.4139, + "step": 1660 + }, + { + "epoch": 0.11853849839405178, + "grad_norm": 5.23154203196852, + "learning_rate": 8.816555445122817e-06, + "loss": 0.4251, + "step": 1670 + }, + { + "epoch": 0.11924830976168084, + "grad_norm": 6.9839536559537505, + "learning_rate": 8.80945619764305e-06, + "loss": 0.4233, + "step": 1680 + }, + { + "epoch": 0.11995812112930988, + "grad_norm": 6.376179671333375, + "learning_rate": 8.802356950163283e-06, + "loss": 0.4089, + "step": 1690 + }, + { + "epoch": 0.12066793249693894, + "grad_norm": 3.824113092644885, + "learning_rate": 8.795257702683516e-06, + "loss": 0.4347, + "step": 1700 + }, + { + "epoch": 0.121377743864568, + "grad_norm": 11.282936555631686, + "learning_rate": 8.78815845520375e-06, + "loss": 0.423, + "step": 1710 + }, + { + "epoch": 0.12208755523219704, + "grad_norm": 4.218268240264897, + "learning_rate": 8.781059207723982e-06, + "loss": 0.4188, + "step": 1720 + }, + { + "epoch": 0.1227973665998261, + "grad_norm": 3.943582749857493, + "learning_rate": 8.773959960244215e-06, + "loss": 0.4276, + "step": 1730 + }, + { + "epoch": 0.12350717796745515, + "grad_norm": 9.679933576473074, + "learning_rate": 8.766860712764449e-06, + "loss": 0.42, + "step": 1740 + }, + { + "epoch": 0.1242169893350842, + "grad_norm": 15.414309701859608, + "learning_rate": 8.75976146528468e-06, + "loss": 0.4316, + "step": 1750 + }, + { + "epoch": 0.12492680070271325, + "grad_norm": 9.429737278511919, + "learning_rate": 8.752662217804913e-06, + "loss": 0.422, + "step": 1760 + }, + { + "epoch": 0.1256366120703423, + "grad_norm": 23.10494354556988, + "learning_rate": 8.745562970325146e-06, + "loss": 0.4276, + "step": 1770 + }, + { + "epoch": 0.12634642343797137, + "grad_norm": 13.541923724604345, + "learning_rate": 8.73846372284538e-06, + "loss": 0.4271, + "step": 1780 + }, + { + "epoch": 0.1270562348056004, + "grad_norm": 2.846694152973873, + "learning_rate": 8.731364475365612e-06, + "loss": 0.4151, + "step": 1790 + }, + { + "epoch": 0.12776604617322945, + "grad_norm": 6.934597145753292, + "learning_rate": 8.724265227885845e-06, + "loss": 0.4247, + "step": 1800 + }, + { + "epoch": 0.12847585754085852, + "grad_norm": 3.435112347451886, + "learning_rate": 8.717165980406079e-06, + "loss": 0.4225, + "step": 1810 + }, + { + "epoch": 0.12918566890848757, + "grad_norm": 3.4829699382867823, + "learning_rate": 8.71006673292631e-06, + "loss": 0.4458, + "step": 1820 + }, + { + "epoch": 0.1298954802761166, + "grad_norm": 5.077072978235785, + "learning_rate": 8.702967485446543e-06, + "loss": 0.4283, + "step": 1830 + }, + { + "epoch": 0.13060529164374568, + "grad_norm": 5.917300462616358, + "learning_rate": 8.695868237966776e-06, + "loss": 0.4119, + "step": 1840 + }, + { + "epoch": 0.13131510301137472, + "grad_norm": 10.693397543481625, + "learning_rate": 8.68876899048701e-06, + "loss": 0.4305, + "step": 1850 + }, + { + "epoch": 0.13202491437900377, + "grad_norm": 3.6456780546239456, + "learning_rate": 8.681669743007242e-06, + "loss": 0.4391, + "step": 1860 + }, + { + "epoch": 0.13273472574663284, + "grad_norm": 14.68038430401678, + "learning_rate": 8.674570495527474e-06, + "loss": 0.4111, + "step": 1870 + }, + { + "epoch": 0.13344453711426188, + "grad_norm": 5.101838800313352, + "learning_rate": 8.667471248047707e-06, + "loss": 0.4323, + "step": 1880 + }, + { + "epoch": 0.13415434848189095, + "grad_norm": 4.497686869632987, + "learning_rate": 8.66037200056794e-06, + "loss": 0.4154, + "step": 1890 + }, + { + "epoch": 0.13486415984952, + "grad_norm": 9.511227824879294, + "learning_rate": 8.653272753088173e-06, + "loss": 0.4295, + "step": 1900 + }, + { + "epoch": 0.13557397121714904, + "grad_norm": 5.344003791146658, + "learning_rate": 8.646173505608406e-06, + "loss": 0.4254, + "step": 1910 + }, + { + "epoch": 0.1362837825847781, + "grad_norm": 8.10132953922794, + "learning_rate": 8.63907425812864e-06, + "loss": 0.4219, + "step": 1920 + }, + { + "epoch": 0.13699359395240715, + "grad_norm": 8.840386508572838, + "learning_rate": 8.631975010648872e-06, + "loss": 0.416, + "step": 1930 + }, + { + "epoch": 0.1377034053200362, + "grad_norm": 5.639143297883941, + "learning_rate": 8.624875763169106e-06, + "loss": 0.4246, + "step": 1940 + }, + { + "epoch": 0.13841321668766526, + "grad_norm": 5.375177742256173, + "learning_rate": 8.617776515689339e-06, + "loss": 0.4263, + "step": 1950 + }, + { + "epoch": 0.1391230280552943, + "grad_norm": 13.872628674699765, + "learning_rate": 8.61067726820957e-06, + "loss": 0.4368, + "step": 1960 + }, + { + "epoch": 0.13983283942292335, + "grad_norm": 6.612051924514802, + "learning_rate": 8.603578020729803e-06, + "loss": 0.4235, + "step": 1970 + }, + { + "epoch": 0.14054265079055242, + "grad_norm": 7.420592038738273, + "learning_rate": 8.596478773250036e-06, + "loss": 0.4315, + "step": 1980 + }, + { + "epoch": 0.14125246215818146, + "grad_norm": 3.883491154973528, + "learning_rate": 8.58937952577027e-06, + "loss": 0.4394, + "step": 1990 + }, + { + "epoch": 0.1419622735258105, + "grad_norm": 4.031594828995353, + "learning_rate": 8.582280278290502e-06, + "loss": 0.4274, + "step": 2000 + }, + { + "epoch": 0.14267208489343958, + "grad_norm": 6.272786134188022, + "learning_rate": 8.575181030810736e-06, + "loss": 0.42, + "step": 2010 + }, + { + "epoch": 0.14338189626106862, + "grad_norm": 8.45570312290703, + "learning_rate": 8.568081783330967e-06, + "loss": 0.4336, + "step": 2020 + }, + { + "epoch": 0.14409170762869766, + "grad_norm": 3.8497660341027693, + "learning_rate": 8.5609825358512e-06, + "loss": 0.4259, + "step": 2030 + }, + { + "epoch": 0.14480151899632673, + "grad_norm": 10.12069309920438, + "learning_rate": 8.553883288371433e-06, + "loss": 0.4208, + "step": 2040 + }, + { + "epoch": 0.14551133036395578, + "grad_norm": 5.128975578462212, + "learning_rate": 8.546784040891666e-06, + "loss": 0.4215, + "step": 2050 + }, + { + "epoch": 0.14622114173158482, + "grad_norm": 4.45602583843403, + "learning_rate": 8.5396847934119e-06, + "loss": 0.4135, + "step": 2060 + }, + { + "epoch": 0.1469309530992139, + "grad_norm": 5.172069700283945, + "learning_rate": 8.53258554593213e-06, + "loss": 0.4122, + "step": 2070 + }, + { + "epoch": 0.14764076446684293, + "grad_norm": 7.147216717746435, + "learning_rate": 8.525486298452364e-06, + "loss": 0.4423, + "step": 2080 + }, + { + "epoch": 0.14835057583447198, + "grad_norm": 14.946527022046613, + "learning_rate": 8.518387050972597e-06, + "loss": 0.4094, + "step": 2090 + }, + { + "epoch": 0.14906038720210105, + "grad_norm": 8.460267496546166, + "learning_rate": 8.51128780349283e-06, + "loss": 0.4186, + "step": 2100 + }, + { + "epoch": 0.1497701985697301, + "grad_norm": 8.93023218882671, + "learning_rate": 8.504188556013063e-06, + "loss": 0.4062, + "step": 2110 + }, + { + "epoch": 0.15048000993735916, + "grad_norm": 3.213343020811049, + "learning_rate": 8.497089308533296e-06, + "loss": 0.3994, + "step": 2120 + }, + { + "epoch": 0.1511898213049882, + "grad_norm": 8.718801113577726, + "learning_rate": 8.48999006105353e-06, + "loss": 0.4232, + "step": 2130 + }, + { + "epoch": 0.15189963267261725, + "grad_norm": 2.832643819770658, + "learning_rate": 8.482890813573762e-06, + "loss": 0.4261, + "step": 2140 + }, + { + "epoch": 0.15260944404024632, + "grad_norm": 3.2673324405839255, + "learning_rate": 8.475791566093996e-06, + "loss": 0.42, + "step": 2150 + }, + { + "epoch": 0.15331925540787536, + "grad_norm": 3.2621489770969214, + "learning_rate": 8.468692318614227e-06, + "loss": 0.4282, + "step": 2160 + }, + { + "epoch": 0.1540290667755044, + "grad_norm": 17.34420036770468, + "learning_rate": 8.46159307113446e-06, + "loss": 0.4198, + "step": 2170 + }, + { + "epoch": 0.15473887814313347, + "grad_norm": 3.6148665582762094, + "learning_rate": 8.454493823654693e-06, + "loss": 0.4157, + "step": 2180 + }, + { + "epoch": 0.15544868951076252, + "grad_norm": 2.775836768166624, + "learning_rate": 8.447394576174926e-06, + "loss": 0.417, + "step": 2190 + }, + { + "epoch": 0.15615850087839156, + "grad_norm": 5.052761832862739, + "learning_rate": 8.44029532869516e-06, + "loss": 0.4035, + "step": 2200 + }, + { + "epoch": 0.15686831224602063, + "grad_norm": 4.778779661514333, + "learning_rate": 8.433196081215393e-06, + "loss": 0.4445, + "step": 2210 + }, + { + "epoch": 0.15757812361364967, + "grad_norm": 4.6274782338902325, + "learning_rate": 8.426096833735626e-06, + "loss": 0.4147, + "step": 2220 + }, + { + "epoch": 0.15828793498127872, + "grad_norm": 4.310225523508245, + "learning_rate": 8.418997586255857e-06, + "loss": 0.4167, + "step": 2230 + }, + { + "epoch": 0.1589977463489078, + "grad_norm": 4.802519845626961, + "learning_rate": 8.41189833877609e-06, + "loss": 0.4052, + "step": 2240 + }, + { + "epoch": 0.15970755771653683, + "grad_norm": 3.949892413625005, + "learning_rate": 8.404799091296323e-06, + "loss": 0.4263, + "step": 2250 + }, + { + "epoch": 0.16041736908416587, + "grad_norm": 5.685661053410237, + "learning_rate": 8.397699843816556e-06, + "loss": 0.4148, + "step": 2260 + }, + { + "epoch": 0.16112718045179494, + "grad_norm": 4.337480471983148, + "learning_rate": 8.39060059633679e-06, + "loss": 0.4101, + "step": 2270 + }, + { + "epoch": 0.161836991819424, + "grad_norm": 4.809277499740254, + "learning_rate": 8.38350134885702e-06, + "loss": 0.4071, + "step": 2280 + }, + { + "epoch": 0.16254680318705303, + "grad_norm": 7.364507480899371, + "learning_rate": 8.376402101377254e-06, + "loss": 0.4021, + "step": 2290 + }, + { + "epoch": 0.1632566145546821, + "grad_norm": 5.408145626972555, + "learning_rate": 8.369302853897487e-06, + "loss": 0.4154, + "step": 2300 + }, + { + "epoch": 0.16396642592231114, + "grad_norm": 2.9449217220121784, + "learning_rate": 8.36220360641772e-06, + "loss": 0.4296, + "step": 2310 + }, + { + "epoch": 0.16467623728994019, + "grad_norm": 3.843647555602573, + "learning_rate": 8.355104358937953e-06, + "loss": 0.4197, + "step": 2320 + }, + { + "epoch": 0.16538604865756926, + "grad_norm": 5.843629733774891, + "learning_rate": 8.348005111458186e-06, + "loss": 0.4052, + "step": 2330 + }, + { + "epoch": 0.1660958600251983, + "grad_norm": 4.182196885965926, + "learning_rate": 8.34090586397842e-06, + "loss": 0.4304, + "step": 2340 + }, + { + "epoch": 0.16680567139282734, + "grad_norm": 12.343897765958163, + "learning_rate": 8.333806616498653e-06, + "loss": 0.4057, + "step": 2350 + }, + { + "epoch": 0.1675154827604564, + "grad_norm": 4.52770872028285, + "learning_rate": 8.326707369018886e-06, + "loss": 0.4234, + "step": 2360 + }, + { + "epoch": 0.16822529412808546, + "grad_norm": 5.473115632671873, + "learning_rate": 8.319608121539117e-06, + "loss": 0.4127, + "step": 2370 + }, + { + "epoch": 0.16893510549571453, + "grad_norm": 5.243162829393595, + "learning_rate": 8.31250887405935e-06, + "loss": 0.4148, + "step": 2380 + }, + { + "epoch": 0.16964491686334357, + "grad_norm": 9.638919529909746, + "learning_rate": 8.305409626579583e-06, + "loss": 0.4244, + "step": 2390 + }, + { + "epoch": 0.1703547282309726, + "grad_norm": 5.824204497516263, + "learning_rate": 8.298310379099816e-06, + "loss": 0.3991, + "step": 2400 + }, + { + "epoch": 0.17106453959860168, + "grad_norm": 8.92013550945478, + "learning_rate": 8.29121113162005e-06, + "loss": 0.4107, + "step": 2410 + }, + { + "epoch": 0.17177435096623073, + "grad_norm": 4.310339052965044, + "learning_rate": 8.284111884140283e-06, + "loss": 0.4198, + "step": 2420 + }, + { + "epoch": 0.17248416233385977, + "grad_norm": 3.674140188675587, + "learning_rate": 8.277012636660514e-06, + "loss": 0.4066, + "step": 2430 + }, + { + "epoch": 0.17319397370148884, + "grad_norm": 3.2816580938205986, + "learning_rate": 8.269913389180747e-06, + "loss": 0.3948, + "step": 2440 + }, + { + "epoch": 0.17390378506911788, + "grad_norm": 3.119520711268051, + "learning_rate": 8.26281414170098e-06, + "loss": 0.4236, + "step": 2450 + }, + { + "epoch": 0.17461359643674693, + "grad_norm": 3.9529990200341216, + "learning_rate": 8.255714894221213e-06, + "loss": 0.4028, + "step": 2460 + }, + { + "epoch": 0.175323407804376, + "grad_norm": 6.5624619571577, + "learning_rate": 8.248615646741446e-06, + "loss": 0.4207, + "step": 2470 + }, + { + "epoch": 0.17603321917200504, + "grad_norm": 6.563862400109423, + "learning_rate": 8.24151639926168e-06, + "loss": 0.4234, + "step": 2480 + }, + { + "epoch": 0.17674303053963408, + "grad_norm": 4.124646423199101, + "learning_rate": 8.234417151781911e-06, + "loss": 0.421, + "step": 2490 + }, + { + "epoch": 0.17745284190726315, + "grad_norm": 8.460797246337737, + "learning_rate": 8.227317904302144e-06, + "loss": 0.4169, + "step": 2500 + }, + { + "epoch": 0.1781626532748922, + "grad_norm": 4.636207121737827, + "learning_rate": 8.220218656822377e-06, + "loss": 0.4154, + "step": 2510 + }, + { + "epoch": 0.17887246464252124, + "grad_norm": 15.193279765427832, + "learning_rate": 8.21311940934261e-06, + "loss": 0.4, + "step": 2520 + }, + { + "epoch": 0.1795822760101503, + "grad_norm": 8.394690912531237, + "learning_rate": 8.206020161862843e-06, + "loss": 0.3994, + "step": 2530 + }, + { + "epoch": 0.18029208737777935, + "grad_norm": 11.829872869588135, + "learning_rate": 8.198920914383076e-06, + "loss": 0.4045, + "step": 2540 + }, + { + "epoch": 0.1810018987454084, + "grad_norm": 10.598164946336963, + "learning_rate": 8.19182166690331e-06, + "loss": 0.4167, + "step": 2550 + }, + { + "epoch": 0.18171171011303747, + "grad_norm": 8.644167493937724, + "learning_rate": 8.184722419423543e-06, + "loss": 0.4193, + "step": 2560 + }, + { + "epoch": 0.1824215214806665, + "grad_norm": 5.532113862418252, + "learning_rate": 8.177623171943776e-06, + "loss": 0.4134, + "step": 2570 + }, + { + "epoch": 0.18313133284829555, + "grad_norm": 8.962347784457894, + "learning_rate": 8.170523924464007e-06, + "loss": 0.4231, + "step": 2580 + }, + { + "epoch": 0.18384114421592462, + "grad_norm": 4.789480578365759, + "learning_rate": 8.16342467698424e-06, + "loss": 0.4056, + "step": 2590 + }, + { + "epoch": 0.18455095558355367, + "grad_norm": 7.463666547462272, + "learning_rate": 8.156325429504473e-06, + "loss": 0.4082, + "step": 2600 + }, + { + "epoch": 0.1852607669511827, + "grad_norm": 3.543632295285487, + "learning_rate": 8.149226182024706e-06, + "loss": 0.3957, + "step": 2610 + }, + { + "epoch": 0.18597057831881178, + "grad_norm": 10.128862482609126, + "learning_rate": 8.14212693454494e-06, + "loss": 0.4104, + "step": 2620 + }, + { + "epoch": 0.18668038968644082, + "grad_norm": 2.279815139257822, + "learning_rate": 8.135027687065171e-06, + "loss": 0.4023, + "step": 2630 + }, + { + "epoch": 0.1873902010540699, + "grad_norm": 5.651432220535337, + "learning_rate": 8.127928439585404e-06, + "loss": 0.4174, + "step": 2640 + }, + { + "epoch": 0.18810001242169894, + "grad_norm": 2.764126752423827, + "learning_rate": 8.120829192105637e-06, + "loss": 0.4316, + "step": 2650 + }, + { + "epoch": 0.18880982378932798, + "grad_norm": 2.2008942019632443, + "learning_rate": 8.11372994462587e-06, + "loss": 0.3998, + "step": 2660 + }, + { + "epoch": 0.18951963515695705, + "grad_norm": 2.6464894767494194, + "learning_rate": 8.106630697146103e-06, + "loss": 0.4152, + "step": 2670 + }, + { + "epoch": 0.1902294465245861, + "grad_norm": 2.9891233500309697, + "learning_rate": 8.099531449666336e-06, + "loss": 0.4065, + "step": 2680 + }, + { + "epoch": 0.19093925789221514, + "grad_norm": 3.2947192783933303, + "learning_rate": 8.092432202186568e-06, + "loss": 0.4096, + "step": 2690 + }, + { + "epoch": 0.1916490692598442, + "grad_norm": 2.6266501022263093, + "learning_rate": 8.085332954706801e-06, + "loss": 0.4079, + "step": 2700 + }, + { + "epoch": 0.19235888062747325, + "grad_norm": 2.0600161188196258, + "learning_rate": 8.078233707227034e-06, + "loss": 0.4245, + "step": 2710 + }, + { + "epoch": 0.1930686919951023, + "grad_norm": 3.4259686474049587, + "learning_rate": 8.071134459747267e-06, + "loss": 0.4168, + "step": 2720 + }, + { + "epoch": 0.19377850336273136, + "grad_norm": 4.184352662206747, + "learning_rate": 8.0640352122675e-06, + "loss": 0.4265, + "step": 2730 + }, + { + "epoch": 0.1944883147303604, + "grad_norm": 3.7320888080359174, + "learning_rate": 8.056935964787733e-06, + "loss": 0.4172, + "step": 2740 + }, + { + "epoch": 0.19519812609798945, + "grad_norm": 3.750448672171502, + "learning_rate": 8.049836717307966e-06, + "loss": 0.4327, + "step": 2750 + }, + { + "epoch": 0.19590793746561852, + "grad_norm": 3.0158382271152564, + "learning_rate": 8.0427374698282e-06, + "loss": 0.4284, + "step": 2760 + }, + { + "epoch": 0.19661774883324756, + "grad_norm": 2.438159262347708, + "learning_rate": 8.035638222348433e-06, + "loss": 0.4117, + "step": 2770 + }, + { + "epoch": 0.1973275602008766, + "grad_norm": 4.795802800628808, + "learning_rate": 8.028538974868664e-06, + "loss": 0.4207, + "step": 2780 + }, + { + "epoch": 0.19803737156850568, + "grad_norm": 2.5291141301554405, + "learning_rate": 8.021439727388897e-06, + "loss": 0.4146, + "step": 2790 + }, + { + "epoch": 0.19874718293613472, + "grad_norm": 2.4740979454164727, + "learning_rate": 8.01434047990913e-06, + "loss": 0.3999, + "step": 2800 + }, + { + "epoch": 0.19945699430376376, + "grad_norm": 3.4467777684569927, + "learning_rate": 8.007241232429363e-06, + "loss": 0.4151, + "step": 2810 + }, + { + "epoch": 0.20016680567139283, + "grad_norm": 2.741445348023422, + "learning_rate": 8.000141984949596e-06, + "loss": 0.4165, + "step": 2820 + }, + { + "epoch": 0.20087661703902188, + "grad_norm": 2.977547725757033, + "learning_rate": 7.99304273746983e-06, + "loss": 0.4137, + "step": 2830 + }, + { + "epoch": 0.20158642840665092, + "grad_norm": 3.493123708582949, + "learning_rate": 7.985943489990061e-06, + "loss": 0.4095, + "step": 2840 + }, + { + "epoch": 0.20229623977428, + "grad_norm": 9.43644672917822, + "learning_rate": 7.978844242510294e-06, + "loss": 0.4066, + "step": 2850 + }, + { + "epoch": 0.20300605114190903, + "grad_norm": 4.050870492633986, + "learning_rate": 7.971744995030527e-06, + "loss": 0.4079, + "step": 2860 + }, + { + "epoch": 0.2037158625095381, + "grad_norm": 7.830134940271083, + "learning_rate": 7.96464574755076e-06, + "loss": 0.3896, + "step": 2870 + }, + { + "epoch": 0.20442567387716715, + "grad_norm": 7.557535176254197, + "learning_rate": 7.957546500070993e-06, + "loss": 0.4096, + "step": 2880 + }, + { + "epoch": 0.2051354852447962, + "grad_norm": 4.715465621080843, + "learning_rate": 7.950447252591226e-06, + "loss": 0.3907, + "step": 2890 + }, + { + "epoch": 0.20584529661242526, + "grad_norm": 30.299863630729803, + "learning_rate": 7.943348005111458e-06, + "loss": 0.4142, + "step": 2900 + }, + { + "epoch": 0.2065551079800543, + "grad_norm": 13.362349279952854, + "learning_rate": 7.936248757631691e-06, + "loss": 0.4211, + "step": 2910 + }, + { + "epoch": 0.20726491934768335, + "grad_norm": 7.166470527615742, + "learning_rate": 7.929149510151924e-06, + "loss": 0.4038, + "step": 2920 + }, + { + "epoch": 0.20797473071531242, + "grad_norm": 218.37559359733393, + "learning_rate": 7.922050262672157e-06, + "loss": 0.3814, + "step": 2930 + }, + { + "epoch": 0.20868454208294146, + "grad_norm": 4.776318350142146, + "learning_rate": 7.91495101519239e-06, + "loss": 0.4033, + "step": 2940 + }, + { + "epoch": 0.2093943534505705, + "grad_norm": 6.050705359465637, + "learning_rate": 7.907851767712623e-06, + "loss": 0.4006, + "step": 2950 + }, + { + "epoch": 0.21010416481819957, + "grad_norm": 7.0609749250244125, + "learning_rate": 7.900752520232857e-06, + "loss": 0.3996, + "step": 2960 + }, + { + "epoch": 0.21081397618582862, + "grad_norm": 5.2294105499183985, + "learning_rate": 7.89365327275309e-06, + "loss": 0.3906, + "step": 2970 + }, + { + "epoch": 0.21152378755345766, + "grad_norm": 5.037453517661707, + "learning_rate": 7.886554025273323e-06, + "loss": 0.3925, + "step": 2980 + }, + { + "epoch": 0.21223359892108673, + "grad_norm": 4.329367488091813, + "learning_rate": 7.879454777793554e-06, + "loss": 0.4005, + "step": 2990 + }, + { + "epoch": 0.21294341028871577, + "grad_norm": 4.587934783884384, + "learning_rate": 7.872355530313787e-06, + "loss": 0.3949, + "step": 3000 + }, + { + "epoch": 0.21365322165634482, + "grad_norm": 4.34538375508175, + "learning_rate": 7.86525628283402e-06, + "loss": 0.3963, + "step": 3010 + }, + { + "epoch": 0.2143630330239739, + "grad_norm": 14.538466945533717, + "learning_rate": 7.858157035354253e-06, + "loss": 0.4145, + "step": 3020 + }, + { + "epoch": 0.21507284439160293, + "grad_norm": 5.725604081866674, + "learning_rate": 7.851057787874487e-06, + "loss": 0.397, + "step": 3030 + }, + { + "epoch": 0.21578265575923197, + "grad_norm": 4.100595238075657, + "learning_rate": 7.843958540394718e-06, + "loss": 0.407, + "step": 3040 + }, + { + "epoch": 0.21649246712686104, + "grad_norm": 3.6102459737641452, + "learning_rate": 7.836859292914951e-06, + "loss": 0.3941, + "step": 3050 + }, + { + "epoch": 0.2172022784944901, + "grad_norm": 9.48884086833176, + "learning_rate": 7.829760045435184e-06, + "loss": 0.3981, + "step": 3060 + }, + { + "epoch": 0.21791208986211913, + "grad_norm": 5.265598040684193, + "learning_rate": 7.822660797955417e-06, + "loss": 0.3865, + "step": 3070 + }, + { + "epoch": 0.2186219012297482, + "grad_norm": 5.853395704700518, + "learning_rate": 7.81556155047565e-06, + "loss": 0.4089, + "step": 3080 + }, + { + "epoch": 0.21933171259737724, + "grad_norm": 2.867041909768411, + "learning_rate": 7.808462302995883e-06, + "loss": 0.411, + "step": 3090 + }, + { + "epoch": 0.22004152396500629, + "grad_norm": 6.447556295363806, + "learning_rate": 7.801363055516117e-06, + "loss": 0.4054, + "step": 3100 + }, + { + "epoch": 0.22075133533263536, + "grad_norm": 6.665403407542621, + "learning_rate": 7.794263808036348e-06, + "loss": 0.4331, + "step": 3110 + }, + { + "epoch": 0.2214611467002644, + "grad_norm": 3.740543632288075, + "learning_rate": 7.787164560556581e-06, + "loss": 0.4132, + "step": 3120 + }, + { + "epoch": 0.22217095806789347, + "grad_norm": 19.12212944661018, + "learning_rate": 7.780065313076814e-06, + "loss": 0.4229, + "step": 3130 + }, + { + "epoch": 0.2228807694355225, + "grad_norm": 5.646216224084272, + "learning_rate": 7.772966065597047e-06, + "loss": 0.4123, + "step": 3140 + }, + { + "epoch": 0.22359058080315156, + "grad_norm": 12.549975615460761, + "learning_rate": 7.76586681811728e-06, + "loss": 0.4156, + "step": 3150 + }, + { + "epoch": 0.22430039217078063, + "grad_norm": 5.34509934381609, + "learning_rate": 7.758767570637513e-06, + "loss": 0.3935, + "step": 3160 + }, + { + "epoch": 0.22501020353840967, + "grad_norm": 4.868356423660982, + "learning_rate": 7.751668323157747e-06, + "loss": 0.4121, + "step": 3170 + }, + { + "epoch": 0.2257200149060387, + "grad_norm": 3.604594374317723, + "learning_rate": 7.74456907567798e-06, + "loss": 0.3949, + "step": 3180 + }, + { + "epoch": 0.22642982627366778, + "grad_norm": 2.6762060130385565, + "learning_rate": 7.737469828198211e-06, + "loss": 0.4192, + "step": 3190 + }, + { + "epoch": 0.22713963764129683, + "grad_norm": 3.7277037964888957, + "learning_rate": 7.730370580718444e-06, + "loss": 0.4063, + "step": 3200 + }, + { + "epoch": 0.22784944900892587, + "grad_norm": 4.2017308560808395, + "learning_rate": 7.723271333238677e-06, + "loss": 0.3983, + "step": 3210 + }, + { + "epoch": 0.22855926037655494, + "grad_norm": 6.82717398390433, + "learning_rate": 7.71617208575891e-06, + "loss": 0.4003, + "step": 3220 + }, + { + "epoch": 0.22926907174418398, + "grad_norm": 3.3720424392184865, + "learning_rate": 7.709072838279143e-06, + "loss": 0.384, + "step": 3230 + }, + { + "epoch": 0.22997888311181303, + "grad_norm": 15.234041629621501, + "learning_rate": 7.701973590799375e-06, + "loss": 0.3936, + "step": 3240 + }, + { + "epoch": 0.2306886944794421, + "grad_norm": 6.450291645106787, + "learning_rate": 7.694874343319608e-06, + "loss": 0.4153, + "step": 3250 + }, + { + "epoch": 0.23139850584707114, + "grad_norm": 5.0596647748479056, + "learning_rate": 7.687775095839841e-06, + "loss": 0.4098, + "step": 3260 + }, + { + "epoch": 0.23210831721470018, + "grad_norm": 6.351369993733097, + "learning_rate": 7.680675848360074e-06, + "loss": 0.4036, + "step": 3270 + }, + { + "epoch": 0.23281812858232925, + "grad_norm": 7.706709044787595, + "learning_rate": 7.673576600880307e-06, + "loss": 0.4137, + "step": 3280 + }, + { + "epoch": 0.2335279399499583, + "grad_norm": 6.111103199878706, + "learning_rate": 7.66647735340054e-06, + "loss": 0.4163, + "step": 3290 + }, + { + "epoch": 0.23423775131758734, + "grad_norm": 3.182362422678598, + "learning_rate": 7.659378105920773e-06, + "loss": 0.4007, + "step": 3300 + }, + { + "epoch": 0.2349475626852164, + "grad_norm": 3.929827344563346, + "learning_rate": 7.652278858441005e-06, + "loss": 0.4011, + "step": 3310 + }, + { + "epoch": 0.23565737405284545, + "grad_norm": 6.606808853169358, + "learning_rate": 7.645179610961238e-06, + "loss": 0.4113, + "step": 3320 + }, + { + "epoch": 0.2363671854204745, + "grad_norm": 7.983975561443669, + "learning_rate": 7.638080363481471e-06, + "loss": 0.3941, + "step": 3330 + }, + { + "epoch": 0.23707699678810357, + "grad_norm": 2.551810232754013, + "learning_rate": 7.630981116001704e-06, + "loss": 0.3987, + "step": 3340 + }, + { + "epoch": 0.2377868081557326, + "grad_norm": 16.325804366695763, + "learning_rate": 7.623881868521937e-06, + "loss": 0.3814, + "step": 3350 + }, + { + "epoch": 0.23849661952336168, + "grad_norm": 17.86582631307272, + "learning_rate": 7.61678262104217e-06, + "loss": 0.4065, + "step": 3360 + }, + { + "epoch": 0.23920643089099072, + "grad_norm": 4.439905284094514, + "learning_rate": 7.6096833735624035e-06, + "loss": 0.4079, + "step": 3370 + }, + { + "epoch": 0.23991624225861977, + "grad_norm": 13.632710588001641, + "learning_rate": 7.602584126082636e-06, + "loss": 0.4075, + "step": 3380 + }, + { + "epoch": 0.24062605362624884, + "grad_norm": 7.4557485788963405, + "learning_rate": 7.595484878602869e-06, + "loss": 0.399, + "step": 3390 + }, + { + "epoch": 0.24133586499387788, + "grad_norm": 6.032057911933067, + "learning_rate": 7.588385631123102e-06, + "loss": 0.3892, + "step": 3400 + }, + { + "epoch": 0.24204567636150692, + "grad_norm": 5.1424876309924, + "learning_rate": 7.581286383643335e-06, + "loss": 0.396, + "step": 3410 + }, + { + "epoch": 0.242755487729136, + "grad_norm": 3.6691932120100987, + "learning_rate": 7.574187136163567e-06, + "loss": 0.4108, + "step": 3420 + }, + { + "epoch": 0.24346529909676504, + "grad_norm": 2.8083232656002033, + "learning_rate": 7.5670878886838004e-06, + "loss": 0.3984, + "step": 3430 + }, + { + "epoch": 0.24417511046439408, + "grad_norm": 13.589049355107566, + "learning_rate": 7.5599886412040335e-06, + "loss": 0.3957, + "step": 3440 + }, + { + "epoch": 0.24488492183202315, + "grad_norm": 6.813624263530042, + "learning_rate": 7.552889393724265e-06, + "loss": 0.4105, + "step": 3450 + }, + { + "epoch": 0.2455947331996522, + "grad_norm": 13.609829369379536, + "learning_rate": 7.545790146244498e-06, + "loss": 0.4175, + "step": 3460 + }, + { + "epoch": 0.24630454456728124, + "grad_norm": 5.1258006881261915, + "learning_rate": 7.538690898764731e-06, + "loss": 0.3966, + "step": 3470 + }, + { + "epoch": 0.2470143559349103, + "grad_norm": 40.31962236147607, + "learning_rate": 7.531591651284964e-06, + "loss": 0.3839, + "step": 3480 + }, + { + "epoch": 0.24772416730253935, + "grad_norm": 6.537768993909155, + "learning_rate": 7.524492403805197e-06, + "loss": 0.4122, + "step": 3490 + }, + { + "epoch": 0.2484339786701684, + "grad_norm": 17.652356012021233, + "learning_rate": 7.51739315632543e-06, + "loss": 0.3948, + "step": 3500 + }, + { + "epoch": 0.24914379003779746, + "grad_norm": 3.85528406182526, + "learning_rate": 7.510293908845663e-06, + "loss": 0.3938, + "step": 3510 + }, + { + "epoch": 0.2498536014054265, + "grad_norm": 125.62304184951121, + "learning_rate": 7.503194661365896e-06, + "loss": 0.389, + "step": 3520 + }, + { + "epoch": 0.25056341277305555, + "grad_norm": 8.558355724038593, + "learning_rate": 7.496095413886129e-06, + "loss": 0.3787, + "step": 3530 + }, + { + "epoch": 0.2512732241406846, + "grad_norm": 4.216427070872869, + "learning_rate": 7.488996166406361e-06, + "loss": 0.3835, + "step": 3540 + }, + { + "epoch": 0.2519830355083137, + "grad_norm": 4.314131483032103, + "learning_rate": 7.481896918926594e-06, + "loss": 0.3946, + "step": 3550 + }, + { + "epoch": 0.25269284687594273, + "grad_norm": 4.159823786853909, + "learning_rate": 7.474797671446827e-06, + "loss": 0.3972, + "step": 3560 + }, + { + "epoch": 0.2534026582435718, + "grad_norm": 3.4947296702394586, + "learning_rate": 7.4676984239670605e-06, + "loss": 0.4165, + "step": 3570 + }, + { + "epoch": 0.2541124696112008, + "grad_norm": 4.022241190948728, + "learning_rate": 7.4605991764872936e-06, + "loss": 0.3988, + "step": 3580 + }, + { + "epoch": 0.25482228097882986, + "grad_norm": 3.4849637281174006, + "learning_rate": 7.453499929007526e-06, + "loss": 0.4106, + "step": 3590 + }, + { + "epoch": 0.2555320923464589, + "grad_norm": 5.338306458076586, + "learning_rate": 7.446400681527759e-06, + "loss": 0.4082, + "step": 3600 + }, + { + "epoch": 0.256241903714088, + "grad_norm": 4.970005106695202, + "learning_rate": 7.439301434047992e-06, + "loss": 0.3914, + "step": 3610 + }, + { + "epoch": 0.25695171508171705, + "grad_norm": 6.355373029038747, + "learning_rate": 7.432202186568225e-06, + "loss": 0.3989, + "step": 3620 + }, + { + "epoch": 0.2576615264493461, + "grad_norm": 5.996742366501121, + "learning_rate": 7.425102939088457e-06, + "loss": 0.3999, + "step": 3630 + }, + { + "epoch": 0.25837133781697513, + "grad_norm": 6.966686936423967, + "learning_rate": 7.4180036916086905e-06, + "loss": 0.3831, + "step": 3640 + }, + { + "epoch": 0.2590811491846042, + "grad_norm": 4.185121399245409, + "learning_rate": 7.410904444128923e-06, + "loss": 0.408, + "step": 3650 + }, + { + "epoch": 0.2597909605522332, + "grad_norm": 2.2056616209460866, + "learning_rate": 7.403805196649155e-06, + "loss": 0.3931, + "step": 3660 + }, + { + "epoch": 0.2605007719198623, + "grad_norm": 4.176248780095696, + "learning_rate": 7.396705949169388e-06, + "loss": 0.409, + "step": 3670 + }, + { + "epoch": 0.26121058328749136, + "grad_norm": 2.47926985794175, + "learning_rate": 7.389606701689621e-06, + "loss": 0.4091, + "step": 3680 + }, + { + "epoch": 0.2619203946551204, + "grad_norm": 3.02240842448802, + "learning_rate": 7.382507454209854e-06, + "loss": 0.4102, + "step": 3690 + }, + { + "epoch": 0.26263020602274945, + "grad_norm": 2.0291710541228816, + "learning_rate": 7.3754082067300866e-06, + "loss": 0.382, + "step": 3700 + }, + { + "epoch": 0.2633400173903785, + "grad_norm": 2.1912303159611084, + "learning_rate": 7.36830895925032e-06, + "loss": 0.3974, + "step": 3710 + }, + { + "epoch": 0.26404982875800753, + "grad_norm": 2.964541482780821, + "learning_rate": 7.361209711770553e-06, + "loss": 0.4096, + "step": 3720 + }, + { + "epoch": 0.26475964012563663, + "grad_norm": 5.810099164313448, + "learning_rate": 7.354110464290786e-06, + "loss": 0.4092, + "step": 3730 + }, + { + "epoch": 0.2654694514932657, + "grad_norm": 4.879409457746285, + "learning_rate": 7.347011216811019e-06, + "loss": 0.4034, + "step": 3740 + }, + { + "epoch": 0.2661792628608947, + "grad_norm": 2.761287928392515, + "learning_rate": 7.339911969331251e-06, + "loss": 0.3971, + "step": 3750 + }, + { + "epoch": 0.26688907422852376, + "grad_norm": 14.80879239487425, + "learning_rate": 7.332812721851484e-06, + "loss": 0.4203, + "step": 3760 + }, + { + "epoch": 0.2675988855961528, + "grad_norm": 2.589550559546521, + "learning_rate": 7.325713474371717e-06, + "loss": 0.4065, + "step": 3770 + }, + { + "epoch": 0.2683086969637819, + "grad_norm": 2.1908148156089204, + "learning_rate": 7.3186142268919505e-06, + "loss": 0.4001, + "step": 3780 + }, + { + "epoch": 0.26901850833141094, + "grad_norm": 3.614429975395643, + "learning_rate": 7.311514979412183e-06, + "loss": 0.3949, + "step": 3790 + }, + { + "epoch": 0.26972831969904, + "grad_norm": 8.199581604131074, + "learning_rate": 7.304415731932416e-06, + "loss": 0.4027, + "step": 3800 + }, + { + "epoch": 0.27043813106666903, + "grad_norm": 1.9841735875976263, + "learning_rate": 7.297316484452649e-06, + "loss": 0.3803, + "step": 3810 + }, + { + "epoch": 0.2711479424342981, + "grad_norm": 1.7818490390141006, + "learning_rate": 7.290217236972882e-06, + "loss": 0.3979, + "step": 3820 + }, + { + "epoch": 0.2718577538019271, + "grad_norm": 2.664420697627613, + "learning_rate": 7.283117989493115e-06, + "loss": 0.4112, + "step": 3830 + }, + { + "epoch": 0.2725675651695562, + "grad_norm": 7.6015896940216345, + "learning_rate": 7.2760187420133474e-06, + "loss": 0.3978, + "step": 3840 + }, + { + "epoch": 0.27327737653718526, + "grad_norm": 5.109710356060471, + "learning_rate": 7.2689194945335805e-06, + "loss": 0.3911, + "step": 3850 + }, + { + "epoch": 0.2739871879048143, + "grad_norm": 1.8719451344781273, + "learning_rate": 7.261820247053813e-06, + "loss": 0.4039, + "step": 3860 + }, + { + "epoch": 0.27469699927244334, + "grad_norm": 7.834590688589366, + "learning_rate": 7.254720999574045e-06, + "loss": 0.3972, + "step": 3870 + }, + { + "epoch": 0.2754068106400724, + "grad_norm": 3.4725606354409915, + "learning_rate": 7.247621752094278e-06, + "loss": 0.4106, + "step": 3880 + }, + { + "epoch": 0.27611662200770143, + "grad_norm": 2.131887069098727, + "learning_rate": 7.240522504614511e-06, + "loss": 0.3921, + "step": 3890 + }, + { + "epoch": 0.2768264333753305, + "grad_norm": 3.840712773368679, + "learning_rate": 7.233423257134744e-06, + "loss": 0.3963, + "step": 3900 + }, + { + "epoch": 0.27753624474295957, + "grad_norm": 1.8435607174327202, + "learning_rate": 7.226324009654977e-06, + "loss": 0.4171, + "step": 3910 + }, + { + "epoch": 0.2782460561105886, + "grad_norm": 2.927315889095762, + "learning_rate": 7.21922476217521e-06, + "loss": 0.4078, + "step": 3920 + }, + { + "epoch": 0.27895586747821766, + "grad_norm": 2.4533548064235955, + "learning_rate": 7.212125514695443e-06, + "loss": 0.4018, + "step": 3930 + }, + { + "epoch": 0.2796656788458467, + "grad_norm": 2.6808622987821424, + "learning_rate": 7.205026267215676e-06, + "loss": 0.3952, + "step": 3940 + }, + { + "epoch": 0.28037549021347574, + "grad_norm": 2.006870713713202, + "learning_rate": 7.197927019735908e-06, + "loss": 0.4041, + "step": 3950 + }, + { + "epoch": 0.28108530158110484, + "grad_norm": 4.1552921396903955, + "learning_rate": 7.190827772256141e-06, + "loss": 0.3815, + "step": 3960 + }, + { + "epoch": 0.2817951129487339, + "grad_norm": 3.088912130241367, + "learning_rate": 7.183728524776374e-06, + "loss": 0.4018, + "step": 3970 + }, + { + "epoch": 0.2825049243163629, + "grad_norm": 2.9619382181530853, + "learning_rate": 7.1766292772966075e-06, + "loss": 0.4071, + "step": 3980 + }, + { + "epoch": 0.28321473568399197, + "grad_norm": 3.194525382034512, + "learning_rate": 7.1695300298168406e-06, + "loss": 0.3861, + "step": 3990 + }, + { + "epoch": 0.283924547051621, + "grad_norm": 2.58824315637412, + "learning_rate": 7.162430782337073e-06, + "loss": 0.4022, + "step": 4000 + }, + { + "epoch": 0.2846343584192501, + "grad_norm": 1.6807083864960135, + "learning_rate": 7.155331534857306e-06, + "loss": 0.3953, + "step": 4010 + }, + { + "epoch": 0.28534416978687915, + "grad_norm": 2.9052226494936706, + "learning_rate": 7.148232287377539e-06, + "loss": 0.3803, + "step": 4020 + }, + { + "epoch": 0.2860539811545082, + "grad_norm": 1.9518486816171219, + "learning_rate": 7.141133039897772e-06, + "loss": 0.4076, + "step": 4030 + }, + { + "epoch": 0.28676379252213724, + "grad_norm": 2.223176862483651, + "learning_rate": 7.134033792418004e-06, + "loss": 0.4058, + "step": 4040 + }, + { + "epoch": 0.2874736038897663, + "grad_norm": 2.2196780309614854, + "learning_rate": 7.1269345449382375e-06, + "loss": 0.3926, + "step": 4050 + }, + { + "epoch": 0.2881834152573953, + "grad_norm": 6.524368077094248, + "learning_rate": 7.11983529745847e-06, + "loss": 0.4172, + "step": 4060 + }, + { + "epoch": 0.2888932266250244, + "grad_norm": 5.292339769504148, + "learning_rate": 7.112736049978702e-06, + "loss": 0.3908, + "step": 4070 + }, + { + "epoch": 0.28960303799265347, + "grad_norm": 2.3067804343233282, + "learning_rate": 7.105636802498935e-06, + "loss": 0.3899, + "step": 4080 + }, + { + "epoch": 0.2903128493602825, + "grad_norm": 3.23451698379491, + "learning_rate": 7.098537555019168e-06, + "loss": 0.4078, + "step": 4090 + }, + { + "epoch": 0.29102266072791155, + "grad_norm": 1.9975711149406958, + "learning_rate": 7.091438307539401e-06, + "loss": 0.3892, + "step": 4100 + }, + { + "epoch": 0.2917324720955406, + "grad_norm": 2.172457996529036, + "learning_rate": 7.084339060059634e-06, + "loss": 0.4024, + "step": 4110 + }, + { + "epoch": 0.29244228346316964, + "grad_norm": 4.2611345539293985, + "learning_rate": 7.077239812579867e-06, + "loss": 0.4051, + "step": 4120 + }, + { + "epoch": 0.29315209483079874, + "grad_norm": 4.8499954927547915, + "learning_rate": 7.0701405651001e-06, + "loss": 0.4051, + "step": 4130 + }, + { + "epoch": 0.2938619061984278, + "grad_norm": 3.133374032170856, + "learning_rate": 7.063041317620333e-06, + "loss": 0.4113, + "step": 4140 + }, + { + "epoch": 0.2945717175660568, + "grad_norm": 3.0408556337828667, + "learning_rate": 7.055942070140566e-06, + "loss": 0.3918, + "step": 4150 + }, + { + "epoch": 0.29528152893368587, + "grad_norm": 2.967610716656761, + "learning_rate": 7.048842822660798e-06, + "loss": 0.3935, + "step": 4160 + }, + { + "epoch": 0.2959913403013149, + "grad_norm": 4.089654504142007, + "learning_rate": 7.041743575181031e-06, + "loss": 0.3812, + "step": 4170 + }, + { + "epoch": 0.29670115166894395, + "grad_norm": 6.123820735815897, + "learning_rate": 7.0346443277012644e-06, + "loss": 0.3894, + "step": 4180 + }, + { + "epoch": 0.29741096303657305, + "grad_norm": 9.52031358542494, + "learning_rate": 7.0275450802214975e-06, + "loss": 0.3933, + "step": 4190 + }, + { + "epoch": 0.2981207744042021, + "grad_norm": 4.241656002923987, + "learning_rate": 7.02044583274173e-06, + "loss": 0.3938, + "step": 4200 + }, + { + "epoch": 0.29883058577183114, + "grad_norm": 10.364254693083032, + "learning_rate": 7.013346585261963e-06, + "loss": 0.3939, + "step": 4210 + }, + { + "epoch": 0.2995403971394602, + "grad_norm": 2.493001703497579, + "learning_rate": 7.006247337782196e-06, + "loss": 0.3904, + "step": 4220 + }, + { + "epoch": 0.3002502085070892, + "grad_norm": 2.372260556132136, + "learning_rate": 6.999148090302429e-06, + "loss": 0.4002, + "step": 4230 + }, + { + "epoch": 0.3009600198747183, + "grad_norm": 4.447948099801884, + "learning_rate": 6.992048842822662e-06, + "loss": 0.3894, + "step": 4240 + }, + { + "epoch": 0.30166983124234736, + "grad_norm": 2.4733723007039847, + "learning_rate": 6.9849495953428944e-06, + "loss": 0.3863, + "step": 4250 + }, + { + "epoch": 0.3023796426099764, + "grad_norm": 11.318740156291982, + "learning_rate": 6.977850347863127e-06, + "loss": 0.3881, + "step": 4260 + }, + { + "epoch": 0.30308945397760545, + "grad_norm": 3.6328999006662563, + "learning_rate": 6.97075110038336e-06, + "loss": 0.3894, + "step": 4270 + }, + { + "epoch": 0.3037992653452345, + "grad_norm": 2.0376811180198353, + "learning_rate": 6.963651852903592e-06, + "loss": 0.3993, + "step": 4280 + }, + { + "epoch": 0.30450907671286354, + "grad_norm": 2.1376755414320625, + "learning_rate": 6.956552605423825e-06, + "loss": 0.3903, + "step": 4290 + }, + { + "epoch": 0.30521888808049263, + "grad_norm": 2.883515618882684, + "learning_rate": 6.949453357944058e-06, + "loss": 0.4082, + "step": 4300 + }, + { + "epoch": 0.3059286994481217, + "grad_norm": 2.0964398516334444, + "learning_rate": 6.942354110464291e-06, + "loss": 0.3857, + "step": 4310 + }, + { + "epoch": 0.3066385108157507, + "grad_norm": 5.410779818418891, + "learning_rate": 6.935254862984524e-06, + "loss": 0.391, + "step": 4320 + }, + { + "epoch": 0.30734832218337976, + "grad_norm": 4.439425532620099, + "learning_rate": 6.928155615504757e-06, + "loss": 0.4099, + "step": 4330 + }, + { + "epoch": 0.3080581335510088, + "grad_norm": 12.275643206811255, + "learning_rate": 6.92105636802499e-06, + "loss": 0.3953, + "step": 4340 + }, + { + "epoch": 0.30876794491863785, + "grad_norm": 5.947992733400443, + "learning_rate": 6.913957120545223e-06, + "loss": 0.3945, + "step": 4350 + }, + { + "epoch": 0.30947775628626695, + "grad_norm": 3.4397054213510843, + "learning_rate": 6.906857873065456e-06, + "loss": 0.3875, + "step": 4360 + }, + { + "epoch": 0.310187567653896, + "grad_norm": 41.88563893552131, + "learning_rate": 6.899758625585688e-06, + "loss": 0.3928, + "step": 4370 + }, + { + "epoch": 0.31089737902152503, + "grad_norm": 3.227989243444744, + "learning_rate": 6.892659378105921e-06, + "loss": 0.3908, + "step": 4380 + }, + { + "epoch": 0.3116071903891541, + "grad_norm": 22.897381721878148, + "learning_rate": 6.8855601306261545e-06, + "loss": 0.391, + "step": 4390 + }, + { + "epoch": 0.3123170017567831, + "grad_norm": 3.3630974135990406, + "learning_rate": 6.878460883146388e-06, + "loss": 0.374, + "step": 4400 + }, + { + "epoch": 0.31302681312441216, + "grad_norm": 4.877401136832981, + "learning_rate": 6.87136163566662e-06, + "loss": 0.3923, + "step": 4410 + }, + { + "epoch": 0.31373662449204126, + "grad_norm": 6.179682561885886, + "learning_rate": 6.864262388186853e-06, + "loss": 0.3865, + "step": 4420 + }, + { + "epoch": 0.3144464358596703, + "grad_norm": 4.8910756460648885, + "learning_rate": 6.857163140707086e-06, + "loss": 0.3865, + "step": 4430 + }, + { + "epoch": 0.31515624722729935, + "grad_norm": 3.260915462621521, + "learning_rate": 6.850063893227319e-06, + "loss": 0.3982, + "step": 4440 + }, + { + "epoch": 0.3158660585949284, + "grad_norm": 4.599472395508018, + "learning_rate": 6.842964645747551e-06, + "loss": 0.3961, + "step": 4450 + }, + { + "epoch": 0.31657586996255743, + "grad_norm": 7.776943140920524, + "learning_rate": 6.8358653982677845e-06, + "loss": 0.3873, + "step": 4460 + }, + { + "epoch": 0.3172856813301865, + "grad_norm": 3.0126570398502723, + "learning_rate": 6.828766150788017e-06, + "loss": 0.3859, + "step": 4470 + }, + { + "epoch": 0.3179954926978156, + "grad_norm": 1.935360939609241, + "learning_rate": 6.82166690330825e-06, + "loss": 0.3893, + "step": 4480 + }, + { + "epoch": 0.3187053040654446, + "grad_norm": 2.8545870894952055, + "learning_rate": 6.814567655828482e-06, + "loss": 0.3963, + "step": 4490 + }, + { + "epoch": 0.31941511543307366, + "grad_norm": 4.70013317139999, + "learning_rate": 6.807468408348715e-06, + "loss": 0.3939, + "step": 4500 + }, + { + "epoch": 0.3201249268007027, + "grad_norm": 3.264719904276936, + "learning_rate": 6.800369160868948e-06, + "loss": 0.3851, + "step": 4510 + }, + { + "epoch": 0.32083473816833175, + "grad_norm": 19.735683632874615, + "learning_rate": 6.793269913389181e-06, + "loss": 0.3722, + "step": 4520 + }, + { + "epoch": 0.32154454953596084, + "grad_norm": 2.501896594333183, + "learning_rate": 6.786170665909414e-06, + "loss": 0.3744, + "step": 4530 + }, + { + "epoch": 0.3222543609035899, + "grad_norm": 6.776418259400934, + "learning_rate": 6.779071418429647e-06, + "loss": 0.3868, + "step": 4540 + }, + { + "epoch": 0.32296417227121893, + "grad_norm": 7.759324029832955, + "learning_rate": 6.77197217094988e-06, + "loss": 0.3978, + "step": 4550 + }, + { + "epoch": 0.323673983638848, + "grad_norm": 5.1020465787210805, + "learning_rate": 6.764872923470113e-06, + "loss": 0.3756, + "step": 4560 + }, + { + "epoch": 0.324383795006477, + "grad_norm": 4.584721636805871, + "learning_rate": 6.757773675990345e-06, + "loss": 0.3962, + "step": 4570 + }, + { + "epoch": 0.32509360637410606, + "grad_norm": 5.227400251430727, + "learning_rate": 6.750674428510578e-06, + "loss": 0.3934, + "step": 4580 + }, + { + "epoch": 0.32580341774173516, + "grad_norm": 6.3055606292098645, + "learning_rate": 6.7435751810308114e-06, + "loss": 0.3921, + "step": 4590 + }, + { + "epoch": 0.3265132291093642, + "grad_norm": 3.6872617865325914, + "learning_rate": 6.7364759335510445e-06, + "loss": 0.3818, + "step": 4600 + }, + { + "epoch": 0.32722304047699324, + "grad_norm": 2.007884918336012, + "learning_rate": 6.729376686071278e-06, + "loss": 0.4005, + "step": 4610 + }, + { + "epoch": 0.3279328518446223, + "grad_norm": 5.042964957635144, + "learning_rate": 6.72227743859151e-06, + "loss": 0.3934, + "step": 4620 + }, + { + "epoch": 0.32864266321225133, + "grad_norm": 4.122572427444757, + "learning_rate": 6.715178191111743e-06, + "loss": 0.3835, + "step": 4630 + }, + { + "epoch": 0.32935247457988037, + "grad_norm": 4.528744366296638, + "learning_rate": 6.708078943631976e-06, + "loss": 0.3781, + "step": 4640 + }, + { + "epoch": 0.33006228594750947, + "grad_norm": 3.0405586193089107, + "learning_rate": 6.700979696152209e-06, + "loss": 0.4013, + "step": 4650 + }, + { + "epoch": 0.3307720973151385, + "grad_norm": 2.497528895602537, + "learning_rate": 6.6938804486724415e-06, + "loss": 0.4012, + "step": 4660 + }, + { + "epoch": 0.33148190868276756, + "grad_norm": 3.949569099861772, + "learning_rate": 6.686781201192674e-06, + "loss": 0.3791, + "step": 4670 + }, + { + "epoch": 0.3321917200503966, + "grad_norm": 2.9026740036563714, + "learning_rate": 6.679681953712907e-06, + "loss": 0.379, + "step": 4680 + }, + { + "epoch": 0.33290153141802564, + "grad_norm": 4.750694201369016, + "learning_rate": 6.672582706233139e-06, + "loss": 0.3962, + "step": 4690 + }, + { + "epoch": 0.3336113427856547, + "grad_norm": 4.9647752226572655, + "learning_rate": 6.665483458753372e-06, + "loss": 0.4014, + "step": 4700 + }, + { + "epoch": 0.3343211541532838, + "grad_norm": 5.007567374826438, + "learning_rate": 6.658384211273605e-06, + "loss": 0.386, + "step": 4710 + }, + { + "epoch": 0.3350309655209128, + "grad_norm": 24.665793733036637, + "learning_rate": 6.651284963793838e-06, + "loss": 0.3904, + "step": 4720 + }, + { + "epoch": 0.33574077688854187, + "grad_norm": 8.807448982539153, + "learning_rate": 6.6441857163140715e-06, + "loss": 0.3817, + "step": 4730 + }, + { + "epoch": 0.3364505882561709, + "grad_norm": 5.649488918187287, + "learning_rate": 6.637086468834304e-06, + "loss": 0.3952, + "step": 4740 + }, + { + "epoch": 0.33716039962379996, + "grad_norm": 10.030238684862177, + "learning_rate": 6.629987221354537e-06, + "loss": 0.3894, + "step": 4750 + }, + { + "epoch": 0.33787021099142905, + "grad_norm": 8.229307584465264, + "learning_rate": 6.62288797387477e-06, + "loss": 0.3777, + "step": 4760 + }, + { + "epoch": 0.3385800223590581, + "grad_norm": 4.702015980686352, + "learning_rate": 6.615788726395003e-06, + "loss": 0.3846, + "step": 4770 + }, + { + "epoch": 0.33928983372668714, + "grad_norm": 7.609531980298162, + "learning_rate": 6.608689478915235e-06, + "loss": 0.3876, + "step": 4780 + }, + { + "epoch": 0.3399996450943162, + "grad_norm": 9.359016840144466, + "learning_rate": 6.601590231435468e-06, + "loss": 0.3912, + "step": 4790 + }, + { + "epoch": 0.3407094564619452, + "grad_norm": 6.921512932106153, + "learning_rate": 6.5944909839557015e-06, + "loss": 0.3808, + "step": 4800 + }, + { + "epoch": 0.34141926782957427, + "grad_norm": 7.896921462163668, + "learning_rate": 6.587391736475935e-06, + "loss": 0.3822, + "step": 4810 + }, + { + "epoch": 0.34212907919720337, + "grad_norm": 41.265653283488135, + "learning_rate": 6.580292488996167e-06, + "loss": 0.3704, + "step": 4820 + }, + { + "epoch": 0.3428388905648324, + "grad_norm": 22.410728414840314, + "learning_rate": 6.5731932415164e-06, + "loss": 0.3879, + "step": 4830 + }, + { + "epoch": 0.34354870193246145, + "grad_norm": 28.36796548695283, + "learning_rate": 6.566093994036633e-06, + "loss": 0.3819, + "step": 4840 + }, + { + "epoch": 0.3442585133000905, + "grad_norm": 5.964443376270807, + "learning_rate": 6.558994746556866e-06, + "loss": 0.3793, + "step": 4850 + }, + { + "epoch": 0.34496832466771954, + "grad_norm": 4.876522423500047, + "learning_rate": 6.551895499077099e-06, + "loss": 0.3882, + "step": 4860 + }, + { + "epoch": 0.3456781360353486, + "grad_norm": 4.871742533391797, + "learning_rate": 6.544796251597331e-06, + "loss": 0.3896, + "step": 4870 + }, + { + "epoch": 0.3463879474029777, + "grad_norm": 11.91690423514364, + "learning_rate": 6.537697004117564e-06, + "loss": 0.3736, + "step": 4880 + }, + { + "epoch": 0.3470977587706067, + "grad_norm": 5.986322327762981, + "learning_rate": 6.530597756637797e-06, + "loss": 0.368, + "step": 4890 + }, + { + "epoch": 0.34780757013823577, + "grad_norm": 4.671637222361169, + "learning_rate": 6.523498509158029e-06, + "loss": 0.3722, + "step": 4900 + }, + { + "epoch": 0.3485173815058648, + "grad_norm": 16.438976188514197, + "learning_rate": 6.516399261678262e-06, + "loss": 0.3776, + "step": 4910 + }, + { + "epoch": 0.34922719287349385, + "grad_norm": 11.76911671905372, + "learning_rate": 6.509300014198495e-06, + "loss": 0.3987, + "step": 4920 + }, + { + "epoch": 0.3499370042411229, + "grad_norm": 12.380867918847773, + "learning_rate": 6.502200766718728e-06, + "loss": 0.3949, + "step": 4930 + }, + { + "epoch": 0.350646815608752, + "grad_norm": 8.367704037629133, + "learning_rate": 6.495101519238961e-06, + "loss": 0.3767, + "step": 4940 + }, + { + "epoch": 0.35135662697638104, + "grad_norm": 74.35690108296033, + "learning_rate": 6.488002271759194e-06, + "loss": 0.3819, + "step": 4950 + }, + { + "epoch": 0.3520664383440101, + "grad_norm": 16.231219614665278, + "learning_rate": 6.480903024279427e-06, + "loss": 0.3859, + "step": 4960 + }, + { + "epoch": 0.3527762497116391, + "grad_norm": 9.060846103909238, + "learning_rate": 6.47380377679966e-06, + "loss": 0.394, + "step": 4970 + }, + { + "epoch": 0.35348606107926817, + "grad_norm": 21.88016531222193, + "learning_rate": 6.466704529319893e-06, + "loss": 0.4167, + "step": 4980 + }, + { + "epoch": 0.35419587244689726, + "grad_norm": 9.919040843315045, + "learning_rate": 6.459605281840125e-06, + "loss": 0.4192, + "step": 4990 + }, + { + "epoch": 0.3549056838145263, + "grad_norm": 5.183299722151934, + "learning_rate": 6.4525060343603584e-06, + "loss": 0.4249, + "step": 5000 + }, + { + "epoch": 0.35561549518215535, + "grad_norm": 8.847185946354221, + "learning_rate": 6.4454067868805915e-06, + "loss": 0.4112, + "step": 5010 + }, + { + "epoch": 0.3563253065497844, + "grad_norm": 11.864215621262682, + "learning_rate": 6.438307539400825e-06, + "loss": 0.4165, + "step": 5020 + }, + { + "epoch": 0.35703511791741344, + "grad_norm": 3.3703428369603503, + "learning_rate": 6.431208291921057e-06, + "loss": 0.3978, + "step": 5030 + }, + { + "epoch": 0.3577449292850425, + "grad_norm": 5.015316577294299, + "learning_rate": 6.42410904444129e-06, + "loss": 0.3872, + "step": 5040 + }, + { + "epoch": 0.3584547406526716, + "grad_norm": 4.2137919102595305, + "learning_rate": 6.417009796961523e-06, + "loss": 0.3766, + "step": 5050 + }, + { + "epoch": 0.3591645520203006, + "grad_norm": 3.0372315306510056, + "learning_rate": 6.409910549481756e-06, + "loss": 0.3842, + "step": 5060 + }, + { + "epoch": 0.35987436338792966, + "grad_norm": 2.7515400586423318, + "learning_rate": 6.4028113020019885e-06, + "loss": 0.3993, + "step": 5070 + }, + { + "epoch": 0.3605841747555587, + "grad_norm": 9.185207292504243, + "learning_rate": 6.395712054522221e-06, + "loss": 0.3875, + "step": 5080 + }, + { + "epoch": 0.36129398612318775, + "grad_norm": 19.515842749867563, + "learning_rate": 6.388612807042454e-06, + "loss": 0.4035, + "step": 5090 + }, + { + "epoch": 0.3620037974908168, + "grad_norm": 12.30636697197178, + "learning_rate": 6.381513559562686e-06, + "loss": 0.4035, + "step": 5100 + }, + { + "epoch": 0.3627136088584459, + "grad_norm": 6.732979846623905, + "learning_rate": 6.374414312082919e-06, + "loss": 0.4079, + "step": 5110 + }, + { + "epoch": 0.36342342022607493, + "grad_norm": 6.642326962423095, + "learning_rate": 6.367315064603152e-06, + "loss": 0.3945, + "step": 5120 + }, + { + "epoch": 0.364133231593704, + "grad_norm": 6.314154234087903, + "learning_rate": 6.360215817123385e-06, + "loss": 0.394, + "step": 5130 + }, + { + "epoch": 0.364843042961333, + "grad_norm": 4.760512258914551, + "learning_rate": 6.3531165696436185e-06, + "loss": 0.3863, + "step": 5140 + }, + { + "epoch": 0.36555285432896206, + "grad_norm": 4.048747245175314, + "learning_rate": 6.346017322163851e-06, + "loss": 0.3863, + "step": 5150 + }, + { + "epoch": 0.3662626656965911, + "grad_norm": 4.190578946223062, + "learning_rate": 6.338918074684084e-06, + "loss": 0.3723, + "step": 5160 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 4.175965799380943, + "learning_rate": 6.331818827204317e-06, + "loss": 0.3889, + "step": 5170 + }, + { + "epoch": 0.36768228843184925, + "grad_norm": 4.807186811656143, + "learning_rate": 6.32471957972455e-06, + "loss": 0.3874, + "step": 5180 + }, + { + "epoch": 0.3683920997994783, + "grad_norm": 6.659345248185456, + "learning_rate": 6.317620332244782e-06, + "loss": 0.3711, + "step": 5190 + }, + { + "epoch": 0.36910191116710733, + "grad_norm": 7.2186380945453905, + "learning_rate": 6.310521084765015e-06, + "loss": 0.3827, + "step": 5200 + }, + { + "epoch": 0.3698117225347364, + "grad_norm": 5.005630183658748, + "learning_rate": 6.3034218372852485e-06, + "loss": 0.3983, + "step": 5210 + }, + { + "epoch": 0.3705215339023654, + "grad_norm": 3.527405153009429, + "learning_rate": 6.296322589805482e-06, + "loss": 0.367, + "step": 5220 + }, + { + "epoch": 0.3712313452699945, + "grad_norm": 3.882199465110045, + "learning_rate": 6.289223342325715e-06, + "loss": 0.3883, + "step": 5230 + }, + { + "epoch": 0.37194115663762356, + "grad_norm": 7.463055050907344, + "learning_rate": 6.282124094845947e-06, + "loss": 0.3823, + "step": 5240 + }, + { + "epoch": 0.3726509680052526, + "grad_norm": 8.000906237369843, + "learning_rate": 6.27502484736618e-06, + "loss": 0.383, + "step": 5250 + }, + { + "epoch": 0.37336077937288165, + "grad_norm": 8.362063303535368, + "learning_rate": 6.267925599886413e-06, + "loss": 0.3893, + "step": 5260 + }, + { + "epoch": 0.3740705907405107, + "grad_norm": 4.721914441661691, + "learning_rate": 6.260826352406646e-06, + "loss": 0.3763, + "step": 5270 + }, + { + "epoch": 0.3747804021081398, + "grad_norm": 12.175797518430029, + "learning_rate": 6.253727104926878e-06, + "loss": 0.3977, + "step": 5280 + }, + { + "epoch": 0.37549021347576883, + "grad_norm": 9.814402397906687, + "learning_rate": 6.246627857447111e-06, + "loss": 0.3716, + "step": 5290 + }, + { + "epoch": 0.3762000248433979, + "grad_norm": 47.1450002499556, + "learning_rate": 6.239528609967344e-06, + "loss": 0.3792, + "step": 5300 + }, + { + "epoch": 0.3769098362110269, + "grad_norm": 27.513481595283608, + "learning_rate": 6.232429362487576e-06, + "loss": 0.3734, + "step": 5310 + }, + { + "epoch": 0.37761964757865596, + "grad_norm": 48.09984812385904, + "learning_rate": 6.225330115007809e-06, + "loss": 0.3873, + "step": 5320 + }, + { + "epoch": 0.378329458946285, + "grad_norm": 5.065884658180426, + "learning_rate": 6.218230867528042e-06, + "loss": 0.39, + "step": 5330 + }, + { + "epoch": 0.3790392703139141, + "grad_norm": 9.226418902203303, + "learning_rate": 6.2111316200482754e-06, + "loss": 0.3819, + "step": 5340 + }, + { + "epoch": 0.37974908168154314, + "grad_norm": 6.998201025336219, + "learning_rate": 6.204032372568508e-06, + "loss": 0.3818, + "step": 5350 + }, + { + "epoch": 0.3804588930491722, + "grad_norm": 4.086309894015096, + "learning_rate": 6.196933125088741e-06, + "loss": 0.3573, + "step": 5360 + }, + { + "epoch": 0.38116870441680123, + "grad_norm": 8.280993749723958, + "learning_rate": 6.189833877608974e-06, + "loss": 0.3763, + "step": 5370 + }, + { + "epoch": 0.3818785157844303, + "grad_norm": 4.086208683086361, + "learning_rate": 6.182734630129207e-06, + "loss": 0.3754, + "step": 5380 + }, + { + "epoch": 0.3825883271520593, + "grad_norm": 5.958244425553627, + "learning_rate": 6.17563538264944e-06, + "loss": 0.3844, + "step": 5390 + }, + { + "epoch": 0.3832981385196884, + "grad_norm": 3.580000162662889, + "learning_rate": 6.168536135169672e-06, + "loss": 0.382, + "step": 5400 + }, + { + "epoch": 0.38400794988731746, + "grad_norm": 2.986600327490101, + "learning_rate": 6.1614368876899054e-06, + "loss": 0.3722, + "step": 5410 + }, + { + "epoch": 0.3847177612549465, + "grad_norm": 3.253411703330411, + "learning_rate": 6.1543376402101386e-06, + "loss": 0.3723, + "step": 5420 + }, + { + "epoch": 0.38542757262257554, + "grad_norm": 5.02266916683139, + "learning_rate": 6.147238392730372e-06, + "loss": 0.353, + "step": 5430 + }, + { + "epoch": 0.3861373839902046, + "grad_norm": 6.509810117314743, + "learning_rate": 6.140139145250604e-06, + "loss": 0.3859, + "step": 5440 + }, + { + "epoch": 0.38684719535783363, + "grad_norm": 3.024955665262126, + "learning_rate": 6.133039897770837e-06, + "loss": 0.3929, + "step": 5450 + }, + { + "epoch": 0.3875570067254627, + "grad_norm": 3.1517938939602206, + "learning_rate": 6.12594065029107e-06, + "loss": 0.3899, + "step": 5460 + }, + { + "epoch": 0.38826681809309177, + "grad_norm": 4.545747430477116, + "learning_rate": 6.118841402811303e-06, + "loss": 0.376, + "step": 5470 + }, + { + "epoch": 0.3889766294607208, + "grad_norm": 4.069699163399179, + "learning_rate": 6.111742155331535e-06, + "loss": 0.3813, + "step": 5480 + }, + { + "epoch": 0.38968644082834986, + "grad_norm": 3.562062075517251, + "learning_rate": 6.104642907851768e-06, + "loss": 0.383, + "step": 5490 + }, + { + "epoch": 0.3903962521959789, + "grad_norm": 9.15980720106711, + "learning_rate": 6.097543660372001e-06, + "loss": 0.3921, + "step": 5500 + }, + { + "epoch": 0.391106063563608, + "grad_norm": 4.449111409231249, + "learning_rate": 6.090444412892234e-06, + "loss": 0.3823, + "step": 5510 + }, + { + "epoch": 0.39181587493123704, + "grad_norm": 12.724861852641904, + "learning_rate": 6.083345165412466e-06, + "loss": 0.3851, + "step": 5520 + }, + { + "epoch": 0.3925256862988661, + "grad_norm": 6.615402324691555, + "learning_rate": 6.076245917932699e-06, + "loss": 0.3667, + "step": 5530 + }, + { + "epoch": 0.3932354976664951, + "grad_norm": 8.817203015753774, + "learning_rate": 6.069146670452932e-06, + "loss": 0.3886, + "step": 5540 + }, + { + "epoch": 0.39394530903412417, + "grad_norm": 9.192960733910674, + "learning_rate": 6.0620474229731655e-06, + "loss": 0.3794, + "step": 5550 + }, + { + "epoch": 0.3946551204017532, + "grad_norm": 4.825188565131958, + "learning_rate": 6.054948175493398e-06, + "loss": 0.3786, + "step": 5560 + }, + { + "epoch": 0.3953649317693823, + "grad_norm": 6.68078822940831, + "learning_rate": 6.047848928013631e-06, + "loss": 0.3835, + "step": 5570 + }, + { + "epoch": 0.39607474313701135, + "grad_norm": 2.6400726840916175, + "learning_rate": 6.040749680533864e-06, + "loss": 0.381, + "step": 5580 + }, + { + "epoch": 0.3967845545046404, + "grad_norm": 3.6668671304324967, + "learning_rate": 6.033650433054097e-06, + "loss": 0.3745, + "step": 5590 + }, + { + "epoch": 0.39749436587226944, + "grad_norm": 2.639833206365908, + "learning_rate": 6.026551185574329e-06, + "loss": 0.3777, + "step": 5600 + }, + { + "epoch": 0.3982041772398985, + "grad_norm": 3.79888213287165, + "learning_rate": 6.019451938094562e-06, + "loss": 0.3911, + "step": 5610 + }, + { + "epoch": 0.3989139886075275, + "grad_norm": 5.09183422587413, + "learning_rate": 6.0123526906147955e-06, + "loss": 0.3832, + "step": 5620 + }, + { + "epoch": 0.3996237999751566, + "grad_norm": 3.3401895175000926, + "learning_rate": 6.005253443135029e-06, + "loss": 0.3862, + "step": 5630 + }, + { + "epoch": 0.40033361134278567, + "grad_norm": 2.5702329959348726, + "learning_rate": 5.998154195655262e-06, + "loss": 0.3934, + "step": 5640 + }, + { + "epoch": 0.4010434227104147, + "grad_norm": 3.0044071678975937, + "learning_rate": 5.991054948175494e-06, + "loss": 0.3826, + "step": 5650 + }, + { + "epoch": 0.40175323407804375, + "grad_norm": 2.412654779599852, + "learning_rate": 5.983955700695727e-06, + "loss": 0.3969, + "step": 5660 + }, + { + "epoch": 0.4024630454456728, + "grad_norm": 3.0767944703908356, + "learning_rate": 5.97685645321596e-06, + "loss": 0.3961, + "step": 5670 + }, + { + "epoch": 0.40317285681330184, + "grad_norm": 2.8053230371522124, + "learning_rate": 5.969757205736193e-06, + "loss": 0.3869, + "step": 5680 + }, + { + "epoch": 0.40388266818093094, + "grad_norm": 7.472643121749521, + "learning_rate": 5.962657958256425e-06, + "loss": 0.3851, + "step": 5690 + }, + { + "epoch": 0.40459247954856, + "grad_norm": 14.585388143398843, + "learning_rate": 5.955558710776658e-06, + "loss": 0.3905, + "step": 5700 + }, + { + "epoch": 0.405302290916189, + "grad_norm": 4.416692599365141, + "learning_rate": 5.948459463296891e-06, + "loss": 0.3862, + "step": 5710 + }, + { + "epoch": 0.40601210228381807, + "grad_norm": 3.4729116521336776, + "learning_rate": 5.941360215817123e-06, + "loss": 0.402, + "step": 5720 + }, + { + "epoch": 0.4067219136514471, + "grad_norm": 3.5423705326787114, + "learning_rate": 5.934260968337356e-06, + "loss": 0.3884, + "step": 5730 + }, + { + "epoch": 0.4074317250190762, + "grad_norm": 3.1365000657861497, + "learning_rate": 5.927161720857589e-06, + "loss": 0.3825, + "step": 5740 + }, + { + "epoch": 0.40814153638670525, + "grad_norm": 5.219488757508086, + "learning_rate": 5.9200624733778224e-06, + "loss": 0.3894, + "step": 5750 + }, + { + "epoch": 0.4088513477543343, + "grad_norm": 3.596909048940233, + "learning_rate": 5.9129632258980555e-06, + "loss": 0.3831, + "step": 5760 + }, + { + "epoch": 0.40956115912196334, + "grad_norm": 2.476134224023759, + "learning_rate": 5.905863978418288e-06, + "loss": 0.3825, + "step": 5770 + }, + { + "epoch": 0.4102709704895924, + "grad_norm": 3.407930958961138, + "learning_rate": 5.898764730938521e-06, + "loss": 0.3714, + "step": 5780 + }, + { + "epoch": 0.4109807818572214, + "grad_norm": 3.6349280667767636, + "learning_rate": 5.891665483458754e-06, + "loss": 0.3949, + "step": 5790 + }, + { + "epoch": 0.4116905932248505, + "grad_norm": 10.032880290815127, + "learning_rate": 5.884566235978987e-06, + "loss": 0.3827, + "step": 5800 + }, + { + "epoch": 0.41240040459247956, + "grad_norm": 4.403552459945297, + "learning_rate": 5.877466988499219e-06, + "loss": 0.3738, + "step": 5810 + }, + { + "epoch": 0.4131102159601086, + "grad_norm": 3.2630803210797086, + "learning_rate": 5.8703677410194525e-06, + "loss": 0.3947, + "step": 5820 + }, + { + "epoch": 0.41382002732773765, + "grad_norm": 11.228663057773362, + "learning_rate": 5.8632684935396856e-06, + "loss": 0.3825, + "step": 5830 + }, + { + "epoch": 0.4145298386953667, + "grad_norm": 18.33844649221444, + "learning_rate": 5.856169246059919e-06, + "loss": 0.381, + "step": 5840 + }, + { + "epoch": 0.41523965006299574, + "grad_norm": 14.576257048715338, + "learning_rate": 5.849069998580152e-06, + "loss": 0.389, + "step": 5850 + }, + { + "epoch": 0.41594946143062483, + "grad_norm": 3.3799659706310177, + "learning_rate": 5.841970751100384e-06, + "loss": 0.3687, + "step": 5860 + }, + { + "epoch": 0.4166592727982539, + "grad_norm": 4.306786145673671, + "learning_rate": 5.834871503620617e-06, + "loss": 0.3846, + "step": 5870 + }, + { + "epoch": 0.4173690841658829, + "grad_norm": 2.71585444285802, + "learning_rate": 5.82777225614085e-06, + "loss": 0.397, + "step": 5880 + }, + { + "epoch": 0.41807889553351196, + "grad_norm": 4.530639455269193, + "learning_rate": 5.820673008661082e-06, + "loss": 0.3633, + "step": 5890 + }, + { + "epoch": 0.418788706901141, + "grad_norm": 5.299365856406392, + "learning_rate": 5.813573761181315e-06, + "loss": 0.3854, + "step": 5900 + }, + { + "epoch": 0.41949851826877005, + "grad_norm": 3.5533453867575786, + "learning_rate": 5.806474513701548e-06, + "loss": 0.3855, + "step": 5910 + }, + { + "epoch": 0.42020832963639915, + "grad_norm": 9.388008852057116, + "learning_rate": 5.799375266221781e-06, + "loss": 0.3911, + "step": 5920 + }, + { + "epoch": 0.4209181410040282, + "grad_norm": 3.378607546141685, + "learning_rate": 5.792276018742013e-06, + "loss": 0.3751, + "step": 5930 + }, + { + "epoch": 0.42162795237165723, + "grad_norm": 12.222073948575716, + "learning_rate": 5.785176771262246e-06, + "loss": 0.3778, + "step": 5940 + }, + { + "epoch": 0.4223377637392863, + "grad_norm": 4.297952573306613, + "learning_rate": 5.778077523782479e-06, + "loss": 0.3827, + "step": 5950 + }, + { + "epoch": 0.4230475751069153, + "grad_norm": 9.764464171752504, + "learning_rate": 5.7709782763027125e-06, + "loss": 0.3893, + "step": 5960 + }, + { + "epoch": 0.4237573864745444, + "grad_norm": 3.7569225597805658, + "learning_rate": 5.763879028822945e-06, + "loss": 0.3901, + "step": 5970 + }, + { + "epoch": 0.42446719784217346, + "grad_norm": 3.0005485619903824, + "learning_rate": 5.756779781343178e-06, + "loss": 0.3753, + "step": 5980 + }, + { + "epoch": 0.4251770092098025, + "grad_norm": 6.457104695432505, + "learning_rate": 5.749680533863411e-06, + "loss": 0.3585, + "step": 5990 + }, + { + "epoch": 0.42588682057743155, + "grad_norm": 4.252684527352716, + "learning_rate": 5.742581286383644e-06, + "loss": 0.3745, + "step": 6000 + }, + { + "epoch": 0.4265966319450606, + "grad_norm": 3.3319349737549673, + "learning_rate": 5.735482038903877e-06, + "loss": 0.3836, + "step": 6010 + }, + { + "epoch": 0.42730644331268963, + "grad_norm": 4.333001859655407, + "learning_rate": 5.728382791424109e-06, + "loss": 0.3698, + "step": 6020 + }, + { + "epoch": 0.42801625468031873, + "grad_norm": 3.9838864194561343, + "learning_rate": 5.7212835439443425e-06, + "loss": 0.3686, + "step": 6030 + }, + { + "epoch": 0.4287260660479478, + "grad_norm": 3.206673737162168, + "learning_rate": 5.714184296464576e-06, + "loss": 0.374, + "step": 6040 + }, + { + "epoch": 0.4294358774155768, + "grad_norm": 7.910008181832549, + "learning_rate": 5.707085048984809e-06, + "loss": 0.3731, + "step": 6050 + }, + { + "epoch": 0.43014568878320586, + "grad_norm": 11.533279860672804, + "learning_rate": 5.699985801505041e-06, + "loss": 0.3842, + "step": 6060 + }, + { + "epoch": 0.4308555001508349, + "grad_norm": 4.06817553254219, + "learning_rate": 5.692886554025274e-06, + "loss": 0.3717, + "step": 6070 + }, + { + "epoch": 0.43156531151846395, + "grad_norm": 12.082596102938004, + "learning_rate": 5.685787306545507e-06, + "loss": 0.3971, + "step": 6080 + }, + { + "epoch": 0.43227512288609304, + "grad_norm": 2.685455478240202, + "learning_rate": 5.678688059065739e-06, + "loss": 0.3822, + "step": 6090 + }, + { + "epoch": 0.4329849342537221, + "grad_norm": 3.1399973614222643, + "learning_rate": 5.671588811585972e-06, + "loss": 0.3774, + "step": 6100 + }, + { + "epoch": 0.43369474562135113, + "grad_norm": 3.518374812592983, + "learning_rate": 5.664489564106205e-06, + "loss": 0.3781, + "step": 6110 + }, + { + "epoch": 0.4344045569889802, + "grad_norm": 4.803932844471321, + "learning_rate": 5.657390316626438e-06, + "loss": 0.3757, + "step": 6120 + }, + { + "epoch": 0.4351143683566092, + "grad_norm": 12.690594810777407, + "learning_rate": 5.650291069146671e-06, + "loss": 0.3747, + "step": 6130 + }, + { + "epoch": 0.43582417972423826, + "grad_norm": 10.80688099347966, + "learning_rate": 5.643191821666903e-06, + "loss": 0.3676, + "step": 6140 + }, + { + "epoch": 0.43653399109186736, + "grad_norm": 4.232034052682343, + "learning_rate": 5.636092574187136e-06, + "loss": 0.395, + "step": 6150 + }, + { + "epoch": 0.4372438024594964, + "grad_norm": 3.422739256279243, + "learning_rate": 5.6289933267073694e-06, + "loss": 0.3693, + "step": 6160 + }, + { + "epoch": 0.43795361382712544, + "grad_norm": 32.06006758689784, + "learning_rate": 5.6218940792276025e-06, + "loss": 0.3782, + "step": 6170 + }, + { + "epoch": 0.4386634251947545, + "grad_norm": 5.623034465377633, + "learning_rate": 5.614794831747835e-06, + "loss": 0.3813, + "step": 6180 + }, + { + "epoch": 0.43937323656238353, + "grad_norm": 10.612805886316337, + "learning_rate": 5.607695584268068e-06, + "loss": 0.3702, + "step": 6190 + }, + { + "epoch": 0.44008304793001257, + "grad_norm": 6.077674805742986, + "learning_rate": 5.600596336788301e-06, + "loss": 0.3643, + "step": 6200 + }, + { + "epoch": 0.44079285929764167, + "grad_norm": 7.053795971115957, + "learning_rate": 5.593497089308534e-06, + "loss": 0.3911, + "step": 6210 + }, + { + "epoch": 0.4415026706652707, + "grad_norm": 6.212842792838621, + "learning_rate": 5.586397841828766e-06, + "loss": 0.3774, + "step": 6220 + }, + { + "epoch": 0.44221248203289976, + "grad_norm": 7.598832178623656, + "learning_rate": 5.5792985943489995e-06, + "loss": 0.3808, + "step": 6230 + }, + { + "epoch": 0.4429222934005288, + "grad_norm": 14.834315377312098, + "learning_rate": 5.5721993468692326e-06, + "loss": 0.3765, + "step": 6240 + }, + { + "epoch": 0.44363210476815784, + "grad_norm": 15.459970963070427, + "learning_rate": 5.565100099389466e-06, + "loss": 0.3863, + "step": 6250 + }, + { + "epoch": 0.44434191613578694, + "grad_norm": 5.002895033502256, + "learning_rate": 5.558000851909699e-06, + "loss": 0.3718, + "step": 6260 + }, + { + "epoch": 0.445051727503416, + "grad_norm": 4.67592371180372, + "learning_rate": 5.550901604429931e-06, + "loss": 0.3869, + "step": 6270 + }, + { + "epoch": 0.445761538871045, + "grad_norm": 4.246040554798665, + "learning_rate": 5.543802356950164e-06, + "loss": 0.3673, + "step": 6280 + }, + { + "epoch": 0.44647135023867407, + "grad_norm": 5.698576828390134, + "learning_rate": 5.536703109470397e-06, + "loss": 0.3733, + "step": 6290 + }, + { + "epoch": 0.4471811616063031, + "grad_norm": 4.890818923695549, + "learning_rate": 5.529603861990629e-06, + "loss": 0.3917, + "step": 6300 + }, + { + "epoch": 0.44789097297393216, + "grad_norm": 3.5954099385229, + "learning_rate": 5.522504614510862e-06, + "loss": 0.387, + "step": 6310 + }, + { + "epoch": 0.44860078434156125, + "grad_norm": 5.819667912733057, + "learning_rate": 5.515405367031095e-06, + "loss": 0.3772, + "step": 6320 + }, + { + "epoch": 0.4493105957091903, + "grad_norm": 4.924613328068802, + "learning_rate": 5.508306119551328e-06, + "loss": 0.3691, + "step": 6330 + }, + { + "epoch": 0.45002040707681934, + "grad_norm": 4.077670226838275, + "learning_rate": 5.50120687207156e-06, + "loss": 0.3606, + "step": 6340 + }, + { + "epoch": 0.4507302184444484, + "grad_norm": 4.7425966011878815, + "learning_rate": 5.494107624591793e-06, + "loss": 0.3712, + "step": 6350 + }, + { + "epoch": 0.4514400298120774, + "grad_norm": 3.7724063921848, + "learning_rate": 5.487008377112026e-06, + "loss": 0.3707, + "step": 6360 + }, + { + "epoch": 0.45214984117970647, + "grad_norm": 2.8597041255348183, + "learning_rate": 5.4799091296322595e-06, + "loss": 0.364, + "step": 6370 + }, + { + "epoch": 0.45285965254733557, + "grad_norm": 5.386440052681094, + "learning_rate": 5.472809882152493e-06, + "loss": 0.3785, + "step": 6380 + }, + { + "epoch": 0.4535694639149646, + "grad_norm": 4.20147189666546, + "learning_rate": 5.465710634672725e-06, + "loss": 0.384, + "step": 6390 + }, + { + "epoch": 0.45427927528259365, + "grad_norm": 5.4360613411555185, + "learning_rate": 5.458611387192958e-06, + "loss": 0.3676, + "step": 6400 + }, + { + "epoch": 0.4549890866502227, + "grad_norm": 7.4543272167324846, + "learning_rate": 5.451512139713191e-06, + "loss": 0.3973, + "step": 6410 + }, + { + "epoch": 0.45569889801785174, + "grad_norm": 5.302161729787796, + "learning_rate": 5.444412892233424e-06, + "loss": 0.3878, + "step": 6420 + }, + { + "epoch": 0.4564087093854808, + "grad_norm": 4.774927845954586, + "learning_rate": 5.437313644753656e-06, + "loss": 0.368, + "step": 6430 + }, + { + "epoch": 0.4571185207531099, + "grad_norm": 4.733108202290537, + "learning_rate": 5.4302143972738895e-06, + "loss": 0.3841, + "step": 6440 + }, + { + "epoch": 0.4578283321207389, + "grad_norm": 4.581655513075473, + "learning_rate": 5.423115149794123e-06, + "loss": 0.3805, + "step": 6450 + }, + { + "epoch": 0.45853814348836797, + "grad_norm": 2.4364404744853445, + "learning_rate": 5.416015902314356e-06, + "loss": 0.3587, + "step": 6460 + }, + { + "epoch": 0.459247954855997, + "grad_norm": 5.16394378928267, + "learning_rate": 5.408916654834588e-06, + "loss": 0.3793, + "step": 6470 + }, + { + "epoch": 0.45995776622362605, + "grad_norm": 8.232574335670192, + "learning_rate": 5.401817407354821e-06, + "loss": 0.3794, + "step": 6480 + }, + { + "epoch": 0.46066757759125515, + "grad_norm": 10.509485180483269, + "learning_rate": 5.394718159875054e-06, + "loss": 0.3742, + "step": 6490 + }, + { + "epoch": 0.4613773889588842, + "grad_norm": 3.418180521754276, + "learning_rate": 5.387618912395286e-06, + "loss": 0.3733, + "step": 6500 + }, + { + "epoch": 0.46208720032651324, + "grad_norm": 4.2689703556593495, + "learning_rate": 5.380519664915519e-06, + "loss": 0.374, + "step": 6510 + }, + { + "epoch": 0.4627970116941423, + "grad_norm": 7.896842999549548, + "learning_rate": 5.373420417435752e-06, + "loss": 0.3799, + "step": 6520 + }, + { + "epoch": 0.4635068230617713, + "grad_norm": 3.4870838077093893, + "learning_rate": 5.366321169955985e-06, + "loss": 0.3712, + "step": 6530 + }, + { + "epoch": 0.46421663442940037, + "grad_norm": 27.778526824166995, + "learning_rate": 5.359221922476218e-06, + "loss": 0.3655, + "step": 6540 + }, + { + "epoch": 0.46492644579702946, + "grad_norm": 16.796202092439216, + "learning_rate": 5.35212267499645e-06, + "loss": 0.3846, + "step": 6550 + }, + { + "epoch": 0.4656362571646585, + "grad_norm": 5.698856930659158, + "learning_rate": 5.345023427516683e-06, + "loss": 0.3877, + "step": 6560 + }, + { + "epoch": 0.46634606853228755, + "grad_norm": 8.694016798434083, + "learning_rate": 5.3379241800369165e-06, + "loss": 0.3607, + "step": 6570 + }, + { + "epoch": 0.4670558798999166, + "grad_norm": 3.617969654098083, + "learning_rate": 5.3308249325571496e-06, + "loss": 0.36, + "step": 6580 + }, + { + "epoch": 0.46776569126754564, + "grad_norm": 7.181014577384461, + "learning_rate": 5.323725685077382e-06, + "loss": 0.3783, + "step": 6590 + }, + { + "epoch": 0.4684755026351747, + "grad_norm": 9.52331650225055, + "learning_rate": 5.316626437597615e-06, + "loss": 0.3707, + "step": 6600 + }, + { + "epoch": 0.4691853140028038, + "grad_norm": 5.927560976046885, + "learning_rate": 5.309527190117848e-06, + "loss": 0.3747, + "step": 6610 + }, + { + "epoch": 0.4698951253704328, + "grad_norm": 33.354649195054265, + "learning_rate": 5.302427942638081e-06, + "loss": 0.3622, + "step": 6620 + }, + { + "epoch": 0.47060493673806186, + "grad_norm": 5.109478632635811, + "learning_rate": 5.295328695158314e-06, + "loss": 0.3702, + "step": 6630 + }, + { + "epoch": 0.4713147481056909, + "grad_norm": 62.14127099005149, + "learning_rate": 5.2882294476785465e-06, + "loss": 0.3718, + "step": 6640 + }, + { + "epoch": 0.47202455947331995, + "grad_norm": 3.9646315343813674, + "learning_rate": 5.2811302001987796e-06, + "loss": 0.3579, + "step": 6650 + }, + { + "epoch": 0.472734370840949, + "grad_norm": 5.822229945732986, + "learning_rate": 5.274030952719013e-06, + "loss": 0.358, + "step": 6660 + }, + { + "epoch": 0.4734441822085781, + "grad_norm": 3.0706990453586607, + "learning_rate": 5.266931705239246e-06, + "loss": 0.3712, + "step": 6670 + }, + { + "epoch": 0.47415399357620713, + "grad_norm": 2.763541771977754, + "learning_rate": 5.259832457759478e-06, + "loss": 0.3862, + "step": 6680 + }, + { + "epoch": 0.4748638049438362, + "grad_norm": 2.8054880505902746, + "learning_rate": 5.252733210279711e-06, + "loss": 0.3609, + "step": 6690 + }, + { + "epoch": 0.4755736163114652, + "grad_norm": 3.5455500616555864, + "learning_rate": 5.245633962799943e-06, + "loss": 0.3845, + "step": 6700 + }, + { + "epoch": 0.47628342767909426, + "grad_norm": 6.871049315984216, + "learning_rate": 5.238534715320176e-06, + "loss": 0.3681, + "step": 6710 + }, + { + "epoch": 0.47699323904672336, + "grad_norm": 4.626136895991325, + "learning_rate": 5.231435467840409e-06, + "loss": 0.3694, + "step": 6720 + }, + { + "epoch": 0.4777030504143524, + "grad_norm": 4.1689737774582385, + "learning_rate": 5.224336220360642e-06, + "loss": 0.3722, + "step": 6730 + }, + { + "epoch": 0.47841286178198145, + "grad_norm": 2.345831388882716, + "learning_rate": 5.217236972880875e-06, + "loss": 0.3778, + "step": 6740 + }, + { + "epoch": 0.4791226731496105, + "grad_norm": 5.181993551246977, + "learning_rate": 5.210137725401107e-06, + "loss": 0.3649, + "step": 6750 + }, + { + "epoch": 0.47983248451723953, + "grad_norm": 4.144025528380454, + "learning_rate": 5.20303847792134e-06, + "loss": 0.3854, + "step": 6760 + }, + { + "epoch": 0.4805422958848686, + "grad_norm": 4.0013049178877536, + "learning_rate": 5.195939230441573e-06, + "loss": 0.3832, + "step": 6770 + }, + { + "epoch": 0.4812521072524977, + "grad_norm": 4.375334224867565, + "learning_rate": 5.1888399829618065e-06, + "loss": 0.3678, + "step": 6780 + }, + { + "epoch": 0.4819619186201267, + "grad_norm": 2.8158913555106926, + "learning_rate": 5.18174073548204e-06, + "loss": 0.3735, + "step": 6790 + }, + { + "epoch": 0.48267172998775576, + "grad_norm": 4.286259213586135, + "learning_rate": 5.174641488002272e-06, + "loss": 0.3824, + "step": 6800 + }, + { + "epoch": 0.4833815413553848, + "grad_norm": 2.917255310557774, + "learning_rate": 5.167542240522505e-06, + "loss": 0.367, + "step": 6810 + }, + { + "epoch": 0.48409135272301385, + "grad_norm": 2.9474809991081194, + "learning_rate": 5.160442993042738e-06, + "loss": 0.37, + "step": 6820 + }, + { + "epoch": 0.4848011640906429, + "grad_norm": 8.0892973566849, + "learning_rate": 5.153343745562971e-06, + "loss": 0.385, + "step": 6830 + }, + { + "epoch": 0.485510975458272, + "grad_norm": 5.46237208189901, + "learning_rate": 5.1462444980832034e-06, + "loss": 0.3723, + "step": 6840 + }, + { + "epoch": 0.48622078682590103, + "grad_norm": 4.813397707683654, + "learning_rate": 5.1391452506034365e-06, + "loss": 0.3847, + "step": 6850 + }, + { + "epoch": 0.4869305981935301, + "grad_norm": 3.839632822272105, + "learning_rate": 5.13204600312367e-06, + "loss": 0.3994, + "step": 6860 + }, + { + "epoch": 0.4876404095611591, + "grad_norm": 2.731217984269613, + "learning_rate": 5.124946755643903e-06, + "loss": 0.3928, + "step": 6870 + }, + { + "epoch": 0.48835022092878816, + "grad_norm": 7.062296596699752, + "learning_rate": 5.117847508164136e-06, + "loss": 0.4141, + "step": 6880 + }, + { + "epoch": 0.4890600322964172, + "grad_norm": 3.0471865890050034, + "learning_rate": 5.110748260684368e-06, + "loss": 0.3712, + "step": 6890 + }, + { + "epoch": 0.4897698436640463, + "grad_norm": 8.240874357274272, + "learning_rate": 5.103649013204601e-06, + "loss": 0.3828, + "step": 6900 + }, + { + "epoch": 0.49047965503167534, + "grad_norm": 4.557814239490917, + "learning_rate": 5.0965497657248334e-06, + "loss": 0.3794, + "step": 6910 + }, + { + "epoch": 0.4911894663993044, + "grad_norm": 6.50934729087624, + "learning_rate": 5.089450518245066e-06, + "loss": 0.3655, + "step": 6920 + }, + { + "epoch": 0.49189927776693343, + "grad_norm": 2.7892154452796696, + "learning_rate": 5.082351270765299e-06, + "loss": 0.3477, + "step": 6930 + }, + { + "epoch": 0.4926090891345625, + "grad_norm": 4.296820022815862, + "learning_rate": 5.075252023285532e-06, + "loss": 0.3917, + "step": 6940 + }, + { + "epoch": 0.4933189005021915, + "grad_norm": 3.7811542108069514, + "learning_rate": 5.068152775805765e-06, + "loss": 0.3846, + "step": 6950 + }, + { + "epoch": 0.4940287118698206, + "grad_norm": 12.150770506288081, + "learning_rate": 5.061053528325997e-06, + "loss": 0.3991, + "step": 6960 + }, + { + "epoch": 0.49473852323744966, + "grad_norm": 8.737862487013935, + "learning_rate": 5.05395428084623e-06, + "loss": 0.376, + "step": 6970 + }, + { + "epoch": 0.4954483346050787, + "grad_norm": 4.705086993153889, + "learning_rate": 5.0468550333664635e-06, + "loss": 0.3774, + "step": 6980 + }, + { + "epoch": 0.49615814597270774, + "grad_norm": 3.95177864719572, + "learning_rate": 5.0397557858866966e-06, + "loss": 0.3867, + "step": 6990 + }, + { + "epoch": 0.4968679573403368, + "grad_norm": 4.9228476674024995, + "learning_rate": 5.03265653840693e-06, + "loss": 0.3868, + "step": 7000 + }, + { + "epoch": 0.4975777687079659, + "grad_norm": 7.598944675436029, + "learning_rate": 5.025557290927162e-06, + "loss": 0.3791, + "step": 7010 + }, + { + "epoch": 0.4982875800755949, + "grad_norm": 3.948022335506646, + "learning_rate": 5.018458043447395e-06, + "loss": 0.3878, + "step": 7020 + }, + { + "epoch": 0.49899739144322397, + "grad_norm": 2.97600555704115, + "learning_rate": 5.011358795967628e-06, + "loss": 0.3891, + "step": 7030 + }, + { + "epoch": 0.499707202810853, + "grad_norm": 7.322058927387839, + "learning_rate": 5.004259548487861e-06, + "loss": 0.3739, + "step": 7040 + }, + { + "epoch": 0.5004170141784821, + "grad_norm": 4.054563164115399, + "learning_rate": 4.9971603010080935e-06, + "loss": 0.3654, + "step": 7050 + }, + { + "epoch": 0.5011268255461111, + "grad_norm": 6.433797069878189, + "learning_rate": 4.990061053528326e-06, + "loss": 0.3769, + "step": 7060 + }, + { + "epoch": 0.5018366369137401, + "grad_norm": 6.244381336548628, + "learning_rate": 4.982961806048559e-06, + "loss": 0.3698, + "step": 7070 + }, + { + "epoch": 0.5025464482813692, + "grad_norm": 4.649812061123292, + "learning_rate": 4.975862558568792e-06, + "loss": 0.3597, + "step": 7080 + }, + { + "epoch": 0.5032562596489982, + "grad_norm": 13.131635539716475, + "learning_rate": 4.968763311089025e-06, + "loss": 0.3737, + "step": 7090 + }, + { + "epoch": 0.5039660710166274, + "grad_norm": 11.654767208116397, + "learning_rate": 4.961664063609258e-06, + "loss": 0.3809, + "step": 7100 + }, + { + "epoch": 0.5046758823842564, + "grad_norm": 5.54405844933368, + "learning_rate": 4.95456481612949e-06, + "loss": 0.3668, + "step": 7110 + }, + { + "epoch": 0.5053856937518855, + "grad_norm": 17.63140898183613, + "learning_rate": 4.9474655686497235e-06, + "loss": 0.3751, + "step": 7120 + }, + { + "epoch": 0.5060955051195145, + "grad_norm": 4.735270750917372, + "learning_rate": 4.940366321169957e-06, + "loss": 0.3759, + "step": 7130 + }, + { + "epoch": 0.5068053164871436, + "grad_norm": 3.6005983980475214, + "learning_rate": 4.93326707369019e-06, + "loss": 0.3932, + "step": 7140 + }, + { + "epoch": 0.5075151278547726, + "grad_norm": 5.073652881259414, + "learning_rate": 4.926167826210422e-06, + "loss": 0.3689, + "step": 7150 + }, + { + "epoch": 0.5082249392224016, + "grad_norm": 6.515311066715168, + "learning_rate": 4.919068578730655e-06, + "loss": 0.3675, + "step": 7160 + }, + { + "epoch": 0.5089347505900307, + "grad_norm": 12.98913332417653, + "learning_rate": 4.911969331250887e-06, + "loss": 0.3861, + "step": 7170 + }, + { + "epoch": 0.5096445619576597, + "grad_norm": 5.1500756291258005, + "learning_rate": 4.90487008377112e-06, + "loss": 0.3731, + "step": 7180 + }, + { + "epoch": 0.5103543733252888, + "grad_norm": 5.833801547579832, + "learning_rate": 4.8977708362913535e-06, + "loss": 0.3831, + "step": 7190 + }, + { + "epoch": 0.5110641846929178, + "grad_norm": 8.343761477251691, + "learning_rate": 4.890671588811587e-06, + "loss": 0.3716, + "step": 7200 + }, + { + "epoch": 0.511773996060547, + "grad_norm": 6.740845613760958, + "learning_rate": 4.883572341331819e-06, + "loss": 0.377, + "step": 7210 + }, + { + "epoch": 0.512483807428176, + "grad_norm": 6.834960096187304, + "learning_rate": 4.876473093852052e-06, + "loss": 0.3774, + "step": 7220 + }, + { + "epoch": 0.513193618795805, + "grad_norm": 6.333904565562881, + "learning_rate": 4.869373846372285e-06, + "loss": 0.3786, + "step": 7230 + }, + { + "epoch": 0.5139034301634341, + "grad_norm": 7.380378873059882, + "learning_rate": 4.862274598892518e-06, + "loss": 0.3641, + "step": 7240 + }, + { + "epoch": 0.5146132415310631, + "grad_norm": 8.15711157363267, + "learning_rate": 4.855175351412751e-06, + "loss": 0.354, + "step": 7250 + }, + { + "epoch": 0.5153230528986922, + "grad_norm": 5.298194233144714, + "learning_rate": 4.8480761039329835e-06, + "loss": 0.3648, + "step": 7260 + }, + { + "epoch": 0.5160328642663212, + "grad_norm": 6.169565228174972, + "learning_rate": 4.840976856453216e-06, + "loss": 0.3606, + "step": 7270 + }, + { + "epoch": 0.5167426756339503, + "grad_norm": 4.633952354333419, + "learning_rate": 4.833877608973449e-06, + "loss": 0.3627, + "step": 7280 + }, + { + "epoch": 0.5174524870015793, + "grad_norm": 7.754370375548218, + "learning_rate": 4.826778361493682e-06, + "loss": 0.384, + "step": 7290 + }, + { + "epoch": 0.5181622983692084, + "grad_norm": 4.628647672477682, + "learning_rate": 4.819679114013915e-06, + "loss": 0.3717, + "step": 7300 + }, + { + "epoch": 0.5188721097368374, + "grad_norm": 4.6108119740619165, + "learning_rate": 4.812579866534147e-06, + "loss": 0.3531, + "step": 7310 + }, + { + "epoch": 0.5195819211044664, + "grad_norm": 3.777480319775288, + "learning_rate": 4.8054806190543805e-06, + "loss": 0.3735, + "step": 7320 + }, + { + "epoch": 0.5202917324720956, + "grad_norm": 6.455151414772601, + "learning_rate": 4.7983813715746136e-06, + "loss": 0.3845, + "step": 7330 + }, + { + "epoch": 0.5210015438397246, + "grad_norm": 5.0016880570007, + "learning_rate": 4.791282124094847e-06, + "loss": 0.3588, + "step": 7340 + }, + { + "epoch": 0.5217113552073537, + "grad_norm": 3.596195253014758, + "learning_rate": 4.78418287661508e-06, + "loss": 0.3664, + "step": 7350 + }, + { + "epoch": 0.5224211665749827, + "grad_norm": 4.6111563525428005, + "learning_rate": 4.777083629135312e-06, + "loss": 0.3815, + "step": 7360 + }, + { + "epoch": 0.5231309779426118, + "grad_norm": 3.81079107236397, + "learning_rate": 4.769984381655544e-06, + "loss": 0.3603, + "step": 7370 + }, + { + "epoch": 0.5238407893102408, + "grad_norm": 10.081677733455512, + "learning_rate": 4.762885134175777e-06, + "loss": 0.3748, + "step": 7380 + }, + { + "epoch": 0.5245506006778698, + "grad_norm": 4.011909680570432, + "learning_rate": 4.7557858866960105e-06, + "loss": 0.3736, + "step": 7390 + }, + { + "epoch": 0.5252604120454989, + "grad_norm": 4.008812937992125, + "learning_rate": 4.7486866392162436e-06, + "loss": 0.3718, + "step": 7400 + }, + { + "epoch": 0.5259702234131279, + "grad_norm": 8.895014071619777, + "learning_rate": 4.741587391736477e-06, + "loss": 0.3747, + "step": 7410 + }, + { + "epoch": 0.526680034780757, + "grad_norm": 2.5646865204368394, + "learning_rate": 4.734488144256709e-06, + "loss": 0.3593, + "step": 7420 + }, + { + "epoch": 0.527389846148386, + "grad_norm": 2.8583907278858147, + "learning_rate": 4.727388896776942e-06, + "loss": 0.387, + "step": 7430 + }, + { + "epoch": 0.5280996575160151, + "grad_norm": 2.8626323560816296, + "learning_rate": 4.720289649297175e-06, + "loss": 0.3756, + "step": 7440 + }, + { + "epoch": 0.5288094688836442, + "grad_norm": 7.38191434335366, + "learning_rate": 4.713190401817408e-06, + "loss": 0.3715, + "step": 7450 + }, + { + "epoch": 0.5295192802512733, + "grad_norm": 3.187699709665762, + "learning_rate": 4.7060911543376405e-06, + "loss": 0.3763, + "step": 7460 + }, + { + "epoch": 0.5302290916189023, + "grad_norm": 2.2423385405265366, + "learning_rate": 4.698991906857874e-06, + "loss": 0.367, + "step": 7470 + }, + { + "epoch": 0.5309389029865313, + "grad_norm": 3.5525056364166465, + "learning_rate": 4.691892659378106e-06, + "loss": 0.3639, + "step": 7480 + }, + { + "epoch": 0.5316487143541604, + "grad_norm": 2.5840538292895405, + "learning_rate": 4.684793411898339e-06, + "loss": 0.3713, + "step": 7490 + }, + { + "epoch": 0.5323585257217894, + "grad_norm": 3.6015272776951366, + "learning_rate": 4.677694164418572e-06, + "loss": 0.3672, + "step": 7500 + }, + { + "epoch": 0.5330683370894185, + "grad_norm": 2.958338857599813, + "learning_rate": 4.670594916938805e-06, + "loss": 0.364, + "step": 7510 + }, + { + "epoch": 0.5337781484570475, + "grad_norm": 2.6780802400700248, + "learning_rate": 4.663495669459037e-06, + "loss": 0.3871, + "step": 7520 + }, + { + "epoch": 0.5344879598246766, + "grad_norm": 2.141486624042336, + "learning_rate": 4.6563964219792705e-06, + "loss": 0.3918, + "step": 7530 + }, + { + "epoch": 0.5351977711923056, + "grad_norm": 11.627725180923038, + "learning_rate": 4.649297174499504e-06, + "loss": 0.369, + "step": 7540 + }, + { + "epoch": 0.5359075825599346, + "grad_norm": 2.164302320101156, + "learning_rate": 4.642197927019737e-06, + "loss": 0.3763, + "step": 7550 + }, + { + "epoch": 0.5366173939275638, + "grad_norm": 2.5355641201406716, + "learning_rate": 4.63509867953997e-06, + "loss": 0.3709, + "step": 7560 + }, + { + "epoch": 0.5373272052951928, + "grad_norm": 1.7486780225096559, + "learning_rate": 4.627999432060202e-06, + "loss": 0.3778, + "step": 7570 + }, + { + "epoch": 0.5380370166628219, + "grad_norm": 2.1996857828607066, + "learning_rate": 4.620900184580434e-06, + "loss": 0.3878, + "step": 7580 + }, + { + "epoch": 0.5387468280304509, + "grad_norm": 2.2718302971034325, + "learning_rate": 4.613800937100667e-06, + "loss": 0.3691, + "step": 7590 + }, + { + "epoch": 0.53945663939808, + "grad_norm": 2.247788269458988, + "learning_rate": 4.6067016896209005e-06, + "loss": 0.3764, + "step": 7600 + }, + { + "epoch": 0.540166450765709, + "grad_norm": 4.951241532022136, + "learning_rate": 4.599602442141134e-06, + "loss": 0.3696, + "step": 7610 + }, + { + "epoch": 0.5408762621333381, + "grad_norm": 18.87723312065313, + "learning_rate": 4.592503194661366e-06, + "loss": 0.3752, + "step": 7620 + }, + { + "epoch": 0.5415860735009671, + "grad_norm": 4.839150391451601, + "learning_rate": 4.585403947181599e-06, + "loss": 0.3704, + "step": 7630 + }, + { + "epoch": 0.5422958848685961, + "grad_norm": 3.252448644894675, + "learning_rate": 4.578304699701832e-06, + "loss": 0.3662, + "step": 7640 + }, + { + "epoch": 0.5430056962362252, + "grad_norm": 4.636061450249123, + "learning_rate": 4.571205452222065e-06, + "loss": 0.3695, + "step": 7650 + }, + { + "epoch": 0.5437155076038542, + "grad_norm": 2.217398025384477, + "learning_rate": 4.564106204742298e-06, + "loss": 0.381, + "step": 7660 + }, + { + "epoch": 0.5444253189714833, + "grad_norm": 3.2864797627789764, + "learning_rate": 4.5570069572625305e-06, + "loss": 0.3766, + "step": 7670 + }, + { + "epoch": 0.5451351303391124, + "grad_norm": 2.5595280528292346, + "learning_rate": 4.549907709782763e-06, + "loss": 0.3753, + "step": 7680 + }, + { + "epoch": 0.5458449417067415, + "grad_norm": 3.5869951931087356, + "learning_rate": 4.542808462302996e-06, + "loss": 0.3649, + "step": 7690 + }, + { + "epoch": 0.5465547530743705, + "grad_norm": 2.878804286325741, + "learning_rate": 4.535709214823229e-06, + "loss": 0.365, + "step": 7700 + }, + { + "epoch": 0.5472645644419996, + "grad_norm": 3.835428702840037, + "learning_rate": 4.528609967343462e-06, + "loss": 0.388, + "step": 7710 + }, + { + "epoch": 0.5479743758096286, + "grad_norm": 3.3115804743584225, + "learning_rate": 4.521510719863695e-06, + "loss": 0.3498, + "step": 7720 + }, + { + "epoch": 0.5486841871772576, + "grad_norm": 2.155325207710473, + "learning_rate": 4.5144114723839275e-06, + "loss": 0.3638, + "step": 7730 + }, + { + "epoch": 0.5493939985448867, + "grad_norm": 2.9473064158817506, + "learning_rate": 4.5073122249041606e-06, + "loss": 0.3756, + "step": 7740 + }, + { + "epoch": 0.5501038099125157, + "grad_norm": 3.977038197892431, + "learning_rate": 4.500212977424394e-06, + "loss": 0.3674, + "step": 7750 + }, + { + "epoch": 0.5508136212801448, + "grad_norm": 5.638630944163406, + "learning_rate": 4.493113729944627e-06, + "loss": 0.3528, + "step": 7760 + }, + { + "epoch": 0.5515234326477738, + "grad_norm": 2.8534926361264286, + "learning_rate": 4.486014482464859e-06, + "loss": 0.3697, + "step": 7770 + }, + { + "epoch": 0.5522332440154029, + "grad_norm": 6.069502646886042, + "learning_rate": 4.478915234985092e-06, + "loss": 0.3843, + "step": 7780 + }, + { + "epoch": 0.552943055383032, + "grad_norm": 4.343605351910854, + "learning_rate": 4.471815987505324e-06, + "loss": 0.3783, + "step": 7790 + }, + { + "epoch": 0.553652866750661, + "grad_norm": 2.104465858436518, + "learning_rate": 4.4647167400255575e-06, + "loss": 0.3601, + "step": 7800 + }, + { + "epoch": 0.5543626781182901, + "grad_norm": 3.0902122663518448, + "learning_rate": 4.457617492545791e-06, + "loss": 0.3801, + "step": 7810 + }, + { + "epoch": 0.5550724894859191, + "grad_norm": 4.573352955842933, + "learning_rate": 4.450518245066024e-06, + "loss": 0.3835, + "step": 7820 + }, + { + "epoch": 0.5557823008535482, + "grad_norm": 2.9707860507790924, + "learning_rate": 4.443418997586256e-06, + "loss": 0.3709, + "step": 7830 + }, + { + "epoch": 0.5564921122211772, + "grad_norm": 2.5687241689417806, + "learning_rate": 4.436319750106489e-06, + "loss": 0.3835, + "step": 7840 + }, + { + "epoch": 0.5572019235888063, + "grad_norm": 3.347322471582433, + "learning_rate": 4.429220502626722e-06, + "loss": 0.3735, + "step": 7850 + }, + { + "epoch": 0.5579117349564353, + "grad_norm": 6.431823861299619, + "learning_rate": 4.422121255146955e-06, + "loss": 0.3704, + "step": 7860 + }, + { + "epoch": 0.5586215463240644, + "grad_norm": 3.050115422109329, + "learning_rate": 4.4150220076671875e-06, + "loss": 0.3822, + "step": 7870 + }, + { + "epoch": 0.5593313576916934, + "grad_norm": 1.7811591664189523, + "learning_rate": 4.407922760187421e-06, + "loss": 0.3658, + "step": 7880 + }, + { + "epoch": 0.5600411690593224, + "grad_norm": 3.442846796158278, + "learning_rate": 4.400823512707653e-06, + "loss": 0.3621, + "step": 7890 + }, + { + "epoch": 0.5607509804269515, + "grad_norm": 7.2461896738177, + "learning_rate": 4.393724265227886e-06, + "loss": 0.3526, + "step": 7900 + }, + { + "epoch": 0.5614607917945806, + "grad_norm": 2.0219408065827875, + "learning_rate": 4.386625017748119e-06, + "loss": 0.3659, + "step": 7910 + }, + { + "epoch": 0.5621706031622097, + "grad_norm": 4.896944413168855, + "learning_rate": 4.379525770268352e-06, + "loss": 0.3765, + "step": 7920 + }, + { + "epoch": 0.5628804145298387, + "grad_norm": 2.1094695887191848, + "learning_rate": 4.372426522788584e-06, + "loss": 0.3644, + "step": 7930 + }, + { + "epoch": 0.5635902258974678, + "grad_norm": 5.596991296221292, + "learning_rate": 4.3653272753088175e-06, + "loss": 0.3835, + "step": 7940 + }, + { + "epoch": 0.5643000372650968, + "grad_norm": 2.373450501523087, + "learning_rate": 4.358228027829051e-06, + "loss": 0.3756, + "step": 7950 + }, + { + "epoch": 0.5650098486327259, + "grad_norm": 4.1947432157390026, + "learning_rate": 4.351128780349284e-06, + "loss": 0.3787, + "step": 7960 + }, + { + "epoch": 0.5657196600003549, + "grad_norm": 2.921985411820113, + "learning_rate": 4.344029532869517e-06, + "loss": 0.3746, + "step": 7970 + }, + { + "epoch": 0.5664294713679839, + "grad_norm": 13.63904398617421, + "learning_rate": 4.336930285389749e-06, + "loss": 0.3535, + "step": 7980 + }, + { + "epoch": 0.567139282735613, + "grad_norm": 2.6665592498045037, + "learning_rate": 4.329831037909981e-06, + "loss": 0.3668, + "step": 7990 + }, + { + "epoch": 0.567849094103242, + "grad_norm": 2.7866449972058795, + "learning_rate": 4.3227317904302144e-06, + "loss": 0.3747, + "step": 8000 + }, + { + "epoch": 0.5685589054708711, + "grad_norm": 2.795372211208224, + "learning_rate": 4.3156325429504475e-06, + "loss": 0.3737, + "step": 8010 + }, + { + "epoch": 0.5692687168385002, + "grad_norm": 2.829992387736084, + "learning_rate": 4.308533295470681e-06, + "loss": 0.3813, + "step": 8020 + }, + { + "epoch": 0.5699785282061293, + "grad_norm": 3.8835793195310706, + "learning_rate": 4.301434047990914e-06, + "loss": 0.3934, + "step": 8030 + }, + { + "epoch": 0.5706883395737583, + "grad_norm": 2.157944880021205, + "learning_rate": 4.294334800511146e-06, + "loss": 0.3619, + "step": 8040 + }, + { + "epoch": 0.5713981509413874, + "grad_norm": 2.576031100575868, + "learning_rate": 4.287235553031379e-06, + "loss": 0.3654, + "step": 8050 + }, + { + "epoch": 0.5721079623090164, + "grad_norm": 2.1013120962560445, + "learning_rate": 4.280136305551612e-06, + "loss": 0.3808, + "step": 8060 + }, + { + "epoch": 0.5728177736766454, + "grad_norm": 8.72915943640877, + "learning_rate": 4.273037058071845e-06, + "loss": 0.3865, + "step": 8070 + }, + { + "epoch": 0.5735275850442745, + "grad_norm": 3.1373379205439123, + "learning_rate": 4.2659378105920776e-06, + "loss": 0.3631, + "step": 8080 + }, + { + "epoch": 0.5742373964119035, + "grad_norm": 10.697527972561883, + "learning_rate": 4.258838563112311e-06, + "loss": 0.3597, + "step": 8090 + }, + { + "epoch": 0.5749472077795326, + "grad_norm": 3.6970932139238095, + "learning_rate": 4.251739315632543e-06, + "loss": 0.3635, + "step": 8100 + }, + { + "epoch": 0.5756570191471616, + "grad_norm": 2.4203467674630206, + "learning_rate": 4.244640068152776e-06, + "loss": 0.359, + "step": 8110 + }, + { + "epoch": 0.5763668305147907, + "grad_norm": 2.9395692807103035, + "learning_rate": 4.237540820673009e-06, + "loss": 0.3603, + "step": 8120 + }, + { + "epoch": 0.5770766418824197, + "grad_norm": 3.012599979258794, + "learning_rate": 4.230441573193242e-06, + "loss": 0.3568, + "step": 8130 + }, + { + "epoch": 0.5777864532500488, + "grad_norm": 6.667370402568531, + "learning_rate": 4.2233423257134745e-06, + "loss": 0.3629, + "step": 8140 + }, + { + "epoch": 0.5784962646176779, + "grad_norm": 4.471487834006219, + "learning_rate": 4.2162430782337076e-06, + "loss": 0.3683, + "step": 8150 + }, + { + "epoch": 0.5792060759853069, + "grad_norm": 3.599804032694662, + "learning_rate": 4.209143830753941e-06, + "loss": 0.3554, + "step": 8160 + }, + { + "epoch": 0.579915887352936, + "grad_norm": 2.9142466980850985, + "learning_rate": 4.202044583274174e-06, + "loss": 0.3524, + "step": 8170 + }, + { + "epoch": 0.580625698720565, + "grad_norm": 3.8569199714753295, + "learning_rate": 4.194945335794406e-06, + "loss": 0.3663, + "step": 8180 + }, + { + "epoch": 0.5813355100881941, + "grad_norm": 2.4068975949006077, + "learning_rate": 4.187846088314639e-06, + "loss": 0.3747, + "step": 8190 + }, + { + "epoch": 0.5820453214558231, + "grad_norm": 6.174322801188514, + "learning_rate": 4.180746840834871e-06, + "loss": 0.372, + "step": 8200 + }, + { + "epoch": 0.5827551328234521, + "grad_norm": 2.888969982284499, + "learning_rate": 4.1736475933551045e-06, + "loss": 0.361, + "step": 8210 + }, + { + "epoch": 0.5834649441910812, + "grad_norm": 4.910093339119916, + "learning_rate": 4.166548345875338e-06, + "loss": 0.3574, + "step": 8220 + }, + { + "epoch": 0.5841747555587102, + "grad_norm": 5.1058356496999755, + "learning_rate": 4.159449098395571e-06, + "loss": 0.3786, + "step": 8230 + }, + { + "epoch": 0.5848845669263393, + "grad_norm": 14.081326767892058, + "learning_rate": 4.152349850915803e-06, + "loss": 0.3729, + "step": 8240 + }, + { + "epoch": 0.5855943782939684, + "grad_norm": 4.958684438886047, + "learning_rate": 4.145250603436036e-06, + "loss": 0.3566, + "step": 8250 + }, + { + "epoch": 0.5863041896615975, + "grad_norm": 3.9438637049329075, + "learning_rate": 4.138151355956269e-06, + "loss": 0.3861, + "step": 8260 + }, + { + "epoch": 0.5870140010292265, + "grad_norm": 2.9499712942928107, + "learning_rate": 4.131052108476502e-06, + "loss": 0.3439, + "step": 8270 + }, + { + "epoch": 0.5877238123968556, + "grad_norm": 3.332966504823502, + "learning_rate": 4.1239528609967345e-06, + "loss": 0.3788, + "step": 8280 + }, + { + "epoch": 0.5884336237644846, + "grad_norm": 27.970854056782667, + "learning_rate": 4.116853613516968e-06, + "loss": 0.3591, + "step": 8290 + }, + { + "epoch": 0.5891434351321136, + "grad_norm": 4.487327484061174, + "learning_rate": 4.1097543660372e-06, + "loss": 0.3625, + "step": 8300 + }, + { + "epoch": 0.5898532464997427, + "grad_norm": 3.8006981727665496, + "learning_rate": 4.102655118557433e-06, + "loss": 0.3709, + "step": 8310 + }, + { + "epoch": 0.5905630578673717, + "grad_norm": 3.463457513521014, + "learning_rate": 4.095555871077666e-06, + "loss": 0.3641, + "step": 8320 + }, + { + "epoch": 0.5912728692350008, + "grad_norm": 7.640707242523127, + "learning_rate": 4.088456623597899e-06, + "loss": 0.3648, + "step": 8330 + }, + { + "epoch": 0.5919826806026298, + "grad_norm": 2.8614936603096295, + "learning_rate": 4.081357376118132e-06, + "loss": 0.3616, + "step": 8340 + }, + { + "epoch": 0.5926924919702589, + "grad_norm": 3.296737746561609, + "learning_rate": 4.0742581286383645e-06, + "loss": 0.3808, + "step": 8350 + }, + { + "epoch": 0.5934023033378879, + "grad_norm": 3.2426352432246976, + "learning_rate": 4.067158881158598e-06, + "loss": 0.3583, + "step": 8360 + }, + { + "epoch": 0.5941121147055171, + "grad_norm": 3.4522007032736806, + "learning_rate": 4.060059633678831e-06, + "loss": 0.365, + "step": 8370 + }, + { + "epoch": 0.5948219260731461, + "grad_norm": 3.9166457660699145, + "learning_rate": 4.052960386199063e-06, + "loss": 0.3692, + "step": 8380 + }, + { + "epoch": 0.5955317374407751, + "grad_norm": 2.9039677495535874, + "learning_rate": 4.045861138719296e-06, + "loss": 0.3468, + "step": 8390 + }, + { + "epoch": 0.5962415488084042, + "grad_norm": 3.187977468656372, + "learning_rate": 4.038761891239529e-06, + "loss": 0.359, + "step": 8400 + }, + { + "epoch": 0.5969513601760332, + "grad_norm": 4.529576318117622, + "learning_rate": 4.0316626437597614e-06, + "loss": 0.3452, + "step": 8410 + }, + { + "epoch": 0.5976611715436623, + "grad_norm": 6.601726345536697, + "learning_rate": 4.0245633962799945e-06, + "loss": 0.3713, + "step": 8420 + }, + { + "epoch": 0.5983709829112913, + "grad_norm": 2.4278158486667576, + "learning_rate": 4.017464148800228e-06, + "loss": 0.3628, + "step": 8430 + }, + { + "epoch": 0.5990807942789204, + "grad_norm": 2.76630569189727, + "learning_rate": 4.010364901320461e-06, + "loss": 0.3704, + "step": 8440 + }, + { + "epoch": 0.5997906056465494, + "grad_norm": 6.7843620715556545, + "learning_rate": 4.003265653840693e-06, + "loss": 0.3682, + "step": 8450 + }, + { + "epoch": 0.6005004170141784, + "grad_norm": 2.9403338895288336, + "learning_rate": 3.996166406360926e-06, + "loss": 0.3608, + "step": 8460 + }, + { + "epoch": 0.6012102283818075, + "grad_norm": 4.301178222098619, + "learning_rate": 3.989067158881159e-06, + "loss": 0.3595, + "step": 8470 + }, + { + "epoch": 0.6019200397494366, + "grad_norm": 3.0914199152912696, + "learning_rate": 3.981967911401392e-06, + "loss": 0.3718, + "step": 8480 + }, + { + "epoch": 0.6026298511170657, + "grad_norm": 2.753384437967004, + "learning_rate": 3.9748686639216246e-06, + "loss": 0.3672, + "step": 8490 + }, + { + "epoch": 0.6033396624846947, + "grad_norm": 2.576321546323924, + "learning_rate": 3.967769416441858e-06, + "loss": 0.3706, + "step": 8500 + }, + { + "epoch": 0.6040494738523238, + "grad_norm": 2.617904283815147, + "learning_rate": 3.96067016896209e-06, + "loss": 0.3539, + "step": 8510 + }, + { + "epoch": 0.6047592852199528, + "grad_norm": 4.862875127190094, + "learning_rate": 3.953570921482323e-06, + "loss": 0.3763, + "step": 8520 + }, + { + "epoch": 0.6054690965875819, + "grad_norm": 4.741023889550647, + "learning_rate": 3.946471674002556e-06, + "loss": 0.3611, + "step": 8530 + }, + { + "epoch": 0.6061789079552109, + "grad_norm": 6.394478684199079, + "learning_rate": 3.939372426522789e-06, + "loss": 0.3615, + "step": 8540 + }, + { + "epoch": 0.6068887193228399, + "grad_norm": 4.045100357410319, + "learning_rate": 3.9322731790430215e-06, + "loss": 0.3648, + "step": 8550 + }, + { + "epoch": 0.607598530690469, + "grad_norm": 3.756852697194425, + "learning_rate": 3.925173931563255e-06, + "loss": 0.3689, + "step": 8560 + }, + { + "epoch": 0.608308342058098, + "grad_norm": 4.04897373953826, + "learning_rate": 3.918074684083488e-06, + "loss": 0.3644, + "step": 8570 + }, + { + "epoch": 0.6090181534257271, + "grad_norm": 4.036663207362448, + "learning_rate": 3.910975436603721e-06, + "loss": 0.366, + "step": 8580 + }, + { + "epoch": 0.6097279647933561, + "grad_norm": 4.156260594948616, + "learning_rate": 3.903876189123953e-06, + "loss": 0.3554, + "step": 8590 + }, + { + "epoch": 0.6104377761609853, + "grad_norm": 3.398605568980307, + "learning_rate": 3.896776941644186e-06, + "loss": 0.3717, + "step": 8600 + }, + { + "epoch": 0.6111475875286143, + "grad_norm": 3.5114677948249065, + "learning_rate": 3.889677694164418e-06, + "loss": 0.3677, + "step": 8610 + }, + { + "epoch": 0.6118573988962434, + "grad_norm": 4.753605099187553, + "learning_rate": 3.8825784466846515e-06, + "loss": 0.3547, + "step": 8620 + }, + { + "epoch": 0.6125672102638724, + "grad_norm": 3.4243729659259334, + "learning_rate": 3.875479199204885e-06, + "loss": 0.3762, + "step": 8630 + }, + { + "epoch": 0.6132770216315014, + "grad_norm": 5.94912381861312, + "learning_rate": 3.868379951725118e-06, + "loss": 0.359, + "step": 8640 + }, + { + "epoch": 0.6139868329991305, + "grad_norm": 6.590267176028699, + "learning_rate": 3.861280704245351e-06, + "loss": 0.3758, + "step": 8650 + }, + { + "epoch": 0.6146966443667595, + "grad_norm": 3.3256854782540497, + "learning_rate": 3.854181456765583e-06, + "loss": 0.3562, + "step": 8660 + }, + { + "epoch": 0.6154064557343886, + "grad_norm": 3.6453120360212816, + "learning_rate": 3.847082209285816e-06, + "loss": 0.3619, + "step": 8670 + }, + { + "epoch": 0.6161162671020176, + "grad_norm": 13.965716037023453, + "learning_rate": 3.839982961806049e-06, + "loss": 0.3646, + "step": 8680 + }, + { + "epoch": 0.6168260784696467, + "grad_norm": 7.837860273774759, + "learning_rate": 3.8328837143262815e-06, + "loss": 0.3457, + "step": 8690 + }, + { + "epoch": 0.6175358898372757, + "grad_norm": 4.729547574214101, + "learning_rate": 3.825784466846515e-06, + "loss": 0.3565, + "step": 8700 + }, + { + "epoch": 0.6182457012049049, + "grad_norm": 2.5619385732076987, + "learning_rate": 3.818685219366748e-06, + "loss": 0.3676, + "step": 8710 + }, + { + "epoch": 0.6189555125725339, + "grad_norm": 6.790019325573497, + "learning_rate": 3.8115859718869804e-06, + "loss": 0.3646, + "step": 8720 + }, + { + "epoch": 0.6196653239401629, + "grad_norm": 3.3195434105048665, + "learning_rate": 3.804486724407213e-06, + "loss": 0.3575, + "step": 8730 + }, + { + "epoch": 0.620375135307792, + "grad_norm": 3.805294873305076, + "learning_rate": 3.797387476927446e-06, + "loss": 0.3657, + "step": 8740 + }, + { + "epoch": 0.621084946675421, + "grad_norm": 5.59682650769057, + "learning_rate": 3.790288229447679e-06, + "loss": 0.3609, + "step": 8750 + }, + { + "epoch": 0.6217947580430501, + "grad_norm": 4.89958212672841, + "learning_rate": 3.783188981967912e-06, + "loss": 0.3669, + "step": 8760 + }, + { + "epoch": 0.6225045694106791, + "grad_norm": 8.274929479843232, + "learning_rate": 3.7760897344881446e-06, + "loss": 0.3581, + "step": 8770 + }, + { + "epoch": 0.6232143807783082, + "grad_norm": 3.2978821299433445, + "learning_rate": 3.7689904870083777e-06, + "loss": 0.3679, + "step": 8780 + }, + { + "epoch": 0.6239241921459372, + "grad_norm": 12.435473632592815, + "learning_rate": 3.76189123952861e-06, + "loss": 0.3677, + "step": 8790 + }, + { + "epoch": 0.6246340035135662, + "grad_norm": 4.195421567773733, + "learning_rate": 3.754791992048843e-06, + "loss": 0.3492, + "step": 8800 + }, + { + "epoch": 0.6253438148811953, + "grad_norm": 4.406904963403177, + "learning_rate": 3.7476927445690758e-06, + "loss": 0.3597, + "step": 8810 + }, + { + "epoch": 0.6260536262488243, + "grad_norm": 4.199730218503971, + "learning_rate": 3.740593497089309e-06, + "loss": 0.3797, + "step": 8820 + }, + { + "epoch": 0.6267634376164535, + "grad_norm": 3.3446382282646705, + "learning_rate": 3.7334942496095415e-06, + "loss": 0.3638, + "step": 8830 + }, + { + "epoch": 0.6274732489840825, + "grad_norm": 4.862585068251522, + "learning_rate": 3.7263950021297747e-06, + "loss": 0.3573, + "step": 8840 + }, + { + "epoch": 0.6281830603517116, + "grad_norm": 8.107090011887513, + "learning_rate": 3.7192957546500073e-06, + "loss": 0.3672, + "step": 8850 + }, + { + "epoch": 0.6288928717193406, + "grad_norm": 4.3962651782052005, + "learning_rate": 3.7121965071702404e-06, + "loss": 0.3412, + "step": 8860 + }, + { + "epoch": 0.6296026830869697, + "grad_norm": 4.6424143973536935, + "learning_rate": 3.705097259690473e-06, + "loss": 0.3667, + "step": 8870 + }, + { + "epoch": 0.6303124944545987, + "grad_norm": 3.840268427443435, + "learning_rate": 3.697998012210706e-06, + "loss": 0.3557, + "step": 8880 + }, + { + "epoch": 0.6310223058222277, + "grad_norm": 3.6388205049600018, + "learning_rate": 3.6908987647309385e-06, + "loss": 0.3631, + "step": 8890 + }, + { + "epoch": 0.6317321171898568, + "grad_norm": 5.233530712843461, + "learning_rate": 3.6837995172511716e-06, + "loss": 0.3648, + "step": 8900 + }, + { + "epoch": 0.6324419285574858, + "grad_norm": 3.781452701492992, + "learning_rate": 3.6767002697714042e-06, + "loss": 0.3788, + "step": 8910 + }, + { + "epoch": 0.6331517399251149, + "grad_norm": 6.068345043524154, + "learning_rate": 3.6696010222916373e-06, + "loss": 0.3566, + "step": 8920 + }, + { + "epoch": 0.6338615512927439, + "grad_norm": 5.599734595118006, + "learning_rate": 3.66250177481187e-06, + "loss": 0.349, + "step": 8930 + }, + { + "epoch": 0.634571362660373, + "grad_norm": 10.428150341049763, + "learning_rate": 3.655402527332103e-06, + "loss": 0.3584, + "step": 8940 + }, + { + "epoch": 0.6352811740280021, + "grad_norm": 17.681698800577582, + "learning_rate": 3.648303279852336e-06, + "loss": 0.3458, + "step": 8950 + }, + { + "epoch": 0.6359909853956311, + "grad_norm": 6.591627899287575, + "learning_rate": 3.641204032372569e-06, + "loss": 0.3643, + "step": 8960 + }, + { + "epoch": 0.6367007967632602, + "grad_norm": 31.04186356298661, + "learning_rate": 3.634104784892802e-06, + "loss": 0.3577, + "step": 8970 + }, + { + "epoch": 0.6374106081308892, + "grad_norm": 8.824274787999325, + "learning_rate": 3.6270055374130347e-06, + "loss": 0.3618, + "step": 8980 + }, + { + "epoch": 0.6381204194985183, + "grad_norm": 4.7185603252655826, + "learning_rate": 3.619906289933267e-06, + "loss": 0.3598, + "step": 8990 + }, + { + "epoch": 0.6388302308661473, + "grad_norm": 5.394376788444082, + "learning_rate": 3.6128070424535e-06, + "loss": 0.362, + "step": 9000 + }, + { + "epoch": 0.6395400422337764, + "grad_norm": 7.158347387403476, + "learning_rate": 3.6057077949737327e-06, + "loss": 0.3694, + "step": 9010 + }, + { + "epoch": 0.6402498536014054, + "grad_norm": 8.033101525768098, + "learning_rate": 3.598608547493966e-06, + "loss": 0.3626, + "step": 9020 + }, + { + "epoch": 0.6409596649690344, + "grad_norm": 2.7105647455701667, + "learning_rate": 3.591509300014199e-06, + "loss": 0.3462, + "step": 9030 + }, + { + "epoch": 0.6416694763366635, + "grad_norm": 6.3548259889750955, + "learning_rate": 3.5844100525344316e-06, + "loss": 0.3632, + "step": 9040 + }, + { + "epoch": 0.6423792877042925, + "grad_norm": 7.341190059846113, + "learning_rate": 3.5773108050546647e-06, + "loss": 0.3653, + "step": 9050 + }, + { + "epoch": 0.6430890990719217, + "grad_norm": 3.8869033025489723, + "learning_rate": 3.5702115575748974e-06, + "loss": 0.3412, + "step": 9060 + }, + { + "epoch": 0.6437989104395507, + "grad_norm": 4.918908181105817, + "learning_rate": 3.5631123100951305e-06, + "loss": 0.3616, + "step": 9070 + }, + { + "epoch": 0.6445087218071798, + "grad_norm": 6.124064792410853, + "learning_rate": 3.556013062615363e-06, + "loss": 0.3585, + "step": 9080 + }, + { + "epoch": 0.6452185331748088, + "grad_norm": 3.6806357015000764, + "learning_rate": 3.5489138151355963e-06, + "loss": 0.3668, + "step": 9090 + }, + { + "epoch": 0.6459283445424379, + "grad_norm": 5.193254667513745, + "learning_rate": 3.5418145676558285e-06, + "loss": 0.3669, + "step": 9100 + }, + { + "epoch": 0.6466381559100669, + "grad_norm": 10.978524486328482, + "learning_rate": 3.5347153201760616e-06, + "loss": 0.3597, + "step": 9110 + }, + { + "epoch": 0.647347967277696, + "grad_norm": 4.6611361687349175, + "learning_rate": 3.5276160726962943e-06, + "loss": 0.3695, + "step": 9120 + }, + { + "epoch": 0.648057778645325, + "grad_norm": 5.205492428214056, + "learning_rate": 3.5205168252165274e-06, + "loss": 0.3663, + "step": 9130 + }, + { + "epoch": 0.648767590012954, + "grad_norm": 5.139991204646184, + "learning_rate": 3.51341757773676e-06, + "loss": 0.3551, + "step": 9140 + }, + { + "epoch": 0.6494774013805831, + "grad_norm": 16.35255401640736, + "learning_rate": 3.506318330256993e-06, + "loss": 0.3553, + "step": 9150 + }, + { + "epoch": 0.6501872127482121, + "grad_norm": 10.145378264655722, + "learning_rate": 3.499219082777226e-06, + "loss": 0.3583, + "step": 9160 + }, + { + "epoch": 0.6508970241158412, + "grad_norm": 24.878144093372033, + "learning_rate": 3.492119835297459e-06, + "loss": 0.3555, + "step": 9170 + }, + { + "epoch": 0.6516068354834703, + "grad_norm": 3.902743241561423, + "learning_rate": 3.4850205878176916e-06, + "loss": 0.3723, + "step": 9180 + }, + { + "epoch": 0.6523166468510994, + "grad_norm": 4.458085439514939, + "learning_rate": 3.4779213403379247e-06, + "loss": 0.3701, + "step": 9190 + }, + { + "epoch": 0.6530264582187284, + "grad_norm": 4.717552266761064, + "learning_rate": 3.470822092858157e-06, + "loss": 0.3618, + "step": 9200 + }, + { + "epoch": 0.6537362695863574, + "grad_norm": 4.427364622798698, + "learning_rate": 3.46372284537839e-06, + "loss": 0.3614, + "step": 9210 + }, + { + "epoch": 0.6544460809539865, + "grad_norm": 8.323851654330221, + "learning_rate": 3.4566235978986228e-06, + "loss": 0.3678, + "step": 9220 + }, + { + "epoch": 0.6551558923216155, + "grad_norm": 4.966094347637934, + "learning_rate": 3.449524350418856e-06, + "loss": 0.3688, + "step": 9230 + }, + { + "epoch": 0.6558657036892446, + "grad_norm": 4.930577227679058, + "learning_rate": 3.4424251029390886e-06, + "loss": 0.3503, + "step": 9240 + }, + { + "epoch": 0.6565755150568736, + "grad_norm": 5.52399635730182, + "learning_rate": 3.4353258554593217e-06, + "loss": 0.3696, + "step": 9250 + }, + { + "epoch": 0.6572853264245027, + "grad_norm": 4.590670373221129, + "learning_rate": 3.4282266079795543e-06, + "loss": 0.3685, + "step": 9260 + }, + { + "epoch": 0.6579951377921317, + "grad_norm": 8.264828163926657, + "learning_rate": 3.4211273604997874e-06, + "loss": 0.3575, + "step": 9270 + }, + { + "epoch": 0.6587049491597607, + "grad_norm": 8.133262914973033, + "learning_rate": 3.4140281130200205e-06, + "loss": 0.3713, + "step": 9280 + }, + { + "epoch": 0.6594147605273899, + "grad_norm": 5.742760195932282, + "learning_rate": 3.4069288655402532e-06, + "loss": 0.3725, + "step": 9290 + }, + { + "epoch": 0.6601245718950189, + "grad_norm": 8.53035579823295, + "learning_rate": 3.3998296180604855e-06, + "loss": 0.3599, + "step": 9300 + }, + { + "epoch": 0.660834383262648, + "grad_norm": 4.142002947123207, + "learning_rate": 3.3927303705807186e-06, + "loss": 0.3661, + "step": 9310 + }, + { + "epoch": 0.661544194630277, + "grad_norm": 6.246166093324293, + "learning_rate": 3.3856311231009513e-06, + "loss": 0.351, + "step": 9320 + }, + { + "epoch": 0.6622540059979061, + "grad_norm": 16.243950855343193, + "learning_rate": 3.3785318756211844e-06, + "loss": 0.3479, + "step": 9330 + }, + { + "epoch": 0.6629638173655351, + "grad_norm": 6.147144910165458, + "learning_rate": 3.3714326281414175e-06, + "loss": 0.3543, + "step": 9340 + }, + { + "epoch": 0.6636736287331642, + "grad_norm": 4.099934401177817, + "learning_rate": 3.36433338066165e-06, + "loss": 0.3636, + "step": 9350 + }, + { + "epoch": 0.6643834401007932, + "grad_norm": 4.17019707869721, + "learning_rate": 3.3572341331818832e-06, + "loss": 0.351, + "step": 9360 + }, + { + "epoch": 0.6650932514684222, + "grad_norm": 4.102146778496878, + "learning_rate": 3.350134885702116e-06, + "loss": 0.3737, + "step": 9370 + }, + { + "epoch": 0.6658030628360513, + "grad_norm": 4.155164161456904, + "learning_rate": 3.343035638222349e-06, + "loss": 0.3505, + "step": 9380 + }, + { + "epoch": 0.6665128742036803, + "grad_norm": 4.042739251178277, + "learning_rate": 3.3359363907425817e-06, + "loss": 0.3578, + "step": 9390 + }, + { + "epoch": 0.6672226855713094, + "grad_norm": 3.4724621327513057, + "learning_rate": 3.328837143262814e-06, + "loss": 0.3733, + "step": 9400 + }, + { + "epoch": 0.6679324969389385, + "grad_norm": 3.284294254497063, + "learning_rate": 3.321737895783047e-06, + "loss": 0.361, + "step": 9410 + }, + { + "epoch": 0.6686423083065676, + "grad_norm": 5.224665667041366, + "learning_rate": 3.31463864830328e-06, + "loss": 0.3597, + "step": 9420 + }, + { + "epoch": 0.6693521196741966, + "grad_norm": 13.317891191179472, + "learning_rate": 3.307539400823513e-06, + "loss": 0.36, + "step": 9430 + }, + { + "epoch": 0.6700619310418257, + "grad_norm": 8.338179465785696, + "learning_rate": 3.300440153343746e-06, + "loss": 0.3708, + "step": 9440 + }, + { + "epoch": 0.6707717424094547, + "grad_norm": 4.022884248031831, + "learning_rate": 3.2933409058639786e-06, + "loss": 0.357, + "step": 9450 + }, + { + "epoch": 0.6714815537770837, + "grad_norm": 2.816929350582557, + "learning_rate": 3.2862416583842117e-06, + "loss": 0.3618, + "step": 9460 + }, + { + "epoch": 0.6721913651447128, + "grad_norm": 3.2609706893982278, + "learning_rate": 3.2791424109044444e-06, + "loss": 0.3566, + "step": 9470 + }, + { + "epoch": 0.6729011765123418, + "grad_norm": 2.0212043627509177, + "learning_rate": 3.2720431634246775e-06, + "loss": 0.3631, + "step": 9480 + }, + { + "epoch": 0.6736109878799709, + "grad_norm": 3.472359881135022, + "learning_rate": 3.26494391594491e-06, + "loss": 0.3465, + "step": 9490 + }, + { + "epoch": 0.6743207992475999, + "grad_norm": 2.365708920981696, + "learning_rate": 3.257844668465143e-06, + "loss": 0.36, + "step": 9500 + }, + { + "epoch": 0.675030610615229, + "grad_norm": 6.47059083775482, + "learning_rate": 3.2507454209853755e-06, + "loss": 0.3589, + "step": 9510 + }, + { + "epoch": 0.6757404219828581, + "grad_norm": 2.9761715896390872, + "learning_rate": 3.2436461735056086e-06, + "loss": 0.3737, + "step": 9520 + }, + { + "epoch": 0.6764502333504872, + "grad_norm": 3.2920710102385375, + "learning_rate": 3.2365469260258413e-06, + "loss": 0.3631, + "step": 9530 + }, + { + "epoch": 0.6771600447181162, + "grad_norm": 2.24517655258034, + "learning_rate": 3.2294476785460744e-06, + "loss": 0.3565, + "step": 9540 + }, + { + "epoch": 0.6778698560857452, + "grad_norm": 4.585199424065417, + "learning_rate": 3.222348431066307e-06, + "loss": 0.3587, + "step": 9550 + }, + { + "epoch": 0.6785796674533743, + "grad_norm": 2.616245813772314, + "learning_rate": 3.21524918358654e-06, + "loss": 0.3641, + "step": 9560 + }, + { + "epoch": 0.6792894788210033, + "grad_norm": 6.790868775160296, + "learning_rate": 3.208149936106773e-06, + "loss": 0.3542, + "step": 9570 + }, + { + "epoch": 0.6799992901886324, + "grad_norm": 4.6720875235574955, + "learning_rate": 3.201050688627006e-06, + "loss": 0.3724, + "step": 9580 + }, + { + "epoch": 0.6807091015562614, + "grad_norm": 2.929891653919803, + "learning_rate": 3.193951441147239e-06, + "loss": 0.355, + "step": 9590 + }, + { + "epoch": 0.6814189129238905, + "grad_norm": 2.5935885874594935, + "learning_rate": 3.1868521936674717e-06, + "loss": 0.3477, + "step": 9600 + }, + { + "epoch": 0.6821287242915195, + "grad_norm": 4.16743323358689, + "learning_rate": 3.179752946187704e-06, + "loss": 0.3732, + "step": 9610 + }, + { + "epoch": 0.6828385356591485, + "grad_norm": 3.119963047712144, + "learning_rate": 3.172653698707937e-06, + "loss": 0.3583, + "step": 9620 + }, + { + "epoch": 0.6835483470267776, + "grad_norm": 4.025619816942283, + "learning_rate": 3.1655544512281698e-06, + "loss": 0.3814, + "step": 9630 + }, + { + "epoch": 0.6842581583944067, + "grad_norm": 10.60216606667068, + "learning_rate": 3.158455203748403e-06, + "loss": 0.3599, + "step": 9640 + }, + { + "epoch": 0.6849679697620358, + "grad_norm": 4.461108822226996, + "learning_rate": 3.1513559562686356e-06, + "loss": 0.3619, + "step": 9650 + }, + { + "epoch": 0.6856777811296648, + "grad_norm": 2.7381838956818596, + "learning_rate": 3.1442567087888687e-06, + "loss": 0.361, + "step": 9660 + }, + { + "epoch": 0.6863875924972939, + "grad_norm": 3.3932603213636536, + "learning_rate": 3.1371574613091018e-06, + "loss": 0.3722, + "step": 9670 + }, + { + "epoch": 0.6870974038649229, + "grad_norm": 3.0238463961256556, + "learning_rate": 3.1300582138293344e-06, + "loss": 0.3677, + "step": 9680 + }, + { + "epoch": 0.687807215232552, + "grad_norm": 2.9020326019536236, + "learning_rate": 3.1229589663495675e-06, + "loss": 0.3587, + "step": 9690 + }, + { + "epoch": 0.688517026600181, + "grad_norm": 3.4182793620767313, + "learning_rate": 3.1158597188698002e-06, + "loss": 0.3958, + "step": 9700 + }, + { + "epoch": 0.68922683796781, + "grad_norm": 2.7346693208831123, + "learning_rate": 3.1087604713900325e-06, + "loss": 0.3746, + "step": 9710 + }, + { + "epoch": 0.6899366493354391, + "grad_norm": 2.7001110030197184, + "learning_rate": 3.1016612239102656e-06, + "loss": 0.3596, + "step": 9720 + }, + { + "epoch": 0.6906464607030681, + "grad_norm": 3.8786526590857706, + "learning_rate": 3.0945619764304987e-06, + "loss": 0.3677, + "step": 9730 + }, + { + "epoch": 0.6913562720706972, + "grad_norm": 3.601819125137747, + "learning_rate": 3.0874627289507314e-06, + "loss": 0.3599, + "step": 9740 + }, + { + "epoch": 0.6920660834383263, + "grad_norm": 4.257577712986774, + "learning_rate": 3.0803634814709645e-06, + "loss": 0.3653, + "step": 9750 + }, + { + "epoch": 0.6927758948059554, + "grad_norm": 16.2562479732823, + "learning_rate": 3.073264233991197e-06, + "loss": 0.3786, + "step": 9760 + }, + { + "epoch": 0.6934857061735844, + "grad_norm": 2.8308341290836037, + "learning_rate": 3.0661649865114302e-06, + "loss": 0.347, + "step": 9770 + }, + { + "epoch": 0.6941955175412134, + "grad_norm": 2.386467475595729, + "learning_rate": 3.059065739031663e-06, + "loss": 0.3785, + "step": 9780 + }, + { + "epoch": 0.6949053289088425, + "grad_norm": 3.11594441686047, + "learning_rate": 3.051966491551896e-06, + "loss": 0.3613, + "step": 9790 + }, + { + "epoch": 0.6956151402764715, + "grad_norm": 3.4457140851193677, + "learning_rate": 3.0448672440721287e-06, + "loss": 0.3592, + "step": 9800 + }, + { + "epoch": 0.6963249516441006, + "grad_norm": 6.7733834909511135, + "learning_rate": 3.0377679965923614e-06, + "loss": 0.3503, + "step": 9810 + }, + { + "epoch": 0.6970347630117296, + "grad_norm": 2.552293405448118, + "learning_rate": 3.030668749112594e-06, + "loss": 0.3565, + "step": 9820 + }, + { + "epoch": 0.6977445743793587, + "grad_norm": 7.3573968999972985, + "learning_rate": 3.023569501632827e-06, + "loss": 0.3534, + "step": 9830 + }, + { + "epoch": 0.6984543857469877, + "grad_norm": 2.2835556419626286, + "learning_rate": 3.01647025415306e-06, + "loss": 0.3627, + "step": 9840 + }, + { + "epoch": 0.6991641971146167, + "grad_norm": 4.158935806681915, + "learning_rate": 3.009371006673293e-06, + "loss": 0.3676, + "step": 9850 + }, + { + "epoch": 0.6998740084822458, + "grad_norm": 3.444386024390724, + "learning_rate": 3.0022717591935256e-06, + "loss": 0.3498, + "step": 9860 + }, + { + "epoch": 0.700583819849875, + "grad_norm": 76.68033690471103, + "learning_rate": 2.9951725117137587e-06, + "loss": 0.3465, + "step": 9870 + }, + { + "epoch": 0.701293631217504, + "grad_norm": 2.753848553217651, + "learning_rate": 2.9880732642339914e-06, + "loss": 0.3579, + "step": 9880 + }, + { + "epoch": 0.702003442585133, + "grad_norm": 6.8770901385155465, + "learning_rate": 2.9809740167542245e-06, + "loss": 0.3644, + "step": 9890 + }, + { + "epoch": 0.7027132539527621, + "grad_norm": 8.050770443325867, + "learning_rate": 2.9738747692744576e-06, + "loss": 0.3534, + "step": 9900 + }, + { + "epoch": 0.7034230653203911, + "grad_norm": 6.2381173840397794, + "learning_rate": 2.96677552179469e-06, + "loss": 0.3799, + "step": 9910 + }, + { + "epoch": 0.7041328766880202, + "grad_norm": 2.527197221067041, + "learning_rate": 2.9596762743149225e-06, + "loss": 0.3702, + "step": 9920 + }, + { + "epoch": 0.7048426880556492, + "grad_norm": 3.365675129758323, + "learning_rate": 2.9525770268351556e-06, + "loss": 0.3618, + "step": 9930 + }, + { + "epoch": 0.7055524994232782, + "grad_norm": 3.7307831294643323, + "learning_rate": 2.9454777793553883e-06, + "loss": 0.3552, + "step": 9940 + }, + { + "epoch": 0.7062623107909073, + "grad_norm": 10.13055799757591, + "learning_rate": 2.9383785318756214e-06, + "loss": 0.369, + "step": 9950 + }, + { + "epoch": 0.7069721221585363, + "grad_norm": 3.79159989826404, + "learning_rate": 2.931279284395854e-06, + "loss": 0.3393, + "step": 9960 + }, + { + "epoch": 0.7076819335261654, + "grad_norm": 11.361319554472407, + "learning_rate": 2.924180036916087e-06, + "loss": 0.3726, + "step": 9970 + }, + { + "epoch": 0.7083917448937945, + "grad_norm": 2.2727709813242, + "learning_rate": 2.9170807894363203e-06, + "loss": 0.3558, + "step": 9980 + }, + { + "epoch": 0.7091015562614236, + "grad_norm": 13.54783288221351, + "learning_rate": 2.909981541956553e-06, + "loss": 0.3522, + "step": 9990 + }, + { + "epoch": 0.7098113676290526, + "grad_norm": 3.4738198913190037, + "learning_rate": 2.902882294476786e-06, + "loss": 0.3636, + "step": 10000 + }, + { + "epoch": 0.7105211789966817, + "grad_norm": 2.599196507580769, + "learning_rate": 2.8957830469970183e-06, + "loss": 0.373, + "step": 10010 + }, + { + "epoch": 0.7112309903643107, + "grad_norm": 4.846340487255633, + "learning_rate": 2.888683799517251e-06, + "loss": 0.364, + "step": 10020 + }, + { + "epoch": 0.7119408017319397, + "grad_norm": 4.14481835106229, + "learning_rate": 2.881584552037484e-06, + "loss": 0.3565, + "step": 10030 + }, + { + "epoch": 0.7126506130995688, + "grad_norm": 3.12959687042078, + "learning_rate": 2.8744853045577172e-06, + "loss": 0.3597, + "step": 10040 + }, + { + "epoch": 0.7133604244671978, + "grad_norm": 2.0499607045489157, + "learning_rate": 2.86738605707795e-06, + "loss": 0.3665, + "step": 10050 + }, + { + "epoch": 0.7140702358348269, + "grad_norm": 3.4345739303394964, + "learning_rate": 2.860286809598183e-06, + "loss": 0.3406, + "step": 10060 + }, + { + "epoch": 0.7147800472024559, + "grad_norm": 3.2507549549593677, + "learning_rate": 2.8531875621184157e-06, + "loss": 0.3691, + "step": 10070 + }, + { + "epoch": 0.715489858570085, + "grad_norm": 3.088999571380729, + "learning_rate": 2.8460883146386488e-06, + "loss": 0.3512, + "step": 10080 + }, + { + "epoch": 0.716199669937714, + "grad_norm": 3.992697102415428, + "learning_rate": 2.8389890671588815e-06, + "loss": 0.3584, + "step": 10090 + }, + { + "epoch": 0.7169094813053432, + "grad_norm": 8.327520697203159, + "learning_rate": 2.8318898196791146e-06, + "loss": 0.3604, + "step": 10100 + }, + { + "epoch": 0.7176192926729722, + "grad_norm": 4.600972082353797, + "learning_rate": 2.824790572199347e-06, + "loss": 0.3641, + "step": 10110 + }, + { + "epoch": 0.7183291040406012, + "grad_norm": 3.6403983429872384, + "learning_rate": 2.81769132471958e-06, + "loss": 0.3496, + "step": 10120 + }, + { + "epoch": 0.7190389154082303, + "grad_norm": 2.831902492470625, + "learning_rate": 2.8105920772398126e-06, + "loss": 0.3611, + "step": 10130 + }, + { + "epoch": 0.7197487267758593, + "grad_norm": 4.428260390842955, + "learning_rate": 2.8034928297600457e-06, + "loss": 0.3572, + "step": 10140 + }, + { + "epoch": 0.7204585381434884, + "grad_norm": 5.5528766539260825, + "learning_rate": 2.7963935822802784e-06, + "loss": 0.3605, + "step": 10150 + }, + { + "epoch": 0.7211683495111174, + "grad_norm": 3.3271150324051124, + "learning_rate": 2.7892943348005115e-06, + "loss": 0.3646, + "step": 10160 + }, + { + "epoch": 0.7218781608787465, + "grad_norm": 4.353636452465487, + "learning_rate": 2.782195087320744e-06, + "loss": 0.3745, + "step": 10170 + }, + { + "epoch": 0.7225879722463755, + "grad_norm": 4.938483709090633, + "learning_rate": 2.7750958398409773e-06, + "loss": 0.3586, + "step": 10180 + }, + { + "epoch": 0.7232977836140045, + "grad_norm": 4.667393928494558, + "learning_rate": 2.76799659236121e-06, + "loss": 0.3526, + "step": 10190 + }, + { + "epoch": 0.7240075949816336, + "grad_norm": 5.312814121573459, + "learning_rate": 2.760897344881443e-06, + "loss": 0.3539, + "step": 10200 + }, + { + "epoch": 0.7247174063492627, + "grad_norm": 3.102848391211554, + "learning_rate": 2.7537980974016757e-06, + "loss": 0.3453, + "step": 10210 + }, + { + "epoch": 0.7254272177168918, + "grad_norm": 3.036840145081599, + "learning_rate": 2.7466988499219084e-06, + "loss": 0.3627, + "step": 10220 + }, + { + "epoch": 0.7261370290845208, + "grad_norm": 5.647990352632265, + "learning_rate": 2.739599602442141e-06, + "loss": 0.3555, + "step": 10230 + }, + { + "epoch": 0.7268468404521499, + "grad_norm": 4.66342024342857, + "learning_rate": 2.732500354962374e-06, + "loss": 0.3722, + "step": 10240 + }, + { + "epoch": 0.7275566518197789, + "grad_norm": 3.168307885423117, + "learning_rate": 2.725401107482607e-06, + "loss": 0.3673, + "step": 10250 + }, + { + "epoch": 0.728266463187408, + "grad_norm": 4.968172759395676, + "learning_rate": 2.71830186000284e-06, + "loss": 0.3556, + "step": 10260 + }, + { + "epoch": 0.728976274555037, + "grad_norm": 3.5154935991341123, + "learning_rate": 2.7112026125230726e-06, + "loss": 0.3593, + "step": 10270 + }, + { + "epoch": 0.729686085922666, + "grad_norm": 5.0083468168620655, + "learning_rate": 2.7041033650433057e-06, + "loss": 0.3592, + "step": 10280 + }, + { + "epoch": 0.7303958972902951, + "grad_norm": 3.379094612224907, + "learning_rate": 2.697004117563539e-06, + "loss": 0.3643, + "step": 10290 + }, + { + "epoch": 0.7311057086579241, + "grad_norm": 4.180270451928424, + "learning_rate": 2.6899048700837715e-06, + "loss": 0.3574, + "step": 10300 + }, + { + "epoch": 0.7318155200255532, + "grad_norm": 4.640198570927561, + "learning_rate": 2.6828056226040046e-06, + "loss": 0.3578, + "step": 10310 + }, + { + "epoch": 0.7325253313931822, + "grad_norm": 10.365125402351024, + "learning_rate": 2.675706375124237e-06, + "loss": 0.3614, + "step": 10320 + }, + { + "epoch": 0.7332351427608114, + "grad_norm": 15.355341780635097, + "learning_rate": 2.6686071276444695e-06, + "loss": 0.3631, + "step": 10330 + }, + { + "epoch": 0.7339449541284404, + "grad_norm": 6.738981517513828, + "learning_rate": 2.6615078801647026e-06, + "loss": 0.3493, + "step": 10340 + }, + { + "epoch": 0.7346547654960695, + "grad_norm": 7.55570609393924, + "learning_rate": 2.6544086326849357e-06, + "loss": 0.371, + "step": 10350 + }, + { + "epoch": 0.7353645768636985, + "grad_norm": 2.6482961979611526, + "learning_rate": 2.6473093852051684e-06, + "loss": 0.3591, + "step": 10360 + }, + { + "epoch": 0.7360743882313275, + "grad_norm": 8.054548870993123, + "learning_rate": 2.6402101377254015e-06, + "loss": 0.3577, + "step": 10370 + }, + { + "epoch": 0.7367841995989566, + "grad_norm": 7.370207938746124, + "learning_rate": 2.633110890245634e-06, + "loss": 0.3509, + "step": 10380 + }, + { + "epoch": 0.7374940109665856, + "grad_norm": 8.915363239178143, + "learning_rate": 2.6260116427658673e-06, + "loss": 0.3595, + "step": 10390 + }, + { + "epoch": 0.7382038223342147, + "grad_norm": 6.453539668987391, + "learning_rate": 2.6189123952861e-06, + "loss": 0.3735, + "step": 10400 + }, + { + "epoch": 0.7389136337018437, + "grad_norm": 13.429374820990935, + "learning_rate": 2.611813147806333e-06, + "loss": 0.343, + "step": 10410 + }, + { + "epoch": 0.7396234450694728, + "grad_norm": 4.019465503184252, + "learning_rate": 2.6047139003265653e-06, + "loss": 0.3619, + "step": 10420 + }, + { + "epoch": 0.7403332564371018, + "grad_norm": 4.77728942914678, + "learning_rate": 2.5976146528467984e-06, + "loss": 0.3602, + "step": 10430 + }, + { + "epoch": 0.7410430678047308, + "grad_norm": 16.82021280745509, + "learning_rate": 2.590515405367031e-06, + "loss": 0.3765, + "step": 10440 + }, + { + "epoch": 0.74175287917236, + "grad_norm": 4.7659520678895735, + "learning_rate": 2.5834161578872642e-06, + "loss": 0.3557, + "step": 10450 + }, + { + "epoch": 0.742462690539989, + "grad_norm": 5.846901706253607, + "learning_rate": 2.576316910407497e-06, + "loss": 0.3574, + "step": 10460 + }, + { + "epoch": 0.7431725019076181, + "grad_norm": 5.00717365628058, + "learning_rate": 2.56921766292773e-06, + "loss": 0.371, + "step": 10470 + }, + { + "epoch": 0.7438823132752471, + "grad_norm": 12.812616706907704, + "learning_rate": 2.5621184154479627e-06, + "loss": 0.3612, + "step": 10480 + }, + { + "epoch": 0.7445921246428762, + "grad_norm": 2.7312101929568375, + "learning_rate": 2.5550191679681958e-06, + "loss": 0.3551, + "step": 10490 + }, + { + "epoch": 0.7453019360105052, + "grad_norm": 3.0759041075210782, + "learning_rate": 2.5479199204884285e-06, + "loss": 0.3574, + "step": 10500 + }, + { + "epoch": 0.7460117473781342, + "grad_norm": 7.165278043719281, + "learning_rate": 2.5408206730086616e-06, + "loss": 0.3605, + "step": 10510 + }, + { + "epoch": 0.7467215587457633, + "grad_norm": 4.908665990783306, + "learning_rate": 2.533721425528894e-06, + "loss": 0.3479, + "step": 10520 + }, + { + "epoch": 0.7474313701133923, + "grad_norm": 3.4583261557450227, + "learning_rate": 2.526622178049127e-06, + "loss": 0.3542, + "step": 10530 + }, + { + "epoch": 0.7481411814810214, + "grad_norm": 11.387458565670322, + "learning_rate": 2.5195229305693596e-06, + "loss": 0.3619, + "step": 10540 + }, + { + "epoch": 0.7488509928486504, + "grad_norm": 10.198798372329442, + "learning_rate": 2.5124236830895927e-06, + "loss": 0.3434, + "step": 10550 + }, + { + "epoch": 0.7495608042162796, + "grad_norm": 3.893599380410888, + "learning_rate": 2.5053244356098254e-06, + "loss": 0.362, + "step": 10560 + }, + { + "epoch": 0.7502706155839086, + "grad_norm": 5.107597028464082, + "learning_rate": 2.4982251881300585e-06, + "loss": 0.3688, + "step": 10570 + }, + { + "epoch": 0.7509804269515377, + "grad_norm": 4.219068583835792, + "learning_rate": 2.491125940650291e-06, + "loss": 0.3649, + "step": 10580 + }, + { + "epoch": 0.7516902383191667, + "grad_norm": 4.535592066198855, + "learning_rate": 2.4840266931705243e-06, + "loss": 0.37, + "step": 10590 + }, + { + "epoch": 0.7524000496867957, + "grad_norm": 3.541264339618074, + "learning_rate": 2.476927445690757e-06, + "loss": 0.3679, + "step": 10600 + }, + { + "epoch": 0.7531098610544248, + "grad_norm": 4.7884449114332845, + "learning_rate": 2.4698281982109896e-06, + "loss": 0.3472, + "step": 10610 + }, + { + "epoch": 0.7538196724220538, + "grad_norm": 8.667808097909838, + "learning_rate": 2.4627289507312227e-06, + "loss": 0.3704, + "step": 10620 + }, + { + "epoch": 0.7545294837896829, + "grad_norm": 4.925434074834849, + "learning_rate": 2.455629703251456e-06, + "loss": 0.3701, + "step": 10630 + }, + { + "epoch": 0.7552392951573119, + "grad_norm": 3.8594886335750807, + "learning_rate": 2.4485304557716885e-06, + "loss": 0.3662, + "step": 10640 + }, + { + "epoch": 0.755949106524941, + "grad_norm": 4.971536391123703, + "learning_rate": 2.441431208291921e-06, + "loss": 0.35, + "step": 10650 + }, + { + "epoch": 0.75665891789257, + "grad_norm": 15.055144352578429, + "learning_rate": 2.434331960812154e-06, + "loss": 0.3584, + "step": 10660 + }, + { + "epoch": 0.757368729260199, + "grad_norm": 14.432076661811932, + "learning_rate": 2.427232713332387e-06, + "loss": 0.3621, + "step": 10670 + }, + { + "epoch": 0.7580785406278282, + "grad_norm": 9.810669772230819, + "learning_rate": 2.42013346585262e-06, + "loss": 0.3588, + "step": 10680 + }, + { + "epoch": 0.7587883519954572, + "grad_norm": 5.765479927608821, + "learning_rate": 2.4130342183728527e-06, + "loss": 0.3549, + "step": 10690 + }, + { + "epoch": 0.7594981633630863, + "grad_norm": 13.617197754978974, + "learning_rate": 2.4059349708930854e-06, + "loss": 0.3759, + "step": 10700 + }, + { + "epoch": 0.7602079747307153, + "grad_norm": 5.614482278416453, + "learning_rate": 2.3988357234133185e-06, + "loss": 0.3376, + "step": 10710 + }, + { + "epoch": 0.7609177860983444, + "grad_norm": 17.701642596831444, + "learning_rate": 2.391736475933551e-06, + "loss": 0.3647, + "step": 10720 + }, + { + "epoch": 0.7616275974659734, + "grad_norm": 4.910333781437824, + "learning_rate": 2.3846372284537843e-06, + "loss": 0.3643, + "step": 10730 + }, + { + "epoch": 0.7623374088336025, + "grad_norm": 3.415309685272355, + "learning_rate": 2.377537980974017e-06, + "loss": 0.3488, + "step": 10740 + }, + { + "epoch": 0.7630472202012315, + "grad_norm": 4.350903829153794, + "learning_rate": 2.3704387334942497e-06, + "loss": 0.3577, + "step": 10750 + }, + { + "epoch": 0.7637570315688605, + "grad_norm": 3.9361079752185435, + "learning_rate": 2.3633394860144828e-06, + "loss": 0.3591, + "step": 10760 + }, + { + "epoch": 0.7644668429364896, + "grad_norm": 5.913083445040196, + "learning_rate": 2.3562402385347154e-06, + "loss": 0.3486, + "step": 10770 + }, + { + "epoch": 0.7651766543041186, + "grad_norm": 5.982161931863015, + "learning_rate": 2.3491409910549485e-06, + "loss": 0.3714, + "step": 10780 + }, + { + "epoch": 0.7658864656717478, + "grad_norm": 4.5231254195655906, + "learning_rate": 2.3420417435751812e-06, + "loss": 0.3534, + "step": 10790 + }, + { + "epoch": 0.7665962770393768, + "grad_norm": 5.099871081954513, + "learning_rate": 2.334942496095414e-06, + "loss": 0.3509, + "step": 10800 + }, + { + "epoch": 0.7673060884070059, + "grad_norm": 3.361247181502804, + "learning_rate": 2.327843248615647e-06, + "loss": 0.3692, + "step": 10810 + }, + { + "epoch": 0.7680158997746349, + "grad_norm": 6.553423618292367, + "learning_rate": 2.3207440011358797e-06, + "loss": 0.353, + "step": 10820 + }, + { + "epoch": 0.768725711142264, + "grad_norm": 2.985537513367268, + "learning_rate": 2.3136447536561128e-06, + "loss": 0.3498, + "step": 10830 + }, + { + "epoch": 0.769435522509893, + "grad_norm": 3.0266471519507427, + "learning_rate": 2.3065455061763455e-06, + "loss": 0.3563, + "step": 10840 + }, + { + "epoch": 0.770145333877522, + "grad_norm": 17.644165005698888, + "learning_rate": 2.299446258696578e-06, + "loss": 0.3662, + "step": 10850 + }, + { + "epoch": 0.7708551452451511, + "grad_norm": 3.1894412768611016, + "learning_rate": 2.2923470112168112e-06, + "loss": 0.3503, + "step": 10860 + }, + { + "epoch": 0.7715649566127801, + "grad_norm": 4.492544324422795, + "learning_rate": 2.285247763737044e-06, + "loss": 0.3436, + "step": 10870 + }, + { + "epoch": 0.7722747679804092, + "grad_norm": 4.173829674998731, + "learning_rate": 2.278148516257277e-06, + "loss": 0.363, + "step": 10880 + }, + { + "epoch": 0.7729845793480382, + "grad_norm": 3.114718418646357, + "learning_rate": 2.2710492687775097e-06, + "loss": 0.3368, + "step": 10890 + }, + { + "epoch": 0.7736943907156673, + "grad_norm": 2.6323429503484443, + "learning_rate": 2.2639500212977424e-06, + "loss": 0.3489, + "step": 10900 + }, + { + "epoch": 0.7744042020832964, + "grad_norm": 2.8865277064459223, + "learning_rate": 2.2568507738179755e-06, + "loss": 0.3571, + "step": 10910 + }, + { + "epoch": 0.7751140134509255, + "grad_norm": 8.888602826244627, + "learning_rate": 2.249751526338208e-06, + "loss": 0.3399, + "step": 10920 + }, + { + "epoch": 0.7758238248185545, + "grad_norm": 3.532724353902858, + "learning_rate": 2.2426522788584412e-06, + "loss": 0.3493, + "step": 10930 + }, + { + "epoch": 0.7765336361861835, + "grad_norm": 3.6781547439101883, + "learning_rate": 2.235553031378674e-06, + "loss": 0.3462, + "step": 10940 + }, + { + "epoch": 0.7772434475538126, + "grad_norm": 13.16004359433701, + "learning_rate": 2.2284537838989066e-06, + "loss": 0.3649, + "step": 10950 + }, + { + "epoch": 0.7779532589214416, + "grad_norm": 9.642968589987298, + "learning_rate": 2.2213545364191397e-06, + "loss": 0.3582, + "step": 10960 + }, + { + "epoch": 0.7786630702890707, + "grad_norm": 6.16050392324128, + "learning_rate": 2.2142552889393724e-06, + "loss": 0.3624, + "step": 10970 + }, + { + "epoch": 0.7793728816566997, + "grad_norm": 4.012346442724565, + "learning_rate": 2.2071560414596055e-06, + "loss": 0.3448, + "step": 10980 + }, + { + "epoch": 0.7800826930243288, + "grad_norm": 2.6066193255622956, + "learning_rate": 2.2000567939798386e-06, + "loss": 0.3644, + "step": 10990 + }, + { + "epoch": 0.7807925043919578, + "grad_norm": 7.331639609512875, + "learning_rate": 2.1929575465000713e-06, + "loss": 0.3515, + "step": 11000 + }, + { + "epoch": 0.7815023157595868, + "grad_norm": 2.990816174000455, + "learning_rate": 2.185858299020304e-06, + "loss": 0.3505, + "step": 11010 + }, + { + "epoch": 0.782212127127216, + "grad_norm": 3.6112792490950554, + "learning_rate": 2.178759051540537e-06, + "loss": 0.3548, + "step": 11020 + }, + { + "epoch": 0.782921938494845, + "grad_norm": 3.8221043132066286, + "learning_rate": 2.1716598040607697e-06, + "loss": 0.3571, + "step": 11030 + }, + { + "epoch": 0.7836317498624741, + "grad_norm": 7.476265982563856, + "learning_rate": 2.164560556581003e-06, + "loss": 0.3428, + "step": 11040 + }, + { + "epoch": 0.7843415612301031, + "grad_norm": 5.554911455235443, + "learning_rate": 2.1574613091012355e-06, + "loss": 0.354, + "step": 11050 + }, + { + "epoch": 0.7850513725977322, + "grad_norm": 2.9298081851011117, + "learning_rate": 2.150362061621468e-06, + "loss": 0.3597, + "step": 11060 + }, + { + "epoch": 0.7857611839653612, + "grad_norm": 5.325097733237352, + "learning_rate": 2.1432628141417013e-06, + "loss": 0.3486, + "step": 11070 + }, + { + "epoch": 0.7864709953329903, + "grad_norm": 3.5814394523109114, + "learning_rate": 2.136163566661934e-06, + "loss": 0.3544, + "step": 11080 + }, + { + "epoch": 0.7871808067006193, + "grad_norm": 3.6972554376986, + "learning_rate": 2.129064319182167e-06, + "loss": 0.3546, + "step": 11090 + }, + { + "epoch": 0.7878906180682483, + "grad_norm": 6.754098899246775, + "learning_rate": 2.1219650717023997e-06, + "loss": 0.3537, + "step": 11100 + }, + { + "epoch": 0.7886004294358774, + "grad_norm": 3.3122898855719876, + "learning_rate": 2.1148658242226324e-06, + "loss": 0.3645, + "step": 11110 + }, + { + "epoch": 0.7893102408035064, + "grad_norm": 2.8223728276754128, + "learning_rate": 2.1077665767428655e-06, + "loss": 0.3599, + "step": 11120 + }, + { + "epoch": 0.7900200521711355, + "grad_norm": 2.5012481292133937, + "learning_rate": 2.100667329263098e-06, + "loss": 0.3486, + "step": 11130 + }, + { + "epoch": 0.7907298635387646, + "grad_norm": 11.033197138630223, + "learning_rate": 2.0935680817833313e-06, + "loss": 0.3467, + "step": 11140 + }, + { + "epoch": 0.7914396749063937, + "grad_norm": 3.730389968284293, + "learning_rate": 2.086468834303564e-06, + "loss": 0.3544, + "step": 11150 + }, + { + "epoch": 0.7921494862740227, + "grad_norm": 5.898064410181565, + "learning_rate": 2.0793695868237967e-06, + "loss": 0.3477, + "step": 11160 + }, + { + "epoch": 0.7928592976416518, + "grad_norm": 4.55198088261442, + "learning_rate": 2.0722703393440298e-06, + "loss": 0.3527, + "step": 11170 + }, + { + "epoch": 0.7935691090092808, + "grad_norm": 5.318762071563834, + "learning_rate": 2.0651710918642624e-06, + "loss": 0.3478, + "step": 11180 + }, + { + "epoch": 0.7942789203769098, + "grad_norm": 6.161214607463883, + "learning_rate": 2.0580718443844955e-06, + "loss": 0.3546, + "step": 11190 + }, + { + "epoch": 0.7949887317445389, + "grad_norm": 3.1236830623318537, + "learning_rate": 2.0509725969047282e-06, + "loss": 0.3565, + "step": 11200 + }, + { + "epoch": 0.7956985431121679, + "grad_norm": 4.197839999078878, + "learning_rate": 2.043873349424961e-06, + "loss": 0.3496, + "step": 11210 + }, + { + "epoch": 0.796408354479797, + "grad_norm": 3.2762330861667515, + "learning_rate": 2.036774101945194e-06, + "loss": 0.348, + "step": 11220 + }, + { + "epoch": 0.797118165847426, + "grad_norm": 5.961140258537488, + "learning_rate": 2.0296748544654267e-06, + "loss": 0.3637, + "step": 11230 + }, + { + "epoch": 0.797827977215055, + "grad_norm": 2.0964322412177263, + "learning_rate": 2.0225756069856598e-06, + "loss": 0.341, + "step": 11240 + }, + { + "epoch": 0.7985377885826842, + "grad_norm": 11.078753928620895, + "learning_rate": 2.0154763595058925e-06, + "loss": 0.3582, + "step": 11250 + }, + { + "epoch": 0.7992475999503132, + "grad_norm": 11.615859636107096, + "learning_rate": 2.008377112026125e-06, + "loss": 0.3504, + "step": 11260 + }, + { + "epoch": 0.7999574113179423, + "grad_norm": 9.267486623233392, + "learning_rate": 2.0012778645463582e-06, + "loss": 0.3585, + "step": 11270 + }, + { + "epoch": 0.8006672226855713, + "grad_norm": 3.7638868565818613, + "learning_rate": 1.994178617066591e-06, + "loss": 0.3572, + "step": 11280 + }, + { + "epoch": 0.8013770340532004, + "grad_norm": 4.274096264509613, + "learning_rate": 1.987079369586824e-06, + "loss": 0.352, + "step": 11290 + }, + { + "epoch": 0.8020868454208294, + "grad_norm": 3.0651382288741824, + "learning_rate": 1.979980122107057e-06, + "loss": 0.3487, + "step": 11300 + }, + { + "epoch": 0.8027966567884585, + "grad_norm": 2.585139354778811, + "learning_rate": 1.9728808746272894e-06, + "loss": 0.3509, + "step": 11310 + }, + { + "epoch": 0.8035064681560875, + "grad_norm": 3.4507245702670013, + "learning_rate": 1.9657816271475225e-06, + "loss": 0.3605, + "step": 11320 + }, + { + "epoch": 0.8042162795237165, + "grad_norm": 2.168473869134373, + "learning_rate": 1.9586823796677556e-06, + "loss": 0.3473, + "step": 11330 + }, + { + "epoch": 0.8049260908913456, + "grad_norm": 3.3138804394827126, + "learning_rate": 1.9515831321879883e-06, + "loss": 0.3451, + "step": 11340 + }, + { + "epoch": 0.8056359022589746, + "grad_norm": 2.9967871033094284, + "learning_rate": 1.9444838847082214e-06, + "loss": 0.3586, + "step": 11350 + }, + { + "epoch": 0.8063457136266037, + "grad_norm": 2.218098420224771, + "learning_rate": 1.9373846372284536e-06, + "loss": 0.3629, + "step": 11360 + }, + { + "epoch": 0.8070555249942328, + "grad_norm": 4.124703498173868, + "learning_rate": 1.9302853897486867e-06, + "loss": 0.349, + "step": 11370 + }, + { + "epoch": 0.8077653363618619, + "grad_norm": 4.336301638014139, + "learning_rate": 1.92318614226892e-06, + "loss": 0.3474, + "step": 11380 + }, + { + "epoch": 0.8084751477294909, + "grad_norm": 5.67446885361532, + "learning_rate": 1.9160868947891525e-06, + "loss": 0.3577, + "step": 11390 + }, + { + "epoch": 0.80918495909712, + "grad_norm": 5.496735292829206, + "learning_rate": 1.9089876473093856e-06, + "loss": 0.3606, + "step": 11400 + }, + { + "epoch": 0.809894770464749, + "grad_norm": 2.3181036706188505, + "learning_rate": 1.901888399829618e-06, + "loss": 0.3573, + "step": 11410 + }, + { + "epoch": 0.810604581832378, + "grad_norm": 4.2823563842257695, + "learning_rate": 1.894789152349851e-06, + "loss": 0.3456, + "step": 11420 + }, + { + "epoch": 0.8113143932000071, + "grad_norm": 9.041186743139388, + "learning_rate": 1.8876899048700838e-06, + "loss": 0.3493, + "step": 11430 + }, + { + "epoch": 0.8120242045676361, + "grad_norm": 2.135565041402105, + "learning_rate": 1.8805906573903167e-06, + "loss": 0.3573, + "step": 11440 + }, + { + "epoch": 0.8127340159352652, + "grad_norm": 4.2654812969837295, + "learning_rate": 1.8734914099105498e-06, + "loss": 0.3462, + "step": 11450 + }, + { + "epoch": 0.8134438273028942, + "grad_norm": 3.0226693302416465, + "learning_rate": 1.8663921624307823e-06, + "loss": 0.3399, + "step": 11460 + }, + { + "epoch": 0.8141536386705233, + "grad_norm": 5.674429424631266, + "learning_rate": 1.8592929149510152e-06, + "loss": 0.3445, + "step": 11470 + }, + { + "epoch": 0.8148634500381524, + "grad_norm": 5.107735874370569, + "learning_rate": 1.852193667471248e-06, + "loss": 0.3498, + "step": 11480 + }, + { + "epoch": 0.8155732614057815, + "grad_norm": 4.211595369240753, + "learning_rate": 1.8450944199914812e-06, + "loss": 0.3509, + "step": 11490 + }, + { + "epoch": 0.8162830727734105, + "grad_norm": 3.2874196387814485, + "learning_rate": 1.837995172511714e-06, + "loss": 0.352, + "step": 11500 + }, + { + "epoch": 0.8169928841410395, + "grad_norm": 2.51051446421893, + "learning_rate": 1.8308959250319465e-06, + "loss": 0.3445, + "step": 11510 + }, + { + "epoch": 0.8177026955086686, + "grad_norm": 13.267874952448258, + "learning_rate": 1.8237966775521796e-06, + "loss": 0.354, + "step": 11520 + }, + { + "epoch": 0.8184125068762976, + "grad_norm": 4.900767095828628, + "learning_rate": 1.8166974300724125e-06, + "loss": 0.3594, + "step": 11530 + }, + { + "epoch": 0.8191223182439267, + "grad_norm": 8.3230418317363, + "learning_rate": 1.8095981825926454e-06, + "loss": 0.3471, + "step": 11540 + }, + { + "epoch": 0.8198321296115557, + "grad_norm": 2.8346340256917815, + "learning_rate": 1.8024989351128783e-06, + "loss": 0.3695, + "step": 11550 + }, + { + "epoch": 0.8205419409791848, + "grad_norm": 5.533189262204602, + "learning_rate": 1.795399687633111e-06, + "loss": 0.3728, + "step": 11560 + }, + { + "epoch": 0.8212517523468138, + "grad_norm": 3.187071233846852, + "learning_rate": 1.7883004401533439e-06, + "loss": 0.3464, + "step": 11570 + }, + { + "epoch": 0.8219615637144428, + "grad_norm": 3.9314257894883937, + "learning_rate": 1.7812011926735768e-06, + "loss": 0.3532, + "step": 11580 + }, + { + "epoch": 0.8226713750820719, + "grad_norm": 3.6730541227348277, + "learning_rate": 1.7741019451938097e-06, + "loss": 0.3565, + "step": 11590 + }, + { + "epoch": 0.823381186449701, + "grad_norm": 2.9136274666194306, + "learning_rate": 1.7670026977140426e-06, + "loss": 0.3603, + "step": 11600 + }, + { + "epoch": 0.8240909978173301, + "grad_norm": 6.106992201577366, + "learning_rate": 1.7599034502342754e-06, + "loss": 0.3484, + "step": 11610 + }, + { + "epoch": 0.8248008091849591, + "grad_norm": 4.230462903274037, + "learning_rate": 1.7528042027545081e-06, + "loss": 0.35, + "step": 11620 + }, + { + "epoch": 0.8255106205525882, + "grad_norm": 3.376064932155992, + "learning_rate": 1.745704955274741e-06, + "loss": 0.35, + "step": 11630 + }, + { + "epoch": 0.8262204319202172, + "grad_norm": 2.8424779046250612, + "learning_rate": 1.738605707794974e-06, + "loss": 0.3552, + "step": 11640 + }, + { + "epoch": 0.8269302432878463, + "grad_norm": 3.6044824322491347, + "learning_rate": 1.7315064603152068e-06, + "loss": 0.3633, + "step": 11650 + }, + { + "epoch": 0.8276400546554753, + "grad_norm": 3.3041226058016324, + "learning_rate": 1.7244072128354397e-06, + "loss": 0.3453, + "step": 11660 + }, + { + "epoch": 0.8283498660231043, + "grad_norm": 3.461976575510189, + "learning_rate": 1.7173079653556724e-06, + "loss": 0.3607, + "step": 11670 + }, + { + "epoch": 0.8290596773907334, + "grad_norm": 3.96624408516477, + "learning_rate": 1.7102087178759052e-06, + "loss": 0.3431, + "step": 11680 + }, + { + "epoch": 0.8297694887583624, + "grad_norm": 10.446490548963004, + "learning_rate": 1.7031094703961381e-06, + "loss": 0.3518, + "step": 11690 + }, + { + "epoch": 0.8304793001259915, + "grad_norm": 2.4894424633296888, + "learning_rate": 1.696010222916371e-06, + "loss": 0.3618, + "step": 11700 + }, + { + "epoch": 0.8311891114936206, + "grad_norm": 3.7097939930537494, + "learning_rate": 1.688910975436604e-06, + "loss": 0.3577, + "step": 11710 + }, + { + "epoch": 0.8318989228612497, + "grad_norm": 2.591589818986439, + "learning_rate": 1.6818117279568366e-06, + "loss": 0.3454, + "step": 11720 + }, + { + "epoch": 0.8326087342288787, + "grad_norm": 3.0415000039562816, + "learning_rate": 1.6747124804770695e-06, + "loss": 0.3514, + "step": 11730 + }, + { + "epoch": 0.8333185455965078, + "grad_norm": 3.185465708245909, + "learning_rate": 1.6676132329973024e-06, + "loss": 0.3437, + "step": 11740 + }, + { + "epoch": 0.8340283569641368, + "grad_norm": 8.153250864972724, + "learning_rate": 1.6605139855175353e-06, + "loss": 0.3418, + "step": 11750 + }, + { + "epoch": 0.8347381683317658, + "grad_norm": 17.15311701699765, + "learning_rate": 1.6534147380377682e-06, + "loss": 0.3533, + "step": 11760 + }, + { + "epoch": 0.8354479796993949, + "grad_norm": 2.956498750624732, + "learning_rate": 1.6463154905580008e-06, + "loss": 0.3539, + "step": 11770 + }, + { + "epoch": 0.8361577910670239, + "grad_norm": 5.182422880739596, + "learning_rate": 1.6392162430782337e-06, + "loss": 0.3543, + "step": 11780 + }, + { + "epoch": 0.836867602434653, + "grad_norm": 5.245759433932608, + "learning_rate": 1.6321169955984666e-06, + "loss": 0.3506, + "step": 11790 + }, + { + "epoch": 0.837577413802282, + "grad_norm": 2.8777113855306, + "learning_rate": 1.6250177481186997e-06, + "loss": 0.351, + "step": 11800 + }, + { + "epoch": 0.838287225169911, + "grad_norm": 3.317900354948997, + "learning_rate": 1.6179185006389326e-06, + "loss": 0.3426, + "step": 11810 + }, + { + "epoch": 0.8389970365375401, + "grad_norm": 2.7259998460321295, + "learning_rate": 1.610819253159165e-06, + "loss": 0.3416, + "step": 11820 + }, + { + "epoch": 0.8397068479051693, + "grad_norm": 7.203501395811214, + "learning_rate": 1.603720005679398e-06, + "loss": 0.346, + "step": 11830 + }, + { + "epoch": 0.8404166592727983, + "grad_norm": 3.5281319520469343, + "learning_rate": 1.596620758199631e-06, + "loss": 0.3415, + "step": 11840 + }, + { + "epoch": 0.8411264706404273, + "grad_norm": 2.8068995456792085, + "learning_rate": 1.589521510719864e-06, + "loss": 0.3506, + "step": 11850 + }, + { + "epoch": 0.8418362820080564, + "grad_norm": 5.8571413992691, + "learning_rate": 1.5824222632400968e-06, + "loss": 0.3492, + "step": 11860 + }, + { + "epoch": 0.8425460933756854, + "grad_norm": 2.8473277239745625, + "learning_rate": 1.5753230157603295e-06, + "loss": 0.3464, + "step": 11870 + }, + { + "epoch": 0.8432559047433145, + "grad_norm": 2.743001963303042, + "learning_rate": 1.5682237682805624e-06, + "loss": 0.3457, + "step": 11880 + }, + { + "epoch": 0.8439657161109435, + "grad_norm": 10.213481491528695, + "learning_rate": 1.5611245208007953e-06, + "loss": 0.3578, + "step": 11890 + }, + { + "epoch": 0.8446755274785726, + "grad_norm": 3.735755256117381, + "learning_rate": 1.5540252733210282e-06, + "loss": 0.3503, + "step": 11900 + }, + { + "epoch": 0.8453853388462016, + "grad_norm": 4.459890794830131, + "learning_rate": 1.546926025841261e-06, + "loss": 0.3409, + "step": 11910 + }, + { + "epoch": 0.8460951502138306, + "grad_norm": 4.8029617986261295, + "learning_rate": 1.5398267783614938e-06, + "loss": 0.3538, + "step": 11920 + }, + { + "epoch": 0.8468049615814597, + "grad_norm": 7.056776646894436, + "learning_rate": 1.5327275308817267e-06, + "loss": 0.346, + "step": 11930 + }, + { + "epoch": 0.8475147729490888, + "grad_norm": 7.364554673266408, + "learning_rate": 1.5256282834019595e-06, + "loss": 0.3478, + "step": 11940 + }, + { + "epoch": 0.8482245843167179, + "grad_norm": 3.605377806044163, + "learning_rate": 1.5185290359221924e-06, + "loss": 0.3499, + "step": 11950 + }, + { + "epoch": 0.8489343956843469, + "grad_norm": 2.452400869581193, + "learning_rate": 1.5114297884424253e-06, + "loss": 0.339, + "step": 11960 + }, + { + "epoch": 0.849644207051976, + "grad_norm": 2.870621078183671, + "learning_rate": 1.504330540962658e-06, + "loss": 0.3441, + "step": 11970 + }, + { + "epoch": 0.850354018419605, + "grad_norm": 4.473314694561015, + "learning_rate": 1.4972312934828909e-06, + "loss": 0.3559, + "step": 11980 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 5.114834992133615, + "learning_rate": 1.4901320460031238e-06, + "loss": 0.3541, + "step": 11990 + }, + { + "epoch": 0.8517736411548631, + "grad_norm": 12.083657543428806, + "learning_rate": 1.4830327985233567e-06, + "loss": 0.358, + "step": 12000 + }, + { + "epoch": 0.8524834525224921, + "grad_norm": 3.7361409384047923, + "learning_rate": 1.4759335510435896e-06, + "loss": 0.3395, + "step": 12010 + }, + { + "epoch": 0.8531932638901212, + "grad_norm": 3.4424635097779657, + "learning_rate": 1.4688343035638222e-06, + "loss": 0.3593, + "step": 12020 + }, + { + "epoch": 0.8539030752577502, + "grad_norm": 1.9645069008952134, + "learning_rate": 1.4617350560840551e-06, + "loss": 0.3508, + "step": 12030 + }, + { + "epoch": 0.8546128866253793, + "grad_norm": 4.627652849790996, + "learning_rate": 1.454635808604288e-06, + "loss": 0.3408, + "step": 12040 + }, + { + "epoch": 0.8553226979930083, + "grad_norm": 3.831924600437753, + "learning_rate": 1.447536561124521e-06, + "loss": 0.3487, + "step": 12050 + }, + { + "epoch": 0.8560325093606375, + "grad_norm": 4.570169273747359, + "learning_rate": 1.4404373136447538e-06, + "loss": 0.3415, + "step": 12060 + }, + { + "epoch": 0.8567423207282665, + "grad_norm": 4.6135182738223595, + "learning_rate": 1.4333380661649865e-06, + "loss": 0.3604, + "step": 12070 + }, + { + "epoch": 0.8574521320958955, + "grad_norm": 4.751574062951781, + "learning_rate": 1.4262388186852194e-06, + "loss": 0.3636, + "step": 12080 + }, + { + "epoch": 0.8581619434635246, + "grad_norm": 3.378379003665899, + "learning_rate": 1.4191395712054523e-06, + "loss": 0.3432, + "step": 12090 + }, + { + "epoch": 0.8588717548311536, + "grad_norm": 16.540688675093385, + "learning_rate": 1.4120403237256851e-06, + "loss": 0.3396, + "step": 12100 + }, + { + "epoch": 0.8595815661987827, + "grad_norm": 4.814104030359969, + "learning_rate": 1.404941076245918e-06, + "loss": 0.3461, + "step": 12110 + }, + { + "epoch": 0.8602913775664117, + "grad_norm": 10.051601410520883, + "learning_rate": 1.3978418287661507e-06, + "loss": 0.3447, + "step": 12120 + }, + { + "epoch": 0.8610011889340408, + "grad_norm": 2.642610961406552, + "learning_rate": 1.3907425812863836e-06, + "loss": 0.3361, + "step": 12130 + }, + { + "epoch": 0.8617110003016698, + "grad_norm": 4.614329866790318, + "learning_rate": 1.3836433338066165e-06, + "loss": 0.3528, + "step": 12140 + }, + { + "epoch": 0.8624208116692988, + "grad_norm": 5.744791519089807, + "learning_rate": 1.3765440863268496e-06, + "loss": 0.3607, + "step": 12150 + }, + { + "epoch": 0.8631306230369279, + "grad_norm": 3.9315757108747618, + "learning_rate": 1.3694448388470825e-06, + "loss": 0.3598, + "step": 12160 + }, + { + "epoch": 0.8638404344045569, + "grad_norm": 5.812032059514415, + "learning_rate": 1.3623455913673154e-06, + "loss": 0.3406, + "step": 12170 + }, + { + "epoch": 0.8645502457721861, + "grad_norm": 3.1863830261887784, + "learning_rate": 1.3552463438875478e-06, + "loss": 0.3435, + "step": 12180 + }, + { + "epoch": 0.8652600571398151, + "grad_norm": 3.164333810889643, + "learning_rate": 1.348147096407781e-06, + "loss": 0.3477, + "step": 12190 + }, + { + "epoch": 0.8659698685074442, + "grad_norm": 4.132090281780686, + "learning_rate": 1.3410478489280138e-06, + "loss": 0.3476, + "step": 12200 + }, + { + "epoch": 0.8666796798750732, + "grad_norm": 3.050674443165291, + "learning_rate": 1.3339486014482467e-06, + "loss": 0.3451, + "step": 12210 + }, + { + "epoch": 0.8673894912427023, + "grad_norm": 5.9765372634611476, + "learning_rate": 1.3268493539684796e-06, + "loss": 0.3516, + "step": 12220 + }, + { + "epoch": 0.8680993026103313, + "grad_norm": 10.801904177839997, + "learning_rate": 1.3197501064887123e-06, + "loss": 0.3525, + "step": 12230 + }, + { + "epoch": 0.8688091139779603, + "grad_norm": 10.795290079471496, + "learning_rate": 1.3126508590089452e-06, + "loss": 0.3458, + "step": 12240 + }, + { + "epoch": 0.8695189253455894, + "grad_norm": 5.185082480943749, + "learning_rate": 1.305551611529178e-06, + "loss": 0.3471, + "step": 12250 + }, + { + "epoch": 0.8702287367132184, + "grad_norm": 5.967453058115287, + "learning_rate": 1.298452364049411e-06, + "loss": 0.3593, + "step": 12260 + }, + { + "epoch": 0.8709385480808475, + "grad_norm": 2.9260514202439807, + "learning_rate": 1.2913531165696439e-06, + "loss": 0.3401, + "step": 12270 + }, + { + "epoch": 0.8716483594484765, + "grad_norm": 3.5904246593138924, + "learning_rate": 1.2842538690898765e-06, + "loss": 0.3407, + "step": 12280 + }, + { + "epoch": 0.8723581708161057, + "grad_norm": 5.983622275696177, + "learning_rate": 1.2771546216101094e-06, + "loss": 0.3453, + "step": 12290 + }, + { + "epoch": 0.8730679821837347, + "grad_norm": 4.330501853746522, + "learning_rate": 1.2700553741303423e-06, + "loss": 0.3494, + "step": 12300 + }, + { + "epoch": 0.8737777935513638, + "grad_norm": 3.642467957948953, + "learning_rate": 1.2629561266505752e-06, + "loss": 0.3458, + "step": 12310 + }, + { + "epoch": 0.8744876049189928, + "grad_norm": 5.610238111701037, + "learning_rate": 1.255856879170808e-06, + "loss": 0.3533, + "step": 12320 + }, + { + "epoch": 0.8751974162866218, + "grad_norm": 5.47126817738485, + "learning_rate": 1.248757631691041e-06, + "loss": 0.3685, + "step": 12330 + }, + { + "epoch": 0.8759072276542509, + "grad_norm": 2.9438005039273953, + "learning_rate": 1.2416583842112737e-06, + "loss": 0.3325, + "step": 12340 + }, + { + "epoch": 0.8766170390218799, + "grad_norm": 3.7896440417507415, + "learning_rate": 1.2345591367315065e-06, + "loss": 0.3445, + "step": 12350 + }, + { + "epoch": 0.877326850389509, + "grad_norm": 5.754468251004695, + "learning_rate": 1.2274598892517394e-06, + "loss": 0.3374, + "step": 12360 + }, + { + "epoch": 0.878036661757138, + "grad_norm": 4.267624406753751, + "learning_rate": 1.2203606417719723e-06, + "loss": 0.341, + "step": 12370 + }, + { + "epoch": 0.8787464731247671, + "grad_norm": 3.1963277785921993, + "learning_rate": 1.2132613942922052e-06, + "loss": 0.3381, + "step": 12380 + }, + { + "epoch": 0.8794562844923961, + "grad_norm": 6.653906616284059, + "learning_rate": 1.206162146812438e-06, + "loss": 0.3506, + "step": 12390 + }, + { + "epoch": 0.8801660958600251, + "grad_norm": 3.897977105597471, + "learning_rate": 1.1990628993326708e-06, + "loss": 0.3475, + "step": 12400 + }, + { + "epoch": 0.8808759072276543, + "grad_norm": 4.962651576299262, + "learning_rate": 1.1919636518529037e-06, + "loss": 0.349, + "step": 12410 + }, + { + "epoch": 0.8815857185952833, + "grad_norm": 5.136741390825168, + "learning_rate": 1.1848644043731366e-06, + "loss": 0.3465, + "step": 12420 + }, + { + "epoch": 0.8822955299629124, + "grad_norm": 4.445543310701251, + "learning_rate": 1.1777651568933695e-06, + "loss": 0.3548, + "step": 12430 + }, + { + "epoch": 0.8830053413305414, + "grad_norm": 20.40372637998409, + "learning_rate": 1.1706659094136021e-06, + "loss": 0.3583, + "step": 12440 + }, + { + "epoch": 0.8837151526981705, + "grad_norm": 3.982374880512643, + "learning_rate": 1.163566661933835e-06, + "loss": 0.3317, + "step": 12450 + }, + { + "epoch": 0.8844249640657995, + "grad_norm": 32.55413999411799, + "learning_rate": 1.156467414454068e-06, + "loss": 0.3514, + "step": 12460 + }, + { + "epoch": 0.8851347754334286, + "grad_norm": 5.420145750098025, + "learning_rate": 1.1493681669743008e-06, + "loss": 0.3318, + "step": 12470 + }, + { + "epoch": 0.8858445868010576, + "grad_norm": 3.685854173880656, + "learning_rate": 1.1422689194945337e-06, + "loss": 0.3429, + "step": 12480 + }, + { + "epoch": 0.8865543981686866, + "grad_norm": 4.6974765931702605, + "learning_rate": 1.1351696720147664e-06, + "loss": 0.357, + "step": 12490 + }, + { + "epoch": 0.8872642095363157, + "grad_norm": 6.795504660900696, + "learning_rate": 1.1280704245349995e-06, + "loss": 0.3531, + "step": 12500 + }, + { + "epoch": 0.8879740209039447, + "grad_norm": 4.927867549600845, + "learning_rate": 1.1209711770552324e-06, + "loss": 0.3647, + "step": 12510 + }, + { + "epoch": 0.8886838322715739, + "grad_norm": 70.3319920713418, + "learning_rate": 1.113871929575465e-06, + "loss": 0.3481, + "step": 12520 + }, + { + "epoch": 0.8893936436392029, + "grad_norm": 29.187269789239732, + "learning_rate": 1.106772682095698e-06, + "loss": 0.3487, + "step": 12530 + }, + { + "epoch": 0.890103455006832, + "grad_norm": 2.619165987059257, + "learning_rate": 1.0996734346159308e-06, + "loss": 0.3557, + "step": 12540 + }, + { + "epoch": 0.890813266374461, + "grad_norm": 5.724483375383932, + "learning_rate": 1.0925741871361637e-06, + "loss": 0.3587, + "step": 12550 + }, + { + "epoch": 0.89152307774209, + "grad_norm": 4.2668973076468, + "learning_rate": 1.0854749396563966e-06, + "loss": 0.3462, + "step": 12560 + }, + { + "epoch": 0.8922328891097191, + "grad_norm": 9.234745768295488, + "learning_rate": 1.0783756921766293e-06, + "loss": 0.3537, + "step": 12570 + }, + { + "epoch": 0.8929427004773481, + "grad_norm": 3.665665785771113, + "learning_rate": 1.0712764446968622e-06, + "loss": 0.3643, + "step": 12580 + }, + { + "epoch": 0.8936525118449772, + "grad_norm": 2.6258893539339656, + "learning_rate": 1.064177197217095e-06, + "loss": 0.3338, + "step": 12590 + }, + { + "epoch": 0.8943623232126062, + "grad_norm": 3.154491930622594, + "learning_rate": 1.057077949737328e-06, + "loss": 0.3444, + "step": 12600 + }, + { + "epoch": 0.8950721345802353, + "grad_norm": 7.836052713310002, + "learning_rate": 1.0499787022575608e-06, + "loss": 0.3628, + "step": 12610 + }, + { + "epoch": 0.8957819459478643, + "grad_norm": 3.8943175763479996, + "learning_rate": 1.0428794547777935e-06, + "loss": 0.3403, + "step": 12620 + }, + { + "epoch": 0.8964917573154934, + "grad_norm": 15.29553673398478, + "learning_rate": 1.0357802072980264e-06, + "loss": 0.3521, + "step": 12630 + }, + { + "epoch": 0.8972015686831225, + "grad_norm": 4.442650541355824, + "learning_rate": 1.0286809598182595e-06, + "loss": 0.3342, + "step": 12640 + }, + { + "epoch": 0.8979113800507516, + "grad_norm": 3.9047310665092247, + "learning_rate": 1.0215817123384922e-06, + "loss": 0.3427, + "step": 12650 + }, + { + "epoch": 0.8986211914183806, + "grad_norm": 2.1332446352398544, + "learning_rate": 1.014482464858725e-06, + "loss": 0.349, + "step": 12660 + }, + { + "epoch": 0.8993310027860096, + "grad_norm": 2.8714716164962923, + "learning_rate": 1.0073832173789578e-06, + "loss": 0.357, + "step": 12670 + }, + { + "epoch": 0.9000408141536387, + "grad_norm": 5.513019742153847, + "learning_rate": 1.0002839698991909e-06, + "loss": 0.3404, + "step": 12680 + }, + { + "epoch": 0.9007506255212677, + "grad_norm": 3.940129513886605, + "learning_rate": 9.931847224194237e-07, + "loss": 0.3637, + "step": 12690 + }, + { + "epoch": 0.9014604368888968, + "grad_norm": 3.9515535744587256, + "learning_rate": 9.860854749396564e-07, + "loss": 0.3498, + "step": 12700 + }, + { + "epoch": 0.9021702482565258, + "grad_norm": 3.0069372274862234, + "learning_rate": 9.789862274598893e-07, + "loss": 0.3398, + "step": 12710 + }, + { + "epoch": 0.9028800596241549, + "grad_norm": 3.5043049442535072, + "learning_rate": 9.718869799801222e-07, + "loss": 0.339, + "step": 12720 + }, + { + "epoch": 0.9035898709917839, + "grad_norm": 4.7818413498969825, + "learning_rate": 9.64787732500355e-07, + "loss": 0.3482, + "step": 12730 + }, + { + "epoch": 0.9042996823594129, + "grad_norm": 2.9143937043517485, + "learning_rate": 9.57688485020588e-07, + "loss": 0.3289, + "step": 12740 + }, + { + "epoch": 0.9050094937270421, + "grad_norm": 3.530470062388488, + "learning_rate": 9.505892375408208e-07, + "loss": 0.3406, + "step": 12750 + }, + { + "epoch": 0.9057193050946711, + "grad_norm": 3.6289940943514245, + "learning_rate": 9.434899900610537e-07, + "loss": 0.343, + "step": 12760 + }, + { + "epoch": 0.9064291164623002, + "grad_norm": 11.92232636233806, + "learning_rate": 9.363907425812864e-07, + "loss": 0.3538, + "step": 12770 + }, + { + "epoch": 0.9071389278299292, + "grad_norm": 3.3864038291963787, + "learning_rate": 9.292914951015193e-07, + "loss": 0.3361, + "step": 12780 + }, + { + "epoch": 0.9078487391975583, + "grad_norm": 4.345114007441839, + "learning_rate": 9.221922476217522e-07, + "loss": 0.3307, + "step": 12790 + }, + { + "epoch": 0.9085585505651873, + "grad_norm": 3.2046183568204687, + "learning_rate": 9.15093000141985e-07, + "loss": 0.3467, + "step": 12800 + }, + { + "epoch": 0.9092683619328163, + "grad_norm": 3.030859855481088, + "learning_rate": 9.079937526622179e-07, + "loss": 0.3467, + "step": 12810 + }, + { + "epoch": 0.9099781733004454, + "grad_norm": 4.579582289306875, + "learning_rate": 9.008945051824507e-07, + "loss": 0.3232, + "step": 12820 + }, + { + "epoch": 0.9106879846680744, + "grad_norm": 3.760749336756688, + "learning_rate": 8.937952577026836e-07, + "loss": 0.3467, + "step": 12830 + }, + { + "epoch": 0.9113977960357035, + "grad_norm": 3.179418594295822, + "learning_rate": 8.866960102229165e-07, + "loss": 0.3473, + "step": 12840 + }, + { + "epoch": 0.9121076074033325, + "grad_norm": 3.983021666456075, + "learning_rate": 8.795967627431492e-07, + "loss": 0.3587, + "step": 12850 + }, + { + "epoch": 0.9128174187709616, + "grad_norm": 2.6025747411648243, + "learning_rate": 8.724975152633821e-07, + "loss": 0.3462, + "step": 12860 + }, + { + "epoch": 0.9135272301385907, + "grad_norm": 4.3088037403974315, + "learning_rate": 8.65398267783615e-07, + "loss": 0.3428, + "step": 12870 + }, + { + "epoch": 0.9142370415062198, + "grad_norm": 3.7771085521562644, + "learning_rate": 8.582990203038478e-07, + "loss": 0.3398, + "step": 12880 + }, + { + "epoch": 0.9149468528738488, + "grad_norm": 2.5115102656996853, + "learning_rate": 8.511997728240808e-07, + "loss": 0.3419, + "step": 12890 + }, + { + "epoch": 0.9156566642414778, + "grad_norm": 2.646423568943871, + "learning_rate": 8.441005253443135e-07, + "loss": 0.3326, + "step": 12900 + }, + { + "epoch": 0.9163664756091069, + "grad_norm": 4.308215071259538, + "learning_rate": 8.370012778645465e-07, + "loss": 0.3383, + "step": 12910 + }, + { + "epoch": 0.9170762869767359, + "grad_norm": 7.273858221430791, + "learning_rate": 8.299020303847794e-07, + "loss": 0.3411, + "step": 12920 + }, + { + "epoch": 0.917786098344365, + "grad_norm": 3.1600055981634183, + "learning_rate": 8.228027829050122e-07, + "loss": 0.3577, + "step": 12930 + }, + { + "epoch": 0.918495909711994, + "grad_norm": 6.08255963796338, + "learning_rate": 8.15703535425245e-07, + "loss": 0.3589, + "step": 12940 + }, + { + "epoch": 0.9192057210796231, + "grad_norm": 4.397885394689723, + "learning_rate": 8.086042879454778e-07, + "loss": 0.3492, + "step": 12950 + }, + { + "epoch": 0.9199155324472521, + "grad_norm": 227.99760672787355, + "learning_rate": 8.015050404657107e-07, + "loss": 0.3346, + "step": 12960 + }, + { + "epoch": 0.9206253438148811, + "grad_norm": 2.2307237070418853, + "learning_rate": 7.944057929859436e-07, + "loss": 0.3441, + "step": 12970 + }, + { + "epoch": 0.9213351551825103, + "grad_norm": 5.180228064847272, + "learning_rate": 7.873065455061764e-07, + "loss": 0.3465, + "step": 12980 + }, + { + "epoch": 0.9220449665501393, + "grad_norm": 3.2003044967213836, + "learning_rate": 7.802072980264093e-07, + "loss": 0.3425, + "step": 12990 + }, + { + "epoch": 0.9227547779177684, + "grad_norm": 2.734492726273123, + "learning_rate": 7.731080505466421e-07, + "loss": 0.3403, + "step": 13000 + }, + { + "epoch": 0.9234645892853974, + "grad_norm": 2.825363146947483, + "learning_rate": 7.66008803066875e-07, + "loss": 0.3644, + "step": 13010 + }, + { + "epoch": 0.9241744006530265, + "grad_norm": 6.94935444401322, + "learning_rate": 7.589095555871078e-07, + "loss": 0.3498, + "step": 13020 + }, + { + "epoch": 0.9248842120206555, + "grad_norm": 2.8121909722558924, + "learning_rate": 7.518103081073406e-07, + "loss": 0.356, + "step": 13030 + }, + { + "epoch": 0.9255940233882846, + "grad_norm": 2.7024231170054946, + "learning_rate": 7.447110606275735e-07, + "loss": 0.3415, + "step": 13040 + }, + { + "epoch": 0.9263038347559136, + "grad_norm": 2.9617596087956195, + "learning_rate": 7.376118131478063e-07, + "loss": 0.3372, + "step": 13050 + }, + { + "epoch": 0.9270136461235426, + "grad_norm": 42.5976926609076, + "learning_rate": 7.305125656680392e-07, + "loss": 0.3541, + "step": 13060 + }, + { + "epoch": 0.9277234574911717, + "grad_norm": 3.769476187835692, + "learning_rate": 7.234133181882722e-07, + "loss": 0.3594, + "step": 13070 + }, + { + "epoch": 0.9284332688588007, + "grad_norm": 3.749361674379726, + "learning_rate": 7.163140707085049e-07, + "loss": 0.3348, + "step": 13080 + }, + { + "epoch": 0.9291430802264298, + "grad_norm": 2.5267280447133937, + "learning_rate": 7.092148232287379e-07, + "loss": 0.3579, + "step": 13090 + }, + { + "epoch": 0.9298528915940589, + "grad_norm": 3.0968195473762097, + "learning_rate": 7.021155757489707e-07, + "loss": 0.3392, + "step": 13100 + }, + { + "epoch": 0.930562702961688, + "grad_norm": 3.9129176862736674, + "learning_rate": 6.950163282692035e-07, + "loss": 0.3533, + "step": 13110 + }, + { + "epoch": 0.931272514329317, + "grad_norm": 2.7485456874581122, + "learning_rate": 6.879170807894364e-07, + "loss": 0.3399, + "step": 13120 + }, + { + "epoch": 0.9319823256969461, + "grad_norm": 4.769184944849367, + "learning_rate": 6.808178333096692e-07, + "loss": 0.3551, + "step": 13130 + }, + { + "epoch": 0.9326921370645751, + "grad_norm": 2.8275717207772098, + "learning_rate": 6.737185858299021e-07, + "loss": 0.348, + "step": 13140 + }, + { + "epoch": 0.9334019484322041, + "grad_norm": 2.1023857426151595, + "learning_rate": 6.66619338350135e-07, + "loss": 0.3381, + "step": 13150 + }, + { + "epoch": 0.9341117597998332, + "grad_norm": 2.8745163990655125, + "learning_rate": 6.595200908703678e-07, + "loss": 0.3488, + "step": 13160 + }, + { + "epoch": 0.9348215711674622, + "grad_norm": 3.97821451395574, + "learning_rate": 6.524208433906007e-07, + "loss": 0.349, + "step": 13170 + }, + { + "epoch": 0.9355313825350913, + "grad_norm": 7.304369226663597, + "learning_rate": 6.453215959108335e-07, + "loss": 0.352, + "step": 13180 + }, + { + "epoch": 0.9362411939027203, + "grad_norm": 4.654909122469299, + "learning_rate": 6.382223484310663e-07, + "loss": 0.3478, + "step": 13190 + }, + { + "epoch": 0.9369510052703494, + "grad_norm": 3.4074758383445296, + "learning_rate": 6.311231009512992e-07, + "loss": 0.3265, + "step": 13200 + }, + { + "epoch": 0.9376608166379785, + "grad_norm": 2.8891732151802687, + "learning_rate": 6.24023853471532e-07, + "loss": 0.342, + "step": 13210 + }, + { + "epoch": 0.9383706280056076, + "grad_norm": 4.315712149288758, + "learning_rate": 6.169246059917649e-07, + "loss": 0.3542, + "step": 13220 + }, + { + "epoch": 0.9390804393732366, + "grad_norm": 4.202849073092827, + "learning_rate": 6.098253585119978e-07, + "loss": 0.3464, + "step": 13230 + }, + { + "epoch": 0.9397902507408656, + "grad_norm": 4.402135376104271, + "learning_rate": 6.027261110322307e-07, + "loss": 0.3493, + "step": 13240 + }, + { + "epoch": 0.9405000621084947, + "grad_norm": 3.3375797449619804, + "learning_rate": 5.956268635524635e-07, + "loss": 0.3431, + "step": 13250 + }, + { + "epoch": 0.9412098734761237, + "grad_norm": 2.58448811647569, + "learning_rate": 5.885276160726964e-07, + "loss": 0.3516, + "step": 13260 + }, + { + "epoch": 0.9419196848437528, + "grad_norm": 3.1207357827554216, + "learning_rate": 5.814283685929293e-07, + "loss": 0.3469, + "step": 13270 + }, + { + "epoch": 0.9426294962113818, + "grad_norm": 5.535335579042853, + "learning_rate": 5.74329121113162e-07, + "loss": 0.3411, + "step": 13280 + }, + { + "epoch": 0.9433393075790109, + "grad_norm": 4.157192002051246, + "learning_rate": 5.672298736333949e-07, + "loss": 0.3357, + "step": 13290 + }, + { + "epoch": 0.9440491189466399, + "grad_norm": 4.609541473632524, + "learning_rate": 5.601306261536277e-07, + "loss": 0.3297, + "step": 13300 + }, + { + "epoch": 0.9447589303142689, + "grad_norm": 4.556290013887312, + "learning_rate": 5.530313786738606e-07, + "loss": 0.3268, + "step": 13310 + }, + { + "epoch": 0.945468741681898, + "grad_norm": 4.334131807132338, + "learning_rate": 5.459321311940935e-07, + "loss": 0.3582, + "step": 13320 + }, + { + "epoch": 0.9461785530495271, + "grad_norm": 4.733377355574472, + "learning_rate": 5.388328837143264e-07, + "loss": 0.3366, + "step": 13330 + }, + { + "epoch": 0.9468883644171562, + "grad_norm": 6.762724277887754, + "learning_rate": 5.317336362345592e-07, + "loss": 0.345, + "step": 13340 + }, + { + "epoch": 0.9475981757847852, + "grad_norm": 2.9705397730746634, + "learning_rate": 5.246343887547921e-07, + "loss": 0.3465, + "step": 13350 + }, + { + "epoch": 0.9483079871524143, + "grad_norm": 3.195893348669726, + "learning_rate": 5.175351412750249e-07, + "loss": 0.3348, + "step": 13360 + }, + { + "epoch": 0.9490177985200433, + "grad_norm": 7.323985518462735, + "learning_rate": 5.104358937952577e-07, + "loss": 0.3543, + "step": 13370 + }, + { + "epoch": 0.9497276098876724, + "grad_norm": 2.799618403745627, + "learning_rate": 5.033366463154906e-07, + "loss": 0.3431, + "step": 13380 + }, + { + "epoch": 0.9504374212553014, + "grad_norm": 2.7728876598155843, + "learning_rate": 4.962373988357234e-07, + "loss": 0.3249, + "step": 13390 + }, + { + "epoch": 0.9511472326229304, + "grad_norm": 5.195465798306655, + "learning_rate": 4.891381513559563e-07, + "loss": 0.3413, + "step": 13400 + }, + { + "epoch": 0.9518570439905595, + "grad_norm": 10.319650407110732, + "learning_rate": 4.820389038761892e-07, + "loss": 0.3289, + "step": 13410 + }, + { + "epoch": 0.9525668553581885, + "grad_norm": 3.639550539774894, + "learning_rate": 4.74939656396422e-07, + "loss": 0.358, + "step": 13420 + }, + { + "epoch": 0.9532766667258176, + "grad_norm": 3.005922518183922, + "learning_rate": 4.6784040891665486e-07, + "loss": 0.3483, + "step": 13430 + }, + { + "epoch": 0.9539864780934467, + "grad_norm": 3.658172908229024, + "learning_rate": 4.607411614368877e-07, + "loss": 0.3503, + "step": 13440 + }, + { + "epoch": 0.9546962894610758, + "grad_norm": 3.17836271977541, + "learning_rate": 4.5364191395712053e-07, + "loss": 0.32, + "step": 13450 + }, + { + "epoch": 0.9554061008287048, + "grad_norm": 2.6050315565816513, + "learning_rate": 4.465426664773535e-07, + "loss": 0.336, + "step": 13460 + }, + { + "epoch": 0.9561159121963339, + "grad_norm": 2.516963929561299, + "learning_rate": 4.394434189975863e-07, + "loss": 0.3461, + "step": 13470 + }, + { + "epoch": 0.9568257235639629, + "grad_norm": 5.182889994348168, + "learning_rate": 4.3234417151781915e-07, + "loss": 0.3453, + "step": 13480 + }, + { + "epoch": 0.9575355349315919, + "grad_norm": 2.2527308195923843, + "learning_rate": 4.25244924038052e-07, + "loss": 0.3394, + "step": 13490 + }, + { + "epoch": 0.958245346299221, + "grad_norm": 5.702042483324615, + "learning_rate": 4.181456765582848e-07, + "loss": 0.3464, + "step": 13500 + }, + { + "epoch": 0.95895515766685, + "grad_norm": 4.320082944510015, + "learning_rate": 4.110464290785177e-07, + "loss": 0.361, + "step": 13510 + }, + { + "epoch": 0.9596649690344791, + "grad_norm": 2.7057123674561683, + "learning_rate": 4.0394718159875055e-07, + "loss": 0.3451, + "step": 13520 + }, + { + "epoch": 0.9603747804021081, + "grad_norm": 6.179223629975322, + "learning_rate": 3.968479341189834e-07, + "loss": 0.3371, + "step": 13530 + }, + { + "epoch": 0.9610845917697372, + "grad_norm": 2.5395758819730267, + "learning_rate": 3.897486866392163e-07, + "loss": 0.3587, + "step": 13540 + }, + { + "epoch": 0.9617944031373662, + "grad_norm": 3.6526335466786835, + "learning_rate": 3.8264943915944917e-07, + "loss": 0.3439, + "step": 13550 + }, + { + "epoch": 0.9625042145049953, + "grad_norm": 6.134974420857256, + "learning_rate": 3.75550191679682e-07, + "loss": 0.3413, + "step": 13560 + }, + { + "epoch": 0.9632140258726244, + "grad_norm": 4.231152248304582, + "learning_rate": 3.6845094419991484e-07, + "loss": 0.3412, + "step": 13570 + }, + { + "epoch": 0.9639238372402534, + "grad_norm": 19.9166049671889, + "learning_rate": 3.613516967201477e-07, + "loss": 0.3457, + "step": 13580 + }, + { + "epoch": 0.9646336486078825, + "grad_norm": 3.0744926751867565, + "learning_rate": 3.542524492403805e-07, + "loss": 0.3501, + "step": 13590 + }, + { + "epoch": 0.9653434599755115, + "grad_norm": 4.316210901775538, + "learning_rate": 3.471532017606134e-07, + "loss": 0.3391, + "step": 13600 + }, + { + "epoch": 0.9660532713431406, + "grad_norm": 5.568442813862272, + "learning_rate": 3.400539542808463e-07, + "loss": 0.3571, + "step": 13610 + }, + { + "epoch": 0.9667630827107696, + "grad_norm": 2.464997647373043, + "learning_rate": 3.3295470680107913e-07, + "loss": 0.3403, + "step": 13620 + }, + { + "epoch": 0.9674728940783986, + "grad_norm": 9.203447351864554, + "learning_rate": 3.2585545932131197e-07, + "loss": 0.3372, + "step": 13630 + }, + { + "epoch": 0.9681827054460277, + "grad_norm": 4.083574237624433, + "learning_rate": 3.187562118415448e-07, + "loss": 0.3523, + "step": 13640 + }, + { + "epoch": 0.9688925168136567, + "grad_norm": 2.580899686505033, + "learning_rate": 3.1165696436177764e-07, + "loss": 0.3331, + "step": 13650 + }, + { + "epoch": 0.9696023281812858, + "grad_norm": 4.461792584369479, + "learning_rate": 3.0455771688201053e-07, + "loss": 0.3436, + "step": 13660 + }, + { + "epoch": 0.9703121395489148, + "grad_norm": 6.002729090963929, + "learning_rate": 2.9745846940224337e-07, + "loss": 0.3392, + "step": 13670 + }, + { + "epoch": 0.971021950916544, + "grad_norm": 15.908649085501459, + "learning_rate": 2.9035922192247626e-07, + "loss": 0.3401, + "step": 13680 + }, + { + "epoch": 0.971731762284173, + "grad_norm": 3.2548319133826875, + "learning_rate": 2.832599744427091e-07, + "loss": 0.3466, + "step": 13690 + }, + { + "epoch": 0.9724415736518021, + "grad_norm": 2.810860141109629, + "learning_rate": 2.76160726962942e-07, + "loss": 0.3445, + "step": 13700 + }, + { + "epoch": 0.9731513850194311, + "grad_norm": 5.404897398221347, + "learning_rate": 2.690614794831748e-07, + "loss": 0.3464, + "step": 13710 + }, + { + "epoch": 0.9738611963870601, + "grad_norm": 3.07947902781157, + "learning_rate": 2.6196223200340766e-07, + "loss": 0.3295, + "step": 13720 + }, + { + "epoch": 0.9745710077546892, + "grad_norm": 3.2905796500928814, + "learning_rate": 2.548629845236405e-07, + "loss": 0.3491, + "step": 13730 + }, + { + "epoch": 0.9752808191223182, + "grad_norm": 4.431073995020802, + "learning_rate": 2.4776373704387334e-07, + "loss": 0.3483, + "step": 13740 + }, + { + "epoch": 0.9759906304899473, + "grad_norm": 3.5179707782287166, + "learning_rate": 2.406644895641062e-07, + "loss": 0.3469, + "step": 13750 + }, + { + "epoch": 0.9767004418575763, + "grad_norm": 4.221356923748856, + "learning_rate": 2.3356524208433906e-07, + "loss": 0.3343, + "step": 13760 + }, + { + "epoch": 0.9774102532252054, + "grad_norm": 286.15418214313974, + "learning_rate": 2.2646599460457195e-07, + "loss": 0.3349, + "step": 13770 + }, + { + "epoch": 0.9781200645928344, + "grad_norm": 3.4922335144175576, + "learning_rate": 2.193667471248048e-07, + "loss": 0.3485, + "step": 13780 + }, + { + "epoch": 0.9788298759604636, + "grad_norm": 3.944308898398798, + "learning_rate": 2.1226749964503763e-07, + "loss": 0.3288, + "step": 13790 + }, + { + "epoch": 0.9795396873280926, + "grad_norm": 3.16447581060814, + "learning_rate": 2.0516825216527052e-07, + "loss": 0.3435, + "step": 13800 + }, + { + "epoch": 0.9802494986957216, + "grad_norm": 7.105988741131366, + "learning_rate": 1.9806900468550335e-07, + "loss": 0.342, + "step": 13810 + }, + { + "epoch": 0.9809593100633507, + "grad_norm": 3.311616450653751, + "learning_rate": 1.9096975720573622e-07, + "loss": 0.365, + "step": 13820 + }, + { + "epoch": 0.9816691214309797, + "grad_norm": 3.1283492138129128, + "learning_rate": 1.8387050972596905e-07, + "loss": 0.3497, + "step": 13830 + }, + { + "epoch": 0.9823789327986088, + "grad_norm": 4.720800332800002, + "learning_rate": 1.7677126224620194e-07, + "loss": 0.3356, + "step": 13840 + }, + { + "epoch": 0.9830887441662378, + "grad_norm": 5.755549723756511, + "learning_rate": 1.6967201476643478e-07, + "loss": 0.3534, + "step": 13850 + }, + { + "epoch": 0.9837985555338669, + "grad_norm": 12.413957162417217, + "learning_rate": 1.6257276728666762e-07, + "loss": 0.3514, + "step": 13860 + }, + { + "epoch": 0.9845083669014959, + "grad_norm": 3.7416649036415195, + "learning_rate": 1.5547351980690048e-07, + "loss": 0.3468, + "step": 13870 + }, + { + "epoch": 0.985218178269125, + "grad_norm": 5.096087166471907, + "learning_rate": 1.4837427232713335e-07, + "loss": 0.3478, + "step": 13880 + }, + { + "epoch": 0.985927989636754, + "grad_norm": 2.8643069595501847, + "learning_rate": 1.4127502484736618e-07, + "loss": 0.3307, + "step": 13890 + }, + { + "epoch": 0.986637801004383, + "grad_norm": 4.161106542911394, + "learning_rate": 1.3417577736759905e-07, + "loss": 0.3451, + "step": 13900 + }, + { + "epoch": 0.9873476123720122, + "grad_norm": 3.161705990477656, + "learning_rate": 1.270765298878319e-07, + "loss": 0.3389, + "step": 13910 + }, + { + "epoch": 0.9880574237396412, + "grad_norm": 3.2196566259908637, + "learning_rate": 1.1997728240806475e-07, + "loss": 0.3508, + "step": 13920 + }, + { + "epoch": 0.9887672351072703, + "grad_norm": 3.0061617959710403, + "learning_rate": 1.1287803492829761e-07, + "loss": 0.357, + "step": 13930 + }, + { + "epoch": 0.9894770464748993, + "grad_norm": 7.195163761877952, + "learning_rate": 1.0577878744853047e-07, + "loss": 0.3344, + "step": 13940 + }, + { + "epoch": 0.9901868578425284, + "grad_norm": 4.778295681909435, + "learning_rate": 9.867953996876332e-08, + "loss": 0.3404, + "step": 13950 + }, + { + "epoch": 0.9908966692101574, + "grad_norm": 3.6751893575330072, + "learning_rate": 9.158029248899617e-08, + "loss": 0.3222, + "step": 13960 + }, + { + "epoch": 0.9916064805777864, + "grad_norm": 6.066838850034421, + "learning_rate": 8.448104500922902e-08, + "loss": 0.3373, + "step": 13970 + }, + { + "epoch": 0.9923162919454155, + "grad_norm": 5.8640066255244525, + "learning_rate": 7.738179752946189e-08, + "loss": 0.35, + "step": 13980 + }, + { + "epoch": 0.9930261033130445, + "grad_norm": 4.063550481932921, + "learning_rate": 7.028255004969474e-08, + "loss": 0.3424, + "step": 13990 + }, + { + "epoch": 0.9937359146806736, + "grad_norm": 6.923421784576789, + "learning_rate": 6.31833025699276e-08, + "loss": 0.3584, + "step": 14000 + }, + { + "epoch": 0.9944457260483026, + "grad_norm": 4.621602275306591, + "learning_rate": 5.6084055090160446e-08, + "loss": 0.3381, + "step": 14010 + }, + { + "epoch": 0.9951555374159318, + "grad_norm": 5.495946912076004, + "learning_rate": 4.89848076103933e-08, + "loss": 0.3557, + "step": 14020 + }, + { + "epoch": 0.9958653487835608, + "grad_norm": 2.261874767912811, + "learning_rate": 4.188556013062616e-08, + "loss": 0.3346, + "step": 14030 + }, + { + "epoch": 0.9965751601511899, + "grad_norm": 3.528699506394003, + "learning_rate": 3.478631265085901e-08, + "loss": 0.3284, + "step": 14040 + }, + { + "epoch": 0.9972849715188189, + "grad_norm": 3.0483860239618314, + "learning_rate": 2.7687065171091867e-08, + "loss": 0.3341, + "step": 14050 + }, + { + "epoch": 0.9979947828864479, + "grad_norm": 4.681194219809911, + "learning_rate": 2.0587817691324724e-08, + "loss": 0.333, + "step": 14060 + }, + { + "epoch": 0.998704594254077, + "grad_norm": 5.802114485594721, + "learning_rate": 1.3488570211557575e-08, + "loss": 0.3457, + "step": 14070 + }, + { + "epoch": 0.999414405621706, + "grad_norm": 2.8616716300198775, + "learning_rate": 6.389322731790431e-09, + "loss": 0.3398, + "step": 14080 + } + ], + "logging_steps": 10, + "max_steps": 14088, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9975763395674112.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}