{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0520784509785541, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002603922548927705, "grad_norm": 0.8521247506141663, "learning_rate": 5.194805194805195e-06, "loss": 0.7412, "step": 10 }, { "epoch": 0.000520784509785541, "grad_norm": 0.6229312419891357, "learning_rate": 1.038961038961039e-05, "loss": 0.7138, "step": 20 }, { "epoch": 0.0007811767646783114, "grad_norm": 0.4566498100757599, "learning_rate": 1.5584415584415583e-05, "loss": 0.7079, "step": 30 }, { "epoch": 0.001041569019571082, "grad_norm": 0.4316692650318146, "learning_rate": 2.077922077922078e-05, "loss": 0.6988, "step": 40 }, { "epoch": 0.0013019612744638524, "grad_norm": 0.615436315536499, "learning_rate": 2.5974025974025972e-05, "loss": 0.6937, "step": 50 }, { "epoch": 0.0015623535293566228, "grad_norm": 0.48698583245277405, "learning_rate": 3.1168831168831166e-05, "loss": 0.7043, "step": 60 }, { "epoch": 0.0018227457842493933, "grad_norm": 0.3984021544456482, "learning_rate": 3.6363636363636364e-05, "loss": 0.6563, "step": 70 }, { "epoch": 0.002083138039142164, "grad_norm": 0.37576180696487427, "learning_rate": 4.155844155844156e-05, "loss": 0.6462, "step": 80 }, { "epoch": 0.0023435302940349343, "grad_norm": 0.35269680619239807, "learning_rate": 4.675324675324675e-05, "loss": 0.6656, "step": 90 }, { "epoch": 0.0026039225489277048, "grad_norm": 0.31541451811790466, "learning_rate": 5.1948051948051944e-05, "loss": 0.6547, "step": 100 }, { "epoch": 0.002864314803820475, "grad_norm": 0.3462330400943756, "learning_rate": 5.714285714285714e-05, "loss": 0.6621, "step": 110 }, { "epoch": 0.0031247070587132456, "grad_norm": 0.3465985953807831, "learning_rate": 6.233766233766233e-05, "loss": 0.6273, "step": 120 }, { "epoch": 0.003385099313606016, "grad_norm": 0.3297797441482544, "learning_rate": 6.753246753246754e-05, "loss": 0.6559, "step": 130 }, { "epoch": 0.0036454915684987865, "grad_norm": 0.3888818621635437, "learning_rate": 7.272727272727273e-05, "loss": 0.6756, "step": 140 }, { "epoch": 0.003905883823391557, "grad_norm": 0.3542368710041046, "learning_rate": 7.792207792207793e-05, "loss": 0.6506, "step": 150 }, { "epoch": 0.004166276078284328, "grad_norm": 0.37369370460510254, "learning_rate": 8.311688311688312e-05, "loss": 0.6645, "step": 160 }, { "epoch": 0.004426668333177098, "grad_norm": 0.3700549900531769, "learning_rate": 8.831168831168831e-05, "loss": 0.6727, "step": 170 }, { "epoch": 0.004687060588069869, "grad_norm": 0.32032889127731323, "learning_rate": 9.35064935064935e-05, "loss": 0.6529, "step": 180 }, { "epoch": 0.004947452842962639, "grad_norm": 0.3331650495529175, "learning_rate": 9.870129870129871e-05, "loss": 0.6627, "step": 190 }, { "epoch": 0.0052078450978554095, "grad_norm": 0.3300645351409912, "learning_rate": 0.00010389610389610389, "loss": 0.676, "step": 200 }, { "epoch": 0.0054682373527481795, "grad_norm": 0.350356787443161, "learning_rate": 0.00010909090909090909, "loss": 0.6564, "step": 210 }, { "epoch": 0.00572862960764095, "grad_norm": 0.382756769657135, "learning_rate": 0.00011428571428571428, "loss": 0.6243, "step": 220 }, { "epoch": 0.00598902186253372, "grad_norm": 0.34450188279151917, "learning_rate": 0.00011948051948051949, "loss": 0.6611, "step": 230 }, { "epoch": 0.006249414117426491, "grad_norm": 0.3705821633338928, "learning_rate": 0.00012467532467532467, "loss": 0.6384, "step": 240 }, { "epoch": 0.006509806372319262, "grad_norm": 0.36822304129600525, "learning_rate": 0.00012987012987012987, "loss": 0.6415, "step": 250 }, { "epoch": 0.006770198627212032, "grad_norm": 0.32358303666114807, "learning_rate": 0.00013506493506493507, "loss": 0.6584, "step": 260 }, { "epoch": 0.007030590882104803, "grad_norm": 0.33386844396591187, "learning_rate": 0.00014025974025974028, "loss": 0.6702, "step": 270 }, { "epoch": 0.007290983136997573, "grad_norm": 0.32447949051856995, "learning_rate": 0.00014545454545454546, "loss": 0.6519, "step": 280 }, { "epoch": 0.007551375391890344, "grad_norm": 0.3388073146343231, "learning_rate": 0.00015064935064935066, "loss": 0.6735, "step": 290 }, { "epoch": 0.007811767646783114, "grad_norm": 0.39655518531799316, "learning_rate": 0.00015584415584415587, "loss": 0.672, "step": 300 }, { "epoch": 0.008072159901675884, "grad_norm": 0.41258928179740906, "learning_rate": 0.00016103896103896104, "loss": 0.6626, "step": 310 }, { "epoch": 0.008332552156568656, "grad_norm": 0.3963010311126709, "learning_rate": 0.00016623376623376625, "loss": 0.6653, "step": 320 }, { "epoch": 0.008592944411461426, "grad_norm": 0.3641106188297272, "learning_rate": 0.00017142857142857143, "loss": 0.6389, "step": 330 }, { "epoch": 0.008853336666354196, "grad_norm": 0.38745763897895813, "learning_rate": 0.00017662337662337663, "loss": 0.6928, "step": 340 }, { "epoch": 0.009113728921246966, "grad_norm": 0.4573372006416321, "learning_rate": 0.00018181818181818183, "loss": 0.6679, "step": 350 }, { "epoch": 0.009374121176139737, "grad_norm": 0.45714282989501953, "learning_rate": 0.000187012987012987, "loss": 0.6453, "step": 360 }, { "epoch": 0.009634513431032507, "grad_norm": 0.37631818652153015, "learning_rate": 0.00019220779220779222, "loss": 0.6467, "step": 370 }, { "epoch": 0.009894905685925277, "grad_norm": 0.3658345639705658, "learning_rate": 0.00019740259740259742, "loss": 0.6631, "step": 380 }, { "epoch": 0.010155297940818049, "grad_norm": 0.3953540623188019, "learning_rate": 0.00019999996515752773, "loss": 0.6573, "step": 390 }, { "epoch": 0.010415690195710819, "grad_norm": 0.377763569355011, "learning_rate": 0.00019999968641789507, "loss": 0.6664, "step": 400 }, { "epoch": 0.010676082450603589, "grad_norm": 0.37128835916519165, "learning_rate": 0.0001999991289394067, "loss": 0.6342, "step": 410 }, { "epoch": 0.010936474705496359, "grad_norm": 0.33881694078445435, "learning_rate": 0.00019999829272361654, "loss": 0.6476, "step": 420 }, { "epoch": 0.01119686696038913, "grad_norm": 0.39774075150489807, "learning_rate": 0.00019999717777285545, "loss": 0.633, "step": 430 }, { "epoch": 0.0114572592152819, "grad_norm": 0.41350051760673523, "learning_rate": 0.00019999578409023126, "loss": 0.6541, "step": 440 }, { "epoch": 0.01171765147017467, "grad_norm": 0.47954171895980835, "learning_rate": 0.00019999411167962868, "loss": 0.6545, "step": 450 }, { "epoch": 0.01197804372506744, "grad_norm": 0.46860000491142273, "learning_rate": 0.00019999216054570942, "loss": 0.6512, "step": 460 }, { "epoch": 0.012238435979960213, "grad_norm": 0.4395809471607208, "learning_rate": 0.00019998993069391205, "loss": 0.6587, "step": 470 }, { "epoch": 0.012498828234852983, "grad_norm": 0.43222516775131226, "learning_rate": 0.00019998742213045206, "loss": 0.6292, "step": 480 }, { "epoch": 0.012759220489745753, "grad_norm": 0.39363613724708557, "learning_rate": 0.00019998463486232179, "loss": 0.6319, "step": 490 }, { "epoch": 0.013019612744638524, "grad_norm": 0.4984697699546814, "learning_rate": 0.0001999815688972905, "loss": 0.6488, "step": 500 }, { "epoch": 0.013280004999531294, "grad_norm": 0.4710462689399719, "learning_rate": 0.00019997822424390422, "loss": 0.6633, "step": 510 }, { "epoch": 0.013540397254424064, "grad_norm": 0.4141169786453247, "learning_rate": 0.00019997460091148586, "loss": 0.6471, "step": 520 }, { "epoch": 0.013800789509316834, "grad_norm": 0.39957430958747864, "learning_rate": 0.00019997069891013503, "loss": 0.6226, "step": 530 }, { "epoch": 0.014061181764209606, "grad_norm": 0.4508794844150543, "learning_rate": 0.00019996651825072826, "loss": 0.6559, "step": 540 }, { "epoch": 0.014321574019102376, "grad_norm": 0.4256739020347595, "learning_rate": 0.00019996205894491856, "loss": 0.6551, "step": 550 }, { "epoch": 0.014581966273995146, "grad_norm": 0.43204987049102783, "learning_rate": 0.00019995732100513592, "loss": 0.6254, "step": 560 }, { "epoch": 0.014842358528887916, "grad_norm": 0.37589946389198303, "learning_rate": 0.00019995230444458682, "loss": 0.6543, "step": 570 }, { "epoch": 0.015102750783780688, "grad_norm": 0.40850168466567993, "learning_rate": 0.0001999470092772544, "loss": 0.6474, "step": 580 }, { "epoch": 0.015363143038673458, "grad_norm": 0.3754895031452179, "learning_rate": 0.00019994143551789839, "loss": 0.6502, "step": 590 }, { "epoch": 0.015623535293566228, "grad_norm": 0.3857438266277313, "learning_rate": 0.00019993558318205507, "loss": 0.6544, "step": 600 }, { "epoch": 0.015883927548459, "grad_norm": 0.4063841998577118, "learning_rate": 0.00019992945228603724, "loss": 0.639, "step": 610 }, { "epoch": 0.016144319803351768, "grad_norm": 0.35183581709861755, "learning_rate": 0.0001999230428469341, "loss": 0.6442, "step": 620 }, { "epoch": 0.01640471205824454, "grad_norm": 0.4158167243003845, "learning_rate": 0.00019991635488261138, "loss": 0.6586, "step": 630 }, { "epoch": 0.01666510431313731, "grad_norm": 0.45118188858032227, "learning_rate": 0.00019990938841171104, "loss": 0.6581, "step": 640 }, { "epoch": 0.01692549656803008, "grad_norm": 0.39950400590896606, "learning_rate": 0.0001999021434536514, "loss": 0.6712, "step": 650 }, { "epoch": 0.01718588882292285, "grad_norm": 0.35208678245544434, "learning_rate": 0.00019989462002862704, "loss": 0.6398, "step": 660 }, { "epoch": 0.017446281077815623, "grad_norm": 0.38008975982666016, "learning_rate": 0.0001998868181576088, "loss": 0.6479, "step": 670 }, { "epoch": 0.01770667333270839, "grad_norm": 0.4314909875392914, "learning_rate": 0.00019987873786234348, "loss": 0.6358, "step": 680 }, { "epoch": 0.017967065587601163, "grad_norm": 0.3982577323913574, "learning_rate": 0.00019987037916535417, "loss": 0.6361, "step": 690 }, { "epoch": 0.01822745784249393, "grad_norm": 0.3529202342033386, "learning_rate": 0.0001998617420899398, "loss": 0.64, "step": 700 }, { "epoch": 0.018487850097386703, "grad_norm": 0.41149991750717163, "learning_rate": 0.0001998528266601754, "loss": 0.6684, "step": 710 }, { "epoch": 0.018748242352279475, "grad_norm": 0.42630311846733093, "learning_rate": 0.0001998436329009118, "loss": 0.6429, "step": 720 }, { "epoch": 0.019008634607172243, "grad_norm": 0.4028918147087097, "learning_rate": 0.00019983416083777563, "loss": 0.6573, "step": 730 }, { "epoch": 0.019269026862065015, "grad_norm": 0.3785901963710785, "learning_rate": 0.0001998244104971693, "loss": 0.6132, "step": 740 }, { "epoch": 0.019529419116957786, "grad_norm": 0.39018985629081726, "learning_rate": 0.0001998143819062709, "loss": 0.6287, "step": 750 }, { "epoch": 0.019789811371850555, "grad_norm": 0.4268128573894501, "learning_rate": 0.00019980407509303413, "loss": 0.6585, "step": 760 }, { "epoch": 0.020050203626743326, "grad_norm": 0.4293033480644226, "learning_rate": 0.00019979349008618808, "loss": 0.6843, "step": 770 }, { "epoch": 0.020310595881636098, "grad_norm": 0.38943207263946533, "learning_rate": 0.00019978262691523743, "loss": 0.6265, "step": 780 }, { "epoch": 0.020570988136528866, "grad_norm": 0.40528395771980286, "learning_rate": 0.00019977148561046217, "loss": 0.6392, "step": 790 }, { "epoch": 0.020831380391421638, "grad_norm": 0.4273380935192108, "learning_rate": 0.0001997600662029175, "loss": 0.6615, "step": 800 }, { "epoch": 0.021091772646314406, "grad_norm": 0.4269028306007385, "learning_rate": 0.00019974836872443388, "loss": 0.6412, "step": 810 }, { "epoch": 0.021352164901207178, "grad_norm": 0.3542031943798065, "learning_rate": 0.0001997363932076168, "loss": 0.6606, "step": 820 }, { "epoch": 0.02161255715609995, "grad_norm": 0.36826202273368835, "learning_rate": 0.00019972413968584682, "loss": 0.6387, "step": 830 }, { "epoch": 0.021872949410992718, "grad_norm": 0.4278506338596344, "learning_rate": 0.0001997116081932793, "loss": 0.6544, "step": 840 }, { "epoch": 0.02213334166588549, "grad_norm": 0.467886358499527, "learning_rate": 0.0001996987987648446, "loss": 0.6524, "step": 850 }, { "epoch": 0.02239373392077826, "grad_norm": 0.36823606491088867, "learning_rate": 0.0001996857114362476, "loss": 0.6553, "step": 860 }, { "epoch": 0.02265412617567103, "grad_norm": 0.42569059133529663, "learning_rate": 0.00019967234624396793, "loss": 0.6484, "step": 870 }, { "epoch": 0.0229145184305638, "grad_norm": 0.36995476484298706, "learning_rate": 0.00019965870322525965, "loss": 0.6626, "step": 880 }, { "epoch": 0.023174910685456573, "grad_norm": 0.4284444749355316, "learning_rate": 0.0001996447824181513, "loss": 0.6579, "step": 890 }, { "epoch": 0.02343530294034934, "grad_norm": 0.36263275146484375, "learning_rate": 0.0001996305838614457, "loss": 0.6466, "step": 900 }, { "epoch": 0.023695695195242113, "grad_norm": 0.43936702609062195, "learning_rate": 0.00019961610759471984, "loss": 0.6534, "step": 910 }, { "epoch": 0.02395608745013488, "grad_norm": 0.37757524847984314, "learning_rate": 0.00019960135365832486, "loss": 0.6344, "step": 920 }, { "epoch": 0.024216479705027653, "grad_norm": 0.40086570382118225, "learning_rate": 0.00019958632209338587, "loss": 0.6265, "step": 930 }, { "epoch": 0.024476871959920425, "grad_norm": 0.3435315489768982, "learning_rate": 0.00019957101294180174, "loss": 0.6479, "step": 940 }, { "epoch": 0.024737264214813193, "grad_norm": 0.34466204047203064, "learning_rate": 0.00019955542624624522, "loss": 0.641, "step": 950 }, { "epoch": 0.024997656469705965, "grad_norm": 0.46282994747161865, "learning_rate": 0.00019953956205016256, "loss": 0.6389, "step": 960 }, { "epoch": 0.025258048724598737, "grad_norm": 0.3815780580043793, "learning_rate": 0.00019952342039777362, "loss": 0.6472, "step": 970 }, { "epoch": 0.025518440979491505, "grad_norm": 0.43121904134750366, "learning_rate": 0.00019950700133407163, "loss": 0.6314, "step": 980 }, { "epoch": 0.025778833234384277, "grad_norm": 0.41635170578956604, "learning_rate": 0.00019949030490482296, "loss": 0.6483, "step": 990 }, { "epoch": 0.02603922548927705, "grad_norm": 0.3946804106235504, "learning_rate": 0.0001994733311565673, "loss": 0.6383, "step": 1000 }, { "epoch": 0.026299617744169817, "grad_norm": 0.48494285345077515, "learning_rate": 0.0001994560801366171, "loss": 0.6617, "step": 1010 }, { "epoch": 0.02656000999906259, "grad_norm": 0.4007907807826996, "learning_rate": 0.00019943855189305792, "loss": 0.6187, "step": 1020 }, { "epoch": 0.026820402253955357, "grad_norm": 0.4674074649810791, "learning_rate": 0.00019942074647474786, "loss": 0.6629, "step": 1030 }, { "epoch": 0.02708079450884813, "grad_norm": 0.3703964650630951, "learning_rate": 0.00019940266393131775, "loss": 0.6606, "step": 1040 }, { "epoch": 0.0273411867637409, "grad_norm": 0.4177350401878357, "learning_rate": 0.00019938430431317081, "loss": 0.6285, "step": 1050 }, { "epoch": 0.02760157901863367, "grad_norm": 0.391641765832901, "learning_rate": 0.00019936566767148257, "loss": 0.6448, "step": 1060 }, { "epoch": 0.02786197127352644, "grad_norm": 0.38827773928642273, "learning_rate": 0.00019934675405820077, "loss": 0.6272, "step": 1070 }, { "epoch": 0.028122363528419212, "grad_norm": 0.41332709789276123, "learning_rate": 0.00019932756352604515, "loss": 0.6316, "step": 1080 }, { "epoch": 0.02838275578331198, "grad_norm": 0.38579726219177246, "learning_rate": 0.00019930809612850735, "loss": 0.6357, "step": 1090 }, { "epoch": 0.028643148038204752, "grad_norm": 0.4541114568710327, "learning_rate": 0.00019928835191985076, "loss": 0.6546, "step": 1100 }, { "epoch": 0.028903540293097524, "grad_norm": 0.37385833263397217, "learning_rate": 0.0001992683309551103, "loss": 0.6378, "step": 1110 }, { "epoch": 0.029163932547990292, "grad_norm": 0.39442044496536255, "learning_rate": 0.00019924803329009243, "loss": 0.6549, "step": 1120 }, { "epoch": 0.029424324802883064, "grad_norm": 0.3960839509963989, "learning_rate": 0.00019922745898137473, "loss": 0.6304, "step": 1130 }, { "epoch": 0.029684717057775832, "grad_norm": 0.4159034192562103, "learning_rate": 0.00019920660808630598, "loss": 0.6503, "step": 1140 }, { "epoch": 0.029945109312668604, "grad_norm": 0.4242476522922516, "learning_rate": 0.00019918548066300592, "loss": 0.6305, "step": 1150 }, { "epoch": 0.030205501567561376, "grad_norm": 0.4142429530620575, "learning_rate": 0.0001991640767703651, "loss": 0.6246, "step": 1160 }, { "epoch": 0.030465893822454144, "grad_norm": 0.4049033522605896, "learning_rate": 0.00019914239646804462, "loss": 0.6315, "step": 1170 }, { "epoch": 0.030726286077346916, "grad_norm": 0.4325621426105499, "learning_rate": 0.00019912043981647616, "loss": 0.6467, "step": 1180 }, { "epoch": 0.030986678332239687, "grad_norm": 0.35380443930625916, "learning_rate": 0.00019909820687686157, "loss": 0.6416, "step": 1190 }, { "epoch": 0.031247070587132456, "grad_norm": 0.3798046410083771, "learning_rate": 0.0001990756977111729, "loss": 0.6367, "step": 1200 }, { "epoch": 0.03150746284202523, "grad_norm": 0.4257236123085022, "learning_rate": 0.0001990529123821522, "loss": 0.6414, "step": 1210 }, { "epoch": 0.031767855096918, "grad_norm": 0.4575822055339813, "learning_rate": 0.00019902985095331113, "loss": 0.6647, "step": 1220 }, { "epoch": 0.03202824735181077, "grad_norm": 0.34732112288475037, "learning_rate": 0.00019900651348893114, "loss": 0.6446, "step": 1230 }, { "epoch": 0.032288639606703536, "grad_norm": 0.4493260979652405, "learning_rate": 0.00019898290005406296, "loss": 0.6672, "step": 1240 }, { "epoch": 0.03254903186159631, "grad_norm": 0.39185160398483276, "learning_rate": 0.00019895901071452667, "loss": 0.6581, "step": 1250 }, { "epoch": 0.03280942411648908, "grad_norm": 0.37691834568977356, "learning_rate": 0.0001989348455369113, "loss": 0.644, "step": 1260 }, { "epoch": 0.03306981637138185, "grad_norm": 0.378093421459198, "learning_rate": 0.0001989104045885748, "loss": 0.6515, "step": 1270 }, { "epoch": 0.03333020862627462, "grad_norm": 0.37683796882629395, "learning_rate": 0.00019888568793764385, "loss": 0.6281, "step": 1280 }, { "epoch": 0.03359060088116739, "grad_norm": 0.37529483437538147, "learning_rate": 0.00019886069565301355, "loss": 0.6606, "step": 1290 }, { "epoch": 0.03385099313606016, "grad_norm": 0.3849285840988159, "learning_rate": 0.00019883542780434733, "loss": 0.6388, "step": 1300 }, { "epoch": 0.03411138539095293, "grad_norm": 0.3860384523868561, "learning_rate": 0.0001988098844620767, "loss": 0.6612, "step": 1310 }, { "epoch": 0.0343717776458457, "grad_norm": 0.4840448200702667, "learning_rate": 0.0001987840656974011, "loss": 0.6432, "step": 1320 }, { "epoch": 0.034632169900738474, "grad_norm": 0.3508262038230896, "learning_rate": 0.00019875797158228775, "loss": 0.6549, "step": 1330 }, { "epoch": 0.034892562155631246, "grad_norm": 0.4253254234790802, "learning_rate": 0.00019873160218947125, "loss": 0.6303, "step": 1340 }, { "epoch": 0.03515295441052401, "grad_norm": 0.37659895420074463, "learning_rate": 0.00019870495759245362, "loss": 0.6278, "step": 1350 }, { "epoch": 0.03541334666541678, "grad_norm": 0.36914440989494324, "learning_rate": 0.0001986780378655039, "loss": 0.6614, "step": 1360 }, { "epoch": 0.035673738920309554, "grad_norm": 0.40397894382476807, "learning_rate": 0.0001986508430836581, "loss": 0.6295, "step": 1370 }, { "epoch": 0.035934131175202326, "grad_norm": 0.3998821973800659, "learning_rate": 0.0001986233733227188, "loss": 0.6705, "step": 1380 }, { "epoch": 0.0361945234300951, "grad_norm": 0.37330886721611023, "learning_rate": 0.00019859562865925525, "loss": 0.6537, "step": 1390 }, { "epoch": 0.03645491568498786, "grad_norm": 0.3862515091896057, "learning_rate": 0.00019856760917060277, "loss": 0.6576, "step": 1400 }, { "epoch": 0.036715307939880634, "grad_norm": 0.39040204882621765, "learning_rate": 0.00019853931493486287, "loss": 0.6697, "step": 1410 }, { "epoch": 0.036975700194773406, "grad_norm": 0.3295992910861969, "learning_rate": 0.00019851074603090277, "loss": 0.6175, "step": 1420 }, { "epoch": 0.03723609244966618, "grad_norm": 0.33969369530677795, "learning_rate": 0.00019848190253835536, "loss": 0.6453, "step": 1430 }, { "epoch": 0.03749648470455895, "grad_norm": 0.456320196390152, "learning_rate": 0.00019845278453761896, "loss": 0.6392, "step": 1440 }, { "epoch": 0.03775687695945172, "grad_norm": 0.3699491024017334, "learning_rate": 0.00019842339210985696, "loss": 0.636, "step": 1450 }, { "epoch": 0.038017269214344486, "grad_norm": 0.41601112484931946, "learning_rate": 0.00019839372533699774, "loss": 0.6566, "step": 1460 }, { "epoch": 0.03827766146923726, "grad_norm": 0.39745938777923584, "learning_rate": 0.00019836378430173438, "loss": 0.6421, "step": 1470 }, { "epoch": 0.03853805372413003, "grad_norm": 0.38357457518577576, "learning_rate": 0.0001983335690875245, "loss": 0.6355, "step": 1480 }, { "epoch": 0.0387984459790228, "grad_norm": 0.3879673182964325, "learning_rate": 0.00019830307977858984, "loss": 0.6295, "step": 1490 }, { "epoch": 0.03905883823391557, "grad_norm": 0.42652568221092224, "learning_rate": 0.00019827231645991623, "loss": 0.6374, "step": 1500 }, { "epoch": 0.03931923048880834, "grad_norm": 0.3830074369907379, "learning_rate": 0.00019824127921725326, "loss": 0.6292, "step": 1510 }, { "epoch": 0.03957962274370111, "grad_norm": 0.39314061403274536, "learning_rate": 0.00019820996813711407, "loss": 0.6416, "step": 1520 }, { "epoch": 0.03984001499859388, "grad_norm": 0.3321419060230255, "learning_rate": 0.0001981783833067751, "loss": 0.6206, "step": 1530 }, { "epoch": 0.04010040725348665, "grad_norm": 0.41209813952445984, "learning_rate": 0.0001981465248142758, "loss": 0.6576, "step": 1540 }, { "epoch": 0.040360799508379425, "grad_norm": 0.4043482542037964, "learning_rate": 0.00019811439274841842, "loss": 0.6588, "step": 1550 }, { "epoch": 0.040621191763272196, "grad_norm": 0.4470541179180145, "learning_rate": 0.00019808198719876782, "loss": 0.6595, "step": 1560 }, { "epoch": 0.04088158401816496, "grad_norm": 0.3442763090133667, "learning_rate": 0.00019804930825565112, "loss": 0.6584, "step": 1570 }, { "epoch": 0.04114197627305773, "grad_norm": 0.4013935923576355, "learning_rate": 0.00019801635601015752, "loss": 0.6315, "step": 1580 }, { "epoch": 0.041402368527950505, "grad_norm": 0.36532357335090637, "learning_rate": 0.00019798313055413808, "loss": 0.6453, "step": 1590 }, { "epoch": 0.041662760782843276, "grad_norm": 0.4390687644481659, "learning_rate": 0.00019794963198020525, "loss": 0.6375, "step": 1600 }, { "epoch": 0.04192315303773605, "grad_norm": 0.3687056601047516, "learning_rate": 0.00019791586038173296, "loss": 0.637, "step": 1610 }, { "epoch": 0.04218354529262881, "grad_norm": 0.372841477394104, "learning_rate": 0.00019788181585285602, "loss": 0.6322, "step": 1620 }, { "epoch": 0.042443937547521585, "grad_norm": 0.3459762632846832, "learning_rate": 0.00019784749848847003, "loss": 0.62, "step": 1630 }, { "epoch": 0.042704329802414356, "grad_norm": 0.4031515121459961, "learning_rate": 0.0001978129083842312, "loss": 0.6438, "step": 1640 }, { "epoch": 0.04296472205730713, "grad_norm": 0.39984458684921265, "learning_rate": 0.00019777804563655583, "loss": 0.6224, "step": 1650 }, { "epoch": 0.0432251143121999, "grad_norm": 0.37194013595581055, "learning_rate": 0.00019774291034262026, "loss": 0.6258, "step": 1660 }, { "epoch": 0.04348550656709267, "grad_norm": 0.3989511728286743, "learning_rate": 0.00019770750260036054, "loss": 0.6385, "step": 1670 }, { "epoch": 0.043745898821985436, "grad_norm": 0.3801423907279968, "learning_rate": 0.00019767182250847207, "loss": 0.6234, "step": 1680 }, { "epoch": 0.04400629107687821, "grad_norm": 0.3838658034801483, "learning_rate": 0.00019763587016640948, "loss": 0.656, "step": 1690 }, { "epoch": 0.04426668333177098, "grad_norm": 0.5071051716804504, "learning_rate": 0.00019759964567438623, "loss": 0.6385, "step": 1700 }, { "epoch": 0.04452707558666375, "grad_norm": 0.3741011321544647, "learning_rate": 0.00019756314913337432, "loss": 0.6452, "step": 1710 }, { "epoch": 0.04478746784155652, "grad_norm": 0.41739609837532043, "learning_rate": 0.00019752638064510415, "loss": 0.627, "step": 1720 }, { "epoch": 0.04504786009644929, "grad_norm": 0.38942453265190125, "learning_rate": 0.00019748934031206414, "loss": 0.6486, "step": 1730 }, { "epoch": 0.04530825235134206, "grad_norm": 0.40764692425727844, "learning_rate": 0.00019745202823750034, "loss": 0.6311, "step": 1740 }, { "epoch": 0.04556864460623483, "grad_norm": 0.4089398682117462, "learning_rate": 0.0001974144445254164, "loss": 0.6262, "step": 1750 }, { "epoch": 0.0458290368611276, "grad_norm": 0.4223162531852722, "learning_rate": 0.00019737658928057302, "loss": 0.6633, "step": 1760 }, { "epoch": 0.046089429116020375, "grad_norm": 0.4696766436100006, "learning_rate": 0.00019733846260848776, "loss": 0.6448, "step": 1770 }, { "epoch": 0.04634982137091315, "grad_norm": 0.34561800956726074, "learning_rate": 0.0001973000646154349, "loss": 0.6629, "step": 1780 }, { "epoch": 0.04661021362580591, "grad_norm": 0.3809750974178314, "learning_rate": 0.00019726139540844484, "loss": 0.6261, "step": 1790 }, { "epoch": 0.04687060588069868, "grad_norm": 0.37188807129859924, "learning_rate": 0.00019722245509530401, "loss": 0.6392, "step": 1800 }, { "epoch": 0.047130998135591455, "grad_norm": 0.36847737431526184, "learning_rate": 0.00019718324378455458, "loss": 0.6238, "step": 1810 }, { "epoch": 0.04739139039048423, "grad_norm": 0.34314194321632385, "learning_rate": 0.00019714376158549404, "loss": 0.6512, "step": 1820 }, { "epoch": 0.047651782645377, "grad_norm": 0.3639289140701294, "learning_rate": 0.00019710400860817494, "loss": 0.6481, "step": 1830 }, { "epoch": 0.04791217490026976, "grad_norm": 0.34774431586265564, "learning_rate": 0.00019706398496340463, "loss": 0.6583, "step": 1840 }, { "epoch": 0.048172567155162535, "grad_norm": 0.37768319249153137, "learning_rate": 0.00019702369076274494, "loss": 0.6241, "step": 1850 }, { "epoch": 0.04843295941005531, "grad_norm": 0.3546730875968933, "learning_rate": 0.0001969831261185118, "loss": 0.6222, "step": 1860 }, { "epoch": 0.04869335166494808, "grad_norm": 0.3773512840270996, "learning_rate": 0.00019694229114377494, "loss": 0.6201, "step": 1870 }, { "epoch": 0.04895374391984085, "grad_norm": 0.3787965774536133, "learning_rate": 0.00019690118595235774, "loss": 0.6339, "step": 1880 }, { "epoch": 0.04921413617473362, "grad_norm": 0.3667986989021301, "learning_rate": 0.00019685981065883663, "loss": 0.6253, "step": 1890 }, { "epoch": 0.04947452842962639, "grad_norm": 0.39258262515068054, "learning_rate": 0.00019681816537854102, "loss": 0.6417, "step": 1900 }, { "epoch": 0.04973492068451916, "grad_norm": 0.3514678478240967, "learning_rate": 0.00019677625022755289, "loss": 0.6473, "step": 1910 }, { "epoch": 0.04999531293941193, "grad_norm": 0.38365432620048523, "learning_rate": 0.00019673406532270634, "loss": 0.6363, "step": 1920 }, { "epoch": 0.0502557051943047, "grad_norm": 0.34043630957603455, "learning_rate": 0.00019669161078158753, "loss": 0.6249, "step": 1930 }, { "epoch": 0.050516097449197474, "grad_norm": 0.41065657138824463, "learning_rate": 0.0001966488867225341, "loss": 0.6479, "step": 1940 }, { "epoch": 0.05077648970409024, "grad_norm": 0.3435451090335846, "learning_rate": 0.00019660589326463498, "loss": 0.6498, "step": 1950 }, { "epoch": 0.05103688195898301, "grad_norm": 0.3457126021385193, "learning_rate": 0.00019656263052773002, "loss": 0.6188, "step": 1960 }, { "epoch": 0.05129727421387578, "grad_norm": 0.34488430619239807, "learning_rate": 0.00019651909863240965, "loss": 0.6352, "step": 1970 }, { "epoch": 0.051557666468768554, "grad_norm": 0.34936293959617615, "learning_rate": 0.00019647529770001456, "loss": 0.6331, "step": 1980 }, { "epoch": 0.051818058723661325, "grad_norm": 0.34119752049446106, "learning_rate": 0.00019643122785263536, "loss": 0.6188, "step": 1990 }, { "epoch": 0.0520784509785541, "grad_norm": 0.35101839900016785, "learning_rate": 0.00019638688921311224, "loss": 0.6339, "step": 2000 } ], "logging_steps": 10, "max_steps": 19202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.13160820359168e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }