{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02603922548927705, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002603922548927705, "grad_norm": 0.8521247506141663, "learning_rate": 5.194805194805195e-06, "loss": 0.7412, "step": 10 }, { "epoch": 0.000520784509785541, "grad_norm": 0.6229312419891357, "learning_rate": 1.038961038961039e-05, "loss": 0.7138, "step": 20 }, { "epoch": 0.0007811767646783114, "grad_norm": 0.4566498100757599, "learning_rate": 1.5584415584415583e-05, "loss": 0.7079, "step": 30 }, { "epoch": 0.001041569019571082, "grad_norm": 0.4316692650318146, "learning_rate": 2.077922077922078e-05, "loss": 0.6988, "step": 40 }, { "epoch": 0.0013019612744638524, "grad_norm": 0.615436315536499, "learning_rate": 2.5974025974025972e-05, "loss": 0.6937, "step": 50 }, { "epoch": 0.0015623535293566228, "grad_norm": 0.48698583245277405, "learning_rate": 3.1168831168831166e-05, "loss": 0.7043, "step": 60 }, { "epoch": 0.0018227457842493933, "grad_norm": 0.3984021544456482, "learning_rate": 3.6363636363636364e-05, "loss": 0.6563, "step": 70 }, { "epoch": 0.002083138039142164, "grad_norm": 0.37576180696487427, "learning_rate": 4.155844155844156e-05, "loss": 0.6462, "step": 80 }, { "epoch": 0.0023435302940349343, "grad_norm": 0.35269680619239807, "learning_rate": 4.675324675324675e-05, "loss": 0.6656, "step": 90 }, { "epoch": 0.0026039225489277048, "grad_norm": 0.31541451811790466, "learning_rate": 5.1948051948051944e-05, "loss": 0.6547, "step": 100 }, { "epoch": 0.002864314803820475, "grad_norm": 0.3462330400943756, "learning_rate": 5.714285714285714e-05, "loss": 0.6621, "step": 110 }, { "epoch": 0.0031247070587132456, "grad_norm": 0.3465985953807831, "learning_rate": 6.233766233766233e-05, "loss": 0.6273, "step": 120 }, { "epoch": 0.003385099313606016, "grad_norm": 0.3297797441482544, "learning_rate": 6.753246753246754e-05, "loss": 0.6559, "step": 130 }, { "epoch": 0.0036454915684987865, "grad_norm": 0.3888818621635437, "learning_rate": 7.272727272727273e-05, "loss": 0.6756, "step": 140 }, { "epoch": 0.003905883823391557, "grad_norm": 0.3542368710041046, "learning_rate": 7.792207792207793e-05, "loss": 0.6506, "step": 150 }, { "epoch": 0.004166276078284328, "grad_norm": 0.37369370460510254, "learning_rate": 8.311688311688312e-05, "loss": 0.6645, "step": 160 }, { "epoch": 0.004426668333177098, "grad_norm": 0.3700549900531769, "learning_rate": 8.831168831168831e-05, "loss": 0.6727, "step": 170 }, { "epoch": 0.004687060588069869, "grad_norm": 0.32032889127731323, "learning_rate": 9.35064935064935e-05, "loss": 0.6529, "step": 180 }, { "epoch": 0.004947452842962639, "grad_norm": 0.3331650495529175, "learning_rate": 9.870129870129871e-05, "loss": 0.6627, "step": 190 }, { "epoch": 0.0052078450978554095, "grad_norm": 0.3300645351409912, "learning_rate": 0.00010389610389610389, "loss": 0.676, "step": 200 }, { "epoch": 0.0054682373527481795, "grad_norm": 0.350356787443161, "learning_rate": 0.00010909090909090909, "loss": 0.6564, "step": 210 }, { "epoch": 0.00572862960764095, "grad_norm": 0.382756769657135, "learning_rate": 0.00011428571428571428, "loss": 0.6243, "step": 220 }, { "epoch": 0.00598902186253372, "grad_norm": 0.34450188279151917, "learning_rate": 0.00011948051948051949, "loss": 0.6611, "step": 230 }, { "epoch": 0.006249414117426491, "grad_norm": 0.3705821633338928, "learning_rate": 0.00012467532467532467, "loss": 0.6384, "step": 240 }, { "epoch": 0.006509806372319262, "grad_norm": 0.36822304129600525, "learning_rate": 0.00012987012987012987, "loss": 0.6415, "step": 250 }, { "epoch": 0.006770198627212032, "grad_norm": 0.32358303666114807, "learning_rate": 0.00013506493506493507, "loss": 0.6584, "step": 260 }, { "epoch": 0.007030590882104803, "grad_norm": 0.33386844396591187, "learning_rate": 0.00014025974025974028, "loss": 0.6702, "step": 270 }, { "epoch": 0.007290983136997573, "grad_norm": 0.32447949051856995, "learning_rate": 0.00014545454545454546, "loss": 0.6519, "step": 280 }, { "epoch": 0.007551375391890344, "grad_norm": 0.3388073146343231, "learning_rate": 0.00015064935064935066, "loss": 0.6735, "step": 290 }, { "epoch": 0.007811767646783114, "grad_norm": 0.39655518531799316, "learning_rate": 0.00015584415584415587, "loss": 0.672, "step": 300 }, { "epoch": 0.008072159901675884, "grad_norm": 0.41258928179740906, "learning_rate": 0.00016103896103896104, "loss": 0.6626, "step": 310 }, { "epoch": 0.008332552156568656, "grad_norm": 0.3963010311126709, "learning_rate": 0.00016623376623376625, "loss": 0.6653, "step": 320 }, { "epoch": 0.008592944411461426, "grad_norm": 0.3641106188297272, "learning_rate": 0.00017142857142857143, "loss": 0.6389, "step": 330 }, { "epoch": 0.008853336666354196, "grad_norm": 0.38745763897895813, "learning_rate": 0.00017662337662337663, "loss": 0.6928, "step": 340 }, { "epoch": 0.009113728921246966, "grad_norm": 0.4573372006416321, "learning_rate": 0.00018181818181818183, "loss": 0.6679, "step": 350 }, { "epoch": 0.009374121176139737, "grad_norm": 0.45714282989501953, "learning_rate": 0.000187012987012987, "loss": 0.6453, "step": 360 }, { "epoch": 0.009634513431032507, "grad_norm": 0.37631818652153015, "learning_rate": 0.00019220779220779222, "loss": 0.6467, "step": 370 }, { "epoch": 0.009894905685925277, "grad_norm": 0.3658345639705658, "learning_rate": 0.00019740259740259742, "loss": 0.6631, "step": 380 }, { "epoch": 0.010155297940818049, "grad_norm": 0.3953540623188019, "learning_rate": 0.00019999996515752773, "loss": 0.6573, "step": 390 }, { "epoch": 0.010415690195710819, "grad_norm": 0.377763569355011, "learning_rate": 0.00019999968641789507, "loss": 0.6664, "step": 400 }, { "epoch": 0.010676082450603589, "grad_norm": 0.37128835916519165, "learning_rate": 0.0001999991289394067, "loss": 0.6342, "step": 410 }, { "epoch": 0.010936474705496359, "grad_norm": 0.33881694078445435, "learning_rate": 0.00019999829272361654, "loss": 0.6476, "step": 420 }, { "epoch": 0.01119686696038913, "grad_norm": 0.39774075150489807, "learning_rate": 0.00019999717777285545, "loss": 0.633, "step": 430 }, { "epoch": 0.0114572592152819, "grad_norm": 0.41350051760673523, "learning_rate": 0.00019999578409023126, "loss": 0.6541, "step": 440 }, { "epoch": 0.01171765147017467, "grad_norm": 0.47954171895980835, "learning_rate": 0.00019999411167962868, "loss": 0.6545, "step": 450 }, { "epoch": 0.01197804372506744, "grad_norm": 0.46860000491142273, "learning_rate": 0.00019999216054570942, "loss": 0.6512, "step": 460 }, { "epoch": 0.012238435979960213, "grad_norm": 0.4395809471607208, "learning_rate": 0.00019998993069391205, "loss": 0.6587, "step": 470 }, { "epoch": 0.012498828234852983, "grad_norm": 0.43222516775131226, "learning_rate": 0.00019998742213045206, "loss": 0.6292, "step": 480 }, { "epoch": 0.012759220489745753, "grad_norm": 0.39363613724708557, "learning_rate": 0.00019998463486232179, "loss": 0.6319, "step": 490 }, { "epoch": 0.013019612744638524, "grad_norm": 0.4984697699546814, "learning_rate": 0.0001999815688972905, "loss": 0.6488, "step": 500 }, { "epoch": 0.013280004999531294, "grad_norm": 0.4710462689399719, "learning_rate": 0.00019997822424390422, "loss": 0.6633, "step": 510 }, { "epoch": 0.013540397254424064, "grad_norm": 0.4141169786453247, "learning_rate": 0.00019997460091148586, "loss": 0.6471, "step": 520 }, { "epoch": 0.013800789509316834, "grad_norm": 0.39957430958747864, "learning_rate": 0.00019997069891013503, "loss": 0.6226, "step": 530 }, { "epoch": 0.014061181764209606, "grad_norm": 0.4508794844150543, "learning_rate": 0.00019996651825072826, "loss": 0.6559, "step": 540 }, { "epoch": 0.014321574019102376, "grad_norm": 0.4256739020347595, "learning_rate": 0.00019996205894491856, "loss": 0.6551, "step": 550 }, { "epoch": 0.014581966273995146, "grad_norm": 0.43204987049102783, "learning_rate": 0.00019995732100513592, "loss": 0.6254, "step": 560 }, { "epoch": 0.014842358528887916, "grad_norm": 0.37589946389198303, "learning_rate": 0.00019995230444458682, "loss": 0.6543, "step": 570 }, { "epoch": 0.015102750783780688, "grad_norm": 0.40850168466567993, "learning_rate": 0.0001999470092772544, "loss": 0.6474, "step": 580 }, { "epoch": 0.015363143038673458, "grad_norm": 0.3754895031452179, "learning_rate": 0.00019994143551789839, "loss": 0.6502, "step": 590 }, { "epoch": 0.015623535293566228, "grad_norm": 0.3857438266277313, "learning_rate": 0.00019993558318205507, "loss": 0.6544, "step": 600 }, { "epoch": 0.015883927548459, "grad_norm": 0.4063841998577118, "learning_rate": 0.00019992945228603724, "loss": 0.639, "step": 610 }, { "epoch": 0.016144319803351768, "grad_norm": 0.35183581709861755, "learning_rate": 0.0001999230428469341, "loss": 0.6442, "step": 620 }, { "epoch": 0.01640471205824454, "grad_norm": 0.4158167243003845, "learning_rate": 0.00019991635488261138, "loss": 0.6586, "step": 630 }, { "epoch": 0.01666510431313731, "grad_norm": 0.45118188858032227, "learning_rate": 0.00019990938841171104, "loss": 0.6581, "step": 640 }, { "epoch": 0.01692549656803008, "grad_norm": 0.39950400590896606, "learning_rate": 0.0001999021434536514, "loss": 0.6712, "step": 650 }, { "epoch": 0.01718588882292285, "grad_norm": 0.35208678245544434, "learning_rate": 0.00019989462002862704, "loss": 0.6398, "step": 660 }, { "epoch": 0.017446281077815623, "grad_norm": 0.38008975982666016, "learning_rate": 0.0001998868181576088, "loss": 0.6479, "step": 670 }, { "epoch": 0.01770667333270839, "grad_norm": 0.4314909875392914, "learning_rate": 0.00019987873786234348, "loss": 0.6358, "step": 680 }, { "epoch": 0.017967065587601163, "grad_norm": 0.3982577323913574, "learning_rate": 0.00019987037916535417, "loss": 0.6361, "step": 690 }, { "epoch": 0.01822745784249393, "grad_norm": 0.3529202342033386, "learning_rate": 0.0001998617420899398, "loss": 0.64, "step": 700 }, { "epoch": 0.018487850097386703, "grad_norm": 0.41149991750717163, "learning_rate": 0.0001998528266601754, "loss": 0.6684, "step": 710 }, { "epoch": 0.018748242352279475, "grad_norm": 0.42630311846733093, "learning_rate": 0.0001998436329009118, "loss": 0.6429, "step": 720 }, { "epoch": 0.019008634607172243, "grad_norm": 0.4028918147087097, "learning_rate": 0.00019983416083777563, "loss": 0.6573, "step": 730 }, { "epoch": 0.019269026862065015, "grad_norm": 0.3785901963710785, "learning_rate": 0.0001998244104971693, "loss": 0.6132, "step": 740 }, { "epoch": 0.019529419116957786, "grad_norm": 0.39018985629081726, "learning_rate": 0.0001998143819062709, "loss": 0.6287, "step": 750 }, { "epoch": 0.019789811371850555, "grad_norm": 0.4268128573894501, "learning_rate": 0.00019980407509303413, "loss": 0.6585, "step": 760 }, { "epoch": 0.020050203626743326, "grad_norm": 0.4293033480644226, "learning_rate": 0.00019979349008618808, "loss": 0.6843, "step": 770 }, { "epoch": 0.020310595881636098, "grad_norm": 0.38943207263946533, "learning_rate": 0.00019978262691523743, "loss": 0.6265, "step": 780 }, { "epoch": 0.020570988136528866, "grad_norm": 0.40528395771980286, "learning_rate": 0.00019977148561046217, "loss": 0.6392, "step": 790 }, { "epoch": 0.020831380391421638, "grad_norm": 0.4273380935192108, "learning_rate": 0.0001997600662029175, "loss": 0.6615, "step": 800 }, { "epoch": 0.021091772646314406, "grad_norm": 0.4269028306007385, "learning_rate": 0.00019974836872443388, "loss": 0.6412, "step": 810 }, { "epoch": 0.021352164901207178, "grad_norm": 0.3542031943798065, "learning_rate": 0.0001997363932076168, "loss": 0.6606, "step": 820 }, { "epoch": 0.02161255715609995, "grad_norm": 0.36826202273368835, "learning_rate": 0.00019972413968584682, "loss": 0.6387, "step": 830 }, { "epoch": 0.021872949410992718, "grad_norm": 0.4278506338596344, "learning_rate": 0.0001997116081932793, "loss": 0.6544, "step": 840 }, { "epoch": 0.02213334166588549, "grad_norm": 0.467886358499527, "learning_rate": 0.0001996987987648446, "loss": 0.6524, "step": 850 }, { "epoch": 0.02239373392077826, "grad_norm": 0.36823606491088867, "learning_rate": 0.0001996857114362476, "loss": 0.6553, "step": 860 }, { "epoch": 0.02265412617567103, "grad_norm": 0.42569059133529663, "learning_rate": 0.00019967234624396793, "loss": 0.6484, "step": 870 }, { "epoch": 0.0229145184305638, "grad_norm": 0.36995476484298706, "learning_rate": 0.00019965870322525965, "loss": 0.6626, "step": 880 }, { "epoch": 0.023174910685456573, "grad_norm": 0.4284444749355316, "learning_rate": 0.0001996447824181513, "loss": 0.6579, "step": 890 }, { "epoch": 0.02343530294034934, "grad_norm": 0.36263275146484375, "learning_rate": 0.0001996305838614457, "loss": 0.6466, "step": 900 }, { "epoch": 0.023695695195242113, "grad_norm": 0.43936702609062195, "learning_rate": 0.00019961610759471984, "loss": 0.6534, "step": 910 }, { "epoch": 0.02395608745013488, "grad_norm": 0.37757524847984314, "learning_rate": 0.00019960135365832486, "loss": 0.6344, "step": 920 }, { "epoch": 0.024216479705027653, "grad_norm": 0.40086570382118225, "learning_rate": 0.00019958632209338587, "loss": 0.6265, "step": 930 }, { "epoch": 0.024476871959920425, "grad_norm": 0.3435315489768982, "learning_rate": 0.00019957101294180174, "loss": 0.6479, "step": 940 }, { "epoch": 0.024737264214813193, "grad_norm": 0.34466204047203064, "learning_rate": 0.00019955542624624522, "loss": 0.641, "step": 950 }, { "epoch": 0.024997656469705965, "grad_norm": 0.46282994747161865, "learning_rate": 0.00019953956205016256, "loss": 0.6389, "step": 960 }, { "epoch": 0.025258048724598737, "grad_norm": 0.3815780580043793, "learning_rate": 0.00019952342039777362, "loss": 0.6472, "step": 970 }, { "epoch": 0.025518440979491505, "grad_norm": 0.43121904134750366, "learning_rate": 0.00019950700133407163, "loss": 0.6314, "step": 980 }, { "epoch": 0.025778833234384277, "grad_norm": 0.41635170578956604, "learning_rate": 0.00019949030490482296, "loss": 0.6483, "step": 990 }, { "epoch": 0.02603922548927705, "grad_norm": 0.3946804106235504, "learning_rate": 0.0001994733311565673, "loss": 0.6383, "step": 1000 } ], "logging_steps": 10, "max_steps": 19202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.6580410179584e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }