{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10981167298083787, "eval_steps": 250, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010981167298083787, "grad_norm": 0.017355114221572876, "learning_rate": 1e-05, "loss": 10.3824, "step": 1 }, { "epoch": 0.00010981167298083787, "eval_loss": 10.37172794342041, "eval_runtime": 126.3947, "eval_samples_per_second": 11.765, "eval_steps_per_second": 5.886, "step": 1 }, { "epoch": 0.00021962334596167574, "grad_norm": 0.015282077714800835, "learning_rate": 2e-05, "loss": 10.3712, "step": 2 }, { "epoch": 0.00032943501894251357, "grad_norm": 0.0194566547870636, "learning_rate": 3e-05, "loss": 10.3723, "step": 3 }, { "epoch": 0.0004392466919233515, "grad_norm": 0.020783135667443275, "learning_rate": 4e-05, "loss": 10.3683, "step": 4 }, { "epoch": 0.0005490583649041893, "grad_norm": 0.014004701748490334, "learning_rate": 5e-05, "loss": 10.3673, "step": 5 }, { "epoch": 0.0006588700378850271, "grad_norm": 0.01998690329492092, "learning_rate": 6e-05, "loss": 10.3895, "step": 6 }, { "epoch": 0.0007686817108658651, "grad_norm": 0.020466268062591553, "learning_rate": 7e-05, "loss": 10.3723, "step": 7 }, { "epoch": 0.000878493383846703, "grad_norm": 0.017083710059523582, "learning_rate": 8e-05, "loss": 10.374, "step": 8 }, { "epoch": 0.0009883050568275408, "grad_norm": 0.018081573769450188, "learning_rate": 9e-05, "loss": 10.374, "step": 9 }, { "epoch": 0.0010981167298083786, "grad_norm": 0.018566979095339775, "learning_rate": 0.0001, "loss": 10.3827, "step": 10 }, { "epoch": 0.0012079284027892165, "grad_norm": 0.01672968454658985, "learning_rate": 9.999974825027756e-05, "loss": 10.369, "step": 11 }, { "epoch": 0.0013177400757700543, "grad_norm": 0.019540229812264442, "learning_rate": 9.999899300364532e-05, "loss": 10.3848, "step": 12 }, { "epoch": 0.0014275517487508922, "grad_norm": 0.01690570078790188, "learning_rate": 9.999773426770865e-05, "loss": 10.3752, "step": 13 }, { "epoch": 0.0015373634217317302, "grad_norm": 0.017440909519791603, "learning_rate": 9.999597205514297e-05, "loss": 10.3691, "step": 14 }, { "epoch": 0.001647175094712568, "grad_norm": 0.017702434211969376, "learning_rate": 9.999370638369377e-05, "loss": 10.3696, "step": 15 }, { "epoch": 0.001756986767693406, "grad_norm": 0.02032608538866043, "learning_rate": 9.99909372761763e-05, "loss": 10.3744, "step": 16 }, { "epoch": 0.0018667984406742436, "grad_norm": 0.01723252609372139, "learning_rate": 9.998766476047547e-05, "loss": 10.3688, "step": 17 }, { "epoch": 0.0019766101136550816, "grad_norm": 0.018756622448563576, "learning_rate": 9.998388886954547e-05, "loss": 10.375, "step": 18 }, { "epoch": 0.0020864217866359194, "grad_norm": 0.014659336768090725, "learning_rate": 9.997960964140947e-05, "loss": 10.3659, "step": 19 }, { "epoch": 0.002196233459616757, "grad_norm": 0.01682705245912075, "learning_rate": 9.997482711915927e-05, "loss": 10.3654, "step": 20 }, { "epoch": 0.0023060451325975953, "grad_norm": 0.01613915152847767, "learning_rate": 9.99695413509548e-05, "loss": 10.3732, "step": 21 }, { "epoch": 0.002415856805578433, "grad_norm": 0.017700903117656708, "learning_rate": 9.996375239002369e-05, "loss": 10.3603, "step": 22 }, { "epoch": 0.0025256684785592708, "grad_norm": 0.016667550429701805, "learning_rate": 9.995746029466071e-05, "loss": 10.3743, "step": 23 }, { "epoch": 0.0026354801515401085, "grad_norm": 0.0205207709223032, "learning_rate": 9.99506651282272e-05, "loss": 10.3717, "step": 24 }, { "epoch": 0.0027452918245209467, "grad_norm": 0.018369698897004128, "learning_rate": 9.99433669591504e-05, "loss": 10.3688, "step": 25 }, { "epoch": 0.0028551034975017845, "grad_norm": 0.018392475321888924, "learning_rate": 9.993556586092281e-05, "loss": 10.3672, "step": 26 }, { "epoch": 0.002964915170482622, "grad_norm": 0.019174449145793915, "learning_rate": 9.992726191210138e-05, "loss": 10.3728, "step": 27 }, { "epoch": 0.0030747268434634604, "grad_norm": 0.019878720864653587, "learning_rate": 9.991845519630678e-05, "loss": 10.3743, "step": 28 }, { "epoch": 0.003184538516444298, "grad_norm": 0.01893949694931507, "learning_rate": 9.990914580222257e-05, "loss": 10.3629, "step": 29 }, { "epoch": 0.003294350189425136, "grad_norm": 0.01767776906490326, "learning_rate": 9.989933382359422e-05, "loss": 10.3693, "step": 30 }, { "epoch": 0.0034041618624059736, "grad_norm": 0.017583226785063744, "learning_rate": 9.988901935922826e-05, "loss": 10.3733, "step": 31 }, { "epoch": 0.003513973535386812, "grad_norm": 0.018666569143533707, "learning_rate": 9.987820251299122e-05, "loss": 10.3736, "step": 32 }, { "epoch": 0.0036237852083676496, "grad_norm": 0.01870071142911911, "learning_rate": 9.986688339380862e-05, "loss": 10.362, "step": 33 }, { "epoch": 0.0037335968813484873, "grad_norm": 0.020093046128749847, "learning_rate": 9.985506211566388e-05, "loss": 10.365, "step": 34 }, { "epoch": 0.003843408554329325, "grad_norm": 0.02296426147222519, "learning_rate": 9.984273879759713e-05, "loss": 10.3642, "step": 35 }, { "epoch": 0.003953220227310163, "grad_norm": 0.017974786460399628, "learning_rate": 9.982991356370404e-05, "loss": 10.3682, "step": 36 }, { "epoch": 0.0040630319002910005, "grad_norm": 0.019896533340215683, "learning_rate": 9.981658654313457e-05, "loss": 10.3677, "step": 37 }, { "epoch": 0.004172843573271839, "grad_norm": 0.01675613969564438, "learning_rate": 9.98027578700917e-05, "loss": 10.3662, "step": 38 }, { "epoch": 0.004282655246252677, "grad_norm": 0.017869669944047928, "learning_rate": 9.978842768382998e-05, "loss": 10.3671, "step": 39 }, { "epoch": 0.004392466919233514, "grad_norm": 0.02277432754635811, "learning_rate": 9.977359612865423e-05, "loss": 10.3599, "step": 40 }, { "epoch": 0.004502278592214352, "grad_norm": 0.021998431533575058, "learning_rate": 9.975826335391808e-05, "loss": 10.3711, "step": 41 }, { "epoch": 0.004612090265195191, "grad_norm": 0.02354966290295124, "learning_rate": 9.974242951402235e-05, "loss": 10.3737, "step": 42 }, { "epoch": 0.004721901938176028, "grad_norm": 0.0228439774364233, "learning_rate": 9.972609476841367e-05, "loss": 10.3651, "step": 43 }, { "epoch": 0.004831713611156866, "grad_norm": 0.02064528316259384, "learning_rate": 9.970925928158274e-05, "loss": 10.3705, "step": 44 }, { "epoch": 0.004941525284137704, "grad_norm": 0.02003604546189308, "learning_rate": 9.969192322306271e-05, "loss": 10.3709, "step": 45 }, { "epoch": 0.0050513369571185416, "grad_norm": 0.023140212520956993, "learning_rate": 9.967408676742751e-05, "loss": 10.3732, "step": 46 }, { "epoch": 0.00516114863009938, "grad_norm": 0.01854497194290161, "learning_rate": 9.965575009429006e-05, "loss": 10.3679, "step": 47 }, { "epoch": 0.005270960303080217, "grad_norm": 0.024674147367477417, "learning_rate": 9.963691338830044e-05, "loss": 10.3719, "step": 48 }, { "epoch": 0.005380771976061055, "grad_norm": 0.021843615919351578, "learning_rate": 9.961757683914406e-05, "loss": 10.365, "step": 49 }, { "epoch": 0.005490583649041893, "grad_norm": 0.020922599360346794, "learning_rate": 9.959774064153977e-05, "loss": 10.3657, "step": 50 }, { "epoch": 0.005600395322022731, "grad_norm": 0.01967649534344673, "learning_rate": 9.957740499523787e-05, "loss": 10.3719, "step": 51 }, { "epoch": 0.005710206995003569, "grad_norm": 0.02338647097349167, "learning_rate": 9.955657010501806e-05, "loss": 10.3676, "step": 52 }, { "epoch": 0.005820018667984407, "grad_norm": 0.02005460113286972, "learning_rate": 9.953523618068749e-05, "loss": 10.3713, "step": 53 }, { "epoch": 0.005929830340965244, "grad_norm": 0.01933208853006363, "learning_rate": 9.951340343707852e-05, "loss": 10.3718, "step": 54 }, { "epoch": 0.006039642013946083, "grad_norm": 0.01974133774638176, "learning_rate": 9.949107209404665e-05, "loss": 10.3796, "step": 55 }, { "epoch": 0.006149453686926921, "grad_norm": 0.01728207990527153, "learning_rate": 9.946824237646824e-05, "loss": 10.3637, "step": 56 }, { "epoch": 0.006259265359907758, "grad_norm": 0.023813683539628983, "learning_rate": 9.944491451423828e-05, "loss": 10.3702, "step": 57 }, { "epoch": 0.006369077032888596, "grad_norm": 0.020871616899967194, "learning_rate": 9.942108874226811e-05, "loss": 10.3688, "step": 58 }, { "epoch": 0.006478888705869434, "grad_norm": 0.02382914163172245, "learning_rate": 9.939676530048301e-05, "loss": 10.3702, "step": 59 }, { "epoch": 0.006588700378850272, "grad_norm": 0.02201221138238907, "learning_rate": 9.937194443381972e-05, "loss": 10.3725, "step": 60 }, { "epoch": 0.00669851205183111, "grad_norm": 0.023925097659230232, "learning_rate": 9.934662639222412e-05, "loss": 10.3698, "step": 61 }, { "epoch": 0.006808323724811947, "grad_norm": 0.028722627088427544, "learning_rate": 9.93208114306486e-05, "loss": 10.3673, "step": 62 }, { "epoch": 0.006918135397792785, "grad_norm": 0.023019561544060707, "learning_rate": 9.929449980904952e-05, "loss": 10.3641, "step": 63 }, { "epoch": 0.007027947070773624, "grad_norm": 0.027020001783967018, "learning_rate": 9.926769179238466e-05, "loss": 10.3681, "step": 64 }, { "epoch": 0.007137758743754461, "grad_norm": 0.024322889745235443, "learning_rate": 9.924038765061042e-05, "loss": 10.37, "step": 65 }, { "epoch": 0.007247570416735299, "grad_norm": 0.021261010318994522, "learning_rate": 9.921258765867919e-05, "loss": 10.3704, "step": 66 }, { "epoch": 0.007357382089716136, "grad_norm": 0.023977644741535187, "learning_rate": 9.918429209653662e-05, "loss": 10.362, "step": 67 }, { "epoch": 0.007467193762696975, "grad_norm": 0.02730187587440014, "learning_rate": 9.915550124911866e-05, "loss": 10.3661, "step": 68 }, { "epoch": 0.007577005435677813, "grad_norm": 0.026274163275957108, "learning_rate": 9.912621540634887e-05, "loss": 10.3631, "step": 69 }, { "epoch": 0.00768681710865865, "grad_norm": 0.02593817003071308, "learning_rate": 9.909643486313533e-05, "loss": 10.3637, "step": 70 }, { "epoch": 0.007796628781639488, "grad_norm": 0.02696198970079422, "learning_rate": 9.90661599193678e-05, "loss": 10.3705, "step": 71 }, { "epoch": 0.007906440454620326, "grad_norm": 0.02566119097173214, "learning_rate": 9.903539087991462e-05, "loss": 10.372, "step": 72 }, { "epoch": 0.008016252127601164, "grad_norm": 0.024189887568354607, "learning_rate": 9.900412805461967e-05, "loss": 10.3658, "step": 73 }, { "epoch": 0.008126063800582001, "grad_norm": 0.029657792299985886, "learning_rate": 9.897237175829926e-05, "loss": 10.3746, "step": 74 }, { "epoch": 0.00823587547356284, "grad_norm": 0.02618011273443699, "learning_rate": 9.894012231073894e-05, "loss": 10.3615, "step": 75 }, { "epoch": 0.008345687146543677, "grad_norm": 0.028396811336278915, "learning_rate": 9.890738003669029e-05, "loss": 10.3704, "step": 76 }, { "epoch": 0.008455498819524515, "grad_norm": 0.02743990533053875, "learning_rate": 9.887414526586763e-05, "loss": 10.3631, "step": 77 }, { "epoch": 0.008565310492505354, "grad_norm": 0.02389010787010193, "learning_rate": 9.884041833294476e-05, "loss": 10.3641, "step": 78 }, { "epoch": 0.008675122165486191, "grad_norm": 0.027859805151820183, "learning_rate": 9.880619957755151e-05, "loss": 10.367, "step": 79 }, { "epoch": 0.008784933838467028, "grad_norm": 0.032361432909965515, "learning_rate": 9.877148934427037e-05, "loss": 10.358, "step": 80 }, { "epoch": 0.008894745511447867, "grad_norm": 0.029688427224755287, "learning_rate": 9.873628798263296e-05, "loss": 10.3648, "step": 81 }, { "epoch": 0.009004557184428705, "grad_norm": 0.025555763393640518, "learning_rate": 9.870059584711668e-05, "loss": 10.3598, "step": 82 }, { "epoch": 0.009114368857409542, "grad_norm": 0.031570229679346085, "learning_rate": 9.866441329714088e-05, "loss": 10.366, "step": 83 }, { "epoch": 0.009224180530390381, "grad_norm": 0.031101500615477562, "learning_rate": 9.862774069706346e-05, "loss": 10.3745, "step": 84 }, { "epoch": 0.009333992203371218, "grad_norm": 0.030765362083911896, "learning_rate": 9.859057841617709e-05, "loss": 10.3661, "step": 85 }, { "epoch": 0.009443803876352056, "grad_norm": 0.029136590659618378, "learning_rate": 9.855292682870551e-05, "loss": 10.3667, "step": 86 }, { "epoch": 0.009553615549332895, "grad_norm": 0.029173659160733223, "learning_rate": 9.851478631379982e-05, "loss": 10.3711, "step": 87 }, { "epoch": 0.009663427222313732, "grad_norm": 0.030497074127197266, "learning_rate": 9.847615725553456e-05, "loss": 10.3644, "step": 88 }, { "epoch": 0.00977323889529457, "grad_norm": 0.032321833074092865, "learning_rate": 9.843704004290392e-05, "loss": 10.3671, "step": 89 }, { "epoch": 0.009883050568275408, "grad_norm": 0.028986245393753052, "learning_rate": 9.839743506981782e-05, "loss": 10.3674, "step": 90 }, { "epoch": 0.009992862241256246, "grad_norm": 0.039418235421180725, "learning_rate": 9.835734273509786e-05, "loss": 10.3704, "step": 91 }, { "epoch": 0.010102673914237083, "grad_norm": 0.025938916951417923, "learning_rate": 9.831676344247342e-05, "loss": 10.3742, "step": 92 }, { "epoch": 0.01021248558721792, "grad_norm": 0.03675010800361633, "learning_rate": 9.827569760057755e-05, "loss": 10.3696, "step": 93 }, { "epoch": 0.01032229726019876, "grad_norm": 0.03282457962632179, "learning_rate": 9.82341456229428e-05, "loss": 10.3518, "step": 94 }, { "epoch": 0.010432108933179597, "grad_norm": 0.03477093204855919, "learning_rate": 9.819210792799712e-05, "loss": 10.3669, "step": 95 }, { "epoch": 0.010541920606160434, "grad_norm": 0.02538960985839367, "learning_rate": 9.814958493905963e-05, "loss": 10.3597, "step": 96 }, { "epoch": 0.010651732279141273, "grad_norm": 0.03886039927601814, "learning_rate": 9.810657708433637e-05, "loss": 10.3661, "step": 97 }, { "epoch": 0.01076154395212211, "grad_norm": 0.037356775254011154, "learning_rate": 9.806308479691595e-05, "loss": 10.3658, "step": 98 }, { "epoch": 0.010871355625102948, "grad_norm": 0.0339871346950531, "learning_rate": 9.801910851476523e-05, "loss": 10.3688, "step": 99 }, { "epoch": 0.010981167298083787, "grad_norm": 0.04446303844451904, "learning_rate": 9.797464868072488e-05, "loss": 10.3686, "step": 100 }, { "epoch": 0.011090978971064624, "grad_norm": 0.03416872024536133, "learning_rate": 9.792970574250493e-05, "loss": 10.3628, "step": 101 }, { "epoch": 0.011200790644045461, "grad_norm": 0.03962542116641998, "learning_rate": 9.788428015268027e-05, "loss": 10.364, "step": 102 }, { "epoch": 0.0113106023170263, "grad_norm": 0.03903932124376297, "learning_rate": 9.783837236868609e-05, "loss": 10.3619, "step": 103 }, { "epoch": 0.011420413990007138, "grad_norm": 0.040032122284173965, "learning_rate": 9.779198285281325e-05, "loss": 10.3616, "step": 104 }, { "epoch": 0.011530225662987975, "grad_norm": 0.03569488599896431, "learning_rate": 9.77451120722037e-05, "loss": 10.3639, "step": 105 }, { "epoch": 0.011640037335968814, "grad_norm": 0.036571722477674484, "learning_rate": 9.769776049884563e-05, "loss": 10.3628, "step": 106 }, { "epoch": 0.011749849008949651, "grad_norm": 0.04485912248492241, "learning_rate": 9.764992860956889e-05, "loss": 10.3629, "step": 107 }, { "epoch": 0.011859660681930489, "grad_norm": 0.039439521729946136, "learning_rate": 9.760161688604008e-05, "loss": 10.3586, "step": 108 }, { "epoch": 0.011969472354911328, "grad_norm": 0.036617379635572433, "learning_rate": 9.755282581475769e-05, "loss": 10.3619, "step": 109 }, { "epoch": 0.012079284027892165, "grad_norm": 0.03839511796832085, "learning_rate": 9.750355588704727e-05, "loss": 10.3568, "step": 110 }, { "epoch": 0.012189095700873002, "grad_norm": 0.03922630101442337, "learning_rate": 9.745380759905647e-05, "loss": 10.3603, "step": 111 }, { "epoch": 0.012298907373853842, "grad_norm": 0.04881632700562477, "learning_rate": 9.740358145174998e-05, "loss": 10.3594, "step": 112 }, { "epoch": 0.012408719046834679, "grad_norm": 0.05318975821137428, "learning_rate": 9.735287795090455e-05, "loss": 10.365, "step": 113 }, { "epoch": 0.012518530719815516, "grad_norm": 0.03702029585838318, "learning_rate": 9.730169760710386e-05, "loss": 10.3532, "step": 114 }, { "epoch": 0.012628342392796353, "grad_norm": 0.05414840579032898, "learning_rate": 9.725004093573342e-05, "loss": 10.3674, "step": 115 }, { "epoch": 0.012738154065777193, "grad_norm": 0.050753697752952576, "learning_rate": 9.719790845697533e-05, "loss": 10.3593, "step": 116 }, { "epoch": 0.01284796573875803, "grad_norm": 0.043248798698186874, "learning_rate": 9.714530069580309e-05, "loss": 10.3564, "step": 117 }, { "epoch": 0.012957777411738867, "grad_norm": 0.04024583473801613, "learning_rate": 9.709221818197624e-05, "loss": 10.3642, "step": 118 }, { "epoch": 0.013067589084719706, "grad_norm": 0.03656017780303955, "learning_rate": 9.703866145003511e-05, "loss": 10.3524, "step": 119 }, { "epoch": 0.013177400757700544, "grad_norm": 0.033831775188446045, "learning_rate": 9.698463103929542e-05, "loss": 10.3505, "step": 120 }, { "epoch": 0.01328721243068138, "grad_norm": 0.0484774112701416, "learning_rate": 9.693012749384279e-05, "loss": 10.3663, "step": 121 }, { "epoch": 0.01339702410366222, "grad_norm": 0.05139002948999405, "learning_rate": 9.687515136252731e-05, "loss": 10.3602, "step": 122 }, { "epoch": 0.013506835776643057, "grad_norm": 0.042695775628089905, "learning_rate": 9.681970319895803e-05, "loss": 10.3566, "step": 123 }, { "epoch": 0.013616647449623894, "grad_norm": 0.0395086295902729, "learning_rate": 9.676378356149734e-05, "loss": 10.361, "step": 124 }, { "epoch": 0.013726459122604734, "grad_norm": 0.04226267337799072, "learning_rate": 9.670739301325534e-05, "loss": 10.3521, "step": 125 }, { "epoch": 0.01383627079558557, "grad_norm": 0.045777130872011185, "learning_rate": 9.665053212208426e-05, "loss": 10.356, "step": 126 }, { "epoch": 0.013946082468566408, "grad_norm": 0.05063780024647713, "learning_rate": 9.659320146057262e-05, "loss": 10.3586, "step": 127 }, { "epoch": 0.014055894141547247, "grad_norm": 0.04973231628537178, "learning_rate": 9.653540160603956e-05, "loss": 10.3522, "step": 128 }, { "epoch": 0.014165705814528085, "grad_norm": 0.04170297831296921, "learning_rate": 9.647713314052896e-05, "loss": 10.3542, "step": 129 }, { "epoch": 0.014275517487508922, "grad_norm": 0.04281565919518471, "learning_rate": 9.641839665080363e-05, "loss": 10.3476, "step": 130 }, { "epoch": 0.014385329160489761, "grad_norm": 0.05829470232129097, "learning_rate": 9.635919272833938e-05, "loss": 10.3501, "step": 131 }, { "epoch": 0.014495140833470598, "grad_norm": 0.05022669956088066, "learning_rate": 9.629952196931901e-05, "loss": 10.3457, "step": 132 }, { "epoch": 0.014604952506451436, "grad_norm": 0.058340515941381454, "learning_rate": 9.623938497462646e-05, "loss": 10.3489, "step": 133 }, { "epoch": 0.014714764179432273, "grad_norm": 0.044063545763492584, "learning_rate": 9.617878234984055e-05, "loss": 10.3502, "step": 134 }, { "epoch": 0.014824575852413112, "grad_norm": 0.05639813840389252, "learning_rate": 9.611771470522908e-05, "loss": 10.3533, "step": 135 }, { "epoch": 0.01493438752539395, "grad_norm": 0.04481794685125351, "learning_rate": 9.60561826557425e-05, "loss": 10.3537, "step": 136 }, { "epoch": 0.015044199198374787, "grad_norm": 0.03814784437417984, "learning_rate": 9.599418682100793e-05, "loss": 10.3527, "step": 137 }, { "epoch": 0.015154010871355626, "grad_norm": 0.049631427973508835, "learning_rate": 9.593172782532268e-05, "loss": 10.3465, "step": 138 }, { "epoch": 0.015263822544336463, "grad_norm": 0.05086357891559601, "learning_rate": 9.586880629764817e-05, "loss": 10.354, "step": 139 }, { "epoch": 0.0153736342173173, "grad_norm": 0.039971865713596344, "learning_rate": 9.580542287160348e-05, "loss": 10.3547, "step": 140 }, { "epoch": 0.01548344589029814, "grad_norm": 0.04161727428436279, "learning_rate": 9.574157818545901e-05, "loss": 10.3496, "step": 141 }, { "epoch": 0.015593257563278977, "grad_norm": 0.050380293279886246, "learning_rate": 9.567727288213005e-05, "loss": 10.3504, "step": 142 }, { "epoch": 0.015703069236259814, "grad_norm": 0.04933194816112518, "learning_rate": 9.561250760917027e-05, "loss": 10.3542, "step": 143 }, { "epoch": 0.015812880909240653, "grad_norm": 0.04347715154290199, "learning_rate": 9.554728301876526e-05, "loss": 10.3458, "step": 144 }, { "epoch": 0.01592269258222149, "grad_norm": 0.04432293027639389, "learning_rate": 9.548159976772592e-05, "loss": 10.3506, "step": 145 }, { "epoch": 0.016032504255202328, "grad_norm": 0.05397890880703926, "learning_rate": 9.541545851748186e-05, "loss": 10.3564, "step": 146 }, { "epoch": 0.016142315928183167, "grad_norm": 0.04890001192688942, "learning_rate": 9.534885993407474e-05, "loss": 10.3493, "step": 147 }, { "epoch": 0.016252127601164002, "grad_norm": 0.05409630760550499, "learning_rate": 9.528180468815155e-05, "loss": 10.3466, "step": 148 }, { "epoch": 0.01636193927414484, "grad_norm": 0.03742580488324165, "learning_rate": 9.521429345495787e-05, "loss": 10.347, "step": 149 }, { "epoch": 0.01647175094712568, "grad_norm": 0.05012597516179085, "learning_rate": 9.514632691433107e-05, "loss": 10.3539, "step": 150 }, { "epoch": 0.016581562620106516, "grad_norm": 0.055696357041597366, "learning_rate": 9.507790575069347e-05, "loss": 10.3495, "step": 151 }, { "epoch": 0.016691374293087355, "grad_norm": 0.04766364023089409, "learning_rate": 9.50090306530454e-05, "loss": 10.3492, "step": 152 }, { "epoch": 0.016801185966068194, "grad_norm": 0.04760153219103813, "learning_rate": 9.493970231495835e-05, "loss": 10.3456, "step": 153 }, { "epoch": 0.01691099763904903, "grad_norm": 0.057927701622247696, "learning_rate": 9.486992143456792e-05, "loss": 10.3531, "step": 154 }, { "epoch": 0.01702080931202987, "grad_norm": 0.045849986374378204, "learning_rate": 9.479968871456679e-05, "loss": 10.3477, "step": 155 }, { "epoch": 0.017130620985010708, "grad_norm": 0.041446588933467865, "learning_rate": 9.472900486219769e-05, "loss": 10.3525, "step": 156 }, { "epoch": 0.017240432657991543, "grad_norm": 0.04088803008198738, "learning_rate": 9.46578705892462e-05, "loss": 10.3492, "step": 157 }, { "epoch": 0.017350244330972382, "grad_norm": 0.0481136329472065, "learning_rate": 9.458628661203367e-05, "loss": 10.3516, "step": 158 }, { "epoch": 0.01746005600395322, "grad_norm": 0.03966531530022621, "learning_rate": 9.451425365140996e-05, "loss": 10.3436, "step": 159 }, { "epoch": 0.017569867676934057, "grad_norm": 0.04862331971526146, "learning_rate": 9.444177243274618e-05, "loss": 10.353, "step": 160 }, { "epoch": 0.017679679349914896, "grad_norm": 0.039939992129802704, "learning_rate": 9.43688436859274e-05, "loss": 10.35, "step": 161 }, { "epoch": 0.017789491022895735, "grad_norm": 0.0484342984855175, "learning_rate": 9.429546814534529e-05, "loss": 10.3455, "step": 162 }, { "epoch": 0.01789930269587657, "grad_norm": 0.05140478536486626, "learning_rate": 9.422164654989072e-05, "loss": 10.3346, "step": 163 }, { "epoch": 0.01800911436885741, "grad_norm": 0.042091164737939835, "learning_rate": 9.414737964294636e-05, "loss": 10.3482, "step": 164 }, { "epoch": 0.01811892604183825, "grad_norm": 0.02937757596373558, "learning_rate": 9.407266817237911e-05, "loss": 10.3413, "step": 165 }, { "epoch": 0.018228737714819084, "grad_norm": 0.03831694275140762, "learning_rate": 9.399751289053267e-05, "loss": 10.3497, "step": 166 }, { "epoch": 0.018338549387799923, "grad_norm": 0.05200895294547081, "learning_rate": 9.392191455421988e-05, "loss": 10.3499, "step": 167 }, { "epoch": 0.018448361060780762, "grad_norm": 0.04790012910962105, "learning_rate": 9.384587392471515e-05, "loss": 10.3441, "step": 168 }, { "epoch": 0.018558172733761598, "grad_norm": 0.039311476051807404, "learning_rate": 9.376939176774679e-05, "loss": 10.3433, "step": 169 }, { "epoch": 0.018667984406742437, "grad_norm": 0.040564488619565964, "learning_rate": 9.369246885348926e-05, "loss": 10.3471, "step": 170 }, { "epoch": 0.018777796079723276, "grad_norm": 0.039450425654649734, "learning_rate": 9.361510595655545e-05, "loss": 10.3433, "step": 171 }, { "epoch": 0.01888760775270411, "grad_norm": 0.045643243938684464, "learning_rate": 9.353730385598887e-05, "loss": 10.3431, "step": 172 }, { "epoch": 0.01899741942568495, "grad_norm": 0.035616207867860794, "learning_rate": 9.345906333525581e-05, "loss": 10.3486, "step": 173 }, { "epoch": 0.01910723109866579, "grad_norm": 0.04152603819966316, "learning_rate": 9.338038518223747e-05, "loss": 10.3436, "step": 174 }, { "epoch": 0.019217042771646625, "grad_norm": 0.030790936201810837, "learning_rate": 9.330127018922194e-05, "loss": 10.3417, "step": 175 }, { "epoch": 0.019326854444627464, "grad_norm": 0.0589006170630455, "learning_rate": 9.322171915289635e-05, "loss": 10.3429, "step": 176 }, { "epoch": 0.019436666117608303, "grad_norm": 0.0449153333902359, "learning_rate": 9.314173287433873e-05, "loss": 10.3484, "step": 177 }, { "epoch": 0.01954647779058914, "grad_norm": 0.03249122574925423, "learning_rate": 9.306131215901003e-05, "loss": 10.3417, "step": 178 }, { "epoch": 0.019656289463569978, "grad_norm": 0.05547071993350983, "learning_rate": 9.298045781674596e-05, "loss": 10.3361, "step": 179 }, { "epoch": 0.019766101136550817, "grad_norm": 0.04753856733441353, "learning_rate": 9.289917066174886e-05, "loss": 10.3451, "step": 180 }, { "epoch": 0.019875912809531653, "grad_norm": 0.04023727774620056, "learning_rate": 9.281745151257946e-05, "loss": 10.3451, "step": 181 }, { "epoch": 0.01998572448251249, "grad_norm": 0.03852841258049011, "learning_rate": 9.273530119214868e-05, "loss": 10.3434, "step": 182 }, { "epoch": 0.02009553615549333, "grad_norm": 0.042827341705560684, "learning_rate": 9.265272052770936e-05, "loss": 10.3405, "step": 183 }, { "epoch": 0.020205347828474166, "grad_norm": 0.027394205331802368, "learning_rate": 9.256971035084785e-05, "loss": 10.342, "step": 184 }, { "epoch": 0.020315159501455005, "grad_norm": 0.04240558296442032, "learning_rate": 9.248627149747573e-05, "loss": 10.3435, "step": 185 }, { "epoch": 0.02042497117443584, "grad_norm": 0.03938102349638939, "learning_rate": 9.24024048078213e-05, "loss": 10.3397, "step": 186 }, { "epoch": 0.02053478284741668, "grad_norm": 0.03396698087453842, "learning_rate": 9.231811112642121e-05, "loss": 10.3474, "step": 187 }, { "epoch": 0.02064459452039752, "grad_norm": 0.03540763631463051, "learning_rate": 9.223339130211192e-05, "loss": 10.3349, "step": 188 }, { "epoch": 0.020754406193378355, "grad_norm": 0.03599977120757103, "learning_rate": 9.214824618802109e-05, "loss": 10.3481, "step": 189 }, { "epoch": 0.020864217866359194, "grad_norm": 0.04189132899045944, "learning_rate": 9.206267664155907e-05, "loss": 10.3417, "step": 190 }, { "epoch": 0.020974029539340033, "grad_norm": 0.03507464751601219, "learning_rate": 9.197668352441025e-05, "loss": 10.3468, "step": 191 }, { "epoch": 0.021083841212320868, "grad_norm": 0.03676028177142143, "learning_rate": 9.189026770252436e-05, "loss": 10.3394, "step": 192 }, { "epoch": 0.021193652885301707, "grad_norm": 0.030979083850979805, "learning_rate": 9.18034300461078e-05, "loss": 10.3518, "step": 193 }, { "epoch": 0.021303464558282546, "grad_norm": 0.030613142997026443, "learning_rate": 9.171617142961477e-05, "loss": 10.3456, "step": 194 }, { "epoch": 0.021413276231263382, "grad_norm": 0.03547399491071701, "learning_rate": 9.162849273173857e-05, "loss": 10.3469, "step": 195 }, { "epoch": 0.02152308790424422, "grad_norm": 0.0392518974840641, "learning_rate": 9.154039483540273e-05, "loss": 10.3432, "step": 196 }, { "epoch": 0.02163289957722506, "grad_norm": 0.03400423005223274, "learning_rate": 9.145187862775209e-05, "loss": 10.3462, "step": 197 }, { "epoch": 0.021742711250205896, "grad_norm": 0.03951896354556084, "learning_rate": 9.136294500014386e-05, "loss": 10.3428, "step": 198 }, { "epoch": 0.021852522923186735, "grad_norm": 0.045471709221601486, "learning_rate": 9.12735948481387e-05, "loss": 10.346, "step": 199 }, { "epoch": 0.021962334596167574, "grad_norm": 0.029996167868375778, "learning_rate": 9.118382907149165e-05, "loss": 10.3444, "step": 200 }, { "epoch": 0.02207214626914841, "grad_norm": 0.03877894580364227, "learning_rate": 9.109364857414306e-05, "loss": 10.3403, "step": 201 }, { "epoch": 0.02218195794212925, "grad_norm": 0.03905482217669487, "learning_rate": 9.100305426420956e-05, "loss": 10.3413, "step": 202 }, { "epoch": 0.022291769615110087, "grad_norm": 0.027946101501584053, "learning_rate": 9.091204705397484e-05, "loss": 10.3392, "step": 203 }, { "epoch": 0.022401581288090923, "grad_norm": 0.0383419394493103, "learning_rate": 9.082062785988049e-05, "loss": 10.344, "step": 204 }, { "epoch": 0.022511392961071762, "grad_norm": 0.033805303275585175, "learning_rate": 9.072879760251679e-05, "loss": 10.3472, "step": 205 }, { "epoch": 0.0226212046340526, "grad_norm": 0.0401395782828331, "learning_rate": 9.06365572066134e-05, "loss": 10.3446, "step": 206 }, { "epoch": 0.022731016307033437, "grad_norm": 0.025171702727675438, "learning_rate": 9.05439076010301e-05, "loss": 10.3431, "step": 207 }, { "epoch": 0.022840827980014276, "grad_norm": 0.029064837843179703, "learning_rate": 9.045084971874738e-05, "loss": 10.3377, "step": 208 }, { "epoch": 0.022950639652995115, "grad_norm": 0.04559755325317383, "learning_rate": 9.035738449685707e-05, "loss": 10.3407, "step": 209 }, { "epoch": 0.02306045132597595, "grad_norm": 0.023931635543704033, "learning_rate": 9.026351287655294e-05, "loss": 10.336, "step": 210 }, { "epoch": 0.02317026299895679, "grad_norm": 0.02436680532991886, "learning_rate": 9.016923580312113e-05, "loss": 10.3383, "step": 211 }, { "epoch": 0.02328007467193763, "grad_norm": 0.041574105620384216, "learning_rate": 9.007455422593077e-05, "loss": 10.3407, "step": 212 }, { "epoch": 0.023389886344918464, "grad_norm": 0.027527812868356705, "learning_rate": 8.997946909842425e-05, "loss": 10.3438, "step": 213 }, { "epoch": 0.023499698017899303, "grad_norm": 0.03408113494515419, "learning_rate": 8.988398137810777e-05, "loss": 10.3354, "step": 214 }, { "epoch": 0.023609509690880142, "grad_norm": 0.031411245465278625, "learning_rate": 8.978809202654162e-05, "loss": 10.3436, "step": 215 }, { "epoch": 0.023719321363860978, "grad_norm": 0.026551635935902596, "learning_rate": 8.969180200933047e-05, "loss": 10.3435, "step": 216 }, { "epoch": 0.023829133036841817, "grad_norm": 0.025988008826971054, "learning_rate": 8.959511229611376e-05, "loss": 10.3325, "step": 217 }, { "epoch": 0.023938944709822656, "grad_norm": 0.027901288121938705, "learning_rate": 8.949802386055581e-05, "loss": 10.336, "step": 218 }, { "epoch": 0.02404875638280349, "grad_norm": 0.04028617963194847, "learning_rate": 8.940053768033609e-05, "loss": 10.3403, "step": 219 }, { "epoch": 0.02415856805578433, "grad_norm": 0.036284368485212326, "learning_rate": 8.930265473713938e-05, "loss": 10.3337, "step": 220 }, { "epoch": 0.02426837972876517, "grad_norm": 0.03083152137696743, "learning_rate": 8.92043760166458e-05, "loss": 10.336, "step": 221 }, { "epoch": 0.024378191401746005, "grad_norm": 0.03540259972214699, "learning_rate": 8.910570250852097e-05, "loss": 10.3433, "step": 222 }, { "epoch": 0.024488003074726844, "grad_norm": 0.038421355187892914, "learning_rate": 8.900663520640604e-05, "loss": 10.3332, "step": 223 }, { "epoch": 0.024597814747707683, "grad_norm": 0.026076674461364746, "learning_rate": 8.890717510790763e-05, "loss": 10.3425, "step": 224 }, { "epoch": 0.02470762642068852, "grad_norm": 0.03401215746998787, "learning_rate": 8.880732321458784e-05, "loss": 10.3383, "step": 225 }, { "epoch": 0.024817438093669358, "grad_norm": 0.027312805876135826, "learning_rate": 8.870708053195413e-05, "loss": 10.3477, "step": 226 }, { "epoch": 0.024927249766650193, "grad_norm": 0.03216833621263504, "learning_rate": 8.860644806944918e-05, "loss": 10.3384, "step": 227 }, { "epoch": 0.025037061439631032, "grad_norm": 0.031216269358992577, "learning_rate": 8.850542684044078e-05, "loss": 10.3427, "step": 228 }, { "epoch": 0.02514687311261187, "grad_norm": 0.031537123024463654, "learning_rate": 8.840401786221159e-05, "loss": 10.3444, "step": 229 }, { "epoch": 0.025256684785592707, "grad_norm": 0.029525797814130783, "learning_rate": 8.83022221559489e-05, "loss": 10.3408, "step": 230 }, { "epoch": 0.025366496458573546, "grad_norm": 0.03302108868956566, "learning_rate": 8.820004074673433e-05, "loss": 10.3425, "step": 231 }, { "epoch": 0.025476308131554385, "grad_norm": 0.02777056396007538, "learning_rate": 8.809747466353356e-05, "loss": 10.3413, "step": 232 }, { "epoch": 0.02558611980453522, "grad_norm": 0.035754457116127014, "learning_rate": 8.799452493918585e-05, "loss": 10.3388, "step": 233 }, { "epoch": 0.02569593147751606, "grad_norm": 0.02970656380057335, "learning_rate": 8.789119261039385e-05, "loss": 10.3444, "step": 234 }, { "epoch": 0.0258057431504969, "grad_norm": 0.024922939017415047, "learning_rate": 8.778747871771292e-05, "loss": 10.3367, "step": 235 }, { "epoch": 0.025915554823477734, "grad_norm": 0.0323614627122879, "learning_rate": 8.768338430554082e-05, "loss": 10.3374, "step": 236 }, { "epoch": 0.026025366496458573, "grad_norm": 0.043238565325737, "learning_rate": 8.757891042210714e-05, "loss": 10.3333, "step": 237 }, { "epoch": 0.026135178169439412, "grad_norm": 0.03006700798869133, "learning_rate": 8.74740581194627e-05, "loss": 10.335, "step": 238 }, { "epoch": 0.026244989842420248, "grad_norm": 0.0294354148209095, "learning_rate": 8.736882845346906e-05, "loss": 10.3401, "step": 239 }, { "epoch": 0.026354801515401087, "grad_norm": 0.03272950276732445, "learning_rate": 8.726322248378775e-05, "loss": 10.345, "step": 240 }, { "epoch": 0.026464613188381926, "grad_norm": 0.02740362472832203, "learning_rate": 8.715724127386972e-05, "loss": 10.3334, "step": 241 }, { "epoch": 0.02657442486136276, "grad_norm": 0.044348545372486115, "learning_rate": 8.705088589094459e-05, "loss": 10.3402, "step": 242 }, { "epoch": 0.0266842365343436, "grad_norm": 0.026010507717728615, "learning_rate": 8.694415740600988e-05, "loss": 10.3361, "step": 243 }, { "epoch": 0.02679404820732444, "grad_norm": 0.029143190011382103, "learning_rate": 8.683705689382024e-05, "loss": 10.3319, "step": 244 }, { "epoch": 0.026903859880305275, "grad_norm": 0.035695601254701614, "learning_rate": 8.672958543287666e-05, "loss": 10.3398, "step": 245 }, { "epoch": 0.027013671553286114, "grad_norm": 0.0501851923763752, "learning_rate": 8.662174410541555e-05, "loss": 10.3404, "step": 246 }, { "epoch": 0.027123483226266953, "grad_norm": 0.03639901056885719, "learning_rate": 8.651353399739787e-05, "loss": 10.3342, "step": 247 }, { "epoch": 0.02723329489924779, "grad_norm": 0.035799819976091385, "learning_rate": 8.640495619849821e-05, "loss": 10.3404, "step": 248 }, { "epoch": 0.027343106572228628, "grad_norm": 0.027779866009950638, "learning_rate": 8.629601180209381e-05, "loss": 10.3388, "step": 249 }, { "epoch": 0.027452918245209467, "grad_norm": 0.0283342357724905, "learning_rate": 8.618670190525352e-05, "loss": 10.3322, "step": 250 }, { "epoch": 0.027452918245209467, "eval_loss": 10.337260246276855, "eval_runtime": 126.9198, "eval_samples_per_second": 11.716, "eval_steps_per_second": 5.862, "step": 250 }, { "epoch": 0.027562729918190303, "grad_norm": 0.04042504355311394, "learning_rate": 8.607702760872678e-05, "loss": 10.3368, "step": 251 }, { "epoch": 0.02767254159117114, "grad_norm": 0.03197220712900162, "learning_rate": 8.596699001693255e-05, "loss": 10.3404, "step": 252 }, { "epoch": 0.02778235326415198, "grad_norm": 0.04232124611735344, "learning_rate": 8.585659023794818e-05, "loss": 10.3363, "step": 253 }, { "epoch": 0.027892164937132816, "grad_norm": 0.021878182888031006, "learning_rate": 8.574582938349817e-05, "loss": 10.3399, "step": 254 }, { "epoch": 0.028001976610113655, "grad_norm": 0.032933253794908524, "learning_rate": 8.563470856894316e-05, "loss": 10.3329, "step": 255 }, { "epoch": 0.028111788283094494, "grad_norm": 0.03841102123260498, "learning_rate": 8.552322891326846e-05, "loss": 10.336, "step": 256 }, { "epoch": 0.02822159995607533, "grad_norm": 0.028103046119213104, "learning_rate": 8.541139153907296e-05, "loss": 10.3418, "step": 257 }, { "epoch": 0.02833141162905617, "grad_norm": 0.037000104784965515, "learning_rate": 8.529919757255783e-05, "loss": 10.3368, "step": 258 }, { "epoch": 0.028441223302037008, "grad_norm": 0.03226947411894798, "learning_rate": 8.518664814351502e-05, "loss": 10.3326, "step": 259 }, { "epoch": 0.028551034975017844, "grad_norm": 0.02781609445810318, "learning_rate": 8.507374438531607e-05, "loss": 10.3357, "step": 260 }, { "epoch": 0.028660846647998683, "grad_norm": 0.02789613977074623, "learning_rate": 8.496048743490053e-05, "loss": 10.3382, "step": 261 }, { "epoch": 0.028770658320979522, "grad_norm": 0.03186870366334915, "learning_rate": 8.484687843276469e-05, "loss": 10.3351, "step": 262 }, { "epoch": 0.028880469993960357, "grad_norm": 0.030995000153779984, "learning_rate": 8.473291852294987e-05, "loss": 10.3374, "step": 263 }, { "epoch": 0.028990281666941196, "grad_norm": 0.026344187557697296, "learning_rate": 8.461860885303114e-05, "loss": 10.3401, "step": 264 }, { "epoch": 0.029100093339922032, "grad_norm": 0.035475119948387146, "learning_rate": 8.450395057410561e-05, "loss": 10.3365, "step": 265 }, { "epoch": 0.02920990501290287, "grad_norm": 0.03263852000236511, "learning_rate": 8.438894484078086e-05, "loss": 10.3353, "step": 266 }, { "epoch": 0.02931971668588371, "grad_norm": 0.05197161063551903, "learning_rate": 8.427359281116334e-05, "loss": 10.3249, "step": 267 }, { "epoch": 0.029429528358864546, "grad_norm": 0.027927998453378677, "learning_rate": 8.415789564684673e-05, "loss": 10.3405, "step": 268 }, { "epoch": 0.029539340031845385, "grad_norm": 0.04254119098186493, "learning_rate": 8.404185451290018e-05, "loss": 10.3348, "step": 269 }, { "epoch": 0.029649151704826224, "grad_norm": 0.036639902740716934, "learning_rate": 8.392547057785661e-05, "loss": 10.3408, "step": 270 }, { "epoch": 0.02975896337780706, "grad_norm": 0.024165011942386627, "learning_rate": 8.380874501370097e-05, "loss": 10.3365, "step": 271 }, { "epoch": 0.0298687750507879, "grad_norm": 0.021736474707722664, "learning_rate": 8.369167899585841e-05, "loss": 10.3383, "step": 272 }, { "epoch": 0.029978586723768737, "grad_norm": 0.02925550378859043, "learning_rate": 8.357427370318239e-05, "loss": 10.3319, "step": 273 }, { "epoch": 0.030088398396749573, "grad_norm": 0.03253068029880524, "learning_rate": 8.345653031794292e-05, "loss": 10.3361, "step": 274 }, { "epoch": 0.030198210069730412, "grad_norm": 0.033936094492673874, "learning_rate": 8.333845002581458e-05, "loss": 10.3388, "step": 275 }, { "epoch": 0.03030802174271125, "grad_norm": 0.021694660186767578, "learning_rate": 8.322003401586462e-05, "loss": 10.3427, "step": 276 }, { "epoch": 0.030417833415692087, "grad_norm": 0.03910309821367264, "learning_rate": 8.310128348054094e-05, "loss": 10.3308, "step": 277 }, { "epoch": 0.030527645088672926, "grad_norm": 0.025291498750448227, "learning_rate": 8.298219961566009e-05, "loss": 10.3391, "step": 278 }, { "epoch": 0.030637456761653765, "grad_norm": 0.029412228614091873, "learning_rate": 8.286278362039528e-05, "loss": 10.3311, "step": 279 }, { "epoch": 0.0307472684346346, "grad_norm": 0.03143836930394173, "learning_rate": 8.274303669726426e-05, "loss": 10.3244, "step": 280 }, { "epoch": 0.03085708010761544, "grad_norm": 0.04798191785812378, "learning_rate": 8.262296005211721e-05, "loss": 10.3348, "step": 281 }, { "epoch": 0.03096689178059628, "grad_norm": 0.027467703446745872, "learning_rate": 8.250255489412463e-05, "loss": 10.3343, "step": 282 }, { "epoch": 0.031076703453577114, "grad_norm": 0.031064271926879883, "learning_rate": 8.238182243576512e-05, "loss": 10.3367, "step": 283 }, { "epoch": 0.031186515126557953, "grad_norm": 0.03701472282409668, "learning_rate": 8.226076389281316e-05, "loss": 10.3296, "step": 284 }, { "epoch": 0.03129632679953879, "grad_norm": 0.02463219314813614, "learning_rate": 8.213938048432697e-05, "loss": 10.3367, "step": 285 }, { "epoch": 0.03140613847251963, "grad_norm": 0.043679844588041306, "learning_rate": 8.201767343263612e-05, "loss": 10.3264, "step": 286 }, { "epoch": 0.03151595014550047, "grad_norm": 0.03478361666202545, "learning_rate": 8.189564396332928e-05, "loss": 10.3401, "step": 287 }, { "epoch": 0.031625761818481306, "grad_norm": 0.03689027205109596, "learning_rate": 8.177329330524182e-05, "loss": 10.3317, "step": 288 }, { "epoch": 0.031735573491462145, "grad_norm": 0.027582010254263878, "learning_rate": 8.165062269044353e-05, "loss": 10.3406, "step": 289 }, { "epoch": 0.03184538516444298, "grad_norm": 0.027584658935666084, "learning_rate": 8.152763335422613e-05, "loss": 10.3352, "step": 290 }, { "epoch": 0.031955196837423816, "grad_norm": 0.027357645332813263, "learning_rate": 8.140432653509089e-05, "loss": 10.3323, "step": 291 }, { "epoch": 0.032065008510404655, "grad_norm": 0.04812498390674591, "learning_rate": 8.128070347473609e-05, "loss": 10.3334, "step": 292 }, { "epoch": 0.032174820183385494, "grad_norm": 0.03677697852253914, "learning_rate": 8.115676541804456e-05, "loss": 10.3326, "step": 293 }, { "epoch": 0.03228463185636633, "grad_norm": 0.031057769432663918, "learning_rate": 8.103251361307119e-05, "loss": 10.333, "step": 294 }, { "epoch": 0.03239444352934717, "grad_norm": 0.037131551653146744, "learning_rate": 8.090794931103026e-05, "loss": 10.327, "step": 295 }, { "epoch": 0.032504255202328004, "grad_norm": 0.028850682079792023, "learning_rate": 8.07830737662829e-05, "loss": 10.3375, "step": 296 }, { "epoch": 0.03261406687530884, "grad_norm": 0.03361869603395462, "learning_rate": 8.065788823632451e-05, "loss": 10.3332, "step": 297 }, { "epoch": 0.03272387854828968, "grad_norm": 0.03031431883573532, "learning_rate": 8.053239398177191e-05, "loss": 10.3368, "step": 298 }, { "epoch": 0.03283369022127052, "grad_norm": 0.029790762811899185, "learning_rate": 8.04065922663509e-05, "loss": 10.3333, "step": 299 }, { "epoch": 0.03294350189425136, "grad_norm": 0.04281746223568916, "learning_rate": 8.028048435688333e-05, "loss": 10.3422, "step": 300 }, { "epoch": 0.0330533135672322, "grad_norm": 0.04399149492383003, "learning_rate": 8.015407152327448e-05, "loss": 10.3316, "step": 301 }, { "epoch": 0.03316312524021303, "grad_norm": 0.02453172393143177, "learning_rate": 8.002735503850016e-05, "loss": 10.332, "step": 302 }, { "epoch": 0.03327293691319387, "grad_norm": 0.04740273579955101, "learning_rate": 7.990033617859396e-05, "loss": 10.3332, "step": 303 }, { "epoch": 0.03338274858617471, "grad_norm": 0.03436451405286789, "learning_rate": 7.97730162226344e-05, "loss": 10.3357, "step": 304 }, { "epoch": 0.03349256025915555, "grad_norm": 0.026904281228780746, "learning_rate": 7.964539645273204e-05, "loss": 10.3396, "step": 305 }, { "epoch": 0.03360237193213639, "grad_norm": 0.03356492146849632, "learning_rate": 7.95174781540165e-05, "loss": 10.3312, "step": 306 }, { "epoch": 0.03371218360511723, "grad_norm": 0.03133975341916084, "learning_rate": 7.938926261462366e-05, "loss": 10.3385, "step": 307 }, { "epoch": 0.03382199527809806, "grad_norm": 0.032852113246917725, "learning_rate": 7.926075112568259e-05, "loss": 10.3316, "step": 308 }, { "epoch": 0.0339318069510789, "grad_norm": 0.035984572023153305, "learning_rate": 7.913194498130252e-05, "loss": 10.3326, "step": 309 }, { "epoch": 0.03404161862405974, "grad_norm": 0.032500043511390686, "learning_rate": 7.900284547855991e-05, "loss": 10.3387, "step": 310 }, { "epoch": 0.034151430297040576, "grad_norm": 0.03251442685723305, "learning_rate": 7.887345391748533e-05, "loss": 10.332, "step": 311 }, { "epoch": 0.034261241970021415, "grad_norm": 0.03657425194978714, "learning_rate": 7.874377160105036e-05, "loss": 10.3351, "step": 312 }, { "epoch": 0.034371053643002254, "grad_norm": 0.03712441399693489, "learning_rate": 7.861379983515449e-05, "loss": 10.3298, "step": 313 }, { "epoch": 0.034480865315983086, "grad_norm": 0.02978348359465599, "learning_rate": 7.848353992861195e-05, "loss": 10.3341, "step": 314 }, { "epoch": 0.034590676988963925, "grad_norm": 0.042129017412662506, "learning_rate": 7.835299319313853e-05, "loss": 10.3266, "step": 315 }, { "epoch": 0.034700488661944764, "grad_norm": 0.03364431485533714, "learning_rate": 7.822216094333847e-05, "loss": 10.3254, "step": 316 }, { "epoch": 0.034810300334925603, "grad_norm": 0.04211954399943352, "learning_rate": 7.809104449669101e-05, "loss": 10.3264, "step": 317 }, { "epoch": 0.03492011200790644, "grad_norm": 0.03056887537240982, "learning_rate": 7.795964517353735e-05, "loss": 10.3309, "step": 318 }, { "epoch": 0.03502992368088728, "grad_norm": 0.03751233592629433, "learning_rate": 7.78279642970672e-05, "loss": 10.3309, "step": 319 }, { "epoch": 0.035139735353868114, "grad_norm": 0.03249451890587807, "learning_rate": 7.769600319330552e-05, "loss": 10.3347, "step": 320 }, { "epoch": 0.03524954702684895, "grad_norm": 0.028485741466283798, "learning_rate": 7.756376319109917e-05, "loss": 10.3275, "step": 321 }, { "epoch": 0.03535935869982979, "grad_norm": 0.031526558101177216, "learning_rate": 7.74312456221035e-05, "loss": 10.3349, "step": 322 }, { "epoch": 0.03546917037281063, "grad_norm": 0.039903029799461365, "learning_rate": 7.729845182076895e-05, "loss": 10.3325, "step": 323 }, { "epoch": 0.03557898204579147, "grad_norm": 0.0326211117208004, "learning_rate": 7.716538312432766e-05, "loss": 10.3307, "step": 324 }, { "epoch": 0.03568879371877231, "grad_norm": 0.027076033875346184, "learning_rate": 7.703204087277988e-05, "loss": 10.3329, "step": 325 }, { "epoch": 0.03579860539175314, "grad_norm": 0.03494537994265556, "learning_rate": 7.689842640888063e-05, "loss": 10.3355, "step": 326 }, { "epoch": 0.03590841706473398, "grad_norm": 0.03274150937795639, "learning_rate": 7.676454107812607e-05, "loss": 10.3289, "step": 327 }, { "epoch": 0.03601822873771482, "grad_norm": 0.040688082575798035, "learning_rate": 7.663038622873999e-05, "loss": 10.3352, "step": 328 }, { "epoch": 0.03612804041069566, "grad_norm": 0.04242965579032898, "learning_rate": 7.649596321166024e-05, "loss": 10.3332, "step": 329 }, { "epoch": 0.0362378520836765, "grad_norm": 0.03959864377975464, "learning_rate": 7.636127338052512e-05, "loss": 10.3458, "step": 330 }, { "epoch": 0.03634766375665733, "grad_norm": 0.031401682645082474, "learning_rate": 7.622631809165973e-05, "loss": 10.3289, "step": 331 }, { "epoch": 0.03645747542963817, "grad_norm": 0.03373299911618233, "learning_rate": 7.60910987040623e-05, "loss": 10.3334, "step": 332 }, { "epoch": 0.03656728710261901, "grad_norm": 0.03774107247591019, "learning_rate": 7.595561657939061e-05, "loss": 10.3311, "step": 333 }, { "epoch": 0.036677098775599846, "grad_norm": 0.03841663897037506, "learning_rate": 7.58198730819481e-05, "loss": 10.3272, "step": 334 }, { "epoch": 0.036786910448580686, "grad_norm": 0.030634526163339615, "learning_rate": 7.568386957867033e-05, "loss": 10.3323, "step": 335 }, { "epoch": 0.036896722121561525, "grad_norm": 0.0379607118666172, "learning_rate": 7.554760743911103e-05, "loss": 10.3264, "step": 336 }, { "epoch": 0.03700653379454236, "grad_norm": 0.03495849668979645, "learning_rate": 7.541108803542846e-05, "loss": 10.3324, "step": 337 }, { "epoch": 0.037116345467523196, "grad_norm": 0.04587521031498909, "learning_rate": 7.52743127423715e-05, "loss": 10.3278, "step": 338 }, { "epoch": 0.037226157140504035, "grad_norm": 0.04321592301130295, "learning_rate": 7.51372829372658e-05, "loss": 10.3267, "step": 339 }, { "epoch": 0.037335968813484874, "grad_norm": 0.035660337656736374, "learning_rate": 7.500000000000001e-05, "loss": 10.3341, "step": 340 }, { "epoch": 0.03744578048646571, "grad_norm": 0.03724941238760948, "learning_rate": 7.486246531301177e-05, "loss": 10.3323, "step": 341 }, { "epoch": 0.03755559215944655, "grad_norm": 0.03373545780777931, "learning_rate": 7.472468026127385e-05, "loss": 10.332, "step": 342 }, { "epoch": 0.037665403832427384, "grad_norm": 0.04065243899822235, "learning_rate": 7.45866462322802e-05, "loss": 10.3321, "step": 343 }, { "epoch": 0.03777521550540822, "grad_norm": 0.03922753781080246, "learning_rate": 7.444836461603195e-05, "loss": 10.3263, "step": 344 }, { "epoch": 0.03788502717838906, "grad_norm": 0.03144107013940811, "learning_rate": 7.430983680502344e-05, "loss": 10.3253, "step": 345 }, { "epoch": 0.0379948388513699, "grad_norm": 0.035334512591362, "learning_rate": 7.417106419422819e-05, "loss": 10.3362, "step": 346 }, { "epoch": 0.03810465052435074, "grad_norm": 0.03799709677696228, "learning_rate": 7.403204818108487e-05, "loss": 10.3353, "step": 347 }, { "epoch": 0.03821446219733158, "grad_norm": 0.032984789460897446, "learning_rate": 7.389279016548316e-05, "loss": 10.3315, "step": 348 }, { "epoch": 0.03832427387031241, "grad_norm": 0.04229612648487091, "learning_rate": 7.375329154974975e-05, "loss": 10.3272, "step": 349 }, { "epoch": 0.03843408554329325, "grad_norm": 0.03587073087692261, "learning_rate": 7.361355373863414e-05, "loss": 10.3314, "step": 350 }, { "epoch": 0.03854389721627409, "grad_norm": 0.03785446286201477, "learning_rate": 7.347357813929454e-05, "loss": 10.3296, "step": 351 }, { "epoch": 0.03865370888925493, "grad_norm": 0.03587425872683525, "learning_rate": 7.333336616128369e-05, "loss": 10.3142, "step": 352 }, { "epoch": 0.03876352056223577, "grad_norm": 0.03285554423928261, "learning_rate": 7.319291921653464e-05, "loss": 10.327, "step": 353 }, { "epoch": 0.03887333223521661, "grad_norm": 0.03733392432332039, "learning_rate": 7.305223871934657e-05, "loss": 10.3342, "step": 354 }, { "epoch": 0.03898314390819744, "grad_norm": 0.03083522990345955, "learning_rate": 7.291132608637052e-05, "loss": 10.3335, "step": 355 }, { "epoch": 0.03909295558117828, "grad_norm": 0.02849193662405014, "learning_rate": 7.277018273659517e-05, "loss": 10.3335, "step": 356 }, { "epoch": 0.03920276725415912, "grad_norm": 0.03563259541988373, "learning_rate": 7.262881009133242e-05, "loss": 10.3325, "step": 357 }, { "epoch": 0.039312578927139956, "grad_norm": 0.039791807532310486, "learning_rate": 7.24872095742033e-05, "loss": 10.3242, "step": 358 }, { "epoch": 0.039422390600120795, "grad_norm": 0.03888785094022751, "learning_rate": 7.23453826111234e-05, "loss": 10.3187, "step": 359 }, { "epoch": 0.039532202273101634, "grad_norm": 0.03819039463996887, "learning_rate": 7.220333063028872e-05, "loss": 10.3228, "step": 360 }, { "epoch": 0.039642013946082466, "grad_norm": 0.03698824718594551, "learning_rate": 7.206105506216106e-05, "loss": 10.3289, "step": 361 }, { "epoch": 0.039751825619063305, "grad_norm": 0.044339802116155624, "learning_rate": 7.191855733945387e-05, "loss": 10.3127, "step": 362 }, { "epoch": 0.039861637292044144, "grad_norm": 0.03806031122803688, "learning_rate": 7.177583889711762e-05, "loss": 10.3325, "step": 363 }, { "epoch": 0.03997144896502498, "grad_norm": 0.02650645188987255, "learning_rate": 7.163290117232542e-05, "loss": 10.3307, "step": 364 }, { "epoch": 0.04008126063800582, "grad_norm": 0.03684021905064583, "learning_rate": 7.148974560445859e-05, "loss": 10.3256, "step": 365 }, { "epoch": 0.04019107231098666, "grad_norm": 0.02676587738096714, "learning_rate": 7.13463736350921e-05, "loss": 10.3333, "step": 366 }, { "epoch": 0.04030088398396749, "grad_norm": 0.02446620538830757, "learning_rate": 7.120278670798009e-05, "loss": 10.3314, "step": 367 }, { "epoch": 0.04041069565694833, "grad_norm": 0.03325483202934265, "learning_rate": 7.105898626904134e-05, "loss": 10.3293, "step": 368 }, { "epoch": 0.04052050732992917, "grad_norm": 0.03116060234606266, "learning_rate": 7.091497376634464e-05, "loss": 10.3233, "step": 369 }, { "epoch": 0.04063031900291001, "grad_norm": 0.02906019799411297, "learning_rate": 7.077075065009433e-05, "loss": 10.3276, "step": 370 }, { "epoch": 0.04074013067589085, "grad_norm": 0.02984323725104332, "learning_rate": 7.062631837261557e-05, "loss": 10.3289, "step": 371 }, { "epoch": 0.04084994234887168, "grad_norm": 0.03829892724752426, "learning_rate": 7.048167838833977e-05, "loss": 10.3313, "step": 372 }, { "epoch": 0.04095975402185252, "grad_norm": 0.03572454676032066, "learning_rate": 7.033683215379002e-05, "loss": 10.3271, "step": 373 }, { "epoch": 0.04106956569483336, "grad_norm": 0.03328411653637886, "learning_rate": 7.019178112756624e-05, "loss": 10.3338, "step": 374 }, { "epoch": 0.0411793773678142, "grad_norm": 0.04072040319442749, "learning_rate": 7.004652677033068e-05, "loss": 10.3246, "step": 375 }, { "epoch": 0.04128918904079504, "grad_norm": 0.03667178004980087, "learning_rate": 6.990107054479312e-05, "loss": 10.32, "step": 376 }, { "epoch": 0.04139900071377588, "grad_norm": 0.0344451405107975, "learning_rate": 6.97554139156961e-05, "loss": 10.3214, "step": 377 }, { "epoch": 0.04150881238675671, "grad_norm": 0.03454028069972992, "learning_rate": 6.960955834980028e-05, "loss": 10.3291, "step": 378 }, { "epoch": 0.04161862405973755, "grad_norm": 0.03503163531422615, "learning_rate": 6.946350531586959e-05, "loss": 10.3234, "step": 379 }, { "epoch": 0.04172843573271839, "grad_norm": 0.03392709419131279, "learning_rate": 6.931725628465643e-05, "loss": 10.3242, "step": 380 }, { "epoch": 0.041838247405699226, "grad_norm": 0.038634542375802994, "learning_rate": 6.917081272888697e-05, "loss": 10.3302, "step": 381 }, { "epoch": 0.041948059078680065, "grad_norm": 0.032439880073070526, "learning_rate": 6.902417612324615e-05, "loss": 10.3313, "step": 382 }, { "epoch": 0.042057870751660904, "grad_norm": 0.03669724985957146, "learning_rate": 6.8877347944363e-05, "loss": 10.3306, "step": 383 }, { "epoch": 0.042167682424641736, "grad_norm": 0.03968527540564537, "learning_rate": 6.873032967079561e-05, "loss": 10.3271, "step": 384 }, { "epoch": 0.042277494097622575, "grad_norm": 0.028942497447133064, "learning_rate": 6.858312278301637e-05, "loss": 10.329, "step": 385 }, { "epoch": 0.042387305770603415, "grad_norm": 0.045190274715423584, "learning_rate": 6.843572876339705e-05, "loss": 10.3262, "step": 386 }, { "epoch": 0.042497117443584254, "grad_norm": 0.04092245548963547, "learning_rate": 6.828814909619373e-05, "loss": 10.3284, "step": 387 }, { "epoch": 0.04260692911656509, "grad_norm": 0.03187980130314827, "learning_rate": 6.814038526753205e-05, "loss": 10.323, "step": 388 }, { "epoch": 0.04271674078954593, "grad_norm": 0.029777824878692627, "learning_rate": 6.799243876539212e-05, "loss": 10.3268, "step": 389 }, { "epoch": 0.042826552462526764, "grad_norm": 0.024051783606410027, "learning_rate": 6.784431107959359e-05, "loss": 10.3235, "step": 390 }, { "epoch": 0.0429363641355076, "grad_norm": 0.033297549933195114, "learning_rate": 6.769600370178059e-05, "loss": 10.3263, "step": 391 }, { "epoch": 0.04304617580848844, "grad_norm": 0.029994986951351166, "learning_rate": 6.754751812540679e-05, "loss": 10.3238, "step": 392 }, { "epoch": 0.04315598748146928, "grad_norm": 0.041234225034713745, "learning_rate": 6.739885584572026e-05, "loss": 10.327, "step": 393 }, { "epoch": 0.04326579915445012, "grad_norm": 0.026010913774371147, "learning_rate": 6.725001835974853e-05, "loss": 10.3229, "step": 394 }, { "epoch": 0.04337561082743096, "grad_norm": 0.031098006293177605, "learning_rate": 6.710100716628344e-05, "loss": 10.3285, "step": 395 }, { "epoch": 0.04348542250041179, "grad_norm": 0.035955894738435745, "learning_rate": 6.695182376586603e-05, "loss": 10.3236, "step": 396 }, { "epoch": 0.04359523417339263, "grad_norm": 0.039672642946243286, "learning_rate": 6.680246966077151e-05, "loss": 10.3293, "step": 397 }, { "epoch": 0.04370504584637347, "grad_norm": 0.04517965018749237, "learning_rate": 6.665294635499404e-05, "loss": 10.3235, "step": 398 }, { "epoch": 0.04381485751935431, "grad_norm": 0.037918124347925186, "learning_rate": 6.650325535423167e-05, "loss": 10.3183, "step": 399 }, { "epoch": 0.04392466919233515, "grad_norm": 0.04339971765875816, "learning_rate": 6.635339816587109e-05, "loss": 10.3207, "step": 400 }, { "epoch": 0.044034480865315986, "grad_norm": 0.04171037673950195, "learning_rate": 6.620337629897254e-05, "loss": 10.3291, "step": 401 }, { "epoch": 0.04414429253829682, "grad_norm": 0.03745467588305473, "learning_rate": 6.605319126425454e-05, "loss": 10.3295, "step": 402 }, { "epoch": 0.04425410421127766, "grad_norm": 0.0313887782394886, "learning_rate": 6.590284457407876e-05, "loss": 10.3293, "step": 403 }, { "epoch": 0.0443639158842585, "grad_norm": 0.0354151651263237, "learning_rate": 6.575233774243465e-05, "loss": 10.3277, "step": 404 }, { "epoch": 0.044473727557239336, "grad_norm": 0.04275154694914818, "learning_rate": 6.560167228492436e-05, "loss": 10.3295, "step": 405 }, { "epoch": 0.044583539230220175, "grad_norm": 0.030038248747587204, "learning_rate": 6.545084971874738e-05, "loss": 10.3292, "step": 406 }, { "epoch": 0.044693350903201014, "grad_norm": 0.02965056523680687, "learning_rate": 6.529987156268526e-05, "loss": 10.3298, "step": 407 }, { "epoch": 0.044803162576181846, "grad_norm": 0.036462992429733276, "learning_rate": 6.514873933708638e-05, "loss": 10.3242, "step": 408 }, { "epoch": 0.044912974249162685, "grad_norm": 0.039785418659448624, "learning_rate": 6.499745456385054e-05, "loss": 10.3308, "step": 409 }, { "epoch": 0.045022785922143524, "grad_norm": 0.029456205666065216, "learning_rate": 6.484601876641375e-05, "loss": 10.3215, "step": 410 }, { "epoch": 0.04513259759512436, "grad_norm": 0.0371948666870594, "learning_rate": 6.46944334697328e-05, "loss": 10.3205, "step": 411 }, { "epoch": 0.0452424092681052, "grad_norm": 0.03276629000902176, "learning_rate": 6.454270020026995e-05, "loss": 10.3267, "step": 412 }, { "epoch": 0.045352220941086034, "grad_norm": 0.03515666350722313, "learning_rate": 6.439082048597755e-05, "loss": 10.3245, "step": 413 }, { "epoch": 0.04546203261406687, "grad_norm": 0.03750680014491081, "learning_rate": 6.423879585628261e-05, "loss": 10.3271, "step": 414 }, { "epoch": 0.04557184428704771, "grad_norm": 0.03401637077331543, "learning_rate": 6.408662784207149e-05, "loss": 10.3203, "step": 415 }, { "epoch": 0.04568165596002855, "grad_norm": 0.03510723635554314, "learning_rate": 6.39343179756744e-05, "loss": 10.3218, "step": 416 }, { "epoch": 0.04579146763300939, "grad_norm": 0.04035051912069321, "learning_rate": 6.378186779084995e-05, "loss": 10.3218, "step": 417 }, { "epoch": 0.04590127930599023, "grad_norm": 0.026958249509334564, "learning_rate": 6.36292788227699e-05, "loss": 10.3322, "step": 418 }, { "epoch": 0.04601109097897106, "grad_norm": 0.03243269771337509, "learning_rate": 6.34765526080034e-05, "loss": 10.3211, "step": 419 }, { "epoch": 0.0461209026519519, "grad_norm": 0.03296666219830513, "learning_rate": 6.332369068450174e-05, "loss": 10.334, "step": 420 }, { "epoch": 0.04623071432493274, "grad_norm": 0.028153402730822563, "learning_rate": 6.317069459158284e-05, "loss": 10.3203, "step": 421 }, { "epoch": 0.04634052599791358, "grad_norm": 0.027847595512866974, "learning_rate": 6.30175658699156e-05, "loss": 10.3281, "step": 422 }, { "epoch": 0.04645033767089442, "grad_norm": 0.025902122259140015, "learning_rate": 6.286430606150459e-05, "loss": 10.3206, "step": 423 }, { "epoch": 0.04656014934387526, "grad_norm": 0.04112079739570618, "learning_rate": 6.271091670967436e-05, "loss": 10.3223, "step": 424 }, { "epoch": 0.04666996101685609, "grad_norm": 0.03882209584116936, "learning_rate": 6.255739935905396e-05, "loss": 10.3232, "step": 425 }, { "epoch": 0.04677977268983693, "grad_norm": 0.03721603751182556, "learning_rate": 6.240375555556145e-05, "loss": 10.3228, "step": 426 }, { "epoch": 0.04688958436281777, "grad_norm": 0.030858062207698822, "learning_rate": 6.22499868463882e-05, "loss": 10.3193, "step": 427 }, { "epoch": 0.046999396035798606, "grad_norm": 0.04132893308997154, "learning_rate": 6.209609477998338e-05, "loss": 10.3258, "step": 428 }, { "epoch": 0.047109207708779445, "grad_norm": 0.030822746455669403, "learning_rate": 6.194208090603844e-05, "loss": 10.3189, "step": 429 }, { "epoch": 0.047219019381760284, "grad_norm": 0.03813600167632103, "learning_rate": 6.178794677547137e-05, "loss": 10.3334, "step": 430 }, { "epoch": 0.047328831054741116, "grad_norm": 0.030673587694764137, "learning_rate": 6.163369394041111e-05, "loss": 10.3199, "step": 431 }, { "epoch": 0.047438642727721955, "grad_norm": 0.029215993359684944, "learning_rate": 6.147932395418205e-05, "loss": 10.3247, "step": 432 }, { "epoch": 0.047548454400702794, "grad_norm": 0.028485752642154694, "learning_rate": 6.132483837128823e-05, "loss": 10.3183, "step": 433 }, { "epoch": 0.04765826607368363, "grad_norm": 0.03938799723982811, "learning_rate": 6.117023874739772e-05, "loss": 10.3124, "step": 434 }, { "epoch": 0.04776807774666447, "grad_norm": 0.03493209183216095, "learning_rate": 6.1015526639327035e-05, "loss": 10.3192, "step": 435 }, { "epoch": 0.04787788941964531, "grad_norm": 0.04554183781147003, "learning_rate": 6.0860703605025395e-05, "loss": 10.3278, "step": 436 }, { "epoch": 0.047987701092626144, "grad_norm": 0.029267124831676483, "learning_rate": 6.0705771203559024e-05, "loss": 10.3284, "step": 437 }, { "epoch": 0.04809751276560698, "grad_norm": 0.03491409122943878, "learning_rate": 6.05507309950955e-05, "loss": 10.3195, "step": 438 }, { "epoch": 0.04820732443858782, "grad_norm": 0.0309711042791605, "learning_rate": 6.0395584540887963e-05, "loss": 10.3235, "step": 439 }, { "epoch": 0.04831713611156866, "grad_norm": 0.038257911801338196, "learning_rate": 6.024033340325954e-05, "loss": 10.3272, "step": 440 }, { "epoch": 0.0484269477845495, "grad_norm": 0.03563486412167549, "learning_rate": 6.008497914558744e-05, "loss": 10.329, "step": 441 }, { "epoch": 0.04853675945753034, "grad_norm": 0.026158859953284264, "learning_rate": 5.992952333228728e-05, "loss": 10.3265, "step": 442 }, { "epoch": 0.04864657113051117, "grad_norm": 0.02602277882397175, "learning_rate": 5.9773967528797414e-05, "loss": 10.323, "step": 443 }, { "epoch": 0.04875638280349201, "grad_norm": 0.03592957556247711, "learning_rate": 5.9618313301563055e-05, "loss": 10.3275, "step": 444 }, { "epoch": 0.04886619447647285, "grad_norm": 0.03412698209285736, "learning_rate": 5.946256221802051e-05, "loss": 10.325, "step": 445 }, { "epoch": 0.04897600614945369, "grad_norm": 0.04358633980154991, "learning_rate": 5.9306715846581506e-05, "loss": 10.3175, "step": 446 }, { "epoch": 0.04908581782243453, "grad_norm": 0.0327661894261837, "learning_rate": 5.915077575661723e-05, "loss": 10.327, "step": 447 }, { "epoch": 0.049195629495415366, "grad_norm": 0.03483322262763977, "learning_rate": 5.8994743518442694e-05, "loss": 10.3191, "step": 448 }, { "epoch": 0.0493054411683962, "grad_norm": 0.03744484484195709, "learning_rate": 5.8838620703300784e-05, "loss": 10.3209, "step": 449 }, { "epoch": 0.04941525284137704, "grad_norm": 0.02646210789680481, "learning_rate": 5.868240888334653e-05, "loss": 10.3271, "step": 450 }, { "epoch": 0.049525064514357876, "grad_norm": 0.035832397639751434, "learning_rate": 5.85261096316312e-05, "loss": 10.3113, "step": 451 }, { "epoch": 0.049634876187338715, "grad_norm": 0.025235386565327644, "learning_rate": 5.836972452208654e-05, "loss": 10.3237, "step": 452 }, { "epoch": 0.049744687860319554, "grad_norm": 0.03769892081618309, "learning_rate": 5.821325512950886e-05, "loss": 10.3233, "step": 453 }, { "epoch": 0.049854499533300387, "grad_norm": 0.048819780349731445, "learning_rate": 5.805670302954321e-05, "loss": 10.3302, "step": 454 }, { "epoch": 0.049964311206281226, "grad_norm": 0.02916429005563259, "learning_rate": 5.79000697986675e-05, "loss": 10.335, "step": 455 }, { "epoch": 0.050074122879262065, "grad_norm": 0.02625507116317749, "learning_rate": 5.7743357014176624e-05, "loss": 10.3283, "step": 456 }, { "epoch": 0.050183934552242904, "grad_norm": 0.02969398908317089, "learning_rate": 5.7586566254166583e-05, "loss": 10.3154, "step": 457 }, { "epoch": 0.05029374622522374, "grad_norm": 0.03696830943226814, "learning_rate": 5.7429699097518585e-05, "loss": 10.3261, "step": 458 }, { "epoch": 0.05040355789820458, "grad_norm": 0.029392560943961143, "learning_rate": 5.7272757123883184e-05, "loss": 10.3235, "step": 459 }, { "epoch": 0.050513369571185414, "grad_norm": 0.037895698100328445, "learning_rate": 5.7115741913664264e-05, "loss": 10.3163, "step": 460 }, { "epoch": 0.05062318124416625, "grad_norm": 0.03183186054229736, "learning_rate": 5.695865504800327e-05, "loss": 10.3184, "step": 461 }, { "epoch": 0.05073299291714709, "grad_norm": 0.028957149013876915, "learning_rate": 5.680149810876322e-05, "loss": 10.3216, "step": 462 }, { "epoch": 0.05084280459012793, "grad_norm": 0.035560492426157, "learning_rate": 5.664427267851271e-05, "loss": 10.3225, "step": 463 }, { "epoch": 0.05095261626310877, "grad_norm": 0.03232515975832939, "learning_rate": 5.6486980340510086e-05, "loss": 10.3239, "step": 464 }, { "epoch": 0.05106242793608961, "grad_norm": 0.034444257616996765, "learning_rate": 5.6329622678687463e-05, "loss": 10.3127, "step": 465 }, { "epoch": 0.05117223960907044, "grad_norm": 0.03910606727004051, "learning_rate": 5.617220127763474e-05, "loss": 10.3178, "step": 466 }, { "epoch": 0.05128205128205128, "grad_norm": 0.03780893608927727, "learning_rate": 5.601471772258368e-05, "loss": 10.3128, "step": 467 }, { "epoch": 0.05139186295503212, "grad_norm": 0.03370804339647293, "learning_rate": 5.585717359939192e-05, "loss": 10.3236, "step": 468 }, { "epoch": 0.05150167462801296, "grad_norm": 0.030947532504796982, "learning_rate": 5.569957049452703e-05, "loss": 10.3211, "step": 469 }, { "epoch": 0.0516114863009938, "grad_norm": 0.03483666852116585, "learning_rate": 5.5541909995050554e-05, "loss": 10.3193, "step": 470 }, { "epoch": 0.051721297973974636, "grad_norm": 0.03141488879919052, "learning_rate": 5.538419368860196e-05, "loss": 10.3217, "step": 471 }, { "epoch": 0.05183110964695547, "grad_norm": 0.04525888338685036, "learning_rate": 5.522642316338268e-05, "loss": 10.3247, "step": 472 }, { "epoch": 0.05194092131993631, "grad_norm": 0.036286257207393646, "learning_rate": 5.506860000814017e-05, "loss": 10.3292, "step": 473 }, { "epoch": 0.05205073299291715, "grad_norm": 0.03814885765314102, "learning_rate": 5.4910725812151864e-05, "loss": 10.3249, "step": 474 }, { "epoch": 0.052160544665897986, "grad_norm": 0.03615233674645424, "learning_rate": 5.475280216520913e-05, "loss": 10.3303, "step": 475 }, { "epoch": 0.052270356338878825, "grad_norm": 0.03104758821427822, "learning_rate": 5.4594830657601384e-05, "loss": 10.3242, "step": 476 }, { "epoch": 0.052380168011859664, "grad_norm": 0.0470515601336956, "learning_rate": 5.443681288009991e-05, "loss": 10.3244, "step": 477 }, { "epoch": 0.052489979684840496, "grad_norm": 0.03587877377867699, "learning_rate": 5.427875042394199e-05, "loss": 10.3178, "step": 478 }, { "epoch": 0.052599791357821335, "grad_norm": 0.03119618631899357, "learning_rate": 5.412064488081482e-05, "loss": 10.3217, "step": 479 }, { "epoch": 0.052709603030802174, "grad_norm": 0.03611556068062782, "learning_rate": 5.396249784283942e-05, "loss": 10.3264, "step": 480 }, { "epoch": 0.05281941470378301, "grad_norm": 0.03193291276693344, "learning_rate": 5.3804310902554754e-05, "loss": 10.3217, "step": 481 }, { "epoch": 0.05292922637676385, "grad_norm": 0.03180037438869476, "learning_rate": 5.364608565290155e-05, "loss": 10.312, "step": 482 }, { "epoch": 0.05303903804974469, "grad_norm": 0.03446637839078903, "learning_rate": 5.348782368720626e-05, "loss": 10.3161, "step": 483 }, { "epoch": 0.05314884972272552, "grad_norm": 0.030270066112279892, "learning_rate": 5.3329526599165204e-05, "loss": 10.3141, "step": 484 }, { "epoch": 0.05325866139570636, "grad_norm": 0.027826432138681412, "learning_rate": 5.317119598282823e-05, "loss": 10.3234, "step": 485 }, { "epoch": 0.0533684730686872, "grad_norm": 0.04317229986190796, "learning_rate": 5.301283343258293e-05, "loss": 10.3177, "step": 486 }, { "epoch": 0.05347828474166804, "grad_norm": 0.03048611991107464, "learning_rate": 5.2854440543138406e-05, "loss": 10.3108, "step": 487 }, { "epoch": 0.05358809641464888, "grad_norm": 0.040654148906469345, "learning_rate": 5.2696018909509306e-05, "loss": 10.3177, "step": 488 }, { "epoch": 0.05369790808762971, "grad_norm": 0.027290545403957367, "learning_rate": 5.253757012699972e-05, "loss": 10.3201, "step": 489 }, { "epoch": 0.05380771976061055, "grad_norm": 0.03929990530014038, "learning_rate": 5.2379095791187124e-05, "loss": 10.3214, "step": 490 }, { "epoch": 0.05391753143359139, "grad_norm": 0.03476382791996002, "learning_rate": 5.2220597497906307e-05, "loss": 10.3285, "step": 491 }, { "epoch": 0.05402734310657223, "grad_norm": 0.03593587875366211, "learning_rate": 5.2062076843233366e-05, "loss": 10.318, "step": 492 }, { "epoch": 0.05413715477955307, "grad_norm": 0.034918636083602905, "learning_rate": 5.1903535423469505e-05, "loss": 10.3265, "step": 493 }, { "epoch": 0.05424696645253391, "grad_norm": 0.03082980029284954, "learning_rate": 5.174497483512506e-05, "loss": 10.3138, "step": 494 }, { "epoch": 0.05435677812551474, "grad_norm": 0.04313899576663971, "learning_rate": 5.158639667490339e-05, "loss": 10.319, "step": 495 }, { "epoch": 0.05446658979849558, "grad_norm": 0.028895169496536255, "learning_rate": 5.142780253968481e-05, "loss": 10.3165, "step": 496 }, { "epoch": 0.05457640147147642, "grad_norm": 0.038492292165756226, "learning_rate": 5.126919402651052e-05, "loss": 10.317, "step": 497 }, { "epoch": 0.054686213144457256, "grad_norm": 0.0419144369661808, "learning_rate": 5.1110572732566475e-05, "loss": 10.3168, "step": 498 }, { "epoch": 0.054796024817438095, "grad_norm": 0.04041106998920441, "learning_rate": 5.095194025516733e-05, "loss": 10.3233, "step": 499 }, { "epoch": 0.054905836490418934, "grad_norm": 0.03487811237573624, "learning_rate": 5.0793298191740404e-05, "loss": 10.3198, "step": 500 }, { "epoch": 0.054905836490418934, "eval_loss": 10.318955421447754, "eval_runtime": 126.7187, "eval_samples_per_second": 11.735, "eval_steps_per_second": 5.871, "step": 500 }, { "epoch": 0.055015648163399766, "grad_norm": 0.03410876914858818, "learning_rate": 5.063464813980948e-05, "loss": 10.3254, "step": 501 }, { "epoch": 0.055125459836380605, "grad_norm": 0.04738672450184822, "learning_rate": 5.047599169697884e-05, "loss": 10.3248, "step": 502 }, { "epoch": 0.055235271509361444, "grad_norm": 0.03311553969979286, "learning_rate": 5.03173304609171e-05, "loss": 10.321, "step": 503 }, { "epoch": 0.05534508318234228, "grad_norm": 0.029791921377182007, "learning_rate": 5.015866602934112e-05, "loss": 10.3223, "step": 504 }, { "epoch": 0.05545489485532312, "grad_norm": 0.030381258577108383, "learning_rate": 5e-05, "loss": 10.3222, "step": 505 }, { "epoch": 0.05556470652830396, "grad_norm": 0.0378074012696743, "learning_rate": 4.984133397065889e-05, "loss": 10.3162, "step": 506 }, { "epoch": 0.055674518201284794, "grad_norm": 0.03906317427754402, "learning_rate": 4.968266953908292e-05, "loss": 10.314, "step": 507 }, { "epoch": 0.05578432987426563, "grad_norm": 0.033684585243463516, "learning_rate": 4.952400830302117e-05, "loss": 10.332, "step": 508 }, { "epoch": 0.05589414154724647, "grad_norm": 0.03505406528711319, "learning_rate": 4.9365351860190526e-05, "loss": 10.306, "step": 509 }, { "epoch": 0.05600395322022731, "grad_norm": 0.034469932317733765, "learning_rate": 4.92067018082596e-05, "loss": 10.3212, "step": 510 }, { "epoch": 0.05611376489320815, "grad_norm": 0.03993469104170799, "learning_rate": 4.9048059744832666e-05, "loss": 10.3174, "step": 511 }, { "epoch": 0.05622357656618899, "grad_norm": 0.03790479898452759, "learning_rate": 4.888942726743353e-05, "loss": 10.3236, "step": 512 }, { "epoch": 0.05633338823916982, "grad_norm": 0.025999998673796654, "learning_rate": 4.8730805973489476e-05, "loss": 10.3145, "step": 513 }, { "epoch": 0.05644319991215066, "grad_norm": 0.040338192135095596, "learning_rate": 4.85721974603152e-05, "loss": 10.3174, "step": 514 }, { "epoch": 0.0565530115851315, "grad_norm": 0.033280979841947556, "learning_rate": 4.841360332509663e-05, "loss": 10.3191, "step": 515 }, { "epoch": 0.05666282325811234, "grad_norm": 0.04504970461130142, "learning_rate": 4.825502516487497e-05, "loss": 10.3297, "step": 516 }, { "epoch": 0.05677263493109318, "grad_norm": 0.037833958864212036, "learning_rate": 4.8096464576530507e-05, "loss": 10.3151, "step": 517 }, { "epoch": 0.056882446604074016, "grad_norm": 0.028639158234000206, "learning_rate": 4.7937923156766646e-05, "loss": 10.3191, "step": 518 }, { "epoch": 0.05699225827705485, "grad_norm": 0.04431344196200371, "learning_rate": 4.77794025020937e-05, "loss": 10.3195, "step": 519 }, { "epoch": 0.05710206995003569, "grad_norm": 0.031215572729706764, "learning_rate": 4.762090420881289e-05, "loss": 10.3234, "step": 520 }, { "epoch": 0.057211881623016526, "grad_norm": 0.03430160507559776, "learning_rate": 4.7462429873000295e-05, "loss": 10.3208, "step": 521 }, { "epoch": 0.057321693295997365, "grad_norm": 0.04473254829645157, "learning_rate": 4.730398109049071e-05, "loss": 10.311, "step": 522 }, { "epoch": 0.057431504968978205, "grad_norm": 0.04297739267349243, "learning_rate": 4.71455594568616e-05, "loss": 10.3183, "step": 523 }, { "epoch": 0.057541316641959044, "grad_norm": 0.04153745248913765, "learning_rate": 4.698716656741708e-05, "loss": 10.3191, "step": 524 }, { "epoch": 0.057651128314939876, "grad_norm": 0.0311945341527462, "learning_rate": 4.6828804017171776e-05, "loss": 10.3177, "step": 525 }, { "epoch": 0.057760939987920715, "grad_norm": 0.03847968578338623, "learning_rate": 4.667047340083481e-05, "loss": 10.3191, "step": 526 }, { "epoch": 0.057870751660901554, "grad_norm": 0.039379045367240906, "learning_rate": 4.6512176312793736e-05, "loss": 10.3272, "step": 527 }, { "epoch": 0.05798056333388239, "grad_norm": 0.02749600075185299, "learning_rate": 4.635391434709847e-05, "loss": 10.3133, "step": 528 }, { "epoch": 0.05809037500686323, "grad_norm": 0.029182951897382736, "learning_rate": 4.619568909744524e-05, "loss": 10.3252, "step": 529 }, { "epoch": 0.058200186679844064, "grad_norm": 0.038201138377189636, "learning_rate": 4.603750215716057e-05, "loss": 10.3163, "step": 530 }, { "epoch": 0.0583099983528249, "grad_norm": 0.035310421139001846, "learning_rate": 4.587935511918521e-05, "loss": 10.3124, "step": 531 }, { "epoch": 0.05841981002580574, "grad_norm": 0.03495221585035324, "learning_rate": 4.5721249576058027e-05, "loss": 10.3175, "step": 532 }, { "epoch": 0.05852962169878658, "grad_norm": 0.032546330243349075, "learning_rate": 4.5563187119900104e-05, "loss": 10.3204, "step": 533 }, { "epoch": 0.05863943337176742, "grad_norm": 0.04046177119016647, "learning_rate": 4.5405169342398634e-05, "loss": 10.3216, "step": 534 }, { "epoch": 0.05874924504474826, "grad_norm": 0.035773757845163345, "learning_rate": 4.5247197834790876e-05, "loss": 10.3169, "step": 535 }, { "epoch": 0.05885905671772909, "grad_norm": 0.03923477232456207, "learning_rate": 4.508927418784815e-05, "loss": 10.3121, "step": 536 }, { "epoch": 0.05896886839070993, "grad_norm": 0.038092680275440216, "learning_rate": 4.493139999185983e-05, "loss": 10.3073, "step": 537 }, { "epoch": 0.05907868006369077, "grad_norm": 0.03591860458254814, "learning_rate": 4.477357683661734e-05, "loss": 10.3141, "step": 538 }, { "epoch": 0.05918849173667161, "grad_norm": 0.032019320875406265, "learning_rate": 4.461580631139805e-05, "loss": 10.3131, "step": 539 }, { "epoch": 0.05929830340965245, "grad_norm": 0.03614096716046333, "learning_rate": 4.445809000494946e-05, "loss": 10.3066, "step": 540 }, { "epoch": 0.05940811508263329, "grad_norm": 0.03614196926355362, "learning_rate": 4.4300429505472976e-05, "loss": 10.3307, "step": 541 }, { "epoch": 0.05951792675561412, "grad_norm": 0.03354780375957489, "learning_rate": 4.4142826400608086e-05, "loss": 10.3203, "step": 542 }, { "epoch": 0.05962773842859496, "grad_norm": 0.03033539280295372, "learning_rate": 4.398528227741633e-05, "loss": 10.3201, "step": 543 }, { "epoch": 0.0597375501015758, "grad_norm": 0.03290290758013725, "learning_rate": 4.3827798722365264e-05, "loss": 10.3181, "step": 544 }, { "epoch": 0.059847361774556636, "grad_norm": 0.05568011477589607, "learning_rate": 4.3670377321312535e-05, "loss": 10.3114, "step": 545 }, { "epoch": 0.059957173447537475, "grad_norm": 0.039659466594457626, "learning_rate": 4.351301965948991e-05, "loss": 10.3167, "step": 546 }, { "epoch": 0.060066985120518314, "grad_norm": 0.03856213763356209, "learning_rate": 4.33557273214873e-05, "loss": 10.3138, "step": 547 }, { "epoch": 0.060176796793499146, "grad_norm": 0.037546731531620026, "learning_rate": 4.3198501891236804e-05, "loss": 10.3267, "step": 548 }, { "epoch": 0.060286608466479985, "grad_norm": 0.037389788776636124, "learning_rate": 4.3041344951996746e-05, "loss": 10.3193, "step": 549 }, { "epoch": 0.060396420139460824, "grad_norm": 0.04678316041827202, "learning_rate": 4.288425808633575e-05, "loss": 10.3168, "step": 550 }, { "epoch": 0.06050623181244166, "grad_norm": 0.03719830885529518, "learning_rate": 4.272724287611684e-05, "loss": 10.3069, "step": 551 }, { "epoch": 0.0606160434854225, "grad_norm": 0.030142908915877342, "learning_rate": 4.2570300902481426e-05, "loss": 10.326, "step": 552 }, { "epoch": 0.06072585515840334, "grad_norm": 0.03186402842402458, "learning_rate": 4.241343374583343e-05, "loss": 10.3132, "step": 553 }, { "epoch": 0.06083566683138417, "grad_norm": 0.030028637498617172, "learning_rate": 4.2256642985823395e-05, "loss": 10.3255, "step": 554 }, { "epoch": 0.06094547850436501, "grad_norm": 0.03888958692550659, "learning_rate": 4.20999302013325e-05, "loss": 10.3231, "step": 555 }, { "epoch": 0.06105529017734585, "grad_norm": 0.03933922201395035, "learning_rate": 4.19432969704568e-05, "loss": 10.3206, "step": 556 }, { "epoch": 0.06116510185032669, "grad_norm": 0.030291898176074028, "learning_rate": 4.178674487049116e-05, "loss": 10.3205, "step": 557 }, { "epoch": 0.06127491352330753, "grad_norm": 0.04657311365008354, "learning_rate": 4.163027547791347e-05, "loss": 10.3188, "step": 558 }, { "epoch": 0.06138472519628837, "grad_norm": 0.03705879673361778, "learning_rate": 4.147389036836881e-05, "loss": 10.3169, "step": 559 }, { "epoch": 0.0614945368692692, "grad_norm": 0.0392281673848629, "learning_rate": 4.131759111665349e-05, "loss": 10.3157, "step": 560 }, { "epoch": 0.06160434854225004, "grad_norm": 0.03745996579527855, "learning_rate": 4.116137929669921e-05, "loss": 10.3125, "step": 561 }, { "epoch": 0.06171416021523088, "grad_norm": 0.02913571335375309, "learning_rate": 4.100525648155731e-05, "loss": 10.3165, "step": 562 }, { "epoch": 0.06182397188821172, "grad_norm": 0.03033732809126377, "learning_rate": 4.084922424338277e-05, "loss": 10.3298, "step": 563 }, { "epoch": 0.06193378356119256, "grad_norm": 0.03265180066227913, "learning_rate": 4.06932841534185e-05, "loss": 10.3178, "step": 564 }, { "epoch": 0.062043595234173396, "grad_norm": 0.036046724766492844, "learning_rate": 4.0537437781979506e-05, "loss": 10.3208, "step": 565 }, { "epoch": 0.06215340690715423, "grad_norm": 0.026036258786916733, "learning_rate": 4.038168669843697e-05, "loss": 10.3175, "step": 566 }, { "epoch": 0.06226321858013507, "grad_norm": 0.026027636602520943, "learning_rate": 4.0226032471202604e-05, "loss": 10.3206, "step": 567 }, { "epoch": 0.062373030253115906, "grad_norm": 0.03299787640571594, "learning_rate": 4.007047666771274e-05, "loss": 10.3177, "step": 568 }, { "epoch": 0.062482841926096745, "grad_norm": 0.03509662672877312, "learning_rate": 3.991502085441259e-05, "loss": 10.3257, "step": 569 }, { "epoch": 0.06259265359907758, "grad_norm": 0.03268042206764221, "learning_rate": 3.9759666596740476e-05, "loss": 10.3187, "step": 570 }, { "epoch": 0.06270246527205842, "grad_norm": 0.036411747336387634, "learning_rate": 3.960441545911204e-05, "loss": 10.3192, "step": 571 }, { "epoch": 0.06281227694503926, "grad_norm": 0.05730379745364189, "learning_rate": 3.944926900490452e-05, "loss": 10.3169, "step": 572 }, { "epoch": 0.0629220886180201, "grad_norm": 0.03837157413363457, "learning_rate": 3.929422879644099e-05, "loss": 10.3151, "step": 573 }, { "epoch": 0.06303190029100093, "grad_norm": 0.047361429780721664, "learning_rate": 3.913929639497462e-05, "loss": 10.3227, "step": 574 }, { "epoch": 0.06314171196398177, "grad_norm": 0.04197605699300766, "learning_rate": 3.898447336067297e-05, "loss": 10.3278, "step": 575 }, { "epoch": 0.06325152363696261, "grad_norm": 0.04603104665875435, "learning_rate": 3.882976125260229e-05, "loss": 10.3204, "step": 576 }, { "epoch": 0.06336133530994345, "grad_norm": 0.028033215552568436, "learning_rate": 3.8675161628711776e-05, "loss": 10.3153, "step": 577 }, { "epoch": 0.06347114698292429, "grad_norm": 0.03791102021932602, "learning_rate": 3.852067604581794e-05, "loss": 10.3199, "step": 578 }, { "epoch": 0.06358095865590513, "grad_norm": 0.04788900539278984, "learning_rate": 3.836630605958888e-05, "loss": 10.3185, "step": 579 }, { "epoch": 0.06369077032888595, "grad_norm": 0.030448194593191147, "learning_rate": 3.821205322452863e-05, "loss": 10.3183, "step": 580 }, { "epoch": 0.06380058200186679, "grad_norm": 0.03152129799127579, "learning_rate": 3.8057919093961553e-05, "loss": 10.325, "step": 581 }, { "epoch": 0.06391039367484763, "grad_norm": 0.035265106707811356, "learning_rate": 3.790390522001662e-05, "loss": 10.3314, "step": 582 }, { "epoch": 0.06402020534782847, "grad_norm": 0.040993936359882355, "learning_rate": 3.775001315361183e-05, "loss": 10.3221, "step": 583 }, { "epoch": 0.06413001702080931, "grad_norm": 0.05848237872123718, "learning_rate": 3.759624444443858e-05, "loss": 10.3188, "step": 584 }, { "epoch": 0.06423982869379015, "grad_norm": 0.042362719774246216, "learning_rate": 3.744260064094604e-05, "loss": 10.3205, "step": 585 }, { "epoch": 0.06434964036677099, "grad_norm": 0.031695008277893066, "learning_rate": 3.728908329032567e-05, "loss": 10.3269, "step": 586 }, { "epoch": 0.06445945203975183, "grad_norm": 0.0336785726249218, "learning_rate": 3.713569393849543e-05, "loss": 10.3177, "step": 587 }, { "epoch": 0.06456926371273267, "grad_norm": 0.035682737827301025, "learning_rate": 3.69824341300844e-05, "loss": 10.3147, "step": 588 }, { "epoch": 0.0646790753857135, "grad_norm": 0.03407788276672363, "learning_rate": 3.6829305408417166e-05, "loss": 10.328, "step": 589 }, { "epoch": 0.06478888705869434, "grad_norm": 0.039640314877033234, "learning_rate": 3.6676309315498256e-05, "loss": 10.324, "step": 590 }, { "epoch": 0.06489869873167518, "grad_norm": 0.047250282019376755, "learning_rate": 3.6523447391996614e-05, "loss": 10.3116, "step": 591 }, { "epoch": 0.06500851040465601, "grad_norm": 0.038214169442653656, "learning_rate": 3.6370721177230116e-05, "loss": 10.3118, "step": 592 }, { "epoch": 0.06511832207763685, "grad_norm": 0.03506159037351608, "learning_rate": 3.6218132209150045e-05, "loss": 10.3242, "step": 593 }, { "epoch": 0.06522813375061769, "grad_norm": 0.03700363263487816, "learning_rate": 3.606568202432562e-05, "loss": 10.3138, "step": 594 }, { "epoch": 0.06533794542359853, "grad_norm": 0.03475815802812576, "learning_rate": 3.591337215792852e-05, "loss": 10.3163, "step": 595 }, { "epoch": 0.06544775709657936, "grad_norm": 0.0341210775077343, "learning_rate": 3.5761204143717385e-05, "loss": 10.3262, "step": 596 }, { "epoch": 0.0655575687695602, "grad_norm": 0.04840511083602905, "learning_rate": 3.560917951402245e-05, "loss": 10.3188, "step": 597 }, { "epoch": 0.06566738044254104, "grad_norm": 0.03449239954352379, "learning_rate": 3.545729979973005e-05, "loss": 10.3226, "step": 598 }, { "epoch": 0.06577719211552188, "grad_norm": 0.03965664282441139, "learning_rate": 3.530556653026721e-05, "loss": 10.3202, "step": 599 }, { "epoch": 0.06588700378850272, "grad_norm": 0.029519766569137573, "learning_rate": 3.515398123358627e-05, "loss": 10.3089, "step": 600 }, { "epoch": 0.06599681546148356, "grad_norm": 0.0381334163248539, "learning_rate": 3.5002545436149474e-05, "loss": 10.3148, "step": 601 }, { "epoch": 0.0661066271344644, "grad_norm": 0.041288670152425766, "learning_rate": 3.485126066291364e-05, "loss": 10.3158, "step": 602 }, { "epoch": 0.06621643880744524, "grad_norm": 0.026856929063796997, "learning_rate": 3.470012843731476e-05, "loss": 10.3092, "step": 603 }, { "epoch": 0.06632625048042606, "grad_norm": 0.035839807242155075, "learning_rate": 3.4549150281252636e-05, "loss": 10.3276, "step": 604 }, { "epoch": 0.0664360621534069, "grad_norm": 0.04341225326061249, "learning_rate": 3.439832771507565e-05, "loss": 10.3243, "step": 605 }, { "epoch": 0.06654587382638774, "grad_norm": 0.03989730402827263, "learning_rate": 3.424766225756537e-05, "loss": 10.3178, "step": 606 }, { "epoch": 0.06665568549936858, "grad_norm": 0.031058041378855705, "learning_rate": 3.4097155425921254e-05, "loss": 10.3187, "step": 607 }, { "epoch": 0.06676549717234942, "grad_norm": 0.04681367799639702, "learning_rate": 3.394680873574546e-05, "loss": 10.3176, "step": 608 }, { "epoch": 0.06687530884533026, "grad_norm": 0.039223261177539825, "learning_rate": 3.3796623701027476e-05, "loss": 10.3141, "step": 609 }, { "epoch": 0.0669851205183111, "grad_norm": 0.037996046245098114, "learning_rate": 3.364660183412892e-05, "loss": 10.3196, "step": 610 }, { "epoch": 0.06709493219129194, "grad_norm": 0.04476340860128403, "learning_rate": 3.349674464576834e-05, "loss": 10.3166, "step": 611 }, { "epoch": 0.06720474386427278, "grad_norm": 0.038603756576776505, "learning_rate": 3.334705364500596e-05, "loss": 10.3207, "step": 612 }, { "epoch": 0.06731455553725361, "grad_norm": 0.05021185800433159, "learning_rate": 3.3197530339228487e-05, "loss": 10.3092, "step": 613 }, { "epoch": 0.06742436721023445, "grad_norm": 0.03497612476348877, "learning_rate": 3.304817623413397e-05, "loss": 10.315, "step": 614 }, { "epoch": 0.06753417888321528, "grad_norm": 0.025148971006274223, "learning_rate": 3.289899283371657e-05, "loss": 10.3156, "step": 615 }, { "epoch": 0.06764399055619612, "grad_norm": 0.043258484452962875, "learning_rate": 3.274998164025148e-05, "loss": 10.3091, "step": 616 }, { "epoch": 0.06775380222917696, "grad_norm": 0.039028916507959366, "learning_rate": 3.260114415427975e-05, "loss": 10.308, "step": 617 }, { "epoch": 0.0678636139021578, "grad_norm": 0.023269733414053917, "learning_rate": 3.2452481874593234e-05, "loss": 10.3169, "step": 618 }, { "epoch": 0.06797342557513864, "grad_norm": 0.032692115753889084, "learning_rate": 3.230399629821942e-05, "loss": 10.3187, "step": 619 }, { "epoch": 0.06808323724811947, "grad_norm": 0.03593998774886131, "learning_rate": 3.215568892040641e-05, "loss": 10.3258, "step": 620 }, { "epoch": 0.06819304892110031, "grad_norm": 0.048697151243686676, "learning_rate": 3.200756123460788e-05, "loss": 10.3091, "step": 621 }, { "epoch": 0.06830286059408115, "grad_norm": 0.049541596323251724, "learning_rate": 3.1859614732467954e-05, "loss": 10.3065, "step": 622 }, { "epoch": 0.06841267226706199, "grad_norm": 0.039022549986839294, "learning_rate": 3.171185090380628e-05, "loss": 10.3241, "step": 623 }, { "epoch": 0.06852248394004283, "grad_norm": 0.04879970848560333, "learning_rate": 3.156427123660297e-05, "loss": 10.3167, "step": 624 }, { "epoch": 0.06863229561302367, "grad_norm": 0.030641546472907066, "learning_rate": 3.141687721698363e-05, "loss": 10.3144, "step": 625 }, { "epoch": 0.06874210728600451, "grad_norm": 0.03837910294532776, "learning_rate": 3.12696703292044e-05, "loss": 10.3199, "step": 626 }, { "epoch": 0.06885191895898533, "grad_norm": 0.0463176965713501, "learning_rate": 3.1122652055637015e-05, "loss": 10.3281, "step": 627 }, { "epoch": 0.06896173063196617, "grad_norm": 0.02846435457468033, "learning_rate": 3.097582387675385e-05, "loss": 10.3113, "step": 628 }, { "epoch": 0.06907154230494701, "grad_norm": 0.038684189319610596, "learning_rate": 3.082918727111304e-05, "loss": 10.3169, "step": 629 }, { "epoch": 0.06918135397792785, "grad_norm": 0.034591104835271835, "learning_rate": 3.0682743715343564e-05, "loss": 10.3062, "step": 630 }, { "epoch": 0.06929116565090869, "grad_norm": 0.04411185905337334, "learning_rate": 3.053649468413043e-05, "loss": 10.3221, "step": 631 }, { "epoch": 0.06940097732388953, "grad_norm": 0.03224635869264603, "learning_rate": 3.0390441650199724e-05, "loss": 10.3193, "step": 632 }, { "epoch": 0.06951078899687037, "grad_norm": 0.03653174266219139, "learning_rate": 3.0244586084303905e-05, "loss": 10.3135, "step": 633 }, { "epoch": 0.06962060066985121, "grad_norm": 0.04843145236372948, "learning_rate": 3.0098929455206904e-05, "loss": 10.3105, "step": 634 }, { "epoch": 0.06973041234283205, "grad_norm": 0.03295741230249405, "learning_rate": 2.9953473229669328e-05, "loss": 10.323, "step": 635 }, { "epoch": 0.06984022401581289, "grad_norm": 0.037569474428892136, "learning_rate": 2.9808218872433767e-05, "loss": 10.3158, "step": 636 }, { "epoch": 0.06995003568879372, "grad_norm": 0.034265752881765366, "learning_rate": 2.9663167846209998e-05, "loss": 10.3198, "step": 637 }, { "epoch": 0.07005984736177456, "grad_norm": 0.03296723589301109, "learning_rate": 2.9518321611660237e-05, "loss": 10.3094, "step": 638 }, { "epoch": 0.07016965903475539, "grad_norm": 0.04063355177640915, "learning_rate": 2.9373681627384447e-05, "loss": 10.3247, "step": 639 }, { "epoch": 0.07027947070773623, "grad_norm": 0.029703807085752487, "learning_rate": 2.9229249349905684e-05, "loss": 10.3243, "step": 640 }, { "epoch": 0.07038928238071707, "grad_norm": 0.043013796210289, "learning_rate": 2.9085026233655365e-05, "loss": 10.32, "step": 641 }, { "epoch": 0.0704990940536979, "grad_norm": 0.046771373599767685, "learning_rate": 2.894101373095867e-05, "loss": 10.3242, "step": 642 }, { "epoch": 0.07060890572667874, "grad_norm": 0.05028558894991875, "learning_rate": 2.8797213292019926e-05, "loss": 10.3225, "step": 643 }, { "epoch": 0.07071871739965958, "grad_norm": 0.03822631761431694, "learning_rate": 2.8653626364907917e-05, "loss": 10.3087, "step": 644 }, { "epoch": 0.07082852907264042, "grad_norm": 0.03862608224153519, "learning_rate": 2.851025439554142e-05, "loss": 10.3213, "step": 645 }, { "epoch": 0.07093834074562126, "grad_norm": 0.041775893419981, "learning_rate": 2.8367098827674578e-05, "loss": 10.322, "step": 646 }, { "epoch": 0.0710481524186021, "grad_norm": 0.03678379952907562, "learning_rate": 2.8224161102882397e-05, "loss": 10.3153, "step": 647 }, { "epoch": 0.07115796409158294, "grad_norm": 0.055228762328624725, "learning_rate": 2.8081442660546125e-05, "loss": 10.3128, "step": 648 }, { "epoch": 0.07126777576456378, "grad_norm": 0.0325370691716671, "learning_rate": 2.7938944937838923e-05, "loss": 10.3156, "step": 649 }, { "epoch": 0.07137758743754462, "grad_norm": 0.038115836679935455, "learning_rate": 2.7796669369711294e-05, "loss": 10.3214, "step": 650 }, { "epoch": 0.07148739911052544, "grad_norm": 0.036208376288414, "learning_rate": 2.7654617388876615e-05, "loss": 10.3208, "step": 651 }, { "epoch": 0.07159721078350628, "grad_norm": 0.03381425887346268, "learning_rate": 2.7512790425796718e-05, "loss": 10.3184, "step": 652 }, { "epoch": 0.07170702245648712, "grad_norm": 0.04482642188668251, "learning_rate": 2.7371189908667604e-05, "loss": 10.3278, "step": 653 }, { "epoch": 0.07181683412946796, "grad_norm": 0.042877502739429474, "learning_rate": 2.7229817263404866e-05, "loss": 10.3253, "step": 654 }, { "epoch": 0.0719266458024488, "grad_norm": 0.04468453302979469, "learning_rate": 2.708867391362948e-05, "loss": 10.3093, "step": 655 }, { "epoch": 0.07203645747542964, "grad_norm": 0.03441416472196579, "learning_rate": 2.694776128065345e-05, "loss": 10.3097, "step": 656 }, { "epoch": 0.07214626914841048, "grad_norm": 0.025696834549307823, "learning_rate": 2.6807080783465376e-05, "loss": 10.3216, "step": 657 }, { "epoch": 0.07225608082139132, "grad_norm": 0.037037041038274765, "learning_rate": 2.6666633838716314e-05, "loss": 10.3137, "step": 658 }, { "epoch": 0.07236589249437216, "grad_norm": 0.04586685448884964, "learning_rate": 2.6526421860705473e-05, "loss": 10.315, "step": 659 }, { "epoch": 0.072475704167353, "grad_norm": 0.049783892929553986, "learning_rate": 2.638644626136587e-05, "loss": 10.3142, "step": 660 }, { "epoch": 0.07258551584033383, "grad_norm": 0.04380533844232559, "learning_rate": 2.6246708450250256e-05, "loss": 10.3082, "step": 661 }, { "epoch": 0.07269532751331466, "grad_norm": 0.03231223300099373, "learning_rate": 2.6107209834516854e-05, "loss": 10.3139, "step": 662 }, { "epoch": 0.0728051391862955, "grad_norm": 0.03345828503370285, "learning_rate": 2.596795181891514e-05, "loss": 10.3191, "step": 663 }, { "epoch": 0.07291495085927634, "grad_norm": 0.04067157581448555, "learning_rate": 2.5828935805771802e-05, "loss": 10.3255, "step": 664 }, { "epoch": 0.07302476253225718, "grad_norm": 0.0350385382771492, "learning_rate": 2.5690163194976575e-05, "loss": 10.3132, "step": 665 }, { "epoch": 0.07313457420523801, "grad_norm": 0.02556205540895462, "learning_rate": 2.5551635383968065e-05, "loss": 10.3139, "step": 666 }, { "epoch": 0.07324438587821885, "grad_norm": 0.036634381860494614, "learning_rate": 2.5413353767719805e-05, "loss": 10.3069, "step": 667 }, { "epoch": 0.07335419755119969, "grad_norm": 0.04193887859582901, "learning_rate": 2.5275319738726165e-05, "loss": 10.3103, "step": 668 }, { "epoch": 0.07346400922418053, "grad_norm": 0.04867958277463913, "learning_rate": 2.513753468698826e-05, "loss": 10.3253, "step": 669 }, { "epoch": 0.07357382089716137, "grad_norm": 0.03344857320189476, "learning_rate": 2.500000000000001e-05, "loss": 10.3175, "step": 670 }, { "epoch": 0.07368363257014221, "grad_norm": 0.04245550557971001, "learning_rate": 2.486271706273421e-05, "loss": 10.3247, "step": 671 }, { "epoch": 0.07379344424312305, "grad_norm": 0.03168286383152008, "learning_rate": 2.4725687257628534e-05, "loss": 10.3145, "step": 672 }, { "epoch": 0.07390325591610389, "grad_norm": 0.03433592617511749, "learning_rate": 2.4588911964571553e-05, "loss": 10.3111, "step": 673 }, { "epoch": 0.07401306758908471, "grad_norm": 0.027093639597296715, "learning_rate": 2.4452392560888976e-05, "loss": 10.3204, "step": 674 }, { "epoch": 0.07412287926206555, "grad_norm": 0.03639523312449455, "learning_rate": 2.4316130421329697e-05, "loss": 10.3152, "step": 675 }, { "epoch": 0.07423269093504639, "grad_norm": 0.03811733424663544, "learning_rate": 2.418012691805191e-05, "loss": 10.3224, "step": 676 }, { "epoch": 0.07434250260802723, "grad_norm": 0.033562976866960526, "learning_rate": 2.4044383420609406e-05, "loss": 10.3183, "step": 677 }, { "epoch": 0.07445231428100807, "grad_norm": 0.06074458360671997, "learning_rate": 2.3908901295937713e-05, "loss": 10.3242, "step": 678 }, { "epoch": 0.07456212595398891, "grad_norm": 0.03439025580883026, "learning_rate": 2.3773681908340284e-05, "loss": 10.3216, "step": 679 }, { "epoch": 0.07467193762696975, "grad_norm": 0.042043287307024, "learning_rate": 2.363872661947488e-05, "loss": 10.3206, "step": 680 }, { "epoch": 0.07478174929995059, "grad_norm": 0.02884497120976448, "learning_rate": 2.350403678833976e-05, "loss": 10.3124, "step": 681 }, { "epoch": 0.07489156097293143, "grad_norm": 0.03978228569030762, "learning_rate": 2.336961377126001e-05, "loss": 10.3218, "step": 682 }, { "epoch": 0.07500137264591226, "grad_norm": 0.030587781220674515, "learning_rate": 2.3235458921873925e-05, "loss": 10.3172, "step": 683 }, { "epoch": 0.0751111843188931, "grad_norm": 0.04183843359351158, "learning_rate": 2.310157359111938e-05, "loss": 10.3199, "step": 684 }, { "epoch": 0.07522099599187394, "grad_norm": 0.03554993122816086, "learning_rate": 2.296795912722014e-05, "loss": 10.3174, "step": 685 }, { "epoch": 0.07533080766485477, "grad_norm": 0.03490421175956726, "learning_rate": 2.283461687567236e-05, "loss": 10.3124, "step": 686 }, { "epoch": 0.07544061933783561, "grad_norm": 0.03740306943655014, "learning_rate": 2.2701548179231048e-05, "loss": 10.3259, "step": 687 }, { "epoch": 0.07555043101081645, "grad_norm": 0.02755308896303177, "learning_rate": 2.2568754377896516e-05, "loss": 10.3189, "step": 688 }, { "epoch": 0.07566024268379729, "grad_norm": 0.03202425315976143, "learning_rate": 2.2436236808900844e-05, "loss": 10.3191, "step": 689 }, { "epoch": 0.07577005435677812, "grad_norm": 0.034231580793857574, "learning_rate": 2.2303996806694488e-05, "loss": 10.319, "step": 690 }, { "epoch": 0.07587986602975896, "grad_norm": 0.03219657391309738, "learning_rate": 2.2172035702932825e-05, "loss": 10.3142, "step": 691 }, { "epoch": 0.0759896777027398, "grad_norm": 0.04530481621623039, "learning_rate": 2.2040354826462668e-05, "loss": 10.3175, "step": 692 }, { "epoch": 0.07609948937572064, "grad_norm": 0.04149880260229111, "learning_rate": 2.1908955503308993e-05, "loss": 10.3091, "step": 693 }, { "epoch": 0.07620930104870148, "grad_norm": 0.03388355299830437, "learning_rate": 2.1777839056661554e-05, "loss": 10.3195, "step": 694 }, { "epoch": 0.07631911272168232, "grad_norm": 0.051045697182416916, "learning_rate": 2.164700680686147e-05, "loss": 10.3205, "step": 695 }, { "epoch": 0.07642892439466316, "grad_norm": 0.055720798671245575, "learning_rate": 2.1516460071388062e-05, "loss": 10.3086, "step": 696 }, { "epoch": 0.07653873606764398, "grad_norm": 0.04313961789011955, "learning_rate": 2.1386200164845526e-05, "loss": 10.3159, "step": 697 }, { "epoch": 0.07664854774062482, "grad_norm": 0.05562193691730499, "learning_rate": 2.125622839894964e-05, "loss": 10.3138, "step": 698 }, { "epoch": 0.07675835941360566, "grad_norm": 0.03812890127301216, "learning_rate": 2.1126546082514664e-05, "loss": 10.3137, "step": 699 }, { "epoch": 0.0768681710865865, "grad_norm": 0.04385654255747795, "learning_rate": 2.09971545214401e-05, "loss": 10.3094, "step": 700 }, { "epoch": 0.07697798275956734, "grad_norm": 0.02911500446498394, "learning_rate": 2.086805501869749e-05, "loss": 10.321, "step": 701 }, { "epoch": 0.07708779443254818, "grad_norm": 0.048436541110277176, "learning_rate": 2.073924887431744e-05, "loss": 10.3085, "step": 702 }, { "epoch": 0.07719760610552902, "grad_norm": 0.028855659067630768, "learning_rate": 2.061073738537635e-05, "loss": 10.3155, "step": 703 }, { "epoch": 0.07730741777850986, "grad_norm": 0.03252778202295303, "learning_rate": 2.048252184598352e-05, "loss": 10.311, "step": 704 }, { "epoch": 0.0774172294514907, "grad_norm": 0.04148361086845398, "learning_rate": 2.0354603547267985e-05, "loss": 10.3228, "step": 705 }, { "epoch": 0.07752704112447154, "grad_norm": 0.038128580898046494, "learning_rate": 2.0226983777365604e-05, "loss": 10.3106, "step": 706 }, { "epoch": 0.07763685279745237, "grad_norm": 0.028530459851026535, "learning_rate": 2.0099663821406056e-05, "loss": 10.3058, "step": 707 }, { "epoch": 0.07774666447043321, "grad_norm": 0.04369957000017166, "learning_rate": 1.9972644961499854e-05, "loss": 10.3175, "step": 708 }, { "epoch": 0.07785647614341404, "grad_norm": 0.030100587755441666, "learning_rate": 1.9845928476725524e-05, "loss": 10.3142, "step": 709 }, { "epoch": 0.07796628781639488, "grad_norm": 0.04747960716485977, "learning_rate": 1.9719515643116674e-05, "loss": 10.3127, "step": 710 }, { "epoch": 0.07807609948937572, "grad_norm": 0.03726712614297867, "learning_rate": 1.959340773364911e-05, "loss": 10.3073, "step": 711 }, { "epoch": 0.07818591116235656, "grad_norm": 0.027646781876683235, "learning_rate": 1.946760601822809e-05, "loss": 10.3214, "step": 712 }, { "epoch": 0.0782957228353374, "grad_norm": 0.047327920794487, "learning_rate": 1.9342111763675512e-05, "loss": 10.3203, "step": 713 }, { "epoch": 0.07840553450831823, "grad_norm": 0.04683876410126686, "learning_rate": 1.9216926233717085e-05, "loss": 10.3167, "step": 714 }, { "epoch": 0.07851534618129907, "grad_norm": 0.03807590901851654, "learning_rate": 1.9092050688969738e-05, "loss": 10.326, "step": 715 }, { "epoch": 0.07862515785427991, "grad_norm": 0.03731735050678253, "learning_rate": 1.8967486386928817e-05, "loss": 10.3121, "step": 716 }, { "epoch": 0.07873496952726075, "grad_norm": 0.058313701301813126, "learning_rate": 1.8843234581955442e-05, "loss": 10.3094, "step": 717 }, { "epoch": 0.07884478120024159, "grad_norm": 0.042186010628938675, "learning_rate": 1.8719296525263922e-05, "loss": 10.3155, "step": 718 }, { "epoch": 0.07895459287322243, "grad_norm": 0.043809790164232254, "learning_rate": 1.859567346490913e-05, "loss": 10.3259, "step": 719 }, { "epoch": 0.07906440454620327, "grad_norm": 0.03344713896512985, "learning_rate": 1.847236664577389e-05, "loss": 10.3147, "step": 720 }, { "epoch": 0.0791742162191841, "grad_norm": 0.035155851393938065, "learning_rate": 1.8349377309556486e-05, "loss": 10.3162, "step": 721 }, { "epoch": 0.07928402789216493, "grad_norm": 0.060719750821590424, "learning_rate": 1.8226706694758195e-05, "loss": 10.3148, "step": 722 }, { "epoch": 0.07939383956514577, "grad_norm": 0.03652290999889374, "learning_rate": 1.810435603667075e-05, "loss": 10.311, "step": 723 }, { "epoch": 0.07950365123812661, "grad_norm": 0.041981663554906845, "learning_rate": 1.7982326567363888e-05, "loss": 10.3177, "step": 724 }, { "epoch": 0.07961346291110745, "grad_norm": 0.036146800965070724, "learning_rate": 1.7860619515673033e-05, "loss": 10.3255, "step": 725 }, { "epoch": 0.07972327458408829, "grad_norm": 0.038230050355196, "learning_rate": 1.773923610718686e-05, "loss": 10.3172, "step": 726 }, { "epoch": 0.07983308625706913, "grad_norm": 0.05691583827137947, "learning_rate": 1.7618177564234905e-05, "loss": 10.3222, "step": 727 }, { "epoch": 0.07994289793004997, "grad_norm": 0.03703266382217407, "learning_rate": 1.7497445105875377e-05, "loss": 10.3047, "step": 728 }, { "epoch": 0.0800527096030308, "grad_norm": 0.03908339887857437, "learning_rate": 1.73770399478828e-05, "loss": 10.3105, "step": 729 }, { "epoch": 0.08016252127601164, "grad_norm": 0.030511919409036636, "learning_rate": 1.725696330273575e-05, "loss": 10.3129, "step": 730 }, { "epoch": 0.08027233294899248, "grad_norm": 0.04107876121997833, "learning_rate": 1.7137216379604727e-05, "loss": 10.3126, "step": 731 }, { "epoch": 0.08038214462197332, "grad_norm": 0.04407944902777672, "learning_rate": 1.7017800384339928e-05, "loss": 10.3152, "step": 732 }, { "epoch": 0.08049195629495415, "grad_norm": 0.03675924614071846, "learning_rate": 1.6898716519459074e-05, "loss": 10.3152, "step": 733 }, { "epoch": 0.08060176796793499, "grad_norm": 0.04171142354607582, "learning_rate": 1.6779965984135377e-05, "loss": 10.3044, "step": 734 }, { "epoch": 0.08071157964091583, "grad_norm": 0.029391134157776833, "learning_rate": 1.6661549974185424e-05, "loss": 10.322, "step": 735 }, { "epoch": 0.08082139131389666, "grad_norm": 0.02885211445391178, "learning_rate": 1.6543469682057106e-05, "loss": 10.3165, "step": 736 }, { "epoch": 0.0809312029868775, "grad_norm": 0.0386587493121624, "learning_rate": 1.6425726296817633e-05, "loss": 10.3261, "step": 737 }, { "epoch": 0.08104101465985834, "grad_norm": 0.04038149490952492, "learning_rate": 1.6308321004141607e-05, "loss": 10.3022, "step": 738 }, { "epoch": 0.08115082633283918, "grad_norm": 0.04831194877624512, "learning_rate": 1.619125498629904e-05, "loss": 10.3086, "step": 739 }, { "epoch": 0.08126063800582002, "grad_norm": 0.03492288663983345, "learning_rate": 1.60745294221434e-05, "loss": 10.3102, "step": 740 }, { "epoch": 0.08137044967880086, "grad_norm": 0.054558295756578445, "learning_rate": 1.595814548709983e-05, "loss": 10.3038, "step": 741 }, { "epoch": 0.0814802613517817, "grad_norm": 0.040097616612911224, "learning_rate": 1.5842104353153287e-05, "loss": 10.3186, "step": 742 }, { "epoch": 0.08159007302476254, "grad_norm": 0.03179539740085602, "learning_rate": 1.5726407188836673e-05, "loss": 10.3188, "step": 743 }, { "epoch": 0.08169988469774336, "grad_norm": 0.04013778269290924, "learning_rate": 1.5611055159219152e-05, "loss": 10.3191, "step": 744 }, { "epoch": 0.0818096963707242, "grad_norm": 0.045401014387607574, "learning_rate": 1.549604942589441e-05, "loss": 10.3101, "step": 745 }, { "epoch": 0.08191950804370504, "grad_norm": 0.039099063724279404, "learning_rate": 1.5381391146968866e-05, "loss": 10.3063, "step": 746 }, { "epoch": 0.08202931971668588, "grad_norm": 0.04024317488074303, "learning_rate": 1.526708147705013e-05, "loss": 10.321, "step": 747 }, { "epoch": 0.08213913138966672, "grad_norm": 0.03901572898030281, "learning_rate": 1.5153121567235335e-05, "loss": 10.3149, "step": 748 }, { "epoch": 0.08224894306264756, "grad_norm": 0.03517254814505577, "learning_rate": 1.5039512565099467e-05, "loss": 10.3173, "step": 749 }, { "epoch": 0.0823587547356284, "grad_norm": 0.04018811881542206, "learning_rate": 1.4926255614683932e-05, "loss": 10.3144, "step": 750 }, { "epoch": 0.0823587547356284, "eval_loss": 10.313794136047363, "eval_runtime": 126.7622, "eval_samples_per_second": 11.731, "eval_steps_per_second": 5.869, "step": 750 }, { "epoch": 0.08246856640860924, "grad_norm": 0.030414637178182602, "learning_rate": 1.481335185648498e-05, "loss": 10.317, "step": 751 }, { "epoch": 0.08257837808159008, "grad_norm": 0.028846023604273796, "learning_rate": 1.4700802427442179e-05, "loss": 10.3138, "step": 752 }, { "epoch": 0.08268818975457091, "grad_norm": 0.03306645527482033, "learning_rate": 1.458860846092705e-05, "loss": 10.3088, "step": 753 }, { "epoch": 0.08279800142755175, "grad_norm": 0.038386497646570206, "learning_rate": 1.4476771086731567e-05, "loss": 10.3158, "step": 754 }, { "epoch": 0.08290781310053259, "grad_norm": 0.03763662651181221, "learning_rate": 1.4365291431056871e-05, "loss": 10.3225, "step": 755 }, { "epoch": 0.08301762477351342, "grad_norm": 0.033651165664196014, "learning_rate": 1.4254170616501827e-05, "loss": 10.3079, "step": 756 }, { "epoch": 0.08312743644649426, "grad_norm": 0.03197680786252022, "learning_rate": 1.414340976205183e-05, "loss": 10.3195, "step": 757 }, { "epoch": 0.0832372481194751, "grad_norm": 0.040615715086460114, "learning_rate": 1.4033009983067452e-05, "loss": 10.3074, "step": 758 }, { "epoch": 0.08334705979245594, "grad_norm": 0.04317568242549896, "learning_rate": 1.3922972391273226e-05, "loss": 10.3095, "step": 759 }, { "epoch": 0.08345687146543677, "grad_norm": 0.03475901857018471, "learning_rate": 1.3813298094746491e-05, "loss": 10.3156, "step": 760 }, { "epoch": 0.08356668313841761, "grad_norm": 0.044157762080430984, "learning_rate": 1.3703988197906209e-05, "loss": 10.3152, "step": 761 }, { "epoch": 0.08367649481139845, "grad_norm": 0.03343284875154495, "learning_rate": 1.3595043801501794e-05, "loss": 10.3174, "step": 762 }, { "epoch": 0.08378630648437929, "grad_norm": 0.029324904084205627, "learning_rate": 1.3486466002602133e-05, "loss": 10.3177, "step": 763 }, { "epoch": 0.08389611815736013, "grad_norm": 0.038032740354537964, "learning_rate": 1.3378255894584463e-05, "loss": 10.3177, "step": 764 }, { "epoch": 0.08400592983034097, "grad_norm": 0.049427881836891174, "learning_rate": 1.327041456712334e-05, "loss": 10.301, "step": 765 }, { "epoch": 0.08411574150332181, "grad_norm": 0.0315837599337101, "learning_rate": 1.3162943106179749e-05, "loss": 10.3037, "step": 766 }, { "epoch": 0.08422555317630265, "grad_norm": 0.040594566613435745, "learning_rate": 1.3055842593990131e-05, "loss": 10.306, "step": 767 }, { "epoch": 0.08433536484928347, "grad_norm": 0.04508155584335327, "learning_rate": 1.2949114109055415e-05, "loss": 10.3162, "step": 768 }, { "epoch": 0.08444517652226431, "grad_norm": 0.03993818908929825, "learning_rate": 1.2842758726130283e-05, "loss": 10.3142, "step": 769 }, { "epoch": 0.08455498819524515, "grad_norm": 0.04547726735472679, "learning_rate": 1.2736777516212266e-05, "loss": 10.3145, "step": 770 }, { "epoch": 0.08466479986822599, "grad_norm": 0.03807322680950165, "learning_rate": 1.2631171546530968e-05, "loss": 10.3149, "step": 771 }, { "epoch": 0.08477461154120683, "grad_norm": 0.048404186964035034, "learning_rate": 1.2525941880537307e-05, "loss": 10.3118, "step": 772 }, { "epoch": 0.08488442321418767, "grad_norm": 0.03061523661017418, "learning_rate": 1.2421089577892869e-05, "loss": 10.3091, "step": 773 }, { "epoch": 0.08499423488716851, "grad_norm": 0.04034106433391571, "learning_rate": 1.2316615694459189e-05, "loss": 10.3071, "step": 774 }, { "epoch": 0.08510404656014935, "grad_norm": 0.034991052001714706, "learning_rate": 1.2212521282287092e-05, "loss": 10.3175, "step": 775 }, { "epoch": 0.08521385823313019, "grad_norm": 0.033823732286691666, "learning_rate": 1.2108807389606158e-05, "loss": 10.3216, "step": 776 }, { "epoch": 0.08532366990611102, "grad_norm": 0.03538206219673157, "learning_rate": 1.2005475060814159e-05, "loss": 10.3111, "step": 777 }, { "epoch": 0.08543348157909186, "grad_norm": 0.03175722807645798, "learning_rate": 1.1902525336466464e-05, "loss": 10.3105, "step": 778 }, { "epoch": 0.08554329325207269, "grad_norm": 0.039146069437265396, "learning_rate": 1.1799959253265668e-05, "loss": 10.3121, "step": 779 }, { "epoch": 0.08565310492505353, "grad_norm": 0.04693768173456192, "learning_rate": 1.1697777844051105e-05, "loss": 10.3168, "step": 780 }, { "epoch": 0.08576291659803437, "grad_norm": 0.040665190666913986, "learning_rate": 1.1595982137788403e-05, "loss": 10.3135, "step": 781 }, { "epoch": 0.0858727282710152, "grad_norm": 0.04170459136366844, "learning_rate": 1.1494573159559213e-05, "loss": 10.3146, "step": 782 }, { "epoch": 0.08598253994399604, "grad_norm": 0.044261373579502106, "learning_rate": 1.1393551930550828e-05, "loss": 10.3217, "step": 783 }, { "epoch": 0.08609235161697688, "grad_norm": 0.037661824375391006, "learning_rate": 1.1292919468045877e-05, "loss": 10.3139, "step": 784 }, { "epoch": 0.08620216328995772, "grad_norm": 0.022352036088705063, "learning_rate": 1.1192676785412154e-05, "loss": 10.3142, "step": 785 }, { "epoch": 0.08631197496293856, "grad_norm": 0.032345980405807495, "learning_rate": 1.1092824892092373e-05, "loss": 10.319, "step": 786 }, { "epoch": 0.0864217866359194, "grad_norm": 0.05072391778230667, "learning_rate": 1.099336479359398e-05, "loss": 10.3104, "step": 787 }, { "epoch": 0.08653159830890024, "grad_norm": 0.03811797499656677, "learning_rate": 1.0894297491479045e-05, "loss": 10.3211, "step": 788 }, { "epoch": 0.08664140998188108, "grad_norm": 0.039311766624450684, "learning_rate": 1.0795623983354215e-05, "loss": 10.3168, "step": 789 }, { "epoch": 0.08675122165486192, "grad_norm": 0.044613540172576904, "learning_rate": 1.0697345262860636e-05, "loss": 10.3177, "step": 790 }, { "epoch": 0.08686103332784274, "grad_norm": 0.038676317781209946, "learning_rate": 1.0599462319663905e-05, "loss": 10.3117, "step": 791 }, { "epoch": 0.08697084500082358, "grad_norm": 0.035879697650671005, "learning_rate": 1.0501976139444191e-05, "loss": 10.3083, "step": 792 }, { "epoch": 0.08708065667380442, "grad_norm": 0.03627593815326691, "learning_rate": 1.0404887703886251e-05, "loss": 10.3153, "step": 793 }, { "epoch": 0.08719046834678526, "grad_norm": 0.032888129353523254, "learning_rate": 1.0308197990669538e-05, "loss": 10.3219, "step": 794 }, { "epoch": 0.0873002800197661, "grad_norm": 0.027540508657693863, "learning_rate": 1.021190797345839e-05, "loss": 10.314, "step": 795 }, { "epoch": 0.08741009169274694, "grad_norm": 0.032333459705114365, "learning_rate": 1.0116018621892237e-05, "loss": 10.3131, "step": 796 }, { "epoch": 0.08751990336572778, "grad_norm": 0.028225935995578766, "learning_rate": 1.0020530901575754e-05, "loss": 10.3154, "step": 797 }, { "epoch": 0.08762971503870862, "grad_norm": 0.03728807717561722, "learning_rate": 9.92544577406923e-06, "loss": 10.3162, "step": 798 }, { "epoch": 0.08773952671168946, "grad_norm": 0.03525736555457115, "learning_rate": 9.830764196878872e-06, "loss": 10.3099, "step": 799 }, { "epoch": 0.0878493383846703, "grad_norm": 0.04706577956676483, "learning_rate": 9.73648712344707e-06, "loss": 10.3132, "step": 800 }, { "epoch": 0.08795915005765113, "grad_norm": 0.028163529932498932, "learning_rate": 9.642615503142926e-06, "loss": 10.3124, "step": 801 }, { "epoch": 0.08806896173063197, "grad_norm": 0.04801159352064133, "learning_rate": 9.549150281252633e-06, "loss": 10.3221, "step": 802 }, { "epoch": 0.0881787734036128, "grad_norm": 0.04143408685922623, "learning_rate": 9.456092398969902e-06, "loss": 10.3191, "step": 803 }, { "epoch": 0.08828858507659364, "grad_norm": 0.04347795993089676, "learning_rate": 9.363442793386606e-06, "loss": 10.3035, "step": 804 }, { "epoch": 0.08839839674957448, "grad_norm": 0.0419791154563427, "learning_rate": 9.271202397483215e-06, "loss": 10.3162, "step": 805 }, { "epoch": 0.08850820842255532, "grad_norm": 0.04846560209989548, "learning_rate": 9.179372140119525e-06, "loss": 10.3174, "step": 806 }, { "epoch": 0.08861802009553615, "grad_norm": 0.031321022659540176, "learning_rate": 9.087952946025175e-06, "loss": 10.3094, "step": 807 }, { "epoch": 0.088727831768517, "grad_norm": 0.044255632907152176, "learning_rate": 8.996945735790447e-06, "loss": 10.3189, "step": 808 }, { "epoch": 0.08883764344149783, "grad_norm": 0.049325257539749146, "learning_rate": 8.906351425856952e-06, "loss": 10.3179, "step": 809 }, { "epoch": 0.08894745511447867, "grad_norm": 0.03295501694083214, "learning_rate": 8.816170928508365e-06, "loss": 10.3152, "step": 810 }, { "epoch": 0.08905726678745951, "grad_norm": 0.03943759202957153, "learning_rate": 8.7264051518613e-06, "loss": 10.3086, "step": 811 }, { "epoch": 0.08916707846044035, "grad_norm": 0.030172457918524742, "learning_rate": 8.637054999856148e-06, "loss": 10.3216, "step": 812 }, { "epoch": 0.08927689013342119, "grad_norm": 0.034907639026641846, "learning_rate": 8.548121372247918e-06, "loss": 10.3139, "step": 813 }, { "epoch": 0.08938670180640203, "grad_norm": 0.04455006122589111, "learning_rate": 8.459605164597267e-06, "loss": 10.3064, "step": 814 }, { "epoch": 0.08949651347938285, "grad_norm": 0.02922525629401207, "learning_rate": 8.371507268261437e-06, "loss": 10.3163, "step": 815 }, { "epoch": 0.08960632515236369, "grad_norm": 0.03390868008136749, "learning_rate": 8.283828570385238e-06, "loss": 10.3106, "step": 816 }, { "epoch": 0.08971613682534453, "grad_norm": 0.02741953358054161, "learning_rate": 8.196569953892202e-06, "loss": 10.3219, "step": 817 }, { "epoch": 0.08982594849832537, "grad_norm": 0.02589074708521366, "learning_rate": 8.109732297475635e-06, "loss": 10.323, "step": 818 }, { "epoch": 0.08993576017130621, "grad_norm": 0.04258984699845314, "learning_rate": 8.023316475589754e-06, "loss": 10.3244, "step": 819 }, { "epoch": 0.09004557184428705, "grad_norm": 0.04778209701180458, "learning_rate": 7.937323358440935e-06, "loss": 10.3081, "step": 820 }, { "epoch": 0.09015538351726789, "grad_norm": 0.043443091213703156, "learning_rate": 7.851753811978924e-06, "loss": 10.3217, "step": 821 }, { "epoch": 0.09026519519024873, "grad_norm": 0.041465628892183304, "learning_rate": 7.766608697888095e-06, "loss": 10.3095, "step": 822 }, { "epoch": 0.09037500686322957, "grad_norm": 0.03256349265575409, "learning_rate": 7.681888873578786e-06, "loss": 10.3116, "step": 823 }, { "epoch": 0.0904848185362104, "grad_norm": 0.04575566202402115, "learning_rate": 7.597595192178702e-06, "loss": 10.3091, "step": 824 }, { "epoch": 0.09059463020919124, "grad_norm": 0.028112446889281273, "learning_rate": 7.513728502524286e-06, "loss": 10.318, "step": 825 }, { "epoch": 0.09070444188217207, "grad_norm": 0.04239173233509064, "learning_rate": 7.430289649152156e-06, "loss": 10.3064, "step": 826 }, { "epoch": 0.09081425355515291, "grad_norm": 0.03836563974618912, "learning_rate": 7.347279472290647e-06, "loss": 10.3252, "step": 827 }, { "epoch": 0.09092406522813375, "grad_norm": 0.03923396021127701, "learning_rate": 7.264698807851328e-06, "loss": 10.3129, "step": 828 }, { "epoch": 0.09103387690111459, "grad_norm": 0.04261473938822746, "learning_rate": 7.182548487420554e-06, "loss": 10.3076, "step": 829 }, { "epoch": 0.09114368857409542, "grad_norm": 0.026719851419329643, "learning_rate": 7.100829338251147e-06, "loss": 10.3145, "step": 830 }, { "epoch": 0.09125350024707626, "grad_norm": 0.039296165108680725, "learning_rate": 7.019542183254046e-06, "loss": 10.3084, "step": 831 }, { "epoch": 0.0913633119200571, "grad_norm": 0.04548013210296631, "learning_rate": 6.9386878409899715e-06, "loss": 10.3146, "step": 832 }, { "epoch": 0.09147312359303794, "grad_norm": 0.036591462790966034, "learning_rate": 6.858267125661272e-06, "loss": 10.3044, "step": 833 }, { "epoch": 0.09158293526601878, "grad_norm": 0.04193099960684776, "learning_rate": 6.778280847103669e-06, "loss": 10.3068, "step": 834 }, { "epoch": 0.09169274693899962, "grad_norm": 0.036512311547994614, "learning_rate": 6.698729810778065e-06, "loss": 10.3261, "step": 835 }, { "epoch": 0.09180255861198046, "grad_norm": 0.04096691682934761, "learning_rate": 6.619614817762537e-06, "loss": 10.3102, "step": 836 }, { "epoch": 0.0919123702849613, "grad_norm": 0.026638969779014587, "learning_rate": 6.540936664744196e-06, "loss": 10.3144, "step": 837 }, { "epoch": 0.09202218195794212, "grad_norm": 0.030443793162703514, "learning_rate": 6.462696144011149e-06, "loss": 10.3162, "step": 838 }, { "epoch": 0.09213199363092296, "grad_norm": 0.05177822709083557, "learning_rate": 6.384894043444567e-06, "loss": 10.3126, "step": 839 }, { "epoch": 0.0922418053039038, "grad_norm": 0.030835647135972977, "learning_rate": 6.3075311465107535e-06, "loss": 10.3153, "step": 840 }, { "epoch": 0.09235161697688464, "grad_norm": 0.041804298758506775, "learning_rate": 6.230608232253227e-06, "loss": 10.311, "step": 841 }, { "epoch": 0.09246142864986548, "grad_norm": 0.035478003323078156, "learning_rate": 6.154126075284855e-06, "loss": 10.3128, "step": 842 }, { "epoch": 0.09257124032284632, "grad_norm": 0.05020277947187424, "learning_rate": 6.078085445780129e-06, "loss": 10.3162, "step": 843 }, { "epoch": 0.09268105199582716, "grad_norm": 0.051810123026371, "learning_rate": 6.002487109467347e-06, "loss": 10.3107, "step": 844 }, { "epoch": 0.092790863668808, "grad_norm": 0.039541371166706085, "learning_rate": 5.927331827620903e-06, "loss": 10.3042, "step": 845 }, { "epoch": 0.09290067534178884, "grad_norm": 0.04485049098730087, "learning_rate": 5.852620357053651e-06, "loss": 10.3146, "step": 846 }, { "epoch": 0.09301048701476967, "grad_norm": 0.03302866965532303, "learning_rate": 5.778353450109286e-06, "loss": 10.3181, "step": 847 }, { "epoch": 0.09312029868775051, "grad_norm": 0.028692902997136116, "learning_rate": 5.704531854654721e-06, "loss": 10.3134, "step": 848 }, { "epoch": 0.09323011036073135, "grad_norm": 0.054431699216365814, "learning_rate": 5.631156314072605e-06, "loss": 10.3177, "step": 849 }, { "epoch": 0.09333992203371218, "grad_norm": 0.036631833761930466, "learning_rate": 5.558227567253832e-06, "loss": 10.3251, "step": 850 }, { "epoch": 0.09344973370669302, "grad_norm": 0.03347776457667351, "learning_rate": 5.485746348590048e-06, "loss": 10.3152, "step": 851 }, { "epoch": 0.09355954537967386, "grad_norm": 0.04079202190041542, "learning_rate": 5.413713387966329e-06, "loss": 10.333, "step": 852 }, { "epoch": 0.0936693570526547, "grad_norm": 0.035952821373939514, "learning_rate": 5.34212941075381e-06, "loss": 10.3145, "step": 853 }, { "epoch": 0.09377916872563553, "grad_norm": 0.041086867451667786, "learning_rate": 5.270995137802315e-06, "loss": 10.3106, "step": 854 }, { "epoch": 0.09388898039861637, "grad_norm": 0.04070660099387169, "learning_rate": 5.200311285433213e-06, "loss": 10.3089, "step": 855 }, { "epoch": 0.09399879207159721, "grad_norm": 0.032837964594364166, "learning_rate": 5.13007856543209e-06, "loss": 10.3146, "step": 856 }, { "epoch": 0.09410860374457805, "grad_norm": 0.041533321142196655, "learning_rate": 5.060297685041659e-06, "loss": 10.3161, "step": 857 }, { "epoch": 0.09421841541755889, "grad_norm": 0.03643094375729561, "learning_rate": 4.99096934695461e-06, "loss": 10.3103, "step": 858 }, { "epoch": 0.09432822709053973, "grad_norm": 0.03696398064494133, "learning_rate": 4.922094249306558e-06, "loss": 10.3269, "step": 859 }, { "epoch": 0.09443803876352057, "grad_norm": 0.046098772436380386, "learning_rate": 4.853673085668947e-06, "loss": 10.3118, "step": 860 }, { "epoch": 0.0945478504365014, "grad_norm": 0.041556525975465775, "learning_rate": 4.78570654504214e-06, "loss": 10.323, "step": 861 }, { "epoch": 0.09465766210948223, "grad_norm": 0.04157395288348198, "learning_rate": 4.7181953118484556e-06, "loss": 10.3125, "step": 862 }, { "epoch": 0.09476747378246307, "grad_norm": 0.036072101444005966, "learning_rate": 4.651140065925269e-06, "loss": 10.3063, "step": 863 }, { "epoch": 0.09487728545544391, "grad_norm": 0.038894958794116974, "learning_rate": 4.58454148251814e-06, "loss": 10.3141, "step": 864 }, { "epoch": 0.09498709712842475, "grad_norm": 0.035335294902324677, "learning_rate": 4.5184002322740785e-06, "loss": 10.3173, "step": 865 }, { "epoch": 0.09509690880140559, "grad_norm": 0.040773481130599976, "learning_rate": 4.452716981234744e-06, "loss": 10.3155, "step": 866 }, { "epoch": 0.09520672047438643, "grad_norm": 0.03266516327857971, "learning_rate": 4.387492390829734e-06, "loss": 10.3169, "step": 867 }, { "epoch": 0.09531653214736727, "grad_norm": 0.056261539459228516, "learning_rate": 4.322727117869951e-06, "loss": 10.3119, "step": 868 }, { "epoch": 0.0954263438203481, "grad_norm": 0.04317648708820343, "learning_rate": 4.258421814540992e-06, "loss": 10.3124, "step": 869 }, { "epoch": 0.09553615549332894, "grad_norm": 0.033856358379125595, "learning_rate": 4.19457712839652e-06, "loss": 10.3216, "step": 870 }, { "epoch": 0.09564596716630978, "grad_norm": 0.036258719861507416, "learning_rate": 4.131193702351827e-06, "loss": 10.3018, "step": 871 }, { "epoch": 0.09575577883929062, "grad_norm": 0.03472882881760597, "learning_rate": 4.068272174677335e-06, "loss": 10.314, "step": 872 }, { "epoch": 0.09586559051227145, "grad_norm": 0.03793155401945114, "learning_rate": 4.005813178992091e-06, "loss": 10.3154, "step": 873 }, { "epoch": 0.09597540218525229, "grad_norm": 0.04314670339226723, "learning_rate": 3.9438173442575e-06, "loss": 10.3158, "step": 874 }, { "epoch": 0.09608521385823313, "grad_norm": 0.02972523681819439, "learning_rate": 3.8822852947709375e-06, "loss": 10.3107, "step": 875 }, { "epoch": 0.09619502553121397, "grad_norm": 0.035492341965436935, "learning_rate": 3.821217650159453e-06, "loss": 10.3126, "step": 876 }, { "epoch": 0.0963048372041948, "grad_norm": 0.03144872933626175, "learning_rate": 3.760615025373543e-06, "loss": 10.3118, "step": 877 }, { "epoch": 0.09641464887717564, "grad_norm": 0.04060778021812439, "learning_rate": 3.700478030680987e-06, "loss": 10.3134, "step": 878 }, { "epoch": 0.09652446055015648, "grad_norm": 0.04507026448845863, "learning_rate": 3.6408072716606346e-06, "loss": 10.3128, "step": 879 }, { "epoch": 0.09663427222313732, "grad_norm": 0.028524206951260567, "learning_rate": 3.581603349196372e-06, "loss": 10.3145, "step": 880 }, { "epoch": 0.09674408389611816, "grad_norm": 0.036721982061862946, "learning_rate": 3.522866859471047e-06, "loss": 10.3111, "step": 881 }, { "epoch": 0.096853895569099, "grad_norm": 0.04546421021223068, "learning_rate": 3.4645983939604496e-06, "loss": 10.3152, "step": 882 }, { "epoch": 0.09696370724207984, "grad_norm": 0.03185427933931351, "learning_rate": 3.406798539427386e-06, "loss": 10.3095, "step": 883 }, { "epoch": 0.09707351891506068, "grad_norm": 0.03573969006538391, "learning_rate": 3.349467877915746e-06, "loss": 10.3168, "step": 884 }, { "epoch": 0.0971833305880415, "grad_norm": 0.04182133823633194, "learning_rate": 3.2926069867446675e-06, "loss": 10.3345, "step": 885 }, { "epoch": 0.09729314226102234, "grad_norm": 0.04052354022860527, "learning_rate": 3.2362164385026706e-06, "loss": 10.3127, "step": 886 }, { "epoch": 0.09740295393400318, "grad_norm": 0.03702933341264725, "learning_rate": 3.180296801041971e-06, "loss": 10.3196, "step": 887 }, { "epoch": 0.09751276560698402, "grad_norm": 0.04575482755899429, "learning_rate": 3.1248486374726883e-06, "loss": 10.3089, "step": 888 }, { "epoch": 0.09762257727996486, "grad_norm": 0.03637000918388367, "learning_rate": 3.069872506157212e-06, "loss": 10.3069, "step": 889 }, { "epoch": 0.0977323889529457, "grad_norm": 0.03638225421309471, "learning_rate": 3.0153689607045845e-06, "loss": 10.3222, "step": 890 }, { "epoch": 0.09784220062592654, "grad_norm": 0.061212845146656036, "learning_rate": 2.961338549964893e-06, "loss": 10.3209, "step": 891 }, { "epoch": 0.09795201229890738, "grad_norm": 0.03995781019330025, "learning_rate": 2.9077818180237693e-06, "loss": 10.3103, "step": 892 }, { "epoch": 0.09806182397188822, "grad_norm": 0.0443757101893425, "learning_rate": 2.8546993041969173e-06, "loss": 10.3082, "step": 893 }, { "epoch": 0.09817163564486905, "grad_norm": 0.038907960057258606, "learning_rate": 2.802091543024671e-06, "loss": 10.323, "step": 894 }, { "epoch": 0.0982814473178499, "grad_norm": 0.03369826823472977, "learning_rate": 2.7499590642665774e-06, "loss": 10.3112, "step": 895 }, { "epoch": 0.09839125899083073, "grad_norm": 0.03591571003198624, "learning_rate": 2.6983023928961404e-06, "loss": 10.3122, "step": 896 }, { "epoch": 0.09850107066381156, "grad_norm": 0.04416611045598984, "learning_rate": 2.647122049095463e-06, "loss": 10.3143, "step": 897 }, { "epoch": 0.0986108823367924, "grad_norm": 0.041298989206552505, "learning_rate": 2.596418548250029e-06, "loss": 10.3093, "step": 898 }, { "epoch": 0.09872069400977324, "grad_norm": 0.03004969097673893, "learning_rate": 2.546192400943537e-06, "loss": 10.3169, "step": 899 }, { "epoch": 0.09883050568275407, "grad_norm": 0.04976905882358551, "learning_rate": 2.496444112952734e-06, "loss": 10.3227, "step": 900 }, { "epoch": 0.09894031735573491, "grad_norm": 0.03230028226971626, "learning_rate": 2.4471741852423237e-06, "loss": 10.3213, "step": 901 }, { "epoch": 0.09905012902871575, "grad_norm": 0.051530446857213974, "learning_rate": 2.3983831139599287e-06, "loss": 10.3224, "step": 902 }, { "epoch": 0.09915994070169659, "grad_norm": 0.0378115214407444, "learning_rate": 2.3500713904311024e-06, "loss": 10.3191, "step": 903 }, { "epoch": 0.09926975237467743, "grad_norm": 0.03399795666337013, "learning_rate": 2.3022395011543686e-06, "loss": 10.3116, "step": 904 }, { "epoch": 0.09937956404765827, "grad_norm": 0.05241768807172775, "learning_rate": 2.2548879277963064e-06, "loss": 10.3094, "step": 905 }, { "epoch": 0.09948937572063911, "grad_norm": 0.03988038748502731, "learning_rate": 2.208017147186736e-06, "loss": 10.3073, "step": 906 }, { "epoch": 0.09959918739361995, "grad_norm": 0.03977709636092186, "learning_rate": 2.161627631313923e-06, "loss": 10.3122, "step": 907 }, { "epoch": 0.09970899906660077, "grad_norm": 0.036957282572984695, "learning_rate": 2.1157198473197414e-06, "loss": 10.3245, "step": 908 }, { "epoch": 0.09981881073958161, "grad_norm": 0.02555002085864544, "learning_rate": 2.070294257495081e-06, "loss": 10.3177, "step": 909 }, { "epoch": 0.09992862241256245, "grad_norm": 0.03872646763920784, "learning_rate": 2.0253513192751373e-06, "loss": 10.3131, "step": 910 }, { "epoch": 0.10003843408554329, "grad_norm": 0.04097941890358925, "learning_rate": 1.9808914852347813e-06, "loss": 10.3127, "step": 911 }, { "epoch": 0.10014824575852413, "grad_norm": 0.04144563153386116, "learning_rate": 1.9369152030840556e-06, "loss": 10.3052, "step": 912 }, { "epoch": 0.10025805743150497, "grad_norm": 0.03275219723582268, "learning_rate": 1.8934229156636452e-06, "loss": 10.3125, "step": 913 }, { "epoch": 0.10036786910448581, "grad_norm": 0.04040839523077011, "learning_rate": 1.8504150609403858e-06, "loss": 10.313, "step": 914 }, { "epoch": 0.10047768077746665, "grad_norm": 0.03348499909043312, "learning_rate": 1.807892072002898e-06, "loss": 10.3179, "step": 915 }, { "epoch": 0.10058749245044749, "grad_norm": 0.04070667922496796, "learning_rate": 1.7658543770572189e-06, "loss": 10.3187, "step": 916 }, { "epoch": 0.10069730412342832, "grad_norm": 0.03918742388486862, "learning_rate": 1.724302399422456e-06, "loss": 10.3228, "step": 917 }, { "epoch": 0.10080711579640916, "grad_norm": 0.03554573655128479, "learning_rate": 1.6832365575265741e-06, "loss": 10.3079, "step": 918 }, { "epoch": 0.10091692746939, "grad_norm": 0.05401109531521797, "learning_rate": 1.6426572649021476e-06, "loss": 10.3102, "step": 919 }, { "epoch": 0.10102673914237083, "grad_norm": 0.04078887775540352, "learning_rate": 1.6025649301821876e-06, "loss": 10.3046, "step": 920 }, { "epoch": 0.10113655081535167, "grad_norm": 0.029533548280596733, "learning_rate": 1.5629599570960718e-06, "loss": 10.3108, "step": 921 }, { "epoch": 0.1012463624883325, "grad_norm": 0.04226626828312874, "learning_rate": 1.523842744465437e-06, "loss": 10.3127, "step": 922 }, { "epoch": 0.10135617416131334, "grad_norm": 0.025987090542912483, "learning_rate": 1.4852136862001764e-06, "loss": 10.3211, "step": 923 }, { "epoch": 0.10146598583429418, "grad_norm": 0.03992049768567085, "learning_rate": 1.4470731712944884e-06, "loss": 10.3015, "step": 924 }, { "epoch": 0.10157579750727502, "grad_norm": 0.04411383345723152, "learning_rate": 1.4094215838229176e-06, "loss": 10.3135, "step": 925 }, { "epoch": 0.10168560918025586, "grad_norm": 0.041523367166519165, "learning_rate": 1.372259302936546e-06, "loss": 10.3204, "step": 926 }, { "epoch": 0.1017954208532367, "grad_norm": 0.033936526626348495, "learning_rate": 1.3355867028591208e-06, "loss": 10.3115, "step": 927 }, { "epoch": 0.10190523252621754, "grad_norm": 0.03105180524289608, "learning_rate": 1.2994041528833266e-06, "loss": 10.3131, "step": 928 }, { "epoch": 0.10201504419919838, "grad_norm": 0.04320959001779556, "learning_rate": 1.2637120173670358e-06, "loss": 10.317, "step": 929 }, { "epoch": 0.10212485587217922, "grad_norm": 0.037952907383441925, "learning_rate": 1.2285106557296477e-06, "loss": 10.3224, "step": 930 }, { "epoch": 0.10223466754516006, "grad_norm": 0.046220093965530396, "learning_rate": 1.1938004224484988e-06, "loss": 10.3118, "step": 931 }, { "epoch": 0.10234447921814088, "grad_norm": 0.05611686035990715, "learning_rate": 1.1595816670552428e-06, "loss": 10.3142, "step": 932 }, { "epoch": 0.10245429089112172, "grad_norm": 0.03338133171200752, "learning_rate": 1.1258547341323699e-06, "loss": 10.3146, "step": 933 }, { "epoch": 0.10256410256410256, "grad_norm": 0.037339672446250916, "learning_rate": 1.0926199633097157e-06, "loss": 10.3126, "step": 934 }, { "epoch": 0.1026739142370834, "grad_norm": 0.03368791565299034, "learning_rate": 1.0598776892610685e-06, "loss": 10.3227, "step": 935 }, { "epoch": 0.10278372591006424, "grad_norm": 0.032310787588357925, "learning_rate": 1.02762824170074e-06, "loss": 10.3099, "step": 936 }, { "epoch": 0.10289353758304508, "grad_norm": 0.03587045520544052, "learning_rate": 9.958719453803278e-07, "loss": 10.315, "step": 937 }, { "epoch": 0.10300334925602592, "grad_norm": 0.03891875594854355, "learning_rate": 9.646091200853802e-07, "loss": 10.3223, "step": 938 }, { "epoch": 0.10311316092900676, "grad_norm": 0.031654711812734604, "learning_rate": 9.338400806321978e-07, "loss": 10.3094, "step": 939 }, { "epoch": 0.1032229726019876, "grad_norm": 0.03519720584154129, "learning_rate": 9.035651368646648e-07, "loss": 10.3187, "step": 940 }, { "epoch": 0.10333278427496843, "grad_norm": 0.042424995452165604, "learning_rate": 8.737845936511335e-07, "loss": 10.3054, "step": 941 }, { "epoch": 0.10344259594794927, "grad_norm": 0.03940219804644585, "learning_rate": 8.444987508813451e-07, "loss": 10.3128, "step": 942 }, { "epoch": 0.1035524076209301, "grad_norm": 0.0543336495757103, "learning_rate": 8.157079034633974e-07, "loss": 10.3178, "step": 943 }, { "epoch": 0.10366221929391094, "grad_norm": 0.03599490970373154, "learning_rate": 7.874123413208145e-07, "loss": 10.3213, "step": 944 }, { "epoch": 0.10377203096689178, "grad_norm": 0.04079907387495041, "learning_rate": 7.596123493895991e-07, "loss": 10.3136, "step": 945 }, { "epoch": 0.10388184263987262, "grad_norm": 0.03582724928855896, "learning_rate": 7.323082076153509e-07, "loss": 10.3198, "step": 946 }, { "epoch": 0.10399165431285345, "grad_norm": 0.03927293419837952, "learning_rate": 7.055001909504755e-07, "loss": 10.3103, "step": 947 }, { "epoch": 0.1041014659858343, "grad_norm": 0.05429311841726303, "learning_rate": 6.791885693514133e-07, "loss": 10.3139, "step": 948 }, { "epoch": 0.10421127765881513, "grad_norm": 0.034018926322460175, "learning_rate": 6.533736077758868e-07, "loss": 10.3105, "step": 949 }, { "epoch": 0.10432108933179597, "grad_norm": 0.03970513865351677, "learning_rate": 6.280555661802856e-07, "loss": 10.3111, "step": 950 }, { "epoch": 0.10443090100477681, "grad_norm": 0.05490114912390709, "learning_rate": 6.032346995169968e-07, "loss": 10.3033, "step": 951 }, { "epoch": 0.10454071267775765, "grad_norm": 0.04130084067583084, "learning_rate": 5.78911257731879e-07, "loss": 10.3252, "step": 952 }, { "epoch": 0.10465052435073849, "grad_norm": 0.041738107800483704, "learning_rate": 5.550854857617193e-07, "loss": 10.3104, "step": 953 }, { "epoch": 0.10476033602371933, "grad_norm": 0.03705933317542076, "learning_rate": 5.317576235317756e-07, "loss": 10.3114, "step": 954 }, { "epoch": 0.10487014769670015, "grad_norm": 0.04287085682153702, "learning_rate": 5.089279059533658e-07, "loss": 10.3118, "step": 955 }, { "epoch": 0.10497995936968099, "grad_norm": 0.036541227251291275, "learning_rate": 4.865965629214819e-07, "loss": 10.3191, "step": 956 }, { "epoch": 0.10508977104266183, "grad_norm": 0.03359196335077286, "learning_rate": 4.647638193125137e-07, "loss": 10.3126, "step": 957 }, { "epoch": 0.10519958271564267, "grad_norm": 0.029098298400640488, "learning_rate": 4.434298949819449e-07, "loss": 10.3174, "step": 958 }, { "epoch": 0.10530939438862351, "grad_norm": 0.0325823612511158, "learning_rate": 4.2259500476214407e-07, "loss": 10.3138, "step": 959 }, { "epoch": 0.10541920606160435, "grad_norm": 0.033848442137241364, "learning_rate": 4.02259358460233e-07, "loss": 10.3043, "step": 960 }, { "epoch": 0.10552901773458519, "grad_norm": 0.04012158140540123, "learning_rate": 3.824231608559492e-07, "loss": 10.3117, "step": 961 }, { "epoch": 0.10563882940756603, "grad_norm": 0.042495328933000565, "learning_rate": 3.630866116995757e-07, "loss": 10.3234, "step": 962 }, { "epoch": 0.10574864108054687, "grad_norm": 0.04280965402722359, "learning_rate": 3.4424990570994797e-07, "loss": 10.329, "step": 963 }, { "epoch": 0.1058584527535277, "grad_norm": 0.03574259579181671, "learning_rate": 3.2591323257248893e-07, "loss": 10.3192, "step": 964 }, { "epoch": 0.10596826442650854, "grad_norm": 0.03468066081404686, "learning_rate": 3.080767769372939e-07, "loss": 10.308, "step": 965 }, { "epoch": 0.10607807609948938, "grad_norm": 0.03749536722898483, "learning_rate": 2.907407184172706e-07, "loss": 10.3065, "step": 966 }, { "epoch": 0.10618788777247021, "grad_norm": 0.040267836302518845, "learning_rate": 2.7390523158633554e-07, "loss": 10.3101, "step": 967 }, { "epoch": 0.10629769944545105, "grad_norm": 0.04641765356063843, "learning_rate": 2.5757048597765396e-07, "loss": 10.3158, "step": 968 }, { "epoch": 0.10640751111843189, "grad_norm": 0.045349445194005966, "learning_rate": 2.4173664608193593e-07, "loss": 10.3114, "step": 969 }, { "epoch": 0.10651732279141272, "grad_norm": 0.033795084804296494, "learning_rate": 2.2640387134577058e-07, "loss": 10.3055, "step": 970 }, { "epoch": 0.10662713446439356, "grad_norm": 0.03448265418410301, "learning_rate": 2.1157231617002783e-07, "loss": 10.3157, "step": 971 }, { "epoch": 0.1067369461373744, "grad_norm": 0.032643064856529236, "learning_rate": 1.9724212990830938e-07, "loss": 10.3156, "step": 972 }, { "epoch": 0.10684675781035524, "grad_norm": 0.037873052060604095, "learning_rate": 1.8341345686543332e-07, "loss": 10.3146, "step": 973 }, { "epoch": 0.10695656948333608, "grad_norm": 0.0303360465914011, "learning_rate": 1.7008643629596866e-07, "loss": 10.3094, "step": 974 }, { "epoch": 0.10706638115631692, "grad_norm": 0.04015408083796501, "learning_rate": 1.5726120240288634e-07, "loss": 10.3039, "step": 975 }, { "epoch": 0.10717619282929776, "grad_norm": 0.03686315566301346, "learning_rate": 1.449378843361271e-07, "loss": 10.3094, "step": 976 }, { "epoch": 0.1072860045022786, "grad_norm": 0.04508250951766968, "learning_rate": 1.3311660619138578e-07, "loss": 10.309, "step": 977 }, { "epoch": 0.10739581617525942, "grad_norm": 0.040923111140728, "learning_rate": 1.2179748700879012e-07, "loss": 10.314, "step": 978 }, { "epoch": 0.10750562784824026, "grad_norm": 0.03168369084596634, "learning_rate": 1.109806407717462e-07, "loss": 10.3179, "step": 979 }, { "epoch": 0.1076154395212211, "grad_norm": 0.03265474736690521, "learning_rate": 1.0066617640578368e-07, "loss": 10.3111, "step": 980 }, { "epoch": 0.10772525119420194, "grad_norm": 0.03841938450932503, "learning_rate": 9.085419777743465e-08, "loss": 10.3139, "step": 981 }, { "epoch": 0.10783506286718278, "grad_norm": 0.04329368844628334, "learning_rate": 8.15448036932176e-08, "loss": 10.3161, "step": 982 }, { "epoch": 0.10794487454016362, "grad_norm": 0.040105242282152176, "learning_rate": 7.273808789862724e-08, "loss": 10.3131, "step": 983 }, { "epoch": 0.10805468621314446, "grad_norm": 0.03887563571333885, "learning_rate": 6.443413907720186e-08, "loss": 10.3237, "step": 984 }, { "epoch": 0.1081644978861253, "grad_norm": 0.04586820304393768, "learning_rate": 5.663304084960186e-08, "loss": 10.3174, "step": 985 }, { "epoch": 0.10827430955910614, "grad_norm": 0.046376392245292664, "learning_rate": 4.933487177280482e-08, "loss": 10.3119, "step": 986 }, { "epoch": 0.10838412123208697, "grad_norm": 0.04189673811197281, "learning_rate": 4.253970533929508e-08, "loss": 10.3188, "step": 987 }, { "epoch": 0.10849393290506781, "grad_norm": 0.040410179644823074, "learning_rate": 3.624760997631982e-08, "loss": 10.3182, "step": 988 }, { "epoch": 0.10860374457804865, "grad_norm": 0.04387160390615463, "learning_rate": 3.04586490452119e-08, "loss": 10.3111, "step": 989 }, { "epoch": 0.10871355625102948, "grad_norm": 0.03705020993947983, "learning_rate": 2.5172880840745873e-08, "loss": 10.3173, "step": 990 }, { "epoch": 0.10882336792401032, "grad_norm": 0.029674449935555458, "learning_rate": 2.0390358590538504e-08, "loss": 10.3057, "step": 991 }, { "epoch": 0.10893317959699116, "grad_norm": 0.03965083882212639, "learning_rate": 1.6111130454543598e-08, "loss": 10.3186, "step": 992 }, { "epoch": 0.109042991269972, "grad_norm": 0.02693052589893341, "learning_rate": 1.2335239524541299e-08, "loss": 10.3194, "step": 993 }, { "epoch": 0.10915280294295283, "grad_norm": 0.04057001695036888, "learning_rate": 9.06272382371065e-09, "loss": 10.3171, "step": 994 }, { "epoch": 0.10926261461593367, "grad_norm": 0.04335347190499306, "learning_rate": 6.293616306246586e-09, "loss": 10.3177, "step": 995 }, { "epoch": 0.10937242628891451, "grad_norm": 0.04777819290757179, "learning_rate": 4.0279448570323954e-09, "loss": 10.3211, "step": 996 }, { "epoch": 0.10948223796189535, "grad_norm": 0.034486740827560425, "learning_rate": 2.265732291356626e-09, "loss": 10.3233, "step": 997 }, { "epoch": 0.10959204963487619, "grad_norm": 0.03398129716515541, "learning_rate": 1.0069963546743832e-09, "loss": 10.3182, "step": 998 }, { "epoch": 0.10970186130785703, "grad_norm": 0.03073979541659355, "learning_rate": 2.5174972244634833e-10, "loss": 10.3134, "step": 999 }, { "epoch": 0.10981167298083787, "grad_norm": 0.030138185247778893, "learning_rate": 0.0, "loss": 10.314, "step": 1000 }, { "epoch": 0.10981167298083787, "eval_loss": 10.31318187713623, "eval_runtime": 126.9236, "eval_samples_per_second": 11.716, "eval_steps_per_second": 5.862, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 213840297984000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }