{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998030022457745, "eval_steps": 200, "global_step": 1586, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006303928135219258, "grad_norm": 0.025326624512672424, "learning_rate": 3.3333333333333335e-05, "loss": 0.6736, "step": 1 }, { "epoch": 0.0012607856270438517, "grad_norm": 0.025744665414094925, "learning_rate": 6.666666666666667e-05, "loss": 0.7173, "step": 2 }, { "epoch": 0.0018911784405657775, "grad_norm": 0.023411758244037628, "learning_rate": 0.0001, "loss": 0.7218, "step": 3 }, { "epoch": 0.0025215712540877034, "grad_norm": 0.02725936658680439, "learning_rate": 0.00013333333333333334, "loss": 0.6962, "step": 4 }, { "epoch": 0.0031519640676096294, "grad_norm": 0.03062792494893074, "learning_rate": 0.00016666666666666666, "loss": 0.6939, "step": 5 }, { "epoch": 0.003782356881131555, "grad_norm": 0.026029109954833984, "learning_rate": 0.0002, "loss": 0.65, "step": 6 }, { "epoch": 0.004412749694653481, "grad_norm": 0.032153379172086716, "learning_rate": 0.00023333333333333333, "loss": 0.6741, "step": 7 }, { "epoch": 0.005043142508175407, "grad_norm": 0.03682171180844307, "learning_rate": 0.0002666666666666667, "loss": 0.5635, "step": 8 }, { "epoch": 0.005673535321697332, "grad_norm": 0.03885592147707939, "learning_rate": 0.0003, "loss": 0.6713, "step": 9 }, { "epoch": 0.006303928135219259, "grad_norm": 0.02370770089328289, "learning_rate": 0.0003333333333333333, "loss": 0.6147, "step": 10 }, { "epoch": 0.0069343209487411845, "grad_norm": 0.016970451921224594, "learning_rate": 0.00036666666666666667, "loss": 0.6113, "step": 11 }, { "epoch": 0.00756471376226311, "grad_norm": 0.017335308715701103, "learning_rate": 0.0004, "loss": 0.5424, "step": 12 }, { "epoch": 0.008195106575785037, "grad_norm": 0.01663125492632389, "learning_rate": 0.00043333333333333337, "loss": 0.4799, "step": 13 }, { "epoch": 0.008825499389306962, "grad_norm": 0.016337264329195023, "learning_rate": 0.00046666666666666666, "loss": 0.5158, "step": 14 }, { "epoch": 0.009455892202828888, "grad_norm": 0.015455410815775394, "learning_rate": 0.0005, "loss": 0.5102, "step": 15 }, { "epoch": 0.010086285016350813, "grad_norm": 0.01774396188557148, "learning_rate": 0.0004996817313812858, "loss": 0.5002, "step": 16 }, { "epoch": 0.010716677829872739, "grad_norm": 0.016054702922701836, "learning_rate": 0.0004993634627625716, "loss": 0.439, "step": 17 }, { "epoch": 0.011347070643394665, "grad_norm": 0.0152912437915802, "learning_rate": 0.0004990451941438574, "loss": 0.5497, "step": 18 }, { "epoch": 0.01197746345691659, "grad_norm": 0.02227928303182125, "learning_rate": 0.0004987269255251432, "loss": 0.5518, "step": 19 }, { "epoch": 0.012607856270438518, "grad_norm": 0.0172741636633873, "learning_rate": 0.0004984086569064291, "loss": 0.5072, "step": 20 }, { "epoch": 0.013238249083960443, "grad_norm": 0.018295247107744217, "learning_rate": 0.0004980903882877149, "loss": 0.4582, "step": 21 }, { "epoch": 0.013868641897482369, "grad_norm": 0.01510521862655878, "learning_rate": 0.0004977721196690007, "loss": 0.4367, "step": 22 }, { "epoch": 0.014499034711004295, "grad_norm": 0.01511785015463829, "learning_rate": 0.0004974538510502865, "loss": 0.4817, "step": 23 }, { "epoch": 0.01512942752452622, "grad_norm": 0.014921161346137524, "learning_rate": 0.0004971355824315723, "loss": 0.4648, "step": 24 }, { "epoch": 0.015759820338048146, "grad_norm": 0.021133752539753914, "learning_rate": 0.0004968173138128581, "loss": 0.5615, "step": 25 }, { "epoch": 0.016390213151570073, "grad_norm": 0.015347599983215332, "learning_rate": 0.0004964990451941439, "loss": 0.4716, "step": 26 }, { "epoch": 0.017020605965091997, "grad_norm": 0.012884266674518585, "learning_rate": 0.0004961807765754296, "loss": 0.4996, "step": 27 }, { "epoch": 0.017650998778613924, "grad_norm": 0.013318617828190327, "learning_rate": 0.0004958625079567156, "loss": 0.3983, "step": 28 }, { "epoch": 0.01828139159213585, "grad_norm": 0.013513092882931232, "learning_rate": 0.0004955442393380013, "loss": 0.5372, "step": 29 }, { "epoch": 0.018911784405657776, "grad_norm": 0.0154775595292449, "learning_rate": 0.0004952259707192871, "loss": 0.5042, "step": 30 }, { "epoch": 0.019542177219179703, "grad_norm": 0.012689829804003239, "learning_rate": 0.0004949077021005729, "loss": 0.4029, "step": 31 }, { "epoch": 0.020172570032701627, "grad_norm": 0.012805973179638386, "learning_rate": 0.0004945894334818587, "loss": 0.4598, "step": 32 }, { "epoch": 0.020802962846223554, "grad_norm": 0.012445814907550812, "learning_rate": 0.0004942711648631445, "loss": 0.4056, "step": 33 }, { "epoch": 0.021433355659745478, "grad_norm": 0.013492134399712086, "learning_rate": 0.0004939528962444303, "loss": 0.4337, "step": 34 }, { "epoch": 0.022063748473267406, "grad_norm": 0.01400570385158062, "learning_rate": 0.0004936346276257161, "loss": 0.4748, "step": 35 }, { "epoch": 0.02269414128678933, "grad_norm": 0.012465992011129856, "learning_rate": 0.0004933163590070019, "loss": 0.4292, "step": 36 }, { "epoch": 0.023324534100311257, "grad_norm": 0.013714607805013657, "learning_rate": 0.0004929980903882877, "loss": 0.4607, "step": 37 }, { "epoch": 0.02395492691383318, "grad_norm": 0.01154986210167408, "learning_rate": 0.0004926798217695735, "loss": 0.3525, "step": 38 }, { "epoch": 0.024585319727355108, "grad_norm": 0.01176070049405098, "learning_rate": 0.0004923615531508593, "loss": 0.3955, "step": 39 }, { "epoch": 0.025215712540877035, "grad_norm": 0.013463042676448822, "learning_rate": 0.0004920432845321452, "loss": 0.437, "step": 40 }, { "epoch": 0.02584610535439896, "grad_norm": 0.011418250389397144, "learning_rate": 0.000491725015913431, "loss": 0.4422, "step": 41 }, { "epoch": 0.026476498167920887, "grad_norm": 0.011758891865611076, "learning_rate": 0.0004914067472947168, "loss": 0.3783, "step": 42 }, { "epoch": 0.02710689098144281, "grad_norm": 0.01337432861328125, "learning_rate": 0.0004910884786760026, "loss": 0.437, "step": 43 }, { "epoch": 0.027737283794964738, "grad_norm": 0.012143326923251152, "learning_rate": 0.0004907702100572884, "loss": 0.4015, "step": 44 }, { "epoch": 0.028367676608486662, "grad_norm": 0.01406758464872837, "learning_rate": 0.0004904519414385742, "loss": 0.4546, "step": 45 }, { "epoch": 0.02899806942200859, "grad_norm": 0.01546978484839201, "learning_rate": 0.00049013367281986, "loss": 0.501, "step": 46 }, { "epoch": 0.029628462235530516, "grad_norm": 0.015594643540680408, "learning_rate": 0.0004898154042011458, "loss": 0.4073, "step": 47 }, { "epoch": 0.03025885504905244, "grad_norm": 0.013020046055316925, "learning_rate": 0.0004894971355824315, "loss": 0.4476, "step": 48 }, { "epoch": 0.030889247862574368, "grad_norm": 0.01389628928154707, "learning_rate": 0.0004891788669637173, "loss": 0.405, "step": 49 }, { "epoch": 0.03151964067609629, "grad_norm": 0.015071937814354897, "learning_rate": 0.0004888605983450031, "loss": 0.4566, "step": 50 }, { "epoch": 0.032150033489618215, "grad_norm": 0.011399737559258938, "learning_rate": 0.0004885423297262889, "loss": 0.3752, "step": 51 }, { "epoch": 0.032780426303140146, "grad_norm": 0.010532829910516739, "learning_rate": 0.0004882240611075748, "loss": 0.3592, "step": 52 }, { "epoch": 0.03341081911666207, "grad_norm": 0.014051595702767372, "learning_rate": 0.0004879057924888606, "loss": 0.4044, "step": 53 }, { "epoch": 0.034041211930183994, "grad_norm": 0.01402432844042778, "learning_rate": 0.0004875875238701464, "loss": 0.4425, "step": 54 }, { "epoch": 0.034671604743705925, "grad_norm": 0.013888038694858551, "learning_rate": 0.0004872692552514322, "loss": 0.3876, "step": 55 }, { "epoch": 0.03530199755722785, "grad_norm": 0.011427722871303558, "learning_rate": 0.000486950986632718, "loss": 0.378, "step": 56 }, { "epoch": 0.03593239037074977, "grad_norm": 0.01229893695563078, "learning_rate": 0.0004866327180140038, "loss": 0.3892, "step": 57 }, { "epoch": 0.0365627831842717, "grad_norm": 0.01256764866411686, "learning_rate": 0.00048631444939528965, "loss": 0.4462, "step": 58 }, { "epoch": 0.03719317599779363, "grad_norm": 0.013883323408663273, "learning_rate": 0.00048599618077657544, "loss": 0.4948, "step": 59 }, { "epoch": 0.03782356881131555, "grad_norm": 0.011989254504442215, "learning_rate": 0.00048567791215786124, "loss": 0.4106, "step": 60 }, { "epoch": 0.038453961624837475, "grad_norm": 0.013135885819792747, "learning_rate": 0.0004853596435391471, "loss": 0.4188, "step": 61 }, { "epoch": 0.039084354438359406, "grad_norm": 0.021047130227088928, "learning_rate": 0.0004850413749204329, "loss": 0.4351, "step": 62 }, { "epoch": 0.03971474725188133, "grad_norm": 0.024305276572704315, "learning_rate": 0.0004847231063017187, "loss": 0.4738, "step": 63 }, { "epoch": 0.040345140065403254, "grad_norm": 0.01172337494790554, "learning_rate": 0.00048440483768300447, "loss": 0.3856, "step": 64 }, { "epoch": 0.04097553287892518, "grad_norm": 0.013637801632285118, "learning_rate": 0.00048408656906429026, "loss": 0.4586, "step": 65 }, { "epoch": 0.04160592569244711, "grad_norm": 0.013048408553004265, "learning_rate": 0.00048376830044557606, "loss": 0.3924, "step": 66 }, { "epoch": 0.04223631850596903, "grad_norm": 0.013575535267591476, "learning_rate": 0.00048345003182686185, "loss": 0.4177, "step": 67 }, { "epoch": 0.042866711319490956, "grad_norm": 0.012257833033800125, "learning_rate": 0.00048313176320814765, "loss": 0.3538, "step": 68 }, { "epoch": 0.04349710413301288, "grad_norm": 0.014019403606653214, "learning_rate": 0.00048281349458943344, "loss": 0.4397, "step": 69 }, { "epoch": 0.04412749694653481, "grad_norm": 0.012912232428789139, "learning_rate": 0.00048249522597071934, "loss": 0.4008, "step": 70 }, { "epoch": 0.044757889760056735, "grad_norm": 0.012819399125874043, "learning_rate": 0.00048217695735200514, "loss": 0.3649, "step": 71 }, { "epoch": 0.04538828257357866, "grad_norm": 0.013874593190848827, "learning_rate": 0.00048185868873329093, "loss": 0.4142, "step": 72 }, { "epoch": 0.04601867538710059, "grad_norm": 0.013702747412025928, "learning_rate": 0.0004815404201145767, "loss": 0.4637, "step": 73 }, { "epoch": 0.046649068200622514, "grad_norm": 0.23024669289588928, "learning_rate": 0.0004812221514958625, "loss": 0.3665, "step": 74 }, { "epoch": 0.04727946101414444, "grad_norm": 0.01643698662519455, "learning_rate": 0.0004809038828771483, "loss": 0.4121, "step": 75 }, { "epoch": 0.04790985382766636, "grad_norm": 0.01222292147576809, "learning_rate": 0.0004805856142584341, "loss": 0.3893, "step": 76 }, { "epoch": 0.04854024664118829, "grad_norm": 0.013304284773766994, "learning_rate": 0.0004802673456397199, "loss": 0.4134, "step": 77 }, { "epoch": 0.049170639454710216, "grad_norm": 0.01330029871314764, "learning_rate": 0.00047994907702100575, "loss": 0.3281, "step": 78 }, { "epoch": 0.04980103226823214, "grad_norm": 0.014089803211390972, "learning_rate": 0.00047963080840229155, "loss": 0.3572, "step": 79 }, { "epoch": 0.05043142508175407, "grad_norm": 0.014410941861569881, "learning_rate": 0.00047931253978357734, "loss": 0.4319, "step": 80 }, { "epoch": 0.051061817895275995, "grad_norm": 0.01325183641165495, "learning_rate": 0.00047899427116486313, "loss": 0.3824, "step": 81 }, { "epoch": 0.05169221070879792, "grad_norm": 0.011433329433202744, "learning_rate": 0.000478676002546149, "loss": 0.4037, "step": 82 }, { "epoch": 0.05232260352231984, "grad_norm": 0.01491348072886467, "learning_rate": 0.0004783577339274348, "loss": 0.3854, "step": 83 }, { "epoch": 0.05295299633584177, "grad_norm": 0.021524174138903618, "learning_rate": 0.00047803946530872057, "loss": 0.4322, "step": 84 }, { "epoch": 0.0535833891493637, "grad_norm": 0.015042196959257126, "learning_rate": 0.00047772119669000637, "loss": 0.418, "step": 85 }, { "epoch": 0.05421378196288562, "grad_norm": 0.01667785830795765, "learning_rate": 0.00047740292807129216, "loss": 0.41, "step": 86 }, { "epoch": 0.05484417477640755, "grad_norm": 0.037168629467487335, "learning_rate": 0.000477084659452578, "loss": 0.4742, "step": 87 }, { "epoch": 0.055474567589929476, "grad_norm": 0.021747788414359093, "learning_rate": 0.0004767663908338638, "loss": 0.4504, "step": 88 }, { "epoch": 0.0561049604034514, "grad_norm": 0.014987772330641747, "learning_rate": 0.0004764481222151496, "loss": 0.4319, "step": 89 }, { "epoch": 0.056735353216973324, "grad_norm": 0.014994597062468529, "learning_rate": 0.0004761298535964354, "loss": 0.4694, "step": 90 }, { "epoch": 0.057365746030495254, "grad_norm": 0.016926737502217293, "learning_rate": 0.0004758115849777212, "loss": 0.4038, "step": 91 }, { "epoch": 0.05799613884401718, "grad_norm": 0.01491682417690754, "learning_rate": 0.000475493316359007, "loss": 0.4149, "step": 92 }, { "epoch": 0.0586265316575391, "grad_norm": 0.015193617902696133, "learning_rate": 0.0004751750477402928, "loss": 0.3972, "step": 93 }, { "epoch": 0.05925692447106103, "grad_norm": 0.014669179916381836, "learning_rate": 0.0004748567791215786, "loss": 0.4146, "step": 94 }, { "epoch": 0.05988731728458296, "grad_norm": 0.01460240501910448, "learning_rate": 0.00047453851050286447, "loss": 0.4025, "step": 95 }, { "epoch": 0.06051771009810488, "grad_norm": 0.014518317766487598, "learning_rate": 0.00047422024188415027, "loss": 0.4396, "step": 96 }, { "epoch": 0.061148102911626805, "grad_norm": 0.012409773655235767, "learning_rate": 0.00047390197326543606, "loss": 0.4276, "step": 97 }, { "epoch": 0.061778495725148735, "grad_norm": 0.01293521374464035, "learning_rate": 0.00047358370464672185, "loss": 0.3985, "step": 98 }, { "epoch": 0.06240888853867066, "grad_norm": 0.013423739932477474, "learning_rate": 0.00047326543602800765, "loss": 0.3544, "step": 99 }, { "epoch": 0.06303928135219258, "grad_norm": 0.013836512342095375, "learning_rate": 0.00047294716740929344, "loss": 0.3954, "step": 100 }, { "epoch": 0.06366967416571451, "grad_norm": 0.017503945156931877, "learning_rate": 0.00047262889879057924, "loss": 0.372, "step": 101 }, { "epoch": 0.06430006697923643, "grad_norm": 0.016291653737425804, "learning_rate": 0.00047231063017186503, "loss": 0.3682, "step": 102 }, { "epoch": 0.06493045979275837, "grad_norm": 0.011662670411169529, "learning_rate": 0.0004719923615531509, "loss": 0.3486, "step": 103 }, { "epoch": 0.06556085260628029, "grad_norm": 0.013170573860406876, "learning_rate": 0.0004716740929344367, "loss": 0.3964, "step": 104 }, { "epoch": 0.06619124541980222, "grad_norm": 0.01151296403259039, "learning_rate": 0.00047135582431572247, "loss": 0.4038, "step": 105 }, { "epoch": 0.06682163823332414, "grad_norm": 0.013075540773570538, "learning_rate": 0.0004710375556970083, "loss": 0.438, "step": 106 }, { "epoch": 0.06745203104684606, "grad_norm": 0.012431737035512924, "learning_rate": 0.0004707192870782941, "loss": 0.3328, "step": 107 }, { "epoch": 0.06808242386036799, "grad_norm": 0.015350187197327614, "learning_rate": 0.0004704010184595799, "loss": 0.4361, "step": 108 }, { "epoch": 0.06871281667388991, "grad_norm": 0.02239173837006092, "learning_rate": 0.0004700827498408657, "loss": 0.376, "step": 109 }, { "epoch": 0.06934320948741185, "grad_norm": 0.013986581936478615, "learning_rate": 0.0004697644812221515, "loss": 0.3829, "step": 110 }, { "epoch": 0.06997360230093377, "grad_norm": 0.014001306146383286, "learning_rate": 0.0004694462126034373, "loss": 0.4085, "step": 111 }, { "epoch": 0.0706039951144557, "grad_norm": 0.01874650828540325, "learning_rate": 0.00046912794398472314, "loss": 0.4352, "step": 112 }, { "epoch": 0.07123438792797762, "grad_norm": 0.01388302631676197, "learning_rate": 0.00046880967536600893, "loss": 0.4043, "step": 113 }, { "epoch": 0.07186478074149955, "grad_norm": 0.013722342438995838, "learning_rate": 0.0004684914067472947, "loss": 0.3652, "step": 114 }, { "epoch": 0.07249517355502147, "grad_norm": 0.013007496483623981, "learning_rate": 0.0004681731381285805, "loss": 0.4175, "step": 115 }, { "epoch": 0.0731255663685434, "grad_norm": 0.013948211446404457, "learning_rate": 0.0004678548695098663, "loss": 0.3868, "step": 116 }, { "epoch": 0.07375595918206533, "grad_norm": 0.013613716699182987, "learning_rate": 0.0004675366008911521, "loss": 0.3615, "step": 117 }, { "epoch": 0.07438635199558725, "grad_norm": 0.014294012449681759, "learning_rate": 0.00046721833227243796, "loss": 0.3909, "step": 118 }, { "epoch": 0.07501674480910918, "grad_norm": 0.014494248665869236, "learning_rate": 0.00046690006365372375, "loss": 0.3957, "step": 119 }, { "epoch": 0.0756471376226311, "grad_norm": 0.012452061288058758, "learning_rate": 0.0004665817950350096, "loss": 0.3728, "step": 120 }, { "epoch": 0.07627753043615303, "grad_norm": 0.013415267691016197, "learning_rate": 0.0004662635264162954, "loss": 0.3728, "step": 121 }, { "epoch": 0.07690792324967495, "grad_norm": 0.013821875676512718, "learning_rate": 0.0004659452577975812, "loss": 0.3716, "step": 122 }, { "epoch": 0.07753831606319687, "grad_norm": 0.016661295667290688, "learning_rate": 0.000465626989178867, "loss": 0.4736, "step": 123 }, { "epoch": 0.07816870887671881, "grad_norm": 0.013106433674693108, "learning_rate": 0.0004653087205601528, "loss": 0.3575, "step": 124 }, { "epoch": 0.07879910169024074, "grad_norm": 0.0156813133507967, "learning_rate": 0.00046499045194143857, "loss": 0.4208, "step": 125 }, { "epoch": 0.07942949450376266, "grad_norm": 0.017488928511738777, "learning_rate": 0.00046467218332272436, "loss": 0.4367, "step": 126 }, { "epoch": 0.08005988731728458, "grad_norm": 0.014921813271939754, "learning_rate": 0.00046435391470401016, "loss": 0.4416, "step": 127 }, { "epoch": 0.08069028013080651, "grad_norm": 0.015090832486748695, "learning_rate": 0.000464035646085296, "loss": 0.4752, "step": 128 }, { "epoch": 0.08132067294432843, "grad_norm": 0.01292333286255598, "learning_rate": 0.0004637173774665818, "loss": 0.3859, "step": 129 }, { "epoch": 0.08195106575785036, "grad_norm": 0.014608800411224365, "learning_rate": 0.00046339910884786765, "loss": 0.4007, "step": 130 }, { "epoch": 0.0825814585713723, "grad_norm": 0.015404756180942059, "learning_rate": 0.00046308084022915344, "loss": 0.4202, "step": 131 }, { "epoch": 0.08321185138489422, "grad_norm": 0.01321891974657774, "learning_rate": 0.00046276257161043924, "loss": 0.3905, "step": 132 }, { "epoch": 0.08384224419841614, "grad_norm": 0.01633271761238575, "learning_rate": 0.00046244430299172503, "loss": 0.4113, "step": 133 }, { "epoch": 0.08447263701193806, "grad_norm": 0.015758583322167397, "learning_rate": 0.0004621260343730108, "loss": 0.4112, "step": 134 }, { "epoch": 0.08510302982545999, "grad_norm": 0.014834552071988583, "learning_rate": 0.0004618077657542966, "loss": 0.4268, "step": 135 }, { "epoch": 0.08573342263898191, "grad_norm": 0.01328643225133419, "learning_rate": 0.0004614894971355824, "loss": 0.4048, "step": 136 }, { "epoch": 0.08636381545250384, "grad_norm": 0.015494657680392265, "learning_rate": 0.00046117122851686826, "loss": 0.3835, "step": 137 }, { "epoch": 0.08699420826602576, "grad_norm": 0.011770982295274734, "learning_rate": 0.00046085295989815406, "loss": 0.3668, "step": 138 }, { "epoch": 0.0876246010795477, "grad_norm": 0.015426683239638805, "learning_rate": 0.00046053469127943985, "loss": 0.4193, "step": 139 }, { "epoch": 0.08825499389306962, "grad_norm": 0.0141952158883214, "learning_rate": 0.00046021642266072565, "loss": 0.3766, "step": 140 }, { "epoch": 0.08888538670659155, "grad_norm": 0.01560170017182827, "learning_rate": 0.00045989815404201144, "loss": 0.4024, "step": 141 }, { "epoch": 0.08951577952011347, "grad_norm": 0.01644952967762947, "learning_rate": 0.0004595798854232973, "loss": 0.4507, "step": 142 }, { "epoch": 0.0901461723336354, "grad_norm": 0.031552284955978394, "learning_rate": 0.0004592616168045831, "loss": 0.3472, "step": 143 }, { "epoch": 0.09077656514715732, "grad_norm": 0.013565769419074059, "learning_rate": 0.0004589433481858689, "loss": 0.3529, "step": 144 }, { "epoch": 0.09140695796067924, "grad_norm": 0.01607048697769642, "learning_rate": 0.0004586250795671547, "loss": 0.4221, "step": 145 }, { "epoch": 0.09203735077420118, "grad_norm": 0.014144987799227238, "learning_rate": 0.0004583068109484405, "loss": 0.4135, "step": 146 }, { "epoch": 0.0926677435877231, "grad_norm": 0.013666222803294659, "learning_rate": 0.0004579885423297263, "loss": 0.3856, "step": 147 }, { "epoch": 0.09329813640124503, "grad_norm": 0.014195356518030167, "learning_rate": 0.0004576702737110121, "loss": 0.4203, "step": 148 }, { "epoch": 0.09392852921476695, "grad_norm": 0.014037726446986198, "learning_rate": 0.0004573520050922979, "loss": 0.3851, "step": 149 }, { "epoch": 0.09455892202828887, "grad_norm": 0.013358448632061481, "learning_rate": 0.0004570337364735837, "loss": 0.2949, "step": 150 }, { "epoch": 0.0951893148418108, "grad_norm": 0.017415858805179596, "learning_rate": 0.0004567154678548695, "loss": 0.4462, "step": 151 }, { "epoch": 0.09581970765533272, "grad_norm": 0.013192846439778805, "learning_rate": 0.0004563971992361553, "loss": 0.4266, "step": 152 }, { "epoch": 0.09645010046885466, "grad_norm": 0.014750893227756023, "learning_rate": 0.00045607893061744113, "loss": 0.4399, "step": 153 }, { "epoch": 0.09708049328237658, "grad_norm": 0.013904851861298084, "learning_rate": 0.000455760661998727, "loss": 0.4354, "step": 154 }, { "epoch": 0.09771088609589851, "grad_norm": 0.01355521660298109, "learning_rate": 0.0004554423933800128, "loss": 0.4001, "step": 155 }, { "epoch": 0.09834127890942043, "grad_norm": 0.015457043424248695, "learning_rate": 0.00045512412476129857, "loss": 0.4142, "step": 156 }, { "epoch": 0.09897167172294236, "grad_norm": 0.014151825569570065, "learning_rate": 0.00045480585614258436, "loss": 0.4103, "step": 157 }, { "epoch": 0.09960206453646428, "grad_norm": 0.014811581000685692, "learning_rate": 0.00045448758752387016, "loss": 0.3921, "step": 158 }, { "epoch": 0.1002324573499862, "grad_norm": 0.01601674221456051, "learning_rate": 0.00045416931890515595, "loss": 0.4578, "step": 159 }, { "epoch": 0.10086285016350814, "grad_norm": 0.012152818031609058, "learning_rate": 0.00045385105028644175, "loss": 0.3809, "step": 160 }, { "epoch": 0.10149324297703007, "grad_norm": 0.012448348104953766, "learning_rate": 0.00045353278166772754, "loss": 0.4444, "step": 161 }, { "epoch": 0.10212363579055199, "grad_norm": 0.01306197326630354, "learning_rate": 0.0004532145130490134, "loss": 0.3605, "step": 162 }, { "epoch": 0.10275402860407391, "grad_norm": 0.014597426168620586, "learning_rate": 0.0004528962444302992, "loss": 0.4247, "step": 163 }, { "epoch": 0.10338442141759584, "grad_norm": 0.013518924824893475, "learning_rate": 0.000452577975811585, "loss": 0.4057, "step": 164 }, { "epoch": 0.10401481423111776, "grad_norm": 0.015075739473104477, "learning_rate": 0.00045225970719287077, "loss": 0.3776, "step": 165 }, { "epoch": 0.10464520704463968, "grad_norm": 0.014403780922293663, "learning_rate": 0.00045194143857415657, "loss": 0.4133, "step": 166 }, { "epoch": 0.10527559985816162, "grad_norm": 0.013943623751401901, "learning_rate": 0.0004516231699554424, "loss": 0.3494, "step": 167 }, { "epoch": 0.10590599267168355, "grad_norm": 0.012838203459978104, "learning_rate": 0.0004513049013367282, "loss": 0.3815, "step": 168 }, { "epoch": 0.10653638548520547, "grad_norm": 0.021162182092666626, "learning_rate": 0.000450986632718014, "loss": 0.3889, "step": 169 }, { "epoch": 0.1071667782987274, "grad_norm": 0.012704459950327873, "learning_rate": 0.00045066836409929985, "loss": 0.3436, "step": 170 }, { "epoch": 0.10779717111224932, "grad_norm": 0.015420299954712391, "learning_rate": 0.00045035009548058565, "loss": 0.4373, "step": 171 }, { "epoch": 0.10842756392577124, "grad_norm": 0.014454229734838009, "learning_rate": 0.00045003182686187144, "loss": 0.3496, "step": 172 }, { "epoch": 0.10905795673929317, "grad_norm": 0.013985652476549149, "learning_rate": 0.00044971355824315724, "loss": 0.4123, "step": 173 }, { "epoch": 0.1096883495528151, "grad_norm": 0.014107904396951199, "learning_rate": 0.00044939528962444303, "loss": 0.4552, "step": 174 }, { "epoch": 0.11031874236633703, "grad_norm": 0.015831153839826584, "learning_rate": 0.0004490770210057288, "loss": 0.3907, "step": 175 }, { "epoch": 0.11094913517985895, "grad_norm": 0.014836183749139309, "learning_rate": 0.0004487587523870146, "loss": 0.3582, "step": 176 }, { "epoch": 0.11157952799338088, "grad_norm": 0.013825944624841213, "learning_rate": 0.0004484404837683004, "loss": 0.3443, "step": 177 }, { "epoch": 0.1122099208069028, "grad_norm": 0.015528369694948196, "learning_rate": 0.0004481222151495863, "loss": 0.4169, "step": 178 }, { "epoch": 0.11284031362042472, "grad_norm": 0.0158685315400362, "learning_rate": 0.0004478039465308721, "loss": 0.4148, "step": 179 }, { "epoch": 0.11347070643394665, "grad_norm": 0.01517268642783165, "learning_rate": 0.0004474856779121579, "loss": 0.4375, "step": 180 }, { "epoch": 0.11410109924746858, "grad_norm": 0.01660778373479843, "learning_rate": 0.0004471674092934437, "loss": 0.3777, "step": 181 }, { "epoch": 0.11473149206099051, "grad_norm": 0.014059102162718773, "learning_rate": 0.0004468491406747295, "loss": 0.3683, "step": 182 }, { "epoch": 0.11536188487451243, "grad_norm": 0.01633947342634201, "learning_rate": 0.0004465308720560153, "loss": 0.4094, "step": 183 }, { "epoch": 0.11599227768803436, "grad_norm": 0.011877162382006645, "learning_rate": 0.0004462126034373011, "loss": 0.2996, "step": 184 }, { "epoch": 0.11662267050155628, "grad_norm": 0.012361064553260803, "learning_rate": 0.0004458943348185869, "loss": 0.3291, "step": 185 }, { "epoch": 0.1172530633150782, "grad_norm": 0.03489936888217926, "learning_rate": 0.00044557606619987267, "loss": 0.4093, "step": 186 }, { "epoch": 0.11788345612860013, "grad_norm": 0.015107312239706516, "learning_rate": 0.0004452577975811585, "loss": 0.4203, "step": 187 }, { "epoch": 0.11851384894212207, "grad_norm": 0.01571827568113804, "learning_rate": 0.0004449395289624443, "loss": 0.4133, "step": 188 }, { "epoch": 0.11914424175564399, "grad_norm": 0.016206607222557068, "learning_rate": 0.0004446212603437301, "loss": 0.3722, "step": 189 }, { "epoch": 0.11977463456916591, "grad_norm": 0.013261535204946995, "learning_rate": 0.0004443029917250159, "loss": 0.3516, "step": 190 }, { "epoch": 0.12040502738268784, "grad_norm": 0.012467121705412865, "learning_rate": 0.00044398472310630175, "loss": 0.3462, "step": 191 }, { "epoch": 0.12103542019620976, "grad_norm": 0.013101744465529919, "learning_rate": 0.00044366645448758754, "loss": 0.3397, "step": 192 }, { "epoch": 0.12166581300973169, "grad_norm": 0.014821134507656097, "learning_rate": 0.00044334818586887334, "loss": 0.3313, "step": 193 }, { "epoch": 0.12229620582325361, "grad_norm": 0.014528485015034676, "learning_rate": 0.00044302991725015913, "loss": 0.4186, "step": 194 }, { "epoch": 0.12292659863677555, "grad_norm": 0.014592241495847702, "learning_rate": 0.000442711648631445, "loss": 0.3473, "step": 195 }, { "epoch": 0.12355699145029747, "grad_norm": 0.013686472550034523, "learning_rate": 0.0004423933800127308, "loss": 0.3816, "step": 196 }, { "epoch": 0.1241873842638194, "grad_norm": 0.013242250308394432, "learning_rate": 0.00044207511139401657, "loss": 0.3491, "step": 197 }, { "epoch": 0.12481777707734132, "grad_norm": 0.013653567992150784, "learning_rate": 0.00044175684277530236, "loss": 0.4074, "step": 198 }, { "epoch": 0.12544816989086324, "grad_norm": 0.015144973993301392, "learning_rate": 0.00044143857415658816, "loss": 0.4756, "step": 199 }, { "epoch": 0.12607856270438517, "grad_norm": 0.015144536271691322, "learning_rate": 0.00044112030553787395, "loss": 0.3465, "step": 200 }, { "epoch": 0.12607856270438517, "eval_loss": 0.41307953000068665, "eval_runtime": 330.1069, "eval_samples_per_second": 3.029, "eval_steps_per_second": 3.029, "step": 200 }, { "epoch": 0.1267089555179071, "grad_norm": 0.015632444992661476, "learning_rate": 0.00044080203691915974, "loss": 0.4144, "step": 201 }, { "epoch": 0.12733934833142901, "grad_norm": 0.015394455753266811, "learning_rate": 0.00044048376830044554, "loss": 0.3735, "step": 202 }, { "epoch": 0.12796974114495094, "grad_norm": 0.0115371597930789, "learning_rate": 0.0004401654996817314, "loss": 0.3281, "step": 203 }, { "epoch": 0.12860013395847286, "grad_norm": 0.014215278439223766, "learning_rate": 0.00043984723106301724, "loss": 0.376, "step": 204 }, { "epoch": 0.12923052677199479, "grad_norm": 0.01447849627584219, "learning_rate": 0.00043952896244430303, "loss": 0.4058, "step": 205 }, { "epoch": 0.12986091958551674, "grad_norm": 0.01611350290477276, "learning_rate": 0.0004392106938255888, "loss": 0.4231, "step": 206 }, { "epoch": 0.13049131239903866, "grad_norm": 0.012788980267941952, "learning_rate": 0.0004388924252068746, "loss": 0.3983, "step": 207 }, { "epoch": 0.13112170521256059, "grad_norm": 0.013448570854961872, "learning_rate": 0.0004385741565881604, "loss": 0.3661, "step": 208 }, { "epoch": 0.1317520980260825, "grad_norm": 0.01911238394677639, "learning_rate": 0.0004382558879694462, "loss": 0.3771, "step": 209 }, { "epoch": 0.13238249083960443, "grad_norm": 0.015799477696418762, "learning_rate": 0.000437937619350732, "loss": 0.4074, "step": 210 }, { "epoch": 0.13301288365312636, "grad_norm": 0.012929645366966724, "learning_rate": 0.0004376193507320178, "loss": 0.3433, "step": 211 }, { "epoch": 0.13364327646664828, "grad_norm": 0.0152881545946002, "learning_rate": 0.00043730108211330364, "loss": 0.3711, "step": 212 }, { "epoch": 0.1342736692801702, "grad_norm": 0.014833944849669933, "learning_rate": 0.00043698281349458944, "loss": 0.3722, "step": 213 }, { "epoch": 0.13490406209369213, "grad_norm": 0.013883605599403381, "learning_rate": 0.00043666454487587523, "loss": 0.3473, "step": 214 }, { "epoch": 0.13553445490721405, "grad_norm": 0.014261720702052116, "learning_rate": 0.0004363462762571611, "loss": 0.4188, "step": 215 }, { "epoch": 0.13616484772073598, "grad_norm": 0.017078256234526634, "learning_rate": 0.0004360280076384469, "loss": 0.4365, "step": 216 }, { "epoch": 0.1367952405342579, "grad_norm": 0.011972522363066673, "learning_rate": 0.00043570973901973267, "loss": 0.3513, "step": 217 }, { "epoch": 0.13742563334777982, "grad_norm": 0.01542266272008419, "learning_rate": 0.00043539147040101846, "loss": 0.3978, "step": 218 }, { "epoch": 0.13805602616130175, "grad_norm": 0.014448809437453747, "learning_rate": 0.00043507320178230426, "loss": 0.3638, "step": 219 }, { "epoch": 0.1386864189748237, "grad_norm": 0.01470875646919012, "learning_rate": 0.0004347549331635901, "loss": 0.4013, "step": 220 }, { "epoch": 0.13931681178834562, "grad_norm": 0.0200630035251379, "learning_rate": 0.0004344366645448759, "loss": 0.434, "step": 221 }, { "epoch": 0.13994720460186755, "grad_norm": 0.014296353794634342, "learning_rate": 0.0004341183959261617, "loss": 0.4216, "step": 222 }, { "epoch": 0.14057759741538947, "grad_norm": 0.014910301193594933, "learning_rate": 0.0004338001273074475, "loss": 0.3848, "step": 223 }, { "epoch": 0.1412079902289114, "grad_norm": 0.0342477522790432, "learning_rate": 0.0004334818586887333, "loss": 0.4073, "step": 224 }, { "epoch": 0.14183838304243332, "grad_norm": 0.0111197829246521, "learning_rate": 0.0004331635900700191, "loss": 0.3446, "step": 225 }, { "epoch": 0.14246877585595524, "grad_norm": 0.010407987050712109, "learning_rate": 0.00043284532145130487, "loss": 0.3348, "step": 226 }, { "epoch": 0.14309916866947717, "grad_norm": 0.023615505546331406, "learning_rate": 0.0004325270528325907, "loss": 0.3777, "step": 227 }, { "epoch": 0.1437295614829991, "grad_norm": 0.013172394596040249, "learning_rate": 0.0004322087842138765, "loss": 0.4249, "step": 228 }, { "epoch": 0.14435995429652101, "grad_norm": 0.013465343974530697, "learning_rate": 0.00043189051559516236, "loss": 0.3973, "step": 229 }, { "epoch": 0.14499034711004294, "grad_norm": 0.013410159386694431, "learning_rate": 0.00043157224697644816, "loss": 0.3584, "step": 230 }, { "epoch": 0.14562073992356486, "grad_norm": 0.012107417918741703, "learning_rate": 0.00043125397835773395, "loss": 0.3689, "step": 231 }, { "epoch": 0.1462511327370868, "grad_norm": 0.01683788001537323, "learning_rate": 0.00043093570973901975, "loss": 0.4343, "step": 232 }, { "epoch": 0.1468815255506087, "grad_norm": 0.012769662775099277, "learning_rate": 0.00043061744112030554, "loss": 0.4102, "step": 233 }, { "epoch": 0.14751191836413066, "grad_norm": 0.014911843463778496, "learning_rate": 0.00043029917250159133, "loss": 0.3903, "step": 234 }, { "epoch": 0.14814231117765259, "grad_norm": 0.013946063816547394, "learning_rate": 0.00042998090388287713, "loss": 0.3761, "step": 235 }, { "epoch": 0.1487727039911745, "grad_norm": 0.016239028424024582, "learning_rate": 0.0004296626352641629, "loss": 0.4506, "step": 236 }, { "epoch": 0.14940309680469643, "grad_norm": 0.012469123117625713, "learning_rate": 0.00042934436664544877, "loss": 0.3719, "step": 237 }, { "epoch": 0.15003348961821836, "grad_norm": 0.014078752137720585, "learning_rate": 0.00042902609802673457, "loss": 0.3926, "step": 238 }, { "epoch": 0.15066388243174028, "grad_norm": 0.015152424573898315, "learning_rate": 0.0004287078294080204, "loss": 0.4045, "step": 239 }, { "epoch": 0.1512942752452622, "grad_norm": 0.013733173720538616, "learning_rate": 0.0004283895607893062, "loss": 0.379, "step": 240 }, { "epoch": 0.15192466805878413, "grad_norm": 0.014466444961726665, "learning_rate": 0.000428071292170592, "loss": 0.351, "step": 241 }, { "epoch": 0.15255506087230605, "grad_norm": 0.021267933771014214, "learning_rate": 0.0004277530235518778, "loss": 0.3572, "step": 242 }, { "epoch": 0.15318545368582798, "grad_norm": 0.01361842080950737, "learning_rate": 0.0004274347549331636, "loss": 0.4164, "step": 243 }, { "epoch": 0.1538158464993499, "grad_norm": 0.01527059730142355, "learning_rate": 0.0004271164863144494, "loss": 0.4427, "step": 244 }, { "epoch": 0.15444623931287182, "grad_norm": 0.015042467974126339, "learning_rate": 0.00042679821769573523, "loss": 0.4305, "step": 245 }, { "epoch": 0.15507663212639375, "grad_norm": 0.015271289274096489, "learning_rate": 0.00042647994907702103, "loss": 0.4366, "step": 246 }, { "epoch": 0.15570702493991567, "grad_norm": 0.013475016690790653, "learning_rate": 0.0004261616804583068, "loss": 0.3424, "step": 247 }, { "epoch": 0.15633741775343762, "grad_norm": 0.015119305811822414, "learning_rate": 0.0004258434118395926, "loss": 0.3536, "step": 248 }, { "epoch": 0.15696781056695955, "grad_norm": 0.013678413815796375, "learning_rate": 0.0004255251432208784, "loss": 0.3623, "step": 249 }, { "epoch": 0.15759820338048147, "grad_norm": 0.014214683324098587, "learning_rate": 0.0004252068746021642, "loss": 0.3584, "step": 250 }, { "epoch": 0.1582285961940034, "grad_norm": 0.03388328105211258, "learning_rate": 0.00042488860598345005, "loss": 0.4282, "step": 251 }, { "epoch": 0.15885898900752532, "grad_norm": 0.014387454837560654, "learning_rate": 0.00042457033736473585, "loss": 0.446, "step": 252 }, { "epoch": 0.15948938182104724, "grad_norm": 0.014452244155108929, "learning_rate": 0.00042425206874602164, "loss": 0.3854, "step": 253 }, { "epoch": 0.16011977463456917, "grad_norm": 0.02159762755036354, "learning_rate": 0.0004239338001273075, "loss": 0.4301, "step": 254 }, { "epoch": 0.1607501674480911, "grad_norm": 0.015262540429830551, "learning_rate": 0.0004236155315085933, "loss": 0.4473, "step": 255 }, { "epoch": 0.16138056026161302, "grad_norm": 0.01352153904736042, "learning_rate": 0.0004232972628898791, "loss": 0.3873, "step": 256 }, { "epoch": 0.16201095307513494, "grad_norm": 0.014292595908045769, "learning_rate": 0.00042297899427116487, "loss": 0.4366, "step": 257 }, { "epoch": 0.16264134588865686, "grad_norm": 0.013251558877527714, "learning_rate": 0.00042266072565245067, "loss": 0.4077, "step": 258 }, { "epoch": 0.1632717387021788, "grad_norm": 0.01339180301874876, "learning_rate": 0.00042234245703373646, "loss": 0.3991, "step": 259 }, { "epoch": 0.1639021315157007, "grad_norm": 0.014488326385617256, "learning_rate": 0.00042202418841502226, "loss": 0.379, "step": 260 }, { "epoch": 0.16453252432922263, "grad_norm": 0.01568019762635231, "learning_rate": 0.00042170591979630805, "loss": 0.4067, "step": 261 }, { "epoch": 0.1651629171427446, "grad_norm": 0.015658877789974213, "learning_rate": 0.0004213876511775939, "loss": 0.4342, "step": 262 }, { "epoch": 0.1657933099562665, "grad_norm": 0.014110025949776173, "learning_rate": 0.00042106938255887975, "loss": 0.4089, "step": 263 }, { "epoch": 0.16642370276978843, "grad_norm": 0.014314618892967701, "learning_rate": 0.00042075111394016554, "loss": 0.3328, "step": 264 }, { "epoch": 0.16705409558331036, "grad_norm": 0.014165842905640602, "learning_rate": 0.00042043284532145134, "loss": 0.3921, "step": 265 }, { "epoch": 0.16768448839683228, "grad_norm": 0.01382354460656643, "learning_rate": 0.00042011457670273713, "loss": 0.3568, "step": 266 }, { "epoch": 0.1683148812103542, "grad_norm": 0.017397115007042885, "learning_rate": 0.0004197963080840229, "loss": 0.4122, "step": 267 }, { "epoch": 0.16894527402387613, "grad_norm": 0.014652502723038197, "learning_rate": 0.0004194780394653087, "loss": 0.3855, "step": 268 }, { "epoch": 0.16957566683739805, "grad_norm": 0.01567262038588524, "learning_rate": 0.0004191597708465945, "loss": 0.4256, "step": 269 }, { "epoch": 0.17020605965091998, "grad_norm": 0.018886640667915344, "learning_rate": 0.00041884150222788036, "loss": 0.4843, "step": 270 }, { "epoch": 0.1708364524644419, "grad_norm": 0.013433393090963364, "learning_rate": 0.00041852323360916615, "loss": 0.386, "step": 271 }, { "epoch": 0.17146684527796383, "grad_norm": 0.016310589388012886, "learning_rate": 0.00041820496499045195, "loss": 0.4496, "step": 272 }, { "epoch": 0.17209723809148575, "grad_norm": 0.014000658877193928, "learning_rate": 0.00041788669637173774, "loss": 0.3786, "step": 273 }, { "epoch": 0.17272763090500767, "grad_norm": 0.017284424975514412, "learning_rate": 0.00041756842775302354, "loss": 0.4435, "step": 274 }, { "epoch": 0.1733580237185296, "grad_norm": 0.013865966349840164, "learning_rate": 0.0004172501591343094, "loss": 0.4335, "step": 275 }, { "epoch": 0.17398841653205152, "grad_norm": 0.014749307185411453, "learning_rate": 0.0004169318905155952, "loss": 0.4349, "step": 276 }, { "epoch": 0.17461880934557347, "grad_norm": 0.018386784940958023, "learning_rate": 0.000416613621896881, "loss": 0.4645, "step": 277 }, { "epoch": 0.1752492021590954, "grad_norm": 0.014194587245583534, "learning_rate": 0.00041629535327816677, "loss": 0.3696, "step": 278 }, { "epoch": 0.17587959497261732, "grad_norm": 0.013392159715294838, "learning_rate": 0.0004159770846594526, "loss": 0.2991, "step": 279 }, { "epoch": 0.17650998778613924, "grad_norm": 0.015150287188589573, "learning_rate": 0.0004156588160407384, "loss": 0.3924, "step": 280 }, { "epoch": 0.17714038059966117, "grad_norm": 0.016818981617689133, "learning_rate": 0.0004153405474220242, "loss": 0.4133, "step": 281 }, { "epoch": 0.1777707734131831, "grad_norm": 0.013639618642628193, "learning_rate": 0.00041502227880331, "loss": 0.3, "step": 282 }, { "epoch": 0.17840116622670502, "grad_norm": 0.01360489334911108, "learning_rate": 0.0004147040101845958, "loss": 0.3473, "step": 283 }, { "epoch": 0.17903155904022694, "grad_norm": 0.013170013204216957, "learning_rate": 0.0004143857415658816, "loss": 0.353, "step": 284 }, { "epoch": 0.17966195185374886, "grad_norm": 0.014665192924439907, "learning_rate": 0.0004140674729471674, "loss": 0.4581, "step": 285 }, { "epoch": 0.1802923446672708, "grad_norm": 0.016468839719891548, "learning_rate": 0.0004137492043284532, "loss": 0.4145, "step": 286 }, { "epoch": 0.1809227374807927, "grad_norm": 0.012054849416017532, "learning_rate": 0.0004134309357097391, "loss": 0.3887, "step": 287 }, { "epoch": 0.18155313029431464, "grad_norm": 0.016757695004343987, "learning_rate": 0.0004131126670910249, "loss": 0.391, "step": 288 }, { "epoch": 0.18218352310783656, "grad_norm": 0.013843930326402187, "learning_rate": 0.00041279439847231067, "loss": 0.3954, "step": 289 }, { "epoch": 0.18281391592135848, "grad_norm": 0.014795885421335697, "learning_rate": 0.00041247612985359646, "loss": 0.4613, "step": 290 }, { "epoch": 0.18344430873488043, "grad_norm": 0.014655854552984238, "learning_rate": 0.00041215786123488226, "loss": 0.4217, "step": 291 }, { "epoch": 0.18407470154840236, "grad_norm": 0.014011659659445286, "learning_rate": 0.00041183959261616805, "loss": 0.3924, "step": 292 }, { "epoch": 0.18470509436192428, "grad_norm": 0.013722460716962814, "learning_rate": 0.00041152132399745384, "loss": 0.409, "step": 293 }, { "epoch": 0.1853354871754462, "grad_norm": 0.011710972525179386, "learning_rate": 0.00041120305537873964, "loss": 0.4047, "step": 294 }, { "epoch": 0.18596587998896813, "grad_norm": 0.015348844230175018, "learning_rate": 0.0004108847867600255, "loss": 0.4378, "step": 295 }, { "epoch": 0.18659627280249005, "grad_norm": 0.013804617337882519, "learning_rate": 0.0004105665181413113, "loss": 0.3739, "step": 296 }, { "epoch": 0.18722666561601198, "grad_norm": 0.013841480948030949, "learning_rate": 0.0004102482495225971, "loss": 0.3578, "step": 297 }, { "epoch": 0.1878570584295339, "grad_norm": 0.013800901360809803, "learning_rate": 0.00040992998090388287, "loss": 0.3601, "step": 298 }, { "epoch": 0.18848745124305583, "grad_norm": 0.013804626651108265, "learning_rate": 0.0004096117122851687, "loss": 0.4295, "step": 299 }, { "epoch": 0.18911784405657775, "grad_norm": 0.014350210316479206, "learning_rate": 0.0004092934436664545, "loss": 0.4234, "step": 300 }, { "epoch": 0.18974823687009967, "grad_norm": 0.013308697380125523, "learning_rate": 0.0004089751750477403, "loss": 0.3542, "step": 301 }, { "epoch": 0.1903786296836216, "grad_norm": 0.012164046987891197, "learning_rate": 0.0004086569064290261, "loss": 0.3647, "step": 302 }, { "epoch": 0.19100902249714352, "grad_norm": 0.014454986900091171, "learning_rate": 0.0004083386378103119, "loss": 0.4005, "step": 303 }, { "epoch": 0.19163941531066545, "grad_norm": 0.016729174181818962, "learning_rate": 0.00040802036919159774, "loss": 0.4422, "step": 304 }, { "epoch": 0.1922698081241874, "grad_norm": 0.013259264640510082, "learning_rate": 0.00040770210057288354, "loss": 0.3565, "step": 305 }, { "epoch": 0.19290020093770932, "grad_norm": 0.014306127093732357, "learning_rate": 0.00040738383195416933, "loss": 0.342, "step": 306 }, { "epoch": 0.19353059375123124, "grad_norm": 0.01319255493581295, "learning_rate": 0.0004070655633354551, "loss": 0.3539, "step": 307 }, { "epoch": 0.19416098656475317, "grad_norm": 0.015078844502568245, "learning_rate": 0.0004067472947167409, "loss": 0.4289, "step": 308 }, { "epoch": 0.1947913793782751, "grad_norm": 0.014904836192727089, "learning_rate": 0.0004064290260980267, "loss": 0.4057, "step": 309 }, { "epoch": 0.19542177219179702, "grad_norm": 0.012999248690903187, "learning_rate": 0.0004061107574793125, "loss": 0.316, "step": 310 }, { "epoch": 0.19605216500531894, "grad_norm": 0.013548512943089008, "learning_rate": 0.0004057924888605983, "loss": 0.3334, "step": 311 }, { "epoch": 0.19668255781884086, "grad_norm": 0.014333848841488361, "learning_rate": 0.0004054742202418842, "loss": 0.3819, "step": 312 }, { "epoch": 0.1973129506323628, "grad_norm": 0.012575473636388779, "learning_rate": 0.00040515595162317, "loss": 0.356, "step": 313 }, { "epoch": 0.1979433434458847, "grad_norm": 0.01617979258298874, "learning_rate": 0.0004048376830044558, "loss": 0.4421, "step": 314 }, { "epoch": 0.19857373625940664, "grad_norm": 0.017318615689873695, "learning_rate": 0.0004045194143857416, "loss": 0.4701, "step": 315 }, { "epoch": 0.19920412907292856, "grad_norm": 0.014478931203484535, "learning_rate": 0.0004042011457670274, "loss": 0.3934, "step": 316 }, { "epoch": 0.19983452188645048, "grad_norm": 0.013659725897014141, "learning_rate": 0.0004038828771483132, "loss": 0.3872, "step": 317 }, { "epoch": 0.2004649146999724, "grad_norm": 0.015055459924042225, "learning_rate": 0.00040356460852959897, "loss": 0.3689, "step": 318 }, { "epoch": 0.20109530751349436, "grad_norm": 0.015268666669726372, "learning_rate": 0.00040324633991088477, "loss": 0.4071, "step": 319 }, { "epoch": 0.20172570032701628, "grad_norm": 0.015038284473121166, "learning_rate": 0.00040292807129217056, "loss": 0.4231, "step": 320 }, { "epoch": 0.2023560931405382, "grad_norm": 0.013972152024507523, "learning_rate": 0.0004026098026734564, "loss": 0.4024, "step": 321 }, { "epoch": 0.20298648595406013, "grad_norm": 0.014929687604308128, "learning_rate": 0.0004022915340547422, "loss": 0.4414, "step": 322 }, { "epoch": 0.20361687876758205, "grad_norm": 0.01580497995018959, "learning_rate": 0.000401973265436028, "loss": 0.4082, "step": 323 }, { "epoch": 0.20424727158110398, "grad_norm": 0.011186016723513603, "learning_rate": 0.00040165499681731385, "loss": 0.3307, "step": 324 }, { "epoch": 0.2048776643946259, "grad_norm": 0.014356723055243492, "learning_rate": 0.00040133672819859964, "loss": 0.3379, "step": 325 }, { "epoch": 0.20550805720814783, "grad_norm": 0.014592788182199001, "learning_rate": 0.00040101845957988543, "loss": 0.3889, "step": 326 }, { "epoch": 0.20613845002166975, "grad_norm": 0.015223313122987747, "learning_rate": 0.00040070019096117123, "loss": 0.3837, "step": 327 }, { "epoch": 0.20676884283519167, "grad_norm": 0.016272133216261864, "learning_rate": 0.000400381922342457, "loss": 0.437, "step": 328 }, { "epoch": 0.2073992356487136, "grad_norm": 0.01557878777384758, "learning_rate": 0.00040006365372374287, "loss": 0.3685, "step": 329 }, { "epoch": 0.20802962846223552, "grad_norm": 0.014276874251663685, "learning_rate": 0.00039974538510502867, "loss": 0.4079, "step": 330 }, { "epoch": 0.20866002127575745, "grad_norm": 0.017942672595381737, "learning_rate": 0.00039942711648631446, "loss": 0.37, "step": 331 }, { "epoch": 0.20929041408927937, "grad_norm": 0.011981052346527576, "learning_rate": 0.00039910884786760025, "loss": 0.3682, "step": 332 }, { "epoch": 0.20992080690280132, "grad_norm": 0.01741514913737774, "learning_rate": 0.00039879057924888605, "loss": 0.4268, "step": 333 }, { "epoch": 0.21055119971632325, "grad_norm": 0.015648547559976578, "learning_rate": 0.00039847231063017184, "loss": 0.3903, "step": 334 }, { "epoch": 0.21118159252984517, "grad_norm": 0.011143681593239307, "learning_rate": 0.00039815404201145764, "loss": 0.2943, "step": 335 }, { "epoch": 0.2118119853433671, "grad_norm": 0.011384455487132072, "learning_rate": 0.0003978357733927435, "loss": 0.2741, "step": 336 }, { "epoch": 0.21244237815688902, "grad_norm": 0.016504229977726936, "learning_rate": 0.00039751750477402933, "loss": 0.4272, "step": 337 }, { "epoch": 0.21307277097041094, "grad_norm": 0.015527388080954552, "learning_rate": 0.00039719923615531513, "loss": 0.4121, "step": 338 }, { "epoch": 0.21370316378393286, "grad_norm": 0.01418632548302412, "learning_rate": 0.0003968809675366009, "loss": 0.3996, "step": 339 }, { "epoch": 0.2143335565974548, "grad_norm": 0.013540912419557571, "learning_rate": 0.0003965626989178867, "loss": 0.4081, "step": 340 }, { "epoch": 0.2149639494109767, "grad_norm": 0.014578346163034439, "learning_rate": 0.0003962444302991725, "loss": 0.4144, "step": 341 }, { "epoch": 0.21559434222449864, "grad_norm": 0.015624952502548695, "learning_rate": 0.0003959261616804583, "loss": 0.3777, "step": 342 }, { "epoch": 0.21622473503802056, "grad_norm": 0.023671504110097885, "learning_rate": 0.0003956078930617441, "loss": 0.4658, "step": 343 }, { "epoch": 0.21685512785154248, "grad_norm": 0.012896531261503696, "learning_rate": 0.0003952896244430299, "loss": 0.3261, "step": 344 }, { "epoch": 0.2174855206650644, "grad_norm": 0.013401571661233902, "learning_rate": 0.0003949713558243157, "loss": 0.3401, "step": 345 }, { "epoch": 0.21811591347858633, "grad_norm": 0.019168630242347717, "learning_rate": 0.00039465308720560154, "loss": 0.3726, "step": 346 }, { "epoch": 0.21874630629210828, "grad_norm": 0.015938937664031982, "learning_rate": 0.00039433481858688733, "loss": 0.4274, "step": 347 }, { "epoch": 0.2193766991056302, "grad_norm": 0.012024111114442348, "learning_rate": 0.0003940165499681732, "loss": 0.3304, "step": 348 }, { "epoch": 0.22000709191915213, "grad_norm": 0.015909144654870033, "learning_rate": 0.00039369828134945897, "loss": 0.4388, "step": 349 }, { "epoch": 0.22063748473267406, "grad_norm": 0.014128969050943851, "learning_rate": 0.00039338001273074477, "loss": 0.3943, "step": 350 }, { "epoch": 0.22126787754619598, "grad_norm": 0.012780380435287952, "learning_rate": 0.00039306174411203056, "loss": 0.3639, "step": 351 }, { "epoch": 0.2218982703597179, "grad_norm": 0.016777213662862778, "learning_rate": 0.00039274347549331636, "loss": 0.4845, "step": 352 }, { "epoch": 0.22252866317323983, "grad_norm": 0.016099069267511368, "learning_rate": 0.00039242520687460215, "loss": 0.3806, "step": 353 }, { "epoch": 0.22315905598676175, "grad_norm": 0.012046086601912975, "learning_rate": 0.000392106938255888, "loss": 0.4237, "step": 354 }, { "epoch": 0.22378944880028367, "grad_norm": 0.012734375894069672, "learning_rate": 0.0003917886696371738, "loss": 0.3577, "step": 355 }, { "epoch": 0.2244198416138056, "grad_norm": 0.013315641321241856, "learning_rate": 0.0003914704010184596, "loss": 0.3836, "step": 356 }, { "epoch": 0.22505023442732752, "grad_norm": 0.013257570564746857, "learning_rate": 0.0003911521323997454, "loss": 0.3764, "step": 357 }, { "epoch": 0.22568062724084945, "grad_norm": 0.013702357187867165, "learning_rate": 0.0003908338637810312, "loss": 0.3769, "step": 358 }, { "epoch": 0.22631102005437137, "grad_norm": 0.012227476574480534, "learning_rate": 0.00039051559516231697, "loss": 0.3401, "step": 359 }, { "epoch": 0.2269414128678933, "grad_norm": 0.013636661693453789, "learning_rate": 0.0003901973265436028, "loss": 0.3367, "step": 360 }, { "epoch": 0.22757180568141525, "grad_norm": 0.013034637086093426, "learning_rate": 0.0003898790579248886, "loss": 0.3877, "step": 361 }, { "epoch": 0.22820219849493717, "grad_norm": 0.015421271324157715, "learning_rate": 0.00038956078930617446, "loss": 0.3976, "step": 362 }, { "epoch": 0.2288325913084591, "grad_norm": 0.013457038439810276, "learning_rate": 0.00038924252068746025, "loss": 0.4016, "step": 363 }, { "epoch": 0.22946298412198102, "grad_norm": 0.018328607082366943, "learning_rate": 0.00038892425206874605, "loss": 0.4509, "step": 364 }, { "epoch": 0.23009337693550294, "grad_norm": 0.016547221690416336, "learning_rate": 0.00038860598345003184, "loss": 0.3931, "step": 365 }, { "epoch": 0.23072376974902487, "grad_norm": 0.01578708365559578, "learning_rate": 0.00038828771483131764, "loss": 0.3588, "step": 366 }, { "epoch": 0.2313541625625468, "grad_norm": 0.01346549578011036, "learning_rate": 0.00038796944621260343, "loss": 0.3456, "step": 367 }, { "epoch": 0.2319845553760687, "grad_norm": 0.013822048902511597, "learning_rate": 0.0003876511775938892, "loss": 0.3355, "step": 368 }, { "epoch": 0.23261494818959064, "grad_norm": 0.015959620475769043, "learning_rate": 0.000387332908975175, "loss": 0.3684, "step": 369 }, { "epoch": 0.23324534100311256, "grad_norm": 0.013329599052667618, "learning_rate": 0.0003870146403564608, "loss": 0.3877, "step": 370 }, { "epoch": 0.23387573381663448, "grad_norm": 0.01492052711546421, "learning_rate": 0.00038669637173774666, "loss": 0.3328, "step": 371 }, { "epoch": 0.2345061266301564, "grad_norm": 0.018554024398326874, "learning_rate": 0.0003863781031190325, "loss": 0.4427, "step": 372 }, { "epoch": 0.23513651944367833, "grad_norm": 0.01493624597787857, "learning_rate": 0.0003860598345003183, "loss": 0.3627, "step": 373 }, { "epoch": 0.23576691225720026, "grad_norm": 0.012036562897264957, "learning_rate": 0.0003857415658816041, "loss": 0.3668, "step": 374 }, { "epoch": 0.23639730507072218, "grad_norm": 0.0130785396322608, "learning_rate": 0.0003854232972628899, "loss": 0.3817, "step": 375 }, { "epoch": 0.23702769788424413, "grad_norm": 0.012362291105091572, "learning_rate": 0.0003851050286441757, "loss": 0.3332, "step": 376 }, { "epoch": 0.23765809069776606, "grad_norm": 0.01651920937001705, "learning_rate": 0.0003847867600254615, "loss": 0.3827, "step": 377 }, { "epoch": 0.23828848351128798, "grad_norm": 0.016950994729995728, "learning_rate": 0.0003844684914067473, "loss": 0.3632, "step": 378 }, { "epoch": 0.2389188763248099, "grad_norm": 0.013989094644784927, "learning_rate": 0.0003841502227880331, "loss": 0.3697, "step": 379 }, { "epoch": 0.23954926913833183, "grad_norm": 0.013628944754600525, "learning_rate": 0.0003838319541693189, "loss": 0.3604, "step": 380 }, { "epoch": 0.24017966195185375, "grad_norm": 0.013745056465268135, "learning_rate": 0.0003835136855506047, "loss": 0.3398, "step": 381 }, { "epoch": 0.24081005476537568, "grad_norm": 0.0171502735465765, "learning_rate": 0.0003831954169318905, "loss": 0.3828, "step": 382 }, { "epoch": 0.2414404475788976, "grad_norm": 0.016452766954898834, "learning_rate": 0.0003828771483131763, "loss": 0.3639, "step": 383 }, { "epoch": 0.24207084039241952, "grad_norm": 0.014097441919147968, "learning_rate": 0.00038255887969446215, "loss": 0.3624, "step": 384 }, { "epoch": 0.24270123320594145, "grad_norm": 0.012694849632680416, "learning_rate": 0.00038224061107574794, "loss": 0.3567, "step": 385 }, { "epoch": 0.24333162601946337, "grad_norm": 0.013436023145914078, "learning_rate": 0.00038192234245703374, "loss": 0.3554, "step": 386 }, { "epoch": 0.2439620188329853, "grad_norm": 0.014878401532769203, "learning_rate": 0.0003816040738383196, "loss": 0.4008, "step": 387 }, { "epoch": 0.24459241164650722, "grad_norm": 0.016384709626436234, "learning_rate": 0.0003812858052196054, "loss": 0.4641, "step": 388 }, { "epoch": 0.24522280446002914, "grad_norm": 0.015883326530456543, "learning_rate": 0.0003809675366008912, "loss": 0.4222, "step": 389 }, { "epoch": 0.2458531972735511, "grad_norm": 0.013413279317319393, "learning_rate": 0.00038064926798217697, "loss": 0.3462, "step": 390 }, { "epoch": 0.24648359008707302, "grad_norm": 0.012388180941343307, "learning_rate": 0.00038033099936346276, "loss": 0.3138, "step": 391 }, { "epoch": 0.24711398290059494, "grad_norm": 0.013723078183829784, "learning_rate": 0.00038001273074474856, "loss": 0.3564, "step": 392 }, { "epoch": 0.24774437571411687, "grad_norm": 0.02148633450269699, "learning_rate": 0.00037969446212603435, "loss": 0.4955, "step": 393 }, { "epoch": 0.2483747685276388, "grad_norm": 0.014449981041252613, "learning_rate": 0.00037937619350732015, "loss": 0.3593, "step": 394 }, { "epoch": 0.2490051613411607, "grad_norm": 0.015688324347138405, "learning_rate": 0.00037905792488860594, "loss": 0.3738, "step": 395 }, { "epoch": 0.24963555415468264, "grad_norm": 0.015875380486249924, "learning_rate": 0.00037873965626989184, "loss": 0.3317, "step": 396 }, { "epoch": 0.2502659469682046, "grad_norm": 0.014019105583429337, "learning_rate": 0.00037842138765117764, "loss": 0.3545, "step": 397 }, { "epoch": 0.2508963397817265, "grad_norm": 0.01300369668751955, "learning_rate": 0.00037810311903246343, "loss": 0.3438, "step": 398 }, { "epoch": 0.25152673259524844, "grad_norm": 0.015232580713927746, "learning_rate": 0.0003777848504137492, "loss": 0.3112, "step": 399 }, { "epoch": 0.25215712540877033, "grad_norm": 0.01374281570315361, "learning_rate": 0.000377466581795035, "loss": 0.3294, "step": 400 }, { "epoch": 0.25215712540877033, "eval_loss": 0.40224653482437134, "eval_runtime": 327.6135, "eval_samples_per_second": 3.052, "eval_steps_per_second": 3.052, "step": 400 }, { "epoch": 0.2527875182222923, "grad_norm": 0.01640431396663189, "learning_rate": 0.0003771483131763208, "loss": 0.4272, "step": 401 }, { "epoch": 0.2534179110358142, "grad_norm": 0.012076716870069504, "learning_rate": 0.0003768300445576066, "loss": 0.3272, "step": 402 }, { "epoch": 0.25404830384933613, "grad_norm": 0.014165197499096394, "learning_rate": 0.0003765117759388924, "loss": 0.4131, "step": 403 }, { "epoch": 0.25467869666285803, "grad_norm": 0.1178860291838646, "learning_rate": 0.00037619350732017825, "loss": 0.3604, "step": 404 }, { "epoch": 0.25530908947638, "grad_norm": 0.014928080141544342, "learning_rate": 0.00037587523870146405, "loss": 0.4383, "step": 405 }, { "epoch": 0.2559394822899019, "grad_norm": 0.011646539904177189, "learning_rate": 0.00037555697008274984, "loss": 0.3274, "step": 406 }, { "epoch": 0.25656987510342383, "grad_norm": 0.012327929027378559, "learning_rate": 0.00037523870146403564, "loss": 0.313, "step": 407 }, { "epoch": 0.2572002679169457, "grad_norm": 0.015938373282551765, "learning_rate": 0.0003749204328453215, "loss": 0.3719, "step": 408 }, { "epoch": 0.2578306607304677, "grad_norm": 0.014185669831931591, "learning_rate": 0.0003746021642266073, "loss": 0.3749, "step": 409 }, { "epoch": 0.25846105354398957, "grad_norm": 0.014131504110991955, "learning_rate": 0.00037428389560789307, "loss": 0.387, "step": 410 }, { "epoch": 0.2590914463575115, "grad_norm": 0.015502029098570347, "learning_rate": 0.00037396562698917887, "loss": 0.3892, "step": 411 }, { "epoch": 0.2597218391710335, "grad_norm": 0.015715405344963074, "learning_rate": 0.0003736473583704647, "loss": 0.4097, "step": 412 }, { "epoch": 0.26035223198455537, "grad_norm": 0.014969248324632645, "learning_rate": 0.0003733290897517505, "loss": 0.3387, "step": 413 }, { "epoch": 0.2609826247980773, "grad_norm": 0.016265733167529106, "learning_rate": 0.0003730108211330363, "loss": 0.3704, "step": 414 }, { "epoch": 0.2616130176115992, "grad_norm": 0.0166942048817873, "learning_rate": 0.0003726925525143221, "loss": 0.3649, "step": 415 }, { "epoch": 0.26224341042512117, "grad_norm": 0.015606733039021492, "learning_rate": 0.0003723742838956079, "loss": 0.3688, "step": 416 }, { "epoch": 0.26287380323864307, "grad_norm": 0.015840278938412666, "learning_rate": 0.0003720560152768937, "loss": 0.402, "step": 417 }, { "epoch": 0.263504196052165, "grad_norm": 0.01386589277535677, "learning_rate": 0.0003717377466581795, "loss": 0.3324, "step": 418 }, { "epoch": 0.2641345888656869, "grad_norm": 0.016753733158111572, "learning_rate": 0.0003714194780394653, "loss": 0.4043, "step": 419 }, { "epoch": 0.26476498167920887, "grad_norm": 0.01746310293674469, "learning_rate": 0.0003711012094207511, "loss": 0.4119, "step": 420 }, { "epoch": 0.26539537449273076, "grad_norm": 0.019259415566921234, "learning_rate": 0.00037078294080203697, "loss": 0.3862, "step": 421 }, { "epoch": 0.2660257673062527, "grad_norm": 0.012872149236500263, "learning_rate": 0.00037046467218332277, "loss": 0.3526, "step": 422 }, { "epoch": 0.2666561601197746, "grad_norm": 0.013280967250466347, "learning_rate": 0.00037014640356460856, "loss": 0.3874, "step": 423 }, { "epoch": 0.26728655293329656, "grad_norm": 0.01718621514737606, "learning_rate": 0.00036982813494589435, "loss": 0.4035, "step": 424 }, { "epoch": 0.2679169457468185, "grad_norm": 0.01277583371847868, "learning_rate": 0.00036950986632718015, "loss": 0.3128, "step": 425 }, { "epoch": 0.2685473385603404, "grad_norm": 0.017392504960298538, "learning_rate": 0.00036919159770846594, "loss": 0.3483, "step": 426 }, { "epoch": 0.26917773137386236, "grad_norm": 0.014808950014412403, "learning_rate": 0.00036887332908975174, "loss": 0.3695, "step": 427 }, { "epoch": 0.26980812418738426, "grad_norm": 0.014155060052871704, "learning_rate": 0.00036855506047103753, "loss": 0.3959, "step": 428 }, { "epoch": 0.2704385170009062, "grad_norm": 0.016141362488269806, "learning_rate": 0.0003682367918523234, "loss": 0.4448, "step": 429 }, { "epoch": 0.2710689098144281, "grad_norm": 0.012797849252820015, "learning_rate": 0.0003679185232336092, "loss": 0.2993, "step": 430 }, { "epoch": 0.27169930262795006, "grad_norm": 0.016916556283831596, "learning_rate": 0.00036760025461489497, "loss": 0.4443, "step": 431 }, { "epoch": 0.27232969544147195, "grad_norm": 0.01740068756043911, "learning_rate": 0.0003672819859961808, "loss": 0.3954, "step": 432 }, { "epoch": 0.2729600882549939, "grad_norm": 0.015298452228307724, "learning_rate": 0.0003669637173774666, "loss": 0.4, "step": 433 }, { "epoch": 0.2735904810685158, "grad_norm": 0.016451654955744743, "learning_rate": 0.0003666454487587524, "loss": 0.3681, "step": 434 }, { "epoch": 0.27422087388203775, "grad_norm": 0.01578078418970108, "learning_rate": 0.0003663271801400382, "loss": 0.4012, "step": 435 }, { "epoch": 0.27485126669555965, "grad_norm": 0.01636829599738121, "learning_rate": 0.000366008911521324, "loss": 0.4104, "step": 436 }, { "epoch": 0.2754816595090816, "grad_norm": 0.015866460278630257, "learning_rate": 0.00036569064290260984, "loss": 0.406, "step": 437 }, { "epoch": 0.2761120523226035, "grad_norm": 0.017710495740175247, "learning_rate": 0.00036537237428389564, "loss": 0.4282, "step": 438 }, { "epoch": 0.27674244513612545, "grad_norm": 0.014686573296785355, "learning_rate": 0.00036505410566518143, "loss": 0.4058, "step": 439 }, { "epoch": 0.2773728379496474, "grad_norm": 0.01301991194486618, "learning_rate": 0.0003647358370464672, "loss": 0.3254, "step": 440 }, { "epoch": 0.2780032307631693, "grad_norm": 0.014194956049323082, "learning_rate": 0.000364417568427753, "loss": 0.3656, "step": 441 }, { "epoch": 0.27863362357669125, "grad_norm": 0.014603289775550365, "learning_rate": 0.0003640992998090388, "loss": 0.4253, "step": 442 }, { "epoch": 0.27926401639021314, "grad_norm": 0.01672372967004776, "learning_rate": 0.0003637810311903246, "loss": 0.3715, "step": 443 }, { "epoch": 0.2798944092037351, "grad_norm": 0.014991686679422855, "learning_rate": 0.0003634627625716104, "loss": 0.3256, "step": 444 }, { "epoch": 0.280524802017257, "grad_norm": 0.013302955776453018, "learning_rate": 0.00036314449395289625, "loss": 0.3292, "step": 445 }, { "epoch": 0.28115519483077894, "grad_norm": 0.014854427427053452, "learning_rate": 0.0003628262253341821, "loss": 0.38, "step": 446 }, { "epoch": 0.28178558764430084, "grad_norm": 0.014079076237976551, "learning_rate": 0.0003625079567154679, "loss": 0.3751, "step": 447 }, { "epoch": 0.2824159804578228, "grad_norm": 0.016096465289592743, "learning_rate": 0.0003621896880967537, "loss": 0.4078, "step": 448 }, { "epoch": 0.2830463732713447, "grad_norm": 0.012877153232693672, "learning_rate": 0.0003618714194780395, "loss": 0.3733, "step": 449 }, { "epoch": 0.28367676608486664, "grad_norm": 0.015885455533862114, "learning_rate": 0.0003615531508593253, "loss": 0.4038, "step": 450 }, { "epoch": 0.28430715889838853, "grad_norm": 0.014803246594965458, "learning_rate": 0.00036123488224061107, "loss": 0.393, "step": 451 }, { "epoch": 0.2849375517119105, "grad_norm": 0.013332175090909004, "learning_rate": 0.00036091661362189686, "loss": 0.3553, "step": 452 }, { "epoch": 0.28556794452543244, "grad_norm": 0.014596902765333652, "learning_rate": 0.00036059834500318266, "loss": 0.4522, "step": 453 }, { "epoch": 0.28619833733895433, "grad_norm": 0.018029138445854187, "learning_rate": 0.0003602800763844685, "loss": 0.408, "step": 454 }, { "epoch": 0.2868287301524763, "grad_norm": 0.015817707404494286, "learning_rate": 0.0003599618077657543, "loss": 0.3575, "step": 455 }, { "epoch": 0.2874591229659982, "grad_norm": 0.015697741881012917, "learning_rate": 0.0003596435391470401, "loss": 0.3877, "step": 456 }, { "epoch": 0.28808951577952013, "grad_norm": 0.01666218787431717, "learning_rate": 0.00035932527052832594, "loss": 0.4592, "step": 457 }, { "epoch": 0.28871990859304203, "grad_norm": 0.014196937903761864, "learning_rate": 0.00035900700190961174, "loss": 0.3792, "step": 458 }, { "epoch": 0.289350301406564, "grad_norm": 0.013346925377845764, "learning_rate": 0.00035868873329089753, "loss": 0.3824, "step": 459 }, { "epoch": 0.2899806942200859, "grad_norm": 0.012980618514120579, "learning_rate": 0.0003583704646721833, "loss": 0.3205, "step": 460 }, { "epoch": 0.29061108703360783, "grad_norm": 0.014710046350955963, "learning_rate": 0.0003580521960534691, "loss": 0.3948, "step": 461 }, { "epoch": 0.2912414798471297, "grad_norm": 0.012661518529057503, "learning_rate": 0.0003577339274347549, "loss": 0.3428, "step": 462 }, { "epoch": 0.2918718726606517, "grad_norm": 0.012466810643672943, "learning_rate": 0.00035741565881604076, "loss": 0.3238, "step": 463 }, { "epoch": 0.2925022654741736, "grad_norm": 0.01629483513534069, "learning_rate": 0.00035709739019732656, "loss": 0.4249, "step": 464 }, { "epoch": 0.2931326582876955, "grad_norm": 0.012584395706653595, "learning_rate": 0.00035677912157861235, "loss": 0.2885, "step": 465 }, { "epoch": 0.2937630511012174, "grad_norm": 0.014844930730760098, "learning_rate": 0.00035646085295989815, "loss": 0.3882, "step": 466 }, { "epoch": 0.29439344391473937, "grad_norm": 0.01584932580590248, "learning_rate": 0.00035614258434118394, "loss": 0.3608, "step": 467 }, { "epoch": 0.2950238367282613, "grad_norm": 0.022870469838380814, "learning_rate": 0.00035582431572246973, "loss": 0.3279, "step": 468 }, { "epoch": 0.2956542295417832, "grad_norm": 0.014993885532021523, "learning_rate": 0.0003555060471037556, "loss": 0.4063, "step": 469 }, { "epoch": 0.29628462235530517, "grad_norm": 0.0162053219974041, "learning_rate": 0.0003551877784850414, "loss": 0.4124, "step": 470 }, { "epoch": 0.29691501516882707, "grad_norm": 0.014932282269001007, "learning_rate": 0.0003548695098663272, "loss": 0.4027, "step": 471 }, { "epoch": 0.297545407982349, "grad_norm": 0.015680935233831406, "learning_rate": 0.000354551241247613, "loss": 0.3631, "step": 472 }, { "epoch": 0.2981758007958709, "grad_norm": 0.016389349475502968, "learning_rate": 0.0003542329726288988, "loss": 0.3359, "step": 473 }, { "epoch": 0.29880619360939287, "grad_norm": 0.01533939316868782, "learning_rate": 0.0003539147040101846, "loss": 0.3521, "step": 474 }, { "epoch": 0.29943658642291476, "grad_norm": 0.012865236960351467, "learning_rate": 0.0003535964353914704, "loss": 0.3744, "step": 475 }, { "epoch": 0.3000669792364367, "grad_norm": 0.015376793220639229, "learning_rate": 0.0003532781667727562, "loss": 0.3961, "step": 476 }, { "epoch": 0.3006973720499586, "grad_norm": 0.014606118202209473, "learning_rate": 0.000352959898154042, "loss": 0.3499, "step": 477 }, { "epoch": 0.30132776486348056, "grad_norm": 0.016589732840657234, "learning_rate": 0.0003526416295353278, "loss": 0.4044, "step": 478 }, { "epoch": 0.30195815767700246, "grad_norm": 0.012733055278658867, "learning_rate": 0.00035232336091661363, "loss": 0.3219, "step": 479 }, { "epoch": 0.3025885504905244, "grad_norm": 0.015043248422443867, "learning_rate": 0.00035200509229789943, "loss": 0.3419, "step": 480 }, { "epoch": 0.3032189433040463, "grad_norm": 0.01554787252098322, "learning_rate": 0.0003516868236791853, "loss": 0.3591, "step": 481 }, { "epoch": 0.30384933611756826, "grad_norm": 0.013924339786171913, "learning_rate": 0.00035136855506047107, "loss": 0.3076, "step": 482 }, { "epoch": 0.3044797289310902, "grad_norm": 0.015237972140312195, "learning_rate": 0.00035105028644175686, "loss": 0.3295, "step": 483 }, { "epoch": 0.3051101217446121, "grad_norm": 0.014525455422699451, "learning_rate": 0.00035073201782304266, "loss": 0.3559, "step": 484 }, { "epoch": 0.30574051455813406, "grad_norm": 0.015502218157052994, "learning_rate": 0.00035041374920432845, "loss": 0.3772, "step": 485 }, { "epoch": 0.30637090737165595, "grad_norm": 0.013454561121761799, "learning_rate": 0.00035009548058561425, "loss": 0.3468, "step": 486 }, { "epoch": 0.3070013001851779, "grad_norm": 0.01740995980799198, "learning_rate": 0.00034977721196690004, "loss": 0.4195, "step": 487 }, { "epoch": 0.3076316929986998, "grad_norm": 0.013777488842606544, "learning_rate": 0.0003494589433481859, "loss": 0.3861, "step": 488 }, { "epoch": 0.30826208581222175, "grad_norm": 0.01931867189705372, "learning_rate": 0.0003491406747294717, "loss": 0.4113, "step": 489 }, { "epoch": 0.30889247862574365, "grad_norm": 0.017279399558901787, "learning_rate": 0.0003488224061107575, "loss": 0.4245, "step": 490 }, { "epoch": 0.3095228714392656, "grad_norm": 0.016696816310286522, "learning_rate": 0.00034850413749204327, "loss": 0.3979, "step": 491 }, { "epoch": 0.3101532642527875, "grad_norm": 0.013617909513413906, "learning_rate": 0.00034818586887332907, "loss": 0.3646, "step": 492 }, { "epoch": 0.31078365706630945, "grad_norm": 0.014281963929533958, "learning_rate": 0.0003478676002546149, "loss": 0.3359, "step": 493 }, { "epoch": 0.31141404987983135, "grad_norm": 0.027315836399793625, "learning_rate": 0.0003475493316359007, "loss": 0.3674, "step": 494 }, { "epoch": 0.3120444426933533, "grad_norm": 0.01335853710770607, "learning_rate": 0.0003472310630171865, "loss": 0.3479, "step": 495 }, { "epoch": 0.31267483550687525, "grad_norm": 0.01632700301706791, "learning_rate": 0.00034691279439847235, "loss": 0.3724, "step": 496 }, { "epoch": 0.31330522832039714, "grad_norm": 0.012534298934042454, "learning_rate": 0.00034659452577975815, "loss": 0.3325, "step": 497 }, { "epoch": 0.3139356211339191, "grad_norm": 0.01293514296412468, "learning_rate": 0.00034627625716104394, "loss": 0.3353, "step": 498 }, { "epoch": 0.314566013947441, "grad_norm": 0.017714234068989754, "learning_rate": 0.00034595798854232974, "loss": 0.3835, "step": 499 }, { "epoch": 0.31519640676096294, "grad_norm": 0.016404278576374054, "learning_rate": 0.00034563971992361553, "loss": 0.4057, "step": 500 }, { "epoch": 0.31582679957448484, "grad_norm": 0.015461519360542297, "learning_rate": 0.0003453214513049013, "loss": 0.3715, "step": 501 }, { "epoch": 0.3164571923880068, "grad_norm": 0.011208601295948029, "learning_rate": 0.0003450031826861871, "loss": 0.299, "step": 502 }, { "epoch": 0.3170875852015287, "grad_norm": 0.016973957419395447, "learning_rate": 0.0003446849140674729, "loss": 0.3682, "step": 503 }, { "epoch": 0.31771797801505064, "grad_norm": 0.01692604087293148, "learning_rate": 0.00034436664544875876, "loss": 0.3605, "step": 504 }, { "epoch": 0.31834837082857254, "grad_norm": 0.017638131976127625, "learning_rate": 0.0003440483768300446, "loss": 0.4531, "step": 505 }, { "epoch": 0.3189787636420945, "grad_norm": 0.015443012118339539, "learning_rate": 0.0003437301082113304, "loss": 0.4052, "step": 506 }, { "epoch": 0.3196091564556164, "grad_norm": 0.014743168838322163, "learning_rate": 0.0003434118395926162, "loss": 0.371, "step": 507 }, { "epoch": 0.32023954926913833, "grad_norm": 0.015408329665660858, "learning_rate": 0.000343093570973902, "loss": 0.4008, "step": 508 }, { "epoch": 0.32086994208266023, "grad_norm": 0.017668986693024635, "learning_rate": 0.0003427753023551878, "loss": 0.4523, "step": 509 }, { "epoch": 0.3215003348961822, "grad_norm": 0.013522491790354252, "learning_rate": 0.0003424570337364736, "loss": 0.3349, "step": 510 }, { "epoch": 0.32213072770970413, "grad_norm": 0.018539030104875565, "learning_rate": 0.0003421387651177594, "loss": 0.3847, "step": 511 }, { "epoch": 0.32276112052322603, "grad_norm": 0.015222381800413132, "learning_rate": 0.00034182049649904517, "loss": 0.3655, "step": 512 }, { "epoch": 0.323391513336748, "grad_norm": 0.014356279745697975, "learning_rate": 0.000341502227880331, "loss": 0.3635, "step": 513 }, { "epoch": 0.3240219061502699, "grad_norm": 0.011098071932792664, "learning_rate": 0.0003411839592616168, "loss": 0.3449, "step": 514 }, { "epoch": 0.32465229896379183, "grad_norm": 0.01377238892018795, "learning_rate": 0.0003408656906429026, "loss": 0.3605, "step": 515 }, { "epoch": 0.3252826917773137, "grad_norm": 0.016116170212626457, "learning_rate": 0.0003405474220241884, "loss": 0.4066, "step": 516 }, { "epoch": 0.3259130845908357, "grad_norm": 0.0179687961935997, "learning_rate": 0.00034022915340547425, "loss": 0.4363, "step": 517 }, { "epoch": 0.3265434774043576, "grad_norm": 0.016104156151413918, "learning_rate": 0.00033991088478676004, "loss": 0.3955, "step": 518 }, { "epoch": 0.3271738702178795, "grad_norm": 0.013766656629741192, "learning_rate": 0.00033959261616804584, "loss": 0.3187, "step": 519 }, { "epoch": 0.3278042630314014, "grad_norm": 0.013646628707647324, "learning_rate": 0.00033927434754933163, "loss": 0.329, "step": 520 }, { "epoch": 0.3284346558449234, "grad_norm": 0.014179794117808342, "learning_rate": 0.0003389560789306175, "loss": 0.3786, "step": 521 }, { "epoch": 0.32906504865844527, "grad_norm": 0.013832984492182732, "learning_rate": 0.0003386378103119033, "loss": 0.3517, "step": 522 }, { "epoch": 0.3296954414719672, "grad_norm": 0.015406441874802113, "learning_rate": 0.00033831954169318907, "loss": 0.405, "step": 523 }, { "epoch": 0.3303258342854892, "grad_norm": 0.014820747077465057, "learning_rate": 0.00033800127307447486, "loss": 0.3963, "step": 524 }, { "epoch": 0.33095622709901107, "grad_norm": 0.014587847515940666, "learning_rate": 0.00033768300445576066, "loss": 0.3533, "step": 525 }, { "epoch": 0.331586619912533, "grad_norm": 0.013031021691858768, "learning_rate": 0.00033736473583704645, "loss": 0.3551, "step": 526 }, { "epoch": 0.3322170127260549, "grad_norm": 0.014004986733198166, "learning_rate": 0.00033704646721833224, "loss": 0.343, "step": 527 }, { "epoch": 0.33284740553957687, "grad_norm": 0.014507156796753407, "learning_rate": 0.00033672819859961804, "loss": 0.3449, "step": 528 }, { "epoch": 0.33347779835309876, "grad_norm": 0.015523980371654034, "learning_rate": 0.00033640992998090394, "loss": 0.4295, "step": 529 }, { "epoch": 0.3341081911666207, "grad_norm": 0.01435157936066389, "learning_rate": 0.00033609166136218974, "loss": 0.4317, "step": 530 }, { "epoch": 0.3347385839801426, "grad_norm": 0.014979582279920578, "learning_rate": 0.00033577339274347553, "loss": 0.3827, "step": 531 }, { "epoch": 0.33536897679366456, "grad_norm": 0.015369818545877934, "learning_rate": 0.0003354551241247613, "loss": 0.4105, "step": 532 }, { "epoch": 0.33599936960718646, "grad_norm": 0.018350455909967422, "learning_rate": 0.0003351368555060471, "loss": 0.3988, "step": 533 }, { "epoch": 0.3366297624207084, "grad_norm": 0.018849672749638557, "learning_rate": 0.0003348185868873329, "loss": 0.3667, "step": 534 }, { "epoch": 0.3372601552342303, "grad_norm": 0.014324454590678215, "learning_rate": 0.0003345003182686187, "loss": 0.3488, "step": 535 }, { "epoch": 0.33789054804775226, "grad_norm": 0.014050649479031563, "learning_rate": 0.0003341820496499045, "loss": 0.3935, "step": 536 }, { "epoch": 0.33852094086127416, "grad_norm": 0.014726082794368267, "learning_rate": 0.0003338637810311903, "loss": 0.4173, "step": 537 }, { "epoch": 0.3391513336747961, "grad_norm": 0.013571511022746563, "learning_rate": 0.00033354551241247614, "loss": 0.3542, "step": 538 }, { "epoch": 0.33978172648831806, "grad_norm": 0.014750882983207703, "learning_rate": 0.00033322724379376194, "loss": 0.3867, "step": 539 }, { "epoch": 0.34041211930183995, "grad_norm": 0.013621051795780659, "learning_rate": 0.00033290897517504773, "loss": 0.3372, "step": 540 }, { "epoch": 0.3410425121153619, "grad_norm": 0.01518064271658659, "learning_rate": 0.0003325907065563336, "loss": 0.3522, "step": 541 }, { "epoch": 0.3416729049288838, "grad_norm": 0.015099391341209412, "learning_rate": 0.0003322724379376194, "loss": 0.3944, "step": 542 }, { "epoch": 0.34230329774240575, "grad_norm": 0.014835887588560581, "learning_rate": 0.00033195416931890517, "loss": 0.376, "step": 543 }, { "epoch": 0.34293369055592765, "grad_norm": 0.015946384519338608, "learning_rate": 0.00033163590070019096, "loss": 0.4174, "step": 544 }, { "epoch": 0.3435640833694496, "grad_norm": 0.019949868321418762, "learning_rate": 0.00033131763208147676, "loss": 0.3957, "step": 545 }, { "epoch": 0.3441944761829715, "grad_norm": 0.019879557192325592, "learning_rate": 0.0003309993634627626, "loss": 0.337, "step": 546 }, { "epoch": 0.34482486899649345, "grad_norm": 0.013396283611655235, "learning_rate": 0.0003306810948440484, "loss": 0.3278, "step": 547 }, { "epoch": 0.34545526181001535, "grad_norm": 0.01450798287987709, "learning_rate": 0.0003303628262253342, "loss": 0.3436, "step": 548 }, { "epoch": 0.3460856546235373, "grad_norm": 0.02375684306025505, "learning_rate": 0.00033004455760662, "loss": 0.338, "step": 549 }, { "epoch": 0.3467160474370592, "grad_norm": 0.014005805365741253, "learning_rate": 0.0003297262889879058, "loss": 0.3729, "step": 550 }, { "epoch": 0.34734644025058115, "grad_norm": 0.015596228651702404, "learning_rate": 0.0003294080203691916, "loss": 0.3328, "step": 551 }, { "epoch": 0.34797683306410304, "grad_norm": 0.015341611579060555, "learning_rate": 0.00032908975175047737, "loss": 0.3917, "step": 552 }, { "epoch": 0.348607225877625, "grad_norm": 0.016774989664554596, "learning_rate": 0.0003287714831317632, "loss": 0.324, "step": 553 }, { "epoch": 0.34923761869114694, "grad_norm": 0.013204341754317284, "learning_rate": 0.00032845321451304907, "loss": 0.3604, "step": 554 }, { "epoch": 0.34986801150466884, "grad_norm": 0.015229560434818268, "learning_rate": 0.00032813494589433486, "loss": 0.3471, "step": 555 }, { "epoch": 0.3504984043181908, "grad_norm": 0.012395329773426056, "learning_rate": 0.00032781667727562066, "loss": 0.3345, "step": 556 }, { "epoch": 0.3511287971317127, "grad_norm": 0.014304972253739834, "learning_rate": 0.00032749840865690645, "loss": 0.3738, "step": 557 }, { "epoch": 0.35175918994523464, "grad_norm": 0.01417758408933878, "learning_rate": 0.00032718014003819225, "loss": 0.378, "step": 558 }, { "epoch": 0.35238958275875654, "grad_norm": 0.01500962395220995, "learning_rate": 0.00032686187141947804, "loss": 0.4059, "step": 559 }, { "epoch": 0.3530199755722785, "grad_norm": 0.014990303665399551, "learning_rate": 0.00032654360280076383, "loss": 0.3374, "step": 560 }, { "epoch": 0.3536503683858004, "grad_norm": 0.014926685020327568, "learning_rate": 0.00032622533418204963, "loss": 0.3693, "step": 561 }, { "epoch": 0.35428076119932234, "grad_norm": 0.013300604186952114, "learning_rate": 0.0003259070655633354, "loss": 0.3665, "step": 562 }, { "epoch": 0.35491115401284423, "grad_norm": 0.014208394102752209, "learning_rate": 0.00032558879694462127, "loss": 0.3481, "step": 563 }, { "epoch": 0.3555415468263662, "grad_norm": 0.012996066361665726, "learning_rate": 0.00032527052832590707, "loss": 0.3432, "step": 564 }, { "epoch": 0.3561719396398881, "grad_norm": 0.010608876124024391, "learning_rate": 0.0003249522597071929, "loss": 0.2871, "step": 565 }, { "epoch": 0.35680233245341003, "grad_norm": 0.012749803252518177, "learning_rate": 0.0003246339910884787, "loss": 0.3535, "step": 566 }, { "epoch": 0.357432725266932, "grad_norm": 0.012807090766727924, "learning_rate": 0.0003243157224697645, "loss": 0.4162, "step": 567 }, { "epoch": 0.3580631180804539, "grad_norm": 0.014453215524554253, "learning_rate": 0.0003239974538510503, "loss": 0.3957, "step": 568 }, { "epoch": 0.35869351089397583, "grad_norm": 0.017648903653025627, "learning_rate": 0.0003236791852323361, "loss": 0.3974, "step": 569 }, { "epoch": 0.3593239037074977, "grad_norm": 0.015802111476659775, "learning_rate": 0.0003233609166136219, "loss": 0.4198, "step": 570 }, { "epoch": 0.3599542965210197, "grad_norm": 0.01437598280608654, "learning_rate": 0.00032304264799490773, "loss": 0.3524, "step": 571 }, { "epoch": 0.3605846893345416, "grad_norm": 0.015620991587638855, "learning_rate": 0.00032272437937619353, "loss": 0.3971, "step": 572 }, { "epoch": 0.3612150821480635, "grad_norm": 0.013718714006245136, "learning_rate": 0.0003224061107574793, "loss": 0.3503, "step": 573 }, { "epoch": 0.3618454749615854, "grad_norm": 0.01291655283421278, "learning_rate": 0.0003220878421387651, "loss": 0.3382, "step": 574 }, { "epoch": 0.3624758677751074, "grad_norm": 0.012043423019349575, "learning_rate": 0.0003217695735200509, "loss": 0.3271, "step": 575 }, { "epoch": 0.36310626058862927, "grad_norm": 0.015100762248039246, "learning_rate": 0.0003214513049013367, "loss": 0.3289, "step": 576 }, { "epoch": 0.3637366534021512, "grad_norm": 0.01711404323577881, "learning_rate": 0.0003211330362826225, "loss": 0.3836, "step": 577 }, { "epoch": 0.3643670462156731, "grad_norm": 0.013861637562513351, "learning_rate": 0.00032081476766390835, "loss": 0.3266, "step": 578 }, { "epoch": 0.36499743902919507, "grad_norm": 0.014734717085957527, "learning_rate": 0.00032049649904519414, "loss": 0.3996, "step": 579 }, { "epoch": 0.36562783184271697, "grad_norm": 0.01765221729874611, "learning_rate": 0.00032017823042648, "loss": 0.3884, "step": 580 }, { "epoch": 0.3662582246562389, "grad_norm": 0.013283113949000835, "learning_rate": 0.0003198599618077658, "loss": 0.3082, "step": 581 }, { "epoch": 0.36688861746976087, "grad_norm": 0.01363943237811327, "learning_rate": 0.0003195416931890516, "loss": 0.3788, "step": 582 }, { "epoch": 0.36751901028328277, "grad_norm": 0.015286918729543686, "learning_rate": 0.00031922342457033737, "loss": 0.3673, "step": 583 }, { "epoch": 0.3681494030968047, "grad_norm": 0.020678527653217316, "learning_rate": 0.00031890515595162317, "loss": 0.352, "step": 584 }, { "epoch": 0.3687797959103266, "grad_norm": 0.014694144949316978, "learning_rate": 0.00031858688733290896, "loss": 0.401, "step": 585 }, { "epoch": 0.36941018872384856, "grad_norm": 0.013185822404921055, "learning_rate": 0.00031826861871419476, "loss": 0.4125, "step": 586 }, { "epoch": 0.37004058153737046, "grad_norm": 0.013719198293983936, "learning_rate": 0.00031795035009548055, "loss": 0.3597, "step": 587 }, { "epoch": 0.3706709743508924, "grad_norm": 0.016647811979055405, "learning_rate": 0.0003176320814767664, "loss": 0.3668, "step": 588 }, { "epoch": 0.3713013671644143, "grad_norm": 0.012551740743219852, "learning_rate": 0.00031731381285805225, "loss": 0.3306, "step": 589 }, { "epoch": 0.37193175997793626, "grad_norm": 0.017177043482661247, "learning_rate": 0.00031699554423933804, "loss": 0.4069, "step": 590 }, { "epoch": 0.37256215279145816, "grad_norm": 0.012840714305639267, "learning_rate": 0.00031667727562062384, "loss": 0.3467, "step": 591 }, { "epoch": 0.3731925456049801, "grad_norm": 0.013221084140241146, "learning_rate": 0.00031635900700190963, "loss": 0.3191, "step": 592 }, { "epoch": 0.373822938418502, "grad_norm": 0.015163405798375607, "learning_rate": 0.0003160407383831954, "loss": 0.3459, "step": 593 }, { "epoch": 0.37445333123202396, "grad_norm": 0.01614232361316681, "learning_rate": 0.0003157224697644812, "loss": 0.3563, "step": 594 }, { "epoch": 0.3750837240455459, "grad_norm": 0.013262378983199596, "learning_rate": 0.000315404201145767, "loss": 0.3183, "step": 595 }, { "epoch": 0.3757141168590678, "grad_norm": 0.016392970457673073, "learning_rate": 0.00031508593252705286, "loss": 0.4179, "step": 596 }, { "epoch": 0.37634450967258976, "grad_norm": 0.019099978730082512, "learning_rate": 0.00031476766390833865, "loss": 0.3802, "step": 597 }, { "epoch": 0.37697490248611165, "grad_norm": 0.015689987689256668, "learning_rate": 0.00031444939528962445, "loss": 0.3868, "step": 598 }, { "epoch": 0.3776052952996336, "grad_norm": 0.017538568004965782, "learning_rate": 0.00031413112667091024, "loss": 0.3877, "step": 599 }, { "epoch": 0.3782356881131555, "grad_norm": 0.015454876236617565, "learning_rate": 0.00031381285805219604, "loss": 0.388, "step": 600 }, { "epoch": 0.3782356881131555, "eval_loss": 0.3927299678325653, "eval_runtime": 327.8319, "eval_samples_per_second": 3.05, "eval_steps_per_second": 3.05, "step": 600 }, { "epoch": 0.37886608092667745, "grad_norm": 0.01557796262204647, "learning_rate": 0.00031349458943348183, "loss": 0.3728, "step": 601 }, { "epoch": 0.37949647374019935, "grad_norm": 0.01644926518201828, "learning_rate": 0.0003131763208147677, "loss": 0.4082, "step": 602 }, { "epoch": 0.3801268665537213, "grad_norm": 0.013426114805042744, "learning_rate": 0.0003128580521960535, "loss": 0.3809, "step": 603 }, { "epoch": 0.3807572593672432, "grad_norm": 0.01580638252198696, "learning_rate": 0.00031253978357733927, "loss": 0.4184, "step": 604 }, { "epoch": 0.38138765218076515, "grad_norm": 0.014225323684513569, "learning_rate": 0.0003122215149586251, "loss": 0.3503, "step": 605 }, { "epoch": 0.38201804499428704, "grad_norm": 0.016323791816830635, "learning_rate": 0.0003119032463399109, "loss": 0.3959, "step": 606 }, { "epoch": 0.382648437807809, "grad_norm": 0.014072321355342865, "learning_rate": 0.0003115849777211967, "loss": 0.3425, "step": 607 }, { "epoch": 0.3832788306213309, "grad_norm": 0.016040025278925896, "learning_rate": 0.0003112667091024825, "loss": 0.3173, "step": 608 }, { "epoch": 0.38390922343485284, "grad_norm": 0.015729017555713654, "learning_rate": 0.0003109484404837683, "loss": 0.3421, "step": 609 }, { "epoch": 0.3845396162483748, "grad_norm": 0.013647118583321571, "learning_rate": 0.0003106301718650541, "loss": 0.3704, "step": 610 }, { "epoch": 0.3851700090618967, "grad_norm": 0.013838117942214012, "learning_rate": 0.0003103119032463399, "loss": 0.3574, "step": 611 }, { "epoch": 0.38580040187541864, "grad_norm": 0.019317131489515305, "learning_rate": 0.0003099936346276257, "loss": 0.4552, "step": 612 }, { "epoch": 0.38643079468894054, "grad_norm": 0.044802162796258926, "learning_rate": 0.0003096753660089115, "loss": 0.3911, "step": 613 }, { "epoch": 0.3870611875024625, "grad_norm": 0.013460073620080948, "learning_rate": 0.0003093570973901974, "loss": 0.3773, "step": 614 }, { "epoch": 0.3876915803159844, "grad_norm": 0.013702953234314919, "learning_rate": 0.00030903882877148317, "loss": 0.302, "step": 615 }, { "epoch": 0.38832197312950634, "grad_norm": 0.016412384808063507, "learning_rate": 0.00030872056015276896, "loss": 0.4145, "step": 616 }, { "epoch": 0.38895236594302823, "grad_norm": 0.017136793583631516, "learning_rate": 0.00030840229153405476, "loss": 0.3799, "step": 617 }, { "epoch": 0.3895827587565502, "grad_norm": 0.013987540267407894, "learning_rate": 0.00030808402291534055, "loss": 0.3227, "step": 618 }, { "epoch": 0.3902131515700721, "grad_norm": 0.013590000569820404, "learning_rate": 0.00030776575429662634, "loss": 0.3473, "step": 619 }, { "epoch": 0.39084354438359403, "grad_norm": 0.015469753183424473, "learning_rate": 0.00030744748567791214, "loss": 0.3918, "step": 620 }, { "epoch": 0.39147393719711593, "grad_norm": 0.015418731607496738, "learning_rate": 0.000307129217059198, "loss": 0.4002, "step": 621 }, { "epoch": 0.3921043300106379, "grad_norm": 0.012901484034955502, "learning_rate": 0.0003068109484404838, "loss": 0.3438, "step": 622 }, { "epoch": 0.39273472282415983, "grad_norm": 0.017325155436992645, "learning_rate": 0.0003064926798217696, "loss": 0.3867, "step": 623 }, { "epoch": 0.39336511563768173, "grad_norm": 0.015628132969141006, "learning_rate": 0.00030617441120305537, "loss": 0.4046, "step": 624 }, { "epoch": 0.3939955084512037, "grad_norm": 0.01464697252959013, "learning_rate": 0.00030585614258434116, "loss": 0.3328, "step": 625 }, { "epoch": 0.3946259012647256, "grad_norm": 0.014577261172235012, "learning_rate": 0.000305537873965627, "loss": 0.3693, "step": 626 }, { "epoch": 0.3952562940782475, "grad_norm": 0.01646304875612259, "learning_rate": 0.0003052196053469128, "loss": 0.3684, "step": 627 }, { "epoch": 0.3958866868917694, "grad_norm": 0.012631191872060299, "learning_rate": 0.0003049013367281986, "loss": 0.3352, "step": 628 }, { "epoch": 0.3965170797052914, "grad_norm": 0.02333548478782177, "learning_rate": 0.0003045830681094844, "loss": 0.4287, "step": 629 }, { "epoch": 0.39714747251881327, "grad_norm": 0.01522962935268879, "learning_rate": 0.00030426479949077024, "loss": 0.384, "step": 630 }, { "epoch": 0.3977778653323352, "grad_norm": 0.014716900885105133, "learning_rate": 0.00030394653087205604, "loss": 0.3859, "step": 631 }, { "epoch": 0.3984082581458571, "grad_norm": 0.015523294918239117, "learning_rate": 0.00030362826225334183, "loss": 0.3385, "step": 632 }, { "epoch": 0.39903865095937907, "grad_norm": 0.015310057438910007, "learning_rate": 0.0003033099936346276, "loss": 0.395, "step": 633 }, { "epoch": 0.39966904377290097, "grad_norm": 0.014677232131361961, "learning_rate": 0.0003029917250159134, "loss": 0.3531, "step": 634 }, { "epoch": 0.4002994365864229, "grad_norm": 0.017107607796788216, "learning_rate": 0.0003026734563971992, "loss": 0.4329, "step": 635 }, { "epoch": 0.4009298293999448, "grad_norm": 0.018281778320670128, "learning_rate": 0.000302355187778485, "loss": 0.3648, "step": 636 }, { "epoch": 0.40156022221346677, "grad_norm": 0.01159526128321886, "learning_rate": 0.0003020369191597708, "loss": 0.3222, "step": 637 }, { "epoch": 0.4021906150269887, "grad_norm": 0.01421684492379427, "learning_rate": 0.0003017186505410567, "loss": 0.4209, "step": 638 }, { "epoch": 0.4028210078405106, "grad_norm": 0.015032793395221233, "learning_rate": 0.0003014003819223425, "loss": 0.3328, "step": 639 }, { "epoch": 0.40345140065403257, "grad_norm": 0.015594332478940487, "learning_rate": 0.0003010821133036283, "loss": 0.3477, "step": 640 }, { "epoch": 0.40408179346755446, "grad_norm": 0.013894080184400082, "learning_rate": 0.0003007638446849141, "loss": 0.3971, "step": 641 }, { "epoch": 0.4047121862810764, "grad_norm": 0.0161477942019701, "learning_rate": 0.0003004455760661999, "loss": 0.3482, "step": 642 }, { "epoch": 0.4053425790945983, "grad_norm": 0.015184820629656315, "learning_rate": 0.0003001273074474857, "loss": 0.3609, "step": 643 }, { "epoch": 0.40597297190812026, "grad_norm": 0.013787025585770607, "learning_rate": 0.00029980903882877147, "loss": 0.3692, "step": 644 }, { "epoch": 0.40660336472164216, "grad_norm": 0.016446765512228012, "learning_rate": 0.00029949077021005727, "loss": 0.387, "step": 645 }, { "epoch": 0.4072337575351641, "grad_norm": 0.015117369592189789, "learning_rate": 0.0002991725015913431, "loss": 0.3619, "step": 646 }, { "epoch": 0.407864150348686, "grad_norm": 0.014236542396247387, "learning_rate": 0.0002988542329726289, "loss": 0.3657, "step": 647 }, { "epoch": 0.40849454316220796, "grad_norm": 0.014021654613316059, "learning_rate": 0.0002985359643539147, "loss": 0.3358, "step": 648 }, { "epoch": 0.40912493597572985, "grad_norm": 0.01917107217013836, "learning_rate": 0.0002982176957352005, "loss": 0.3974, "step": 649 }, { "epoch": 0.4097553287892518, "grad_norm": 0.013322819024324417, "learning_rate": 0.00029789942711648635, "loss": 0.3163, "step": 650 }, { "epoch": 0.4103857216027737, "grad_norm": 0.01445494033396244, "learning_rate": 0.00029758115849777214, "loss": 0.3683, "step": 651 }, { "epoch": 0.41101611441629565, "grad_norm": 0.015121490694582462, "learning_rate": 0.00029726288987905793, "loss": 0.3525, "step": 652 }, { "epoch": 0.4116465072298176, "grad_norm": 0.013342364691197872, "learning_rate": 0.00029694462126034373, "loss": 0.3419, "step": 653 }, { "epoch": 0.4122769000433395, "grad_norm": 0.0156511552631855, "learning_rate": 0.0002966263526416295, "loss": 0.3705, "step": 654 }, { "epoch": 0.41290729285686145, "grad_norm": 0.019641265273094177, "learning_rate": 0.00029630808402291537, "loss": 0.4203, "step": 655 }, { "epoch": 0.41353768567038335, "grad_norm": 0.012090439908206463, "learning_rate": 0.00029598981540420117, "loss": 0.3155, "step": 656 }, { "epoch": 0.4141680784839053, "grad_norm": 0.014474401250481606, "learning_rate": 0.00029567154678548696, "loss": 0.3479, "step": 657 }, { "epoch": 0.4147984712974272, "grad_norm": 0.013321110047399998, "learning_rate": 0.00029535327816677275, "loss": 0.3641, "step": 658 }, { "epoch": 0.41542886411094915, "grad_norm": 0.014568335376679897, "learning_rate": 0.00029503500954805855, "loss": 0.3708, "step": 659 }, { "epoch": 0.41605925692447104, "grad_norm": 0.014420869760215282, "learning_rate": 0.00029471674092934434, "loss": 0.4256, "step": 660 }, { "epoch": 0.416689649737993, "grad_norm": 0.013579221442341805, "learning_rate": 0.00029439847231063014, "loss": 0.3881, "step": 661 }, { "epoch": 0.4173200425515149, "grad_norm": 0.015690671280026436, "learning_rate": 0.000294080203691916, "loss": 0.3892, "step": 662 }, { "epoch": 0.41795043536503684, "grad_norm": 0.015400114469230175, "learning_rate": 0.00029376193507320183, "loss": 0.3573, "step": 663 }, { "epoch": 0.41858082817855874, "grad_norm": 0.01533882599323988, "learning_rate": 0.00029344366645448763, "loss": 0.3647, "step": 664 }, { "epoch": 0.4192112209920807, "grad_norm": 0.01558590866625309, "learning_rate": 0.0002931253978357734, "loss": 0.4162, "step": 665 }, { "epoch": 0.41984161380560264, "grad_norm": 0.012559310533106327, "learning_rate": 0.0002928071292170592, "loss": 0.3319, "step": 666 }, { "epoch": 0.42047200661912454, "grad_norm": 0.0139588862657547, "learning_rate": 0.000292488860598345, "loss": 0.3309, "step": 667 }, { "epoch": 0.4211023994326465, "grad_norm": 0.014924910850822926, "learning_rate": 0.0002921705919796308, "loss": 0.4006, "step": 668 }, { "epoch": 0.4217327922461684, "grad_norm": 0.014100976288318634, "learning_rate": 0.0002918523233609166, "loss": 0.3323, "step": 669 }, { "epoch": 0.42236318505969034, "grad_norm": 0.019475214183330536, "learning_rate": 0.0002915340547422024, "loss": 0.3216, "step": 670 }, { "epoch": 0.42299357787321223, "grad_norm": 0.014488855376839638, "learning_rate": 0.00029121578612348824, "loss": 0.3311, "step": 671 }, { "epoch": 0.4236239706867342, "grad_norm": 0.013324184343218803, "learning_rate": 0.00029089751750477404, "loss": 0.3265, "step": 672 }, { "epoch": 0.4242543635002561, "grad_norm": 0.01362680271267891, "learning_rate": 0.00029057924888605983, "loss": 0.3688, "step": 673 }, { "epoch": 0.42488475631377803, "grad_norm": 0.018359068781137466, "learning_rate": 0.0002902609802673457, "loss": 0.4598, "step": 674 }, { "epoch": 0.42551514912729993, "grad_norm": 0.012123852036893368, "learning_rate": 0.00028994271164863147, "loss": 0.3214, "step": 675 }, { "epoch": 0.4261455419408219, "grad_norm": 0.013810473494231701, "learning_rate": 0.00028962444302991727, "loss": 0.3516, "step": 676 }, { "epoch": 0.4267759347543438, "grad_norm": 0.01565823145210743, "learning_rate": 0.00028930617441120306, "loss": 0.3779, "step": 677 }, { "epoch": 0.42740632756786573, "grad_norm": 0.01629094034433365, "learning_rate": 0.00028898790579248886, "loss": 0.4413, "step": 678 }, { "epoch": 0.4280367203813876, "grad_norm": 0.014053650200366974, "learning_rate": 0.00028866963717377465, "loss": 0.339, "step": 679 }, { "epoch": 0.4286671131949096, "grad_norm": 0.01334140170365572, "learning_rate": 0.0002883513685550605, "loss": 0.317, "step": 680 }, { "epoch": 0.42929750600843153, "grad_norm": 0.016112016513943672, "learning_rate": 0.0002880330999363463, "loss": 0.3682, "step": 681 }, { "epoch": 0.4299278988219534, "grad_norm": 0.01528241578489542, "learning_rate": 0.0002877148313176321, "loss": 0.3867, "step": 682 }, { "epoch": 0.4305582916354754, "grad_norm": 0.015661900863051414, "learning_rate": 0.0002873965626989179, "loss": 0.3531, "step": 683 }, { "epoch": 0.4311886844489973, "grad_norm": 0.013817746192216873, "learning_rate": 0.0002870782940802037, "loss": 0.4012, "step": 684 }, { "epoch": 0.4318190772625192, "grad_norm": 0.013254545629024506, "learning_rate": 0.00028676002546148947, "loss": 0.3604, "step": 685 }, { "epoch": 0.4324494700760411, "grad_norm": 0.01405141782015562, "learning_rate": 0.0002864417568427753, "loss": 0.4082, "step": 686 }, { "epoch": 0.43307986288956307, "grad_norm": 0.014481520280241966, "learning_rate": 0.0002861234882240611, "loss": 0.3697, "step": 687 }, { "epoch": 0.43371025570308497, "grad_norm": 0.014411790296435356, "learning_rate": 0.00028580521960534696, "loss": 0.3477, "step": 688 }, { "epoch": 0.4343406485166069, "grad_norm": 0.016432663425803185, "learning_rate": 0.00028548695098663275, "loss": 0.3726, "step": 689 }, { "epoch": 0.4349710413301288, "grad_norm": 0.017969200387597084, "learning_rate": 0.00028516868236791855, "loss": 0.3979, "step": 690 }, { "epoch": 0.43560143414365077, "grad_norm": 0.012793822214007378, "learning_rate": 0.00028485041374920434, "loss": 0.3485, "step": 691 }, { "epoch": 0.43623182695717266, "grad_norm": 0.013791053555905819, "learning_rate": 0.00028453214513049014, "loss": 0.3536, "step": 692 }, { "epoch": 0.4368622197706946, "grad_norm": 0.015151995234191418, "learning_rate": 0.00028421387651177593, "loss": 0.3888, "step": 693 }, { "epoch": 0.43749261258421657, "grad_norm": 0.013432036153972149, "learning_rate": 0.0002838956078930617, "loss": 0.3218, "step": 694 }, { "epoch": 0.43812300539773846, "grad_norm": 0.012717016041278839, "learning_rate": 0.0002835773392743475, "loss": 0.347, "step": 695 }, { "epoch": 0.4387533982112604, "grad_norm": 0.015013251453638077, "learning_rate": 0.0002832590706556333, "loss": 0.3597, "step": 696 }, { "epoch": 0.4393837910247823, "grad_norm": 0.017113640904426575, "learning_rate": 0.00028294080203691916, "loss": 0.4167, "step": 697 }, { "epoch": 0.44001418383830426, "grad_norm": 0.01724739372730255, "learning_rate": 0.000282622533418205, "loss": 0.3806, "step": 698 }, { "epoch": 0.44064457665182616, "grad_norm": 0.014721522107720375, "learning_rate": 0.0002823042647994908, "loss": 0.3063, "step": 699 }, { "epoch": 0.4412749694653481, "grad_norm": 0.014770137146115303, "learning_rate": 0.0002819859961807766, "loss": 0.3884, "step": 700 }, { "epoch": 0.44190536227887, "grad_norm": 0.015977727249264717, "learning_rate": 0.0002816677275620624, "loss": 0.3952, "step": 701 }, { "epoch": 0.44253575509239196, "grad_norm": 0.012070649303495884, "learning_rate": 0.0002813494589433482, "loss": 0.3579, "step": 702 }, { "epoch": 0.44316614790591385, "grad_norm": 0.012140912935137749, "learning_rate": 0.000281031190324634, "loss": 0.3033, "step": 703 }, { "epoch": 0.4437965407194358, "grad_norm": 0.01301062572747469, "learning_rate": 0.0002807129217059198, "loss": 0.3623, "step": 704 }, { "epoch": 0.4444269335329577, "grad_norm": 0.017587412148714066, "learning_rate": 0.0002803946530872056, "loss": 0.3408, "step": 705 }, { "epoch": 0.44505732634647965, "grad_norm": 0.15438830852508545, "learning_rate": 0.0002800763844684914, "loss": 0.3863, "step": 706 }, { "epoch": 0.44568771916000155, "grad_norm": 0.012901189737021923, "learning_rate": 0.0002797581158497772, "loss": 0.3583, "step": 707 }, { "epoch": 0.4463181119735235, "grad_norm": 0.012790758162736893, "learning_rate": 0.000279439847231063, "loss": 0.3343, "step": 708 }, { "epoch": 0.44694850478704545, "grad_norm": 0.014779281802475452, "learning_rate": 0.0002791215786123488, "loss": 0.3395, "step": 709 }, { "epoch": 0.44757889760056735, "grad_norm": 0.022882359102368355, "learning_rate": 0.00027880330999363465, "loss": 0.4358, "step": 710 }, { "epoch": 0.4482092904140893, "grad_norm": 0.015499277971684933, "learning_rate": 0.00027848504137492044, "loss": 0.3933, "step": 711 }, { "epoch": 0.4488396832276112, "grad_norm": 0.0174066424369812, "learning_rate": 0.00027816677275620624, "loss": 0.3721, "step": 712 }, { "epoch": 0.44947007604113315, "grad_norm": 0.01338980719447136, "learning_rate": 0.0002778485041374921, "loss": 0.3176, "step": 713 }, { "epoch": 0.45010046885465504, "grad_norm": 0.016566332429647446, "learning_rate": 0.0002775302355187779, "loss": 0.3704, "step": 714 }, { "epoch": 0.450730861668177, "grad_norm": 0.01236806996166706, "learning_rate": 0.0002772119669000637, "loss": 0.3253, "step": 715 }, { "epoch": 0.4513612544816989, "grad_norm": 0.015173107385635376, "learning_rate": 0.00027689369828134947, "loss": 0.3736, "step": 716 }, { "epoch": 0.45199164729522084, "grad_norm": 0.014302750118076801, "learning_rate": 0.00027657542966263526, "loss": 0.3122, "step": 717 }, { "epoch": 0.45262204010874274, "grad_norm": 0.01715126633644104, "learning_rate": 0.00027625716104392106, "loss": 0.416, "step": 718 }, { "epoch": 0.4532524329222647, "grad_norm": 0.016660649329423904, "learning_rate": 0.00027593889242520685, "loss": 0.3872, "step": 719 }, { "epoch": 0.4538828257357866, "grad_norm": 0.01652917079627514, "learning_rate": 0.00027562062380649265, "loss": 0.3406, "step": 720 }, { "epoch": 0.45451321854930854, "grad_norm": 0.013926257379353046, "learning_rate": 0.00027530235518777844, "loss": 0.3296, "step": 721 }, { "epoch": 0.4551436113628305, "grad_norm": 0.014681665226817131, "learning_rate": 0.00027498408656906434, "loss": 0.3842, "step": 722 }, { "epoch": 0.4557740041763524, "grad_norm": 0.01700069196522236, "learning_rate": 0.00027466581795035014, "loss": 0.3955, "step": 723 }, { "epoch": 0.45640439698987434, "grad_norm": 0.01633227802813053, "learning_rate": 0.00027434754933163593, "loss": 0.3976, "step": 724 }, { "epoch": 0.45703478980339624, "grad_norm": 0.01662380062043667, "learning_rate": 0.0002740292807129217, "loss": 0.3958, "step": 725 }, { "epoch": 0.4576651826169182, "grad_norm": 0.018056437373161316, "learning_rate": 0.0002737110120942075, "loss": 0.3167, "step": 726 }, { "epoch": 0.4582955754304401, "grad_norm": 0.014621833339333534, "learning_rate": 0.0002733927434754933, "loss": 0.3889, "step": 727 }, { "epoch": 0.45892596824396203, "grad_norm": 0.016618939116597176, "learning_rate": 0.0002730744748567791, "loss": 0.3818, "step": 728 }, { "epoch": 0.45955636105748393, "grad_norm": 0.025172855705022812, "learning_rate": 0.0002727562062380649, "loss": 0.3853, "step": 729 }, { "epoch": 0.4601867538710059, "grad_norm": 0.017150798812508583, "learning_rate": 0.00027243793761935075, "loss": 0.3513, "step": 730 }, { "epoch": 0.4608171466845278, "grad_norm": 0.017840418964624405, "learning_rate": 0.00027211966900063655, "loss": 0.3704, "step": 731 }, { "epoch": 0.46144753949804973, "grad_norm": 0.05274835228919983, "learning_rate": 0.00027180140038192234, "loss": 0.3469, "step": 732 }, { "epoch": 0.4620779323115716, "grad_norm": 0.014218789525330067, "learning_rate": 0.00027148313176320814, "loss": 0.3359, "step": 733 }, { "epoch": 0.4627083251250936, "grad_norm": 0.020126720890402794, "learning_rate": 0.00027116486314449393, "loss": 0.3907, "step": 734 }, { "epoch": 0.4633387179386155, "grad_norm": 0.016716623678803444, "learning_rate": 0.0002708465945257798, "loss": 0.3633, "step": 735 }, { "epoch": 0.4639691107521374, "grad_norm": 0.014319379813969135, "learning_rate": 0.00027052832590706557, "loss": 0.3325, "step": 736 }, { "epoch": 0.4645995035656594, "grad_norm": 0.016253383830189705, "learning_rate": 0.00027021005728835137, "loss": 0.3386, "step": 737 }, { "epoch": 0.4652298963791813, "grad_norm": 0.01554201915860176, "learning_rate": 0.0002698917886696372, "loss": 0.3772, "step": 738 }, { "epoch": 0.4658602891927032, "grad_norm": 0.015467418357729912, "learning_rate": 0.000269573520050923, "loss": 0.3601, "step": 739 }, { "epoch": 0.4664906820062251, "grad_norm": 0.018481513485312462, "learning_rate": 0.0002692552514322088, "loss": 0.4007, "step": 740 }, { "epoch": 0.4671210748197471, "grad_norm": 0.014622112736105919, "learning_rate": 0.0002689369828134946, "loss": 0.3725, "step": 741 }, { "epoch": 0.46775146763326897, "grad_norm": 0.015474259853363037, "learning_rate": 0.0002686187141947804, "loss": 0.4009, "step": 742 }, { "epoch": 0.4683818604467909, "grad_norm": 0.014881080947816372, "learning_rate": 0.0002683004455760662, "loss": 0.3586, "step": 743 }, { "epoch": 0.4690122532603128, "grad_norm": 0.01284766010940075, "learning_rate": 0.000267982176957352, "loss": 0.3218, "step": 744 }, { "epoch": 0.46964264607383477, "grad_norm": 0.013011449947953224, "learning_rate": 0.0002676639083386378, "loss": 0.2843, "step": 745 }, { "epoch": 0.47027303888735666, "grad_norm": 0.014213647693395615, "learning_rate": 0.00026734563971992357, "loss": 0.335, "step": 746 }, { "epoch": 0.4709034317008786, "grad_norm": 0.013598071411252022, "learning_rate": 0.00026702737110120947, "loss": 0.3543, "step": 747 }, { "epoch": 0.4715338245144005, "grad_norm": 0.014245964586734772, "learning_rate": 0.00026670910248249527, "loss": 0.405, "step": 748 }, { "epoch": 0.47216421732792246, "grad_norm": 0.014708565548062325, "learning_rate": 0.00026639083386378106, "loss": 0.3696, "step": 749 }, { "epoch": 0.47279461014144436, "grad_norm": 0.017546402290463448, "learning_rate": 0.00026607256524506685, "loss": 0.4109, "step": 750 }, { "epoch": 0.4734250029549663, "grad_norm": 0.016081346198916435, "learning_rate": 0.00026575429662635265, "loss": 0.3402, "step": 751 }, { "epoch": 0.47405539576848826, "grad_norm": 0.015674389898777008, "learning_rate": 0.00026543602800763844, "loss": 0.335, "step": 752 }, { "epoch": 0.47468578858201016, "grad_norm": 0.014885363169014454, "learning_rate": 0.00026511775938892424, "loss": 0.3441, "step": 753 }, { "epoch": 0.4753161813955321, "grad_norm": 0.014220920391380787, "learning_rate": 0.00026479949077021003, "loss": 0.3637, "step": 754 }, { "epoch": 0.475946574209054, "grad_norm": 0.013256360776722431, "learning_rate": 0.0002644812221514959, "loss": 0.2881, "step": 755 }, { "epoch": 0.47657696702257596, "grad_norm": 0.01440057810395956, "learning_rate": 0.0002641629535327817, "loss": 0.317, "step": 756 }, { "epoch": 0.47720735983609786, "grad_norm": 0.015414683148264885, "learning_rate": 0.00026384468491406747, "loss": 0.3332, "step": 757 }, { "epoch": 0.4778377526496198, "grad_norm": 0.014645903371274471, "learning_rate": 0.00026352641629535326, "loss": 0.3697, "step": 758 }, { "epoch": 0.4784681454631417, "grad_norm": 0.014017629437148571, "learning_rate": 0.0002632081476766391, "loss": 0.347, "step": 759 }, { "epoch": 0.47909853827666365, "grad_norm": 0.016517357900738716, "learning_rate": 0.0002628898790579249, "loss": 0.4075, "step": 760 }, { "epoch": 0.47972893109018555, "grad_norm": 0.017165319994091988, "learning_rate": 0.0002625716104392107, "loss": 0.393, "step": 761 }, { "epoch": 0.4803593239037075, "grad_norm": 0.01595127396285534, "learning_rate": 0.0002622533418204965, "loss": 0.34, "step": 762 }, { "epoch": 0.4809897167172294, "grad_norm": 0.013706330209970474, "learning_rate": 0.00026193507320178234, "loss": 0.3078, "step": 763 }, { "epoch": 0.48162010953075135, "grad_norm": 0.021000154316425323, "learning_rate": 0.00026161680458306814, "loss": 0.4465, "step": 764 }, { "epoch": 0.4822505023442733, "grad_norm": 0.016352282837033272, "learning_rate": 0.00026129853596435393, "loss": 0.3341, "step": 765 }, { "epoch": 0.4828808951577952, "grad_norm": 0.01306548435240984, "learning_rate": 0.0002609802673456397, "loss": 0.3848, "step": 766 }, { "epoch": 0.48351128797131715, "grad_norm": 0.013271639123558998, "learning_rate": 0.0002606619987269255, "loss": 0.331, "step": 767 }, { "epoch": 0.48414168078483905, "grad_norm": 0.016379786655306816, "learning_rate": 0.0002603437301082113, "loss": 0.3563, "step": 768 }, { "epoch": 0.484772073598361, "grad_norm": 0.014435634016990662, "learning_rate": 0.0002600254614894971, "loss": 0.3599, "step": 769 }, { "epoch": 0.4854024664118829, "grad_norm": 0.01450477447360754, "learning_rate": 0.0002597071928707829, "loss": 0.315, "step": 770 }, { "epoch": 0.48603285922540485, "grad_norm": 0.014938865788280964, "learning_rate": 0.00025938892425206875, "loss": 0.3614, "step": 771 }, { "epoch": 0.48666325203892674, "grad_norm": 0.015669316053390503, "learning_rate": 0.0002590706556333546, "loss": 0.3315, "step": 772 }, { "epoch": 0.4872936448524487, "grad_norm": 0.017165658995509148, "learning_rate": 0.0002587523870146404, "loss": 0.3505, "step": 773 }, { "epoch": 0.4879240376659706, "grad_norm": 0.017099546268582344, "learning_rate": 0.0002584341183959262, "loss": 0.3769, "step": 774 }, { "epoch": 0.48855443047949254, "grad_norm": 0.013970285654067993, "learning_rate": 0.000258115849777212, "loss": 0.385, "step": 775 }, { "epoch": 0.48918482329301444, "grad_norm": 0.01398810651153326, "learning_rate": 0.0002577975811584978, "loss": 0.3793, "step": 776 }, { "epoch": 0.4898152161065364, "grad_norm": 0.01589352637529373, "learning_rate": 0.00025747931253978357, "loss": 0.3145, "step": 777 }, { "epoch": 0.4904456089200583, "grad_norm": 0.017603708431124687, "learning_rate": 0.00025716104392106936, "loss": 0.3729, "step": 778 }, { "epoch": 0.49107600173358024, "grad_norm": 0.016556723043322563, "learning_rate": 0.00025684277530235516, "loss": 0.411, "step": 779 }, { "epoch": 0.4917063945471022, "grad_norm": 0.01712803915143013, "learning_rate": 0.000256524506683641, "loss": 0.3923, "step": 780 }, { "epoch": 0.4923367873606241, "grad_norm": 0.0149735938757658, "learning_rate": 0.0002562062380649268, "loss": 0.4245, "step": 781 }, { "epoch": 0.49296718017414604, "grad_norm": 0.017102012410759926, "learning_rate": 0.0002558879694462126, "loss": 0.3449, "step": 782 }, { "epoch": 0.49359757298766793, "grad_norm": 0.015632549300789833, "learning_rate": 0.00025556970082749844, "loss": 0.3433, "step": 783 }, { "epoch": 0.4942279658011899, "grad_norm": 0.015138093382120132, "learning_rate": 0.00025525143220878424, "loss": 0.3401, "step": 784 }, { "epoch": 0.4948583586147118, "grad_norm": 0.0173388309776783, "learning_rate": 0.00025493316359007003, "loss": 0.3519, "step": 785 }, { "epoch": 0.49548875142823373, "grad_norm": 0.012897705659270287, "learning_rate": 0.0002546148949713558, "loss": 0.3294, "step": 786 }, { "epoch": 0.4961191442417556, "grad_norm": 0.01930922083556652, "learning_rate": 0.0002542966263526416, "loss": 0.3582, "step": 787 }, { "epoch": 0.4967495370552776, "grad_norm": 0.016981162130832672, "learning_rate": 0.00025397835773392747, "loss": 0.3524, "step": 788 }, { "epoch": 0.4973799298687995, "grad_norm": 0.017955496907234192, "learning_rate": 0.00025366008911521326, "loss": 0.3892, "step": 789 }, { "epoch": 0.4980103226823214, "grad_norm": 0.017408637329936028, "learning_rate": 0.00025334182049649906, "loss": 0.386, "step": 790 }, { "epoch": 0.4986407154958433, "grad_norm": 0.018572593107819557, "learning_rate": 0.00025302355187778485, "loss": 0.4308, "step": 791 }, { "epoch": 0.4992711083093653, "grad_norm": 0.015350669622421265, "learning_rate": 0.00025270528325907065, "loss": 0.3302, "step": 792 }, { "epoch": 0.4999015011228872, "grad_norm": 0.01609470136463642, "learning_rate": 0.00025238701464035644, "loss": 0.3209, "step": 793 }, { "epoch": 0.5005318939364092, "grad_norm": 0.014169929549098015, "learning_rate": 0.00025206874602164223, "loss": 0.3734, "step": 794 }, { "epoch": 0.501162286749931, "grad_norm": 0.012853971682488918, "learning_rate": 0.0002517504774029281, "loss": 0.3627, "step": 795 }, { "epoch": 0.501792679563453, "grad_norm": 0.016874967142939568, "learning_rate": 0.0002514322087842139, "loss": 0.3676, "step": 796 }, { "epoch": 0.5024230723769749, "grad_norm": 0.016694629564881325, "learning_rate": 0.0002511139401654997, "loss": 0.3625, "step": 797 }, { "epoch": 0.5030534651904969, "grad_norm": 0.014560778625309467, "learning_rate": 0.0002507956715467855, "loss": 0.3319, "step": 798 }, { "epoch": 0.5036838580040187, "grad_norm": 0.01932406798005104, "learning_rate": 0.0002504774029280713, "loss": 0.3999, "step": 799 }, { "epoch": 0.5043142508175407, "grad_norm": 0.018284639343619347, "learning_rate": 0.0002501591343093571, "loss": 0.346, "step": 800 }, { "epoch": 0.5043142508175407, "eval_loss": 0.3877585828304291, "eval_runtime": 327.5475, "eval_samples_per_second": 3.053, "eval_steps_per_second": 3.053, "step": 800 }, { "epoch": 0.5049446436310626, "grad_norm": 0.014326919801533222, "learning_rate": 0.0002498408656906429, "loss": 0.357, "step": 801 }, { "epoch": 0.5055750364445846, "grad_norm": 0.01385468803346157, "learning_rate": 0.0002495225970719287, "loss": 0.3863, "step": 802 }, { "epoch": 0.5062054292581064, "grad_norm": 0.015757039189338684, "learning_rate": 0.00024920432845321455, "loss": 0.339, "step": 803 }, { "epoch": 0.5068358220716284, "grad_norm": 0.014080030843615532, "learning_rate": 0.00024888605983450034, "loss": 0.3457, "step": 804 }, { "epoch": 0.5074662148851503, "grad_norm": 0.01702369749546051, "learning_rate": 0.00024856779121578613, "loss": 0.3876, "step": 805 }, { "epoch": 0.5080966076986723, "grad_norm": 0.017006559297442436, "learning_rate": 0.00024824952259707193, "loss": 0.3557, "step": 806 }, { "epoch": 0.5087270005121942, "grad_norm": 0.019816676154732704, "learning_rate": 0.0002479312539783578, "loss": 0.3743, "step": 807 }, { "epoch": 0.5093573933257161, "grad_norm": 0.01247922983020544, "learning_rate": 0.00024761298535964357, "loss": 0.3105, "step": 808 }, { "epoch": 0.509987786139238, "grad_norm": 0.014722848311066628, "learning_rate": 0.00024729471674092936, "loss": 0.3961, "step": 809 }, { "epoch": 0.51061817895276, "grad_norm": 0.014283857308328152, "learning_rate": 0.00024697644812221516, "loss": 0.3453, "step": 810 }, { "epoch": 0.5112485717662819, "grad_norm": 0.015137928538024426, "learning_rate": 0.00024665817950350095, "loss": 0.3876, "step": 811 }, { "epoch": 0.5118789645798038, "grad_norm": 0.015896501019597054, "learning_rate": 0.00024633991088478675, "loss": 0.3935, "step": 812 }, { "epoch": 0.5125093573933257, "grad_norm": 0.014959828928112984, "learning_rate": 0.0002460216422660726, "loss": 0.4404, "step": 813 }, { "epoch": 0.5131397502068477, "grad_norm": 0.014572755433619022, "learning_rate": 0.0002457033736473584, "loss": 0.3634, "step": 814 }, { "epoch": 0.5137701430203696, "grad_norm": 0.013689921237528324, "learning_rate": 0.0002453851050286442, "loss": 0.326, "step": 815 }, { "epoch": 0.5144005358338914, "grad_norm": 0.01436834130436182, "learning_rate": 0.00024506683640993, "loss": 0.365, "step": 816 }, { "epoch": 0.5150309286474134, "grad_norm": 0.018159350380301476, "learning_rate": 0.00024474856779121577, "loss": 0.3933, "step": 817 }, { "epoch": 0.5156613214609354, "grad_norm": 0.013393767178058624, "learning_rate": 0.00024443029917250157, "loss": 0.333, "step": 818 }, { "epoch": 0.5162917142744573, "grad_norm": 0.014291583560407162, "learning_rate": 0.0002441120305537874, "loss": 0.3516, "step": 819 }, { "epoch": 0.5169221070879791, "grad_norm": 0.016608605161309242, "learning_rate": 0.0002437937619350732, "loss": 0.3385, "step": 820 }, { "epoch": 0.5175524999015011, "grad_norm": 0.012852616608142853, "learning_rate": 0.000243475493316359, "loss": 0.3451, "step": 821 }, { "epoch": 0.518182892715023, "grad_norm": 0.013805782422423363, "learning_rate": 0.00024315722469764483, "loss": 0.3624, "step": 822 }, { "epoch": 0.518813285528545, "grad_norm": 0.014693181961774826, "learning_rate": 0.00024283895607893062, "loss": 0.3157, "step": 823 }, { "epoch": 0.519443678342067, "grad_norm": 0.014225085265934467, "learning_rate": 0.00024252068746021644, "loss": 0.3397, "step": 824 }, { "epoch": 0.5200740711555888, "grad_norm": 0.019697504118084908, "learning_rate": 0.00024220241884150224, "loss": 0.4296, "step": 825 }, { "epoch": 0.5207044639691107, "grad_norm": 0.017022551968693733, "learning_rate": 0.00024188415022278803, "loss": 0.4123, "step": 826 }, { "epoch": 0.5213348567826327, "grad_norm": 0.016073765233159065, "learning_rate": 0.00024156588160407382, "loss": 0.3739, "step": 827 }, { "epoch": 0.5219652495961546, "grad_norm": 0.016565553843975067, "learning_rate": 0.00024124761298535967, "loss": 0.3692, "step": 828 }, { "epoch": 0.5225956424096765, "grad_norm": 0.015549513511359692, "learning_rate": 0.00024092934436664547, "loss": 0.3841, "step": 829 }, { "epoch": 0.5232260352231984, "grad_norm": 0.015681127086281776, "learning_rate": 0.00024061107574793126, "loss": 0.4059, "step": 830 }, { "epoch": 0.5238564280367204, "grad_norm": 0.016539370641112328, "learning_rate": 0.00024029280712921705, "loss": 0.4058, "step": 831 }, { "epoch": 0.5244868208502423, "grad_norm": 0.016183825209736824, "learning_rate": 0.00023997453851050288, "loss": 0.3739, "step": 832 }, { "epoch": 0.5251172136637642, "grad_norm": 0.013837708160281181, "learning_rate": 0.00023965626989178867, "loss": 0.2883, "step": 833 }, { "epoch": 0.5257476064772861, "grad_norm": 0.015588294714689255, "learning_rate": 0.0002393380012730745, "loss": 0.4081, "step": 834 }, { "epoch": 0.5263779992908081, "grad_norm": 0.01622762717306614, "learning_rate": 0.00023901973265436029, "loss": 0.3843, "step": 835 }, { "epoch": 0.52700839210433, "grad_norm": 0.01810673251748085, "learning_rate": 0.00023870146403564608, "loss": 0.3833, "step": 836 }, { "epoch": 0.527638784917852, "grad_norm": 0.019679656252264977, "learning_rate": 0.0002383831954169319, "loss": 0.4367, "step": 837 }, { "epoch": 0.5282691777313738, "grad_norm": 0.017767464742064476, "learning_rate": 0.0002380649267982177, "loss": 0.3267, "step": 838 }, { "epoch": 0.5288995705448958, "grad_norm": 0.015622109174728394, "learning_rate": 0.0002377466581795035, "loss": 0.3866, "step": 839 }, { "epoch": 0.5295299633584177, "grad_norm": 0.013189843855798244, "learning_rate": 0.0002374283895607893, "loss": 0.2954, "step": 840 }, { "epoch": 0.5301603561719397, "grad_norm": 0.019285282120108604, "learning_rate": 0.00023711012094207513, "loss": 0.3524, "step": 841 }, { "epoch": 0.5307907489854615, "grad_norm": 0.015225335024297237, "learning_rate": 0.00023679185232336093, "loss": 0.3456, "step": 842 }, { "epoch": 0.5314211417989835, "grad_norm": 0.020252123475074768, "learning_rate": 0.00023647358370464672, "loss": 0.4523, "step": 843 }, { "epoch": 0.5320515346125054, "grad_norm": 0.014538729563355446, "learning_rate": 0.00023615531508593252, "loss": 0.3597, "step": 844 }, { "epoch": 0.5326819274260274, "grad_norm": 0.014581016264855862, "learning_rate": 0.00023583704646721834, "loss": 0.3481, "step": 845 }, { "epoch": 0.5333123202395492, "grad_norm": 0.014762857928872108, "learning_rate": 0.00023551877784850416, "loss": 0.3721, "step": 846 }, { "epoch": 0.5339427130530712, "grad_norm": 0.016726767644286156, "learning_rate": 0.00023520050922978995, "loss": 0.4055, "step": 847 }, { "epoch": 0.5345731058665931, "grad_norm": 0.0158160999417305, "learning_rate": 0.00023488224061107575, "loss": 0.3274, "step": 848 }, { "epoch": 0.5352034986801151, "grad_norm": 0.013633741065859795, "learning_rate": 0.00023456397199236157, "loss": 0.366, "step": 849 }, { "epoch": 0.535833891493637, "grad_norm": 0.015466800890862942, "learning_rate": 0.00023424570337364736, "loss": 0.3817, "step": 850 }, { "epoch": 0.5364642843071589, "grad_norm": 0.014646458439528942, "learning_rate": 0.00023392743475493316, "loss": 0.369, "step": 851 }, { "epoch": 0.5370946771206808, "grad_norm": 0.013928486034274101, "learning_rate": 0.00023360916613621898, "loss": 0.3437, "step": 852 }, { "epoch": 0.5377250699342028, "grad_norm": 0.012963177636265755, "learning_rate": 0.0002332908975175048, "loss": 0.3478, "step": 853 }, { "epoch": 0.5383554627477247, "grad_norm": 0.014482310973107815, "learning_rate": 0.0002329726288987906, "loss": 0.3595, "step": 854 }, { "epoch": 0.5389858555612466, "grad_norm": 0.011819077655673027, "learning_rate": 0.0002326543602800764, "loss": 0.3527, "step": 855 }, { "epoch": 0.5396162483747685, "grad_norm": 0.01876092329621315, "learning_rate": 0.00023233609166136218, "loss": 0.4195, "step": 856 }, { "epoch": 0.5402466411882905, "grad_norm": 0.01506055798381567, "learning_rate": 0.000232017823042648, "loss": 0.3943, "step": 857 }, { "epoch": 0.5408770340018124, "grad_norm": 0.012046286836266518, "learning_rate": 0.00023169955442393382, "loss": 0.3066, "step": 858 }, { "epoch": 0.5415074268153343, "grad_norm": 0.018502840772271156, "learning_rate": 0.00023138128580521962, "loss": 0.3909, "step": 859 }, { "epoch": 0.5421378196288562, "grad_norm": 0.016555285081267357, "learning_rate": 0.0002310630171865054, "loss": 0.3894, "step": 860 }, { "epoch": 0.5427682124423782, "grad_norm": 0.013677193783223629, "learning_rate": 0.0002307447485677912, "loss": 0.3181, "step": 861 }, { "epoch": 0.5433986052559001, "grad_norm": 0.018048271536827087, "learning_rate": 0.00023042647994907703, "loss": 0.432, "step": 862 }, { "epoch": 0.544028998069422, "grad_norm": 0.028850600123405457, "learning_rate": 0.00023010821133036282, "loss": 0.3127, "step": 863 }, { "epoch": 0.5446593908829439, "grad_norm": 0.01547788456082344, "learning_rate": 0.00022978994271164864, "loss": 0.3398, "step": 864 }, { "epoch": 0.5452897836964659, "grad_norm": 0.02058050036430359, "learning_rate": 0.00022947167409293444, "loss": 0.3656, "step": 865 }, { "epoch": 0.5459201765099878, "grad_norm": 0.015212886966764927, "learning_rate": 0.00022915340547422026, "loss": 0.3958, "step": 866 }, { "epoch": 0.5465505693235098, "grad_norm": 0.015723006799817085, "learning_rate": 0.00022883513685550605, "loss": 0.3632, "step": 867 }, { "epoch": 0.5471809621370316, "grad_norm": 0.01525859534740448, "learning_rate": 0.00022851686823679185, "loss": 0.3888, "step": 868 }, { "epoch": 0.5478113549505536, "grad_norm": 0.01541211362928152, "learning_rate": 0.00022819859961807764, "loss": 0.3482, "step": 869 }, { "epoch": 0.5484417477640755, "grad_norm": 0.013938824646174908, "learning_rate": 0.0002278803309993635, "loss": 0.3747, "step": 870 }, { "epoch": 0.5490721405775975, "grad_norm": 0.013390318490564823, "learning_rate": 0.00022756206238064929, "loss": 0.3869, "step": 871 }, { "epoch": 0.5497025333911193, "grad_norm": 0.014436273835599422, "learning_rate": 0.00022724379376193508, "loss": 0.329, "step": 872 }, { "epoch": 0.5503329262046412, "grad_norm": 0.01456101331859827, "learning_rate": 0.00022692552514322087, "loss": 0.4038, "step": 873 }, { "epoch": 0.5509633190181632, "grad_norm": 0.01680777780711651, "learning_rate": 0.0002266072565245067, "loss": 0.3736, "step": 874 }, { "epoch": 0.5515937118316852, "grad_norm": 0.019318213686347008, "learning_rate": 0.0002262889879057925, "loss": 0.387, "step": 875 }, { "epoch": 0.552224104645207, "grad_norm": 0.016530238091945648, "learning_rate": 0.00022597071928707828, "loss": 0.3892, "step": 876 }, { "epoch": 0.5528544974587289, "grad_norm": 0.01574966311454773, "learning_rate": 0.0002256524506683641, "loss": 0.3818, "step": 877 }, { "epoch": 0.5534848902722509, "grad_norm": 0.013102530501782894, "learning_rate": 0.00022533418204964993, "loss": 0.2966, "step": 878 }, { "epoch": 0.5541152830857728, "grad_norm": 0.014598728157579899, "learning_rate": 0.00022501591343093572, "loss": 0.3295, "step": 879 }, { "epoch": 0.5547456758992948, "grad_norm": 0.016482273116707802, "learning_rate": 0.00022469764481222151, "loss": 0.3303, "step": 880 }, { "epoch": 0.5553760687128166, "grad_norm": 0.016923967748880386, "learning_rate": 0.0002243793761935073, "loss": 0.3548, "step": 881 }, { "epoch": 0.5560064615263386, "grad_norm": 0.015409741550683975, "learning_rate": 0.00022406110757479316, "loss": 0.3526, "step": 882 }, { "epoch": 0.5566368543398605, "grad_norm": 0.017666161060333252, "learning_rate": 0.00022374283895607895, "loss": 0.3851, "step": 883 }, { "epoch": 0.5572672471533825, "grad_norm": 0.012790391221642494, "learning_rate": 0.00022342457033736475, "loss": 0.3223, "step": 884 }, { "epoch": 0.5578976399669043, "grad_norm": 0.015927327796816826, "learning_rate": 0.00022310630171865054, "loss": 0.3694, "step": 885 }, { "epoch": 0.5585280327804263, "grad_norm": 0.014640790410339832, "learning_rate": 0.00022278803309993633, "loss": 0.3314, "step": 886 }, { "epoch": 0.5591584255939482, "grad_norm": 0.016205428168177605, "learning_rate": 0.00022246976448122216, "loss": 0.4152, "step": 887 }, { "epoch": 0.5597888184074702, "grad_norm": 0.016068758442997932, "learning_rate": 0.00022215149586250795, "loss": 0.3854, "step": 888 }, { "epoch": 0.560419211220992, "grad_norm": 0.017680400982499123, "learning_rate": 0.00022183322724379377, "loss": 0.3687, "step": 889 }, { "epoch": 0.561049604034514, "grad_norm": 0.013393621891736984, "learning_rate": 0.00022151495862507957, "loss": 0.3318, "step": 890 }, { "epoch": 0.5616799968480359, "grad_norm": 0.017189791426062584, "learning_rate": 0.0002211966900063654, "loss": 0.4156, "step": 891 }, { "epoch": 0.5623103896615579, "grad_norm": 0.017538657411932945, "learning_rate": 0.00022087842138765118, "loss": 0.4055, "step": 892 }, { "epoch": 0.5629407824750798, "grad_norm": 0.015574131160974503, "learning_rate": 0.00022056015276893698, "loss": 0.3537, "step": 893 }, { "epoch": 0.5635711752886017, "grad_norm": 0.0147732924669981, "learning_rate": 0.00022024188415022277, "loss": 0.3307, "step": 894 }, { "epoch": 0.5642015681021236, "grad_norm": 0.015737462788820267, "learning_rate": 0.00021992361553150862, "loss": 0.3925, "step": 895 }, { "epoch": 0.5648319609156456, "grad_norm": 0.013616523705422878, "learning_rate": 0.0002196053469127944, "loss": 0.3167, "step": 896 }, { "epoch": 0.5654623537291675, "grad_norm": 0.013405036181211472, "learning_rate": 0.0002192870782940802, "loss": 0.3036, "step": 897 }, { "epoch": 0.5660927465426894, "grad_norm": 0.012924853712320328, "learning_rate": 0.000218968809675366, "loss": 0.3413, "step": 898 }, { "epoch": 0.5667231393562113, "grad_norm": 0.015666674822568893, "learning_rate": 0.00021865054105665182, "loss": 0.2911, "step": 899 }, { "epoch": 0.5673535321697333, "grad_norm": 0.01583283580839634, "learning_rate": 0.00021833227243793762, "loss": 0.4137, "step": 900 }, { "epoch": 0.5679839249832552, "grad_norm": 0.015507655218243599, "learning_rate": 0.00021801400381922344, "loss": 0.3672, "step": 901 }, { "epoch": 0.5686143177967771, "grad_norm": 0.01216537319123745, "learning_rate": 0.00021769573520050923, "loss": 0.2869, "step": 902 }, { "epoch": 0.569244710610299, "grad_norm": 0.015423446893692017, "learning_rate": 0.00021737746658179505, "loss": 0.3857, "step": 903 }, { "epoch": 0.569875103423821, "grad_norm": 0.011813097633421421, "learning_rate": 0.00021705919796308085, "loss": 0.3298, "step": 904 }, { "epoch": 0.5705054962373429, "grad_norm": 0.020523948594927788, "learning_rate": 0.00021674092934436664, "loss": 0.4695, "step": 905 }, { "epoch": 0.5711358890508649, "grad_norm": 0.015451502054929733, "learning_rate": 0.00021642266072565244, "loss": 0.4236, "step": 906 }, { "epoch": 0.5717662818643867, "grad_norm": 0.014406215399503708, "learning_rate": 0.00021610439210693826, "loss": 0.3224, "step": 907 }, { "epoch": 0.5723966746779087, "grad_norm": 0.014332654885947704, "learning_rate": 0.00021578612348822408, "loss": 0.3765, "step": 908 }, { "epoch": 0.5730270674914306, "grad_norm": 0.0163109190762043, "learning_rate": 0.00021546785486950987, "loss": 0.3398, "step": 909 }, { "epoch": 0.5736574603049526, "grad_norm": 0.016083260998129845, "learning_rate": 0.00021514958625079567, "loss": 0.3301, "step": 910 }, { "epoch": 0.5742878531184744, "grad_norm": 0.01735375076532364, "learning_rate": 0.00021483131763208146, "loss": 0.3797, "step": 911 }, { "epoch": 0.5749182459319964, "grad_norm": 0.015985578298568726, "learning_rate": 0.00021451304901336728, "loss": 0.3715, "step": 912 }, { "epoch": 0.5755486387455183, "grad_norm": 0.020379502326250076, "learning_rate": 0.0002141947803946531, "loss": 0.3479, "step": 913 }, { "epoch": 0.5761790315590403, "grad_norm": 0.01490094792097807, "learning_rate": 0.0002138765117759389, "loss": 0.3344, "step": 914 }, { "epoch": 0.5768094243725621, "grad_norm": 0.015036589466035366, "learning_rate": 0.0002135582431572247, "loss": 0.365, "step": 915 }, { "epoch": 0.5774398171860841, "grad_norm": 0.015495358034968376, "learning_rate": 0.00021323997453851051, "loss": 0.3881, "step": 916 }, { "epoch": 0.578070209999606, "grad_norm": 0.015494070015847683, "learning_rate": 0.0002129217059197963, "loss": 0.348, "step": 917 }, { "epoch": 0.578700602813128, "grad_norm": 0.019342292100191116, "learning_rate": 0.0002126034373010821, "loss": 0.4298, "step": 918 }, { "epoch": 0.5793309956266498, "grad_norm": 0.01745907962322235, "learning_rate": 0.00021228516868236792, "loss": 0.4327, "step": 919 }, { "epoch": 0.5799613884401718, "grad_norm": 0.014646347612142563, "learning_rate": 0.00021196690006365375, "loss": 0.392, "step": 920 }, { "epoch": 0.5805917812536937, "grad_norm": 0.014264516532421112, "learning_rate": 0.00021164863144493954, "loss": 0.3714, "step": 921 }, { "epoch": 0.5812221740672157, "grad_norm": 0.015517518855631351, "learning_rate": 0.00021133036282622533, "loss": 0.3596, "step": 922 }, { "epoch": 0.5818525668807376, "grad_norm": 0.01458315271884203, "learning_rate": 0.00021101209420751113, "loss": 0.3837, "step": 923 }, { "epoch": 0.5824829596942595, "grad_norm": 0.014773000963032246, "learning_rate": 0.00021069382558879695, "loss": 0.3872, "step": 924 }, { "epoch": 0.5831133525077814, "grad_norm": 0.012169947847723961, "learning_rate": 0.00021037555697008277, "loss": 0.3044, "step": 925 }, { "epoch": 0.5837437453213034, "grad_norm": 0.01252297218888998, "learning_rate": 0.00021005728835136856, "loss": 0.3479, "step": 926 }, { "epoch": 0.5843741381348253, "grad_norm": 0.015428910031914711, "learning_rate": 0.00020973901973265436, "loss": 0.4072, "step": 927 }, { "epoch": 0.5850045309483471, "grad_norm": 0.01399853266775608, "learning_rate": 0.00020942075111394018, "loss": 0.3431, "step": 928 }, { "epoch": 0.5856349237618691, "grad_norm": 0.013901373371481895, "learning_rate": 0.00020910248249522597, "loss": 0.2927, "step": 929 }, { "epoch": 0.586265316575391, "grad_norm": 0.012254995293915272, "learning_rate": 0.00020878421387651177, "loss": 0.3191, "step": 930 }, { "epoch": 0.586895709388913, "grad_norm": 0.017550403252243996, "learning_rate": 0.0002084659452577976, "loss": 0.3213, "step": 931 }, { "epoch": 0.5875261022024348, "grad_norm": 0.015892772004008293, "learning_rate": 0.00020814767663908338, "loss": 0.3754, "step": 932 }, { "epoch": 0.5881564950159568, "grad_norm": 0.016435440629720688, "learning_rate": 0.0002078294080203692, "loss": 0.4193, "step": 933 }, { "epoch": 0.5887868878294787, "grad_norm": 0.013939352706074715, "learning_rate": 0.000207511139401655, "loss": 0.3665, "step": 934 }, { "epoch": 0.5894172806430007, "grad_norm": 0.01750248298048973, "learning_rate": 0.0002071928707829408, "loss": 0.3184, "step": 935 }, { "epoch": 0.5900476734565226, "grad_norm": 0.01843385398387909, "learning_rate": 0.0002068746021642266, "loss": 0.3932, "step": 936 }, { "epoch": 0.5906780662700445, "grad_norm": 0.014677590690553188, "learning_rate": 0.00020655633354551244, "loss": 0.3096, "step": 937 }, { "epoch": 0.5913084590835664, "grad_norm": 0.016284259036183357, "learning_rate": 0.00020623806492679823, "loss": 0.3772, "step": 938 }, { "epoch": 0.5919388518970884, "grad_norm": 0.012798065319657326, "learning_rate": 0.00020591979630808403, "loss": 0.2926, "step": 939 }, { "epoch": 0.5925692447106103, "grad_norm": 0.012696528807282448, "learning_rate": 0.00020560152768936982, "loss": 0.3449, "step": 940 }, { "epoch": 0.5931996375241322, "grad_norm": 0.01593698188662529, "learning_rate": 0.00020528325907065564, "loss": 0.4062, "step": 941 }, { "epoch": 0.5938300303376541, "grad_norm": 0.01461632177233696, "learning_rate": 0.00020496499045194144, "loss": 0.2686, "step": 942 }, { "epoch": 0.5944604231511761, "grad_norm": 0.0140744149684906, "learning_rate": 0.00020464672183322726, "loss": 0.3178, "step": 943 }, { "epoch": 0.595090815964698, "grad_norm": 0.019624970853328705, "learning_rate": 0.00020432845321451305, "loss": 0.4125, "step": 944 }, { "epoch": 0.5957212087782199, "grad_norm": 0.016739264130592346, "learning_rate": 0.00020401018459579887, "loss": 0.4202, "step": 945 }, { "epoch": 0.5963516015917418, "grad_norm": 0.01655811257660389, "learning_rate": 0.00020369191597708467, "loss": 0.3786, "step": 946 }, { "epoch": 0.5969819944052638, "grad_norm": 0.014386712573468685, "learning_rate": 0.00020337364735837046, "loss": 0.3329, "step": 947 }, { "epoch": 0.5976123872187857, "grad_norm": 0.01589634083211422, "learning_rate": 0.00020305537873965625, "loss": 0.3732, "step": 948 }, { "epoch": 0.5982427800323077, "grad_norm": 0.025000469759106636, "learning_rate": 0.0002027371101209421, "loss": 0.3471, "step": 949 }, { "epoch": 0.5988731728458295, "grad_norm": 0.016582254320383072, "learning_rate": 0.0002024188415022279, "loss": 0.3464, "step": 950 }, { "epoch": 0.5995035656593515, "grad_norm": 0.017135273665189743, "learning_rate": 0.0002021005728835137, "loss": 0.3679, "step": 951 }, { "epoch": 0.6001339584728734, "grad_norm": 0.014515586197376251, "learning_rate": 0.00020178230426479949, "loss": 0.3468, "step": 952 }, { "epoch": 0.6007643512863954, "grad_norm": 0.017611291259527206, "learning_rate": 0.00020146403564608528, "loss": 0.3622, "step": 953 }, { "epoch": 0.6013947440999172, "grad_norm": 0.013798318803310394, "learning_rate": 0.0002011457670273711, "loss": 0.3228, "step": 954 }, { "epoch": 0.6020251369134392, "grad_norm": 0.017348121851682663, "learning_rate": 0.00020082749840865692, "loss": 0.3621, "step": 955 }, { "epoch": 0.6026555297269611, "grad_norm": 0.015101751312613487, "learning_rate": 0.00020050922978994272, "loss": 0.3343, "step": 956 }, { "epoch": 0.6032859225404831, "grad_norm": 0.012345206923782825, "learning_rate": 0.0002001909611712285, "loss": 0.3044, "step": 957 }, { "epoch": 0.6039163153540049, "grad_norm": 0.014750892296433449, "learning_rate": 0.00019987269255251433, "loss": 0.3469, "step": 958 }, { "epoch": 0.6045467081675269, "grad_norm": 0.015019785612821579, "learning_rate": 0.00019955442393380013, "loss": 0.3471, "step": 959 }, { "epoch": 0.6051771009810488, "grad_norm": 0.01467643678188324, "learning_rate": 0.00019923615531508592, "loss": 0.3655, "step": 960 }, { "epoch": 0.6058074937945708, "grad_norm": 0.012713417410850525, "learning_rate": 0.00019891788669637174, "loss": 0.3617, "step": 961 }, { "epoch": 0.6064378866080926, "grad_norm": 0.013565322384238243, "learning_rate": 0.00019859961807765756, "loss": 0.3335, "step": 962 }, { "epoch": 0.6070682794216146, "grad_norm": 0.014555517584085464, "learning_rate": 0.00019828134945894336, "loss": 0.3831, "step": 963 }, { "epoch": 0.6076986722351365, "grad_norm": 0.014481969177722931, "learning_rate": 0.00019796308084022915, "loss": 0.3357, "step": 964 }, { "epoch": 0.6083290650486585, "grad_norm": 0.018213188275694847, "learning_rate": 0.00019764481222151495, "loss": 0.3743, "step": 965 }, { "epoch": 0.6089594578621804, "grad_norm": 0.01424343977123499, "learning_rate": 0.00019732654360280077, "loss": 0.3649, "step": 966 }, { "epoch": 0.6095898506757023, "grad_norm": 0.013987261801958084, "learning_rate": 0.0001970082749840866, "loss": 0.3578, "step": 967 }, { "epoch": 0.6102202434892242, "grad_norm": 0.017957430332899094, "learning_rate": 0.00019669000636537238, "loss": 0.4362, "step": 968 }, { "epoch": 0.6108506363027462, "grad_norm": 0.015558002516627312, "learning_rate": 0.00019637173774665818, "loss": 0.3649, "step": 969 }, { "epoch": 0.6114810291162681, "grad_norm": 0.014632182195782661, "learning_rate": 0.000196053469127944, "loss": 0.3144, "step": 970 }, { "epoch": 0.61211142192979, "grad_norm": 0.019948672503232956, "learning_rate": 0.0001957352005092298, "loss": 0.3611, "step": 971 }, { "epoch": 0.6127418147433119, "grad_norm": 0.013741201721131802, "learning_rate": 0.0001954169318905156, "loss": 0.3048, "step": 972 }, { "epoch": 0.6133722075568339, "grad_norm": 0.018888531252741814, "learning_rate": 0.0001950986632718014, "loss": 0.3443, "step": 973 }, { "epoch": 0.6140026003703558, "grad_norm": 0.015082293190062046, "learning_rate": 0.00019478039465308723, "loss": 0.3546, "step": 974 }, { "epoch": 0.6146329931838777, "grad_norm": 0.013504636473953724, "learning_rate": 0.00019446212603437302, "loss": 0.3382, "step": 975 }, { "epoch": 0.6152633859973996, "grad_norm": 0.015025708824396133, "learning_rate": 0.00019414385741565882, "loss": 0.3255, "step": 976 }, { "epoch": 0.6158937788109216, "grad_norm": 0.015464656054973602, "learning_rate": 0.0001938255887969446, "loss": 0.366, "step": 977 }, { "epoch": 0.6165241716244435, "grad_norm": 0.017639921978116035, "learning_rate": 0.0001935073201782304, "loss": 0.3599, "step": 978 }, { "epoch": 0.6171545644379655, "grad_norm": 0.012217794544994831, "learning_rate": 0.00019318905155951626, "loss": 0.3622, "step": 979 }, { "epoch": 0.6177849572514873, "grad_norm": 0.012964189983904362, "learning_rate": 0.00019287078294080205, "loss": 0.3223, "step": 980 }, { "epoch": 0.6184153500650093, "grad_norm": 0.011494590900838375, "learning_rate": 0.00019255251432208784, "loss": 0.3019, "step": 981 }, { "epoch": 0.6190457428785312, "grad_norm": 0.015708742663264275, "learning_rate": 0.00019223424570337364, "loss": 0.3409, "step": 982 }, { "epoch": 0.6196761356920532, "grad_norm": 0.015088909305632114, "learning_rate": 0.00019191597708465946, "loss": 0.298, "step": 983 }, { "epoch": 0.620306528505575, "grad_norm": 0.021463219076395035, "learning_rate": 0.00019159770846594525, "loss": 0.4337, "step": 984 }, { "epoch": 0.620936921319097, "grad_norm": 0.01627621427178383, "learning_rate": 0.00019127943984723108, "loss": 0.3315, "step": 985 }, { "epoch": 0.6215673141326189, "grad_norm": 0.018527982756495476, "learning_rate": 0.00019096117122851687, "loss": 0.4431, "step": 986 }, { "epoch": 0.6221977069461408, "grad_norm": 0.014923629350960255, "learning_rate": 0.0001906429026098027, "loss": 0.3033, "step": 987 }, { "epoch": 0.6228280997596627, "grad_norm": 0.013997436501085758, "learning_rate": 0.00019032463399108849, "loss": 0.3786, "step": 988 }, { "epoch": 0.6234584925731846, "grad_norm": 0.015426073223352432, "learning_rate": 0.00019000636537237428, "loss": 0.3181, "step": 989 }, { "epoch": 0.6240888853867066, "grad_norm": 0.015969309955835342, "learning_rate": 0.00018968809675366007, "loss": 0.3354, "step": 990 }, { "epoch": 0.6247192782002285, "grad_norm": 0.01509455218911171, "learning_rate": 0.00018936982813494592, "loss": 0.3677, "step": 991 }, { "epoch": 0.6253496710137505, "grad_norm": 0.013259136117994785, "learning_rate": 0.00018905155951623172, "loss": 0.3698, "step": 992 }, { "epoch": 0.6259800638272723, "grad_norm": 0.015168133191764355, "learning_rate": 0.0001887332908975175, "loss": 0.3508, "step": 993 }, { "epoch": 0.6266104566407943, "grad_norm": 0.015011481009423733, "learning_rate": 0.0001884150222788033, "loss": 0.3513, "step": 994 }, { "epoch": 0.6272408494543162, "grad_norm": 0.01507746521383524, "learning_rate": 0.00018809675366008913, "loss": 0.3458, "step": 995 }, { "epoch": 0.6278712422678382, "grad_norm": 0.014542695134878159, "learning_rate": 0.00018777848504137492, "loss": 0.3548, "step": 996 }, { "epoch": 0.62850163508136, "grad_norm": 0.015464426949620247, "learning_rate": 0.00018746021642266074, "loss": 0.373, "step": 997 }, { "epoch": 0.629132027894882, "grad_norm": 0.01392137911170721, "learning_rate": 0.00018714194780394654, "loss": 0.3251, "step": 998 }, { "epoch": 0.6297624207084039, "grad_norm": 0.018111450597643852, "learning_rate": 0.00018682367918523236, "loss": 0.4391, "step": 999 }, { "epoch": 0.6303928135219259, "grad_norm": 0.015064990147948265, "learning_rate": 0.00018650541056651815, "loss": 0.3284, "step": 1000 }, { "epoch": 0.6303928135219259, "eval_loss": 0.3819592595100403, "eval_runtime": 327.6303, "eval_samples_per_second": 3.052, "eval_steps_per_second": 3.052, "step": 1000 }, { "epoch": 0.6310232063354477, "grad_norm": 0.015068711712956429, "learning_rate": 0.00018618714194780395, "loss": 0.3085, "step": 1001 }, { "epoch": 0.6316535991489697, "grad_norm": 0.016730882227420807, "learning_rate": 0.00018586887332908974, "loss": 0.3563, "step": 1002 }, { "epoch": 0.6322839919624916, "grad_norm": 0.012358102947473526, "learning_rate": 0.00018555060471037556, "loss": 0.3397, "step": 1003 }, { "epoch": 0.6329143847760136, "grad_norm": 0.015266484580934048, "learning_rate": 0.00018523233609166138, "loss": 0.3498, "step": 1004 }, { "epoch": 0.6335447775895354, "grad_norm": 0.015215915627777576, "learning_rate": 0.00018491406747294718, "loss": 0.3403, "step": 1005 }, { "epoch": 0.6341751704030574, "grad_norm": 0.013244755566120148, "learning_rate": 0.00018459579885423297, "loss": 0.3265, "step": 1006 }, { "epoch": 0.6348055632165793, "grad_norm": 0.017123814672231674, "learning_rate": 0.00018427753023551877, "loss": 0.4079, "step": 1007 }, { "epoch": 0.6354359560301013, "grad_norm": 0.015092308633029461, "learning_rate": 0.0001839592616168046, "loss": 0.4049, "step": 1008 }, { "epoch": 0.6360663488436232, "grad_norm": 0.014831154607236385, "learning_rate": 0.0001836409929980904, "loss": 0.34, "step": 1009 }, { "epoch": 0.6366967416571451, "grad_norm": 0.014250526204705238, "learning_rate": 0.0001833227243793762, "loss": 0.3637, "step": 1010 }, { "epoch": 0.637327134470667, "grad_norm": 0.015048504807054996, "learning_rate": 0.000183004455760662, "loss": 0.3653, "step": 1011 }, { "epoch": 0.637957527284189, "grad_norm": 0.03703795373439789, "learning_rate": 0.00018268618714194782, "loss": 0.4197, "step": 1012 }, { "epoch": 0.6385879200977109, "grad_norm": 0.025124607607722282, "learning_rate": 0.0001823679185232336, "loss": 0.4073, "step": 1013 }, { "epoch": 0.6392183129112328, "grad_norm": 0.011759684421122074, "learning_rate": 0.0001820496499045194, "loss": 0.2912, "step": 1014 }, { "epoch": 0.6398487057247547, "grad_norm": 0.015065212734043598, "learning_rate": 0.0001817313812858052, "loss": 0.3772, "step": 1015 }, { "epoch": 0.6404790985382767, "grad_norm": 0.014481586404144764, "learning_rate": 0.00018141311266709105, "loss": 0.3922, "step": 1016 }, { "epoch": 0.6411094913517986, "grad_norm": 0.016161121428012848, "learning_rate": 0.00018109484404837684, "loss": 0.3518, "step": 1017 }, { "epoch": 0.6417398841653205, "grad_norm": 0.01265521626919508, "learning_rate": 0.00018077657542966264, "loss": 0.3314, "step": 1018 }, { "epoch": 0.6423702769788424, "grad_norm": 0.01835082471370697, "learning_rate": 0.00018045830681094843, "loss": 0.3887, "step": 1019 }, { "epoch": 0.6430006697923644, "grad_norm": 0.01568443700671196, "learning_rate": 0.00018014003819223425, "loss": 0.3971, "step": 1020 }, { "epoch": 0.6436310626058863, "grad_norm": 0.019493689760565758, "learning_rate": 0.00017982176957352005, "loss": 0.4661, "step": 1021 }, { "epoch": 0.6442614554194083, "grad_norm": 0.0195660088211298, "learning_rate": 0.00017950350095480587, "loss": 0.4193, "step": 1022 }, { "epoch": 0.6448918482329301, "grad_norm": 0.01241431012749672, "learning_rate": 0.00017918523233609166, "loss": 0.3244, "step": 1023 }, { "epoch": 0.6455222410464521, "grad_norm": 0.016457142308354378, "learning_rate": 0.00017886696371737746, "loss": 0.3626, "step": 1024 }, { "epoch": 0.646152633859974, "grad_norm": 0.01771700754761696, "learning_rate": 0.00017854869509866328, "loss": 0.3847, "step": 1025 }, { "epoch": 0.646783026673496, "grad_norm": 0.01582372933626175, "learning_rate": 0.00017823042647994907, "loss": 0.3403, "step": 1026 }, { "epoch": 0.6474134194870178, "grad_norm": 0.01579039730131626, "learning_rate": 0.00017791215786123487, "loss": 0.4222, "step": 1027 }, { "epoch": 0.6480438123005398, "grad_norm": 0.01896340399980545, "learning_rate": 0.0001775938892425207, "loss": 0.306, "step": 1028 }, { "epoch": 0.6486742051140617, "grad_norm": 0.014032997190952301, "learning_rate": 0.0001772756206238065, "loss": 0.3703, "step": 1029 }, { "epoch": 0.6493045979275837, "grad_norm": 0.014520079828798771, "learning_rate": 0.0001769573520050923, "loss": 0.3646, "step": 1030 }, { "epoch": 0.6499349907411055, "grad_norm": 0.01619674451649189, "learning_rate": 0.0001766390833863781, "loss": 0.3684, "step": 1031 }, { "epoch": 0.6505653835546275, "grad_norm": 0.01473263744264841, "learning_rate": 0.0001763208147676639, "loss": 0.3759, "step": 1032 }, { "epoch": 0.6511957763681494, "grad_norm": 0.014320430345833302, "learning_rate": 0.00017600254614894971, "loss": 0.3485, "step": 1033 }, { "epoch": 0.6518261691816714, "grad_norm": 0.013830669224262238, "learning_rate": 0.00017568427753023554, "loss": 0.3317, "step": 1034 }, { "epoch": 0.6524565619951933, "grad_norm": 0.021012544631958008, "learning_rate": 0.00017536600891152133, "loss": 0.3925, "step": 1035 }, { "epoch": 0.6530869548087151, "grad_norm": 0.012850931845605373, "learning_rate": 0.00017504774029280712, "loss": 0.3472, "step": 1036 }, { "epoch": 0.6537173476222371, "grad_norm": 0.018168209120631218, "learning_rate": 0.00017472947167409295, "loss": 0.3802, "step": 1037 }, { "epoch": 0.654347740435759, "grad_norm": 0.015929190441966057, "learning_rate": 0.00017441120305537874, "loss": 0.3169, "step": 1038 }, { "epoch": 0.654978133249281, "grad_norm": 0.01426453422755003, "learning_rate": 0.00017409293443666453, "loss": 0.3053, "step": 1039 }, { "epoch": 0.6556085260628028, "grad_norm": 0.014137396588921547, "learning_rate": 0.00017377466581795035, "loss": 0.4081, "step": 1040 }, { "epoch": 0.6562389188763248, "grad_norm": 0.014705068431794643, "learning_rate": 0.00017345639719923618, "loss": 0.4056, "step": 1041 }, { "epoch": 0.6568693116898467, "grad_norm": 0.017836054787039757, "learning_rate": 0.00017313812858052197, "loss": 0.3639, "step": 1042 }, { "epoch": 0.6574997045033687, "grad_norm": 0.015535780228674412, "learning_rate": 0.00017281985996180776, "loss": 0.3549, "step": 1043 }, { "epoch": 0.6581300973168905, "grad_norm": 0.020157353952527046, "learning_rate": 0.00017250159134309356, "loss": 0.3198, "step": 1044 }, { "epoch": 0.6587604901304125, "grad_norm": 0.01624133065342903, "learning_rate": 0.00017218332272437938, "loss": 0.3433, "step": 1045 }, { "epoch": 0.6593908829439344, "grad_norm": 0.015931300818920135, "learning_rate": 0.0001718650541056652, "loss": 0.334, "step": 1046 }, { "epoch": 0.6600212757574564, "grad_norm": 0.01300452183932066, "learning_rate": 0.000171546785486951, "loss": 0.3252, "step": 1047 }, { "epoch": 0.6606516685709783, "grad_norm": 0.01916547492146492, "learning_rate": 0.0001712285168682368, "loss": 0.4276, "step": 1048 }, { "epoch": 0.6612820613845002, "grad_norm": 0.015166277065873146, "learning_rate": 0.00017091024824952258, "loss": 0.3959, "step": 1049 }, { "epoch": 0.6619124541980221, "grad_norm": 0.013116384856402874, "learning_rate": 0.0001705919796308084, "loss": 0.358, "step": 1050 }, { "epoch": 0.6625428470115441, "grad_norm": 0.011849263682961464, "learning_rate": 0.0001702737110120942, "loss": 0.309, "step": 1051 }, { "epoch": 0.663173239825066, "grad_norm": 0.01596028544008732, "learning_rate": 0.00016995544239338002, "loss": 0.383, "step": 1052 }, { "epoch": 0.6638036326385879, "grad_norm": 0.014568407088518143, "learning_rate": 0.00016963717377466582, "loss": 0.381, "step": 1053 }, { "epoch": 0.6644340254521098, "grad_norm": 0.01490836776793003, "learning_rate": 0.00016931890515595164, "loss": 0.3668, "step": 1054 }, { "epoch": 0.6650644182656318, "grad_norm": 0.015523996204137802, "learning_rate": 0.00016900063653723743, "loss": 0.3195, "step": 1055 }, { "epoch": 0.6656948110791537, "grad_norm": 0.012916208244860172, "learning_rate": 0.00016868236791852323, "loss": 0.2891, "step": 1056 }, { "epoch": 0.6663252038926756, "grad_norm": 0.015148383565247059, "learning_rate": 0.00016836409929980902, "loss": 0.3782, "step": 1057 }, { "epoch": 0.6669555967061975, "grad_norm": 0.01462711114436388, "learning_rate": 0.00016804583068109487, "loss": 0.3424, "step": 1058 }, { "epoch": 0.6675859895197195, "grad_norm": 0.015564646571874619, "learning_rate": 0.00016772756206238066, "loss": 0.3586, "step": 1059 }, { "epoch": 0.6682163823332414, "grad_norm": 0.01695174165070057, "learning_rate": 0.00016740929344366646, "loss": 0.4015, "step": 1060 }, { "epoch": 0.6688467751467633, "grad_norm": 0.017118029296398163, "learning_rate": 0.00016709102482495225, "loss": 0.3368, "step": 1061 }, { "epoch": 0.6694771679602852, "grad_norm": 0.015382718294858932, "learning_rate": 0.00016677275620623807, "loss": 0.3256, "step": 1062 }, { "epoch": 0.6701075607738072, "grad_norm": 0.013990400359034538, "learning_rate": 0.00016645448758752387, "loss": 0.311, "step": 1063 }, { "epoch": 0.6707379535873291, "grad_norm": 0.014588209800422192, "learning_rate": 0.0001661362189688097, "loss": 0.4017, "step": 1064 }, { "epoch": 0.6713683464008511, "grad_norm": 0.014173957519233227, "learning_rate": 0.00016581795035009548, "loss": 0.3324, "step": 1065 }, { "epoch": 0.6719987392143729, "grad_norm": 0.01841246336698532, "learning_rate": 0.0001654996817313813, "loss": 0.3791, "step": 1066 }, { "epoch": 0.6726291320278949, "grad_norm": 0.016435330733656883, "learning_rate": 0.0001651814131126671, "loss": 0.3673, "step": 1067 }, { "epoch": 0.6732595248414168, "grad_norm": 0.016316816210746765, "learning_rate": 0.0001648631444939529, "loss": 0.4126, "step": 1068 }, { "epoch": 0.6738899176549388, "grad_norm": 0.01850137859582901, "learning_rate": 0.00016454487587523869, "loss": 0.4082, "step": 1069 }, { "epoch": 0.6745203104684606, "grad_norm": 0.015576261095702648, "learning_rate": 0.00016422660725652453, "loss": 0.404, "step": 1070 }, { "epoch": 0.6751507032819826, "grad_norm": 0.014396149665117264, "learning_rate": 0.00016390833863781033, "loss": 0.3596, "step": 1071 }, { "epoch": 0.6757810960955045, "grad_norm": 0.013356272131204605, "learning_rate": 0.00016359007001909612, "loss": 0.3257, "step": 1072 }, { "epoch": 0.6764114889090265, "grad_norm": 0.013586311601102352, "learning_rate": 0.00016327180140038192, "loss": 0.3796, "step": 1073 }, { "epoch": 0.6770418817225483, "grad_norm": 0.015921304002404213, "learning_rate": 0.0001629535327816677, "loss": 0.3872, "step": 1074 }, { "epoch": 0.6776722745360703, "grad_norm": 0.014110450632870197, "learning_rate": 0.00016263526416295353, "loss": 0.3377, "step": 1075 }, { "epoch": 0.6783026673495922, "grad_norm": 0.01424513477832079, "learning_rate": 0.00016231699554423935, "loss": 0.3338, "step": 1076 }, { "epoch": 0.6789330601631142, "grad_norm": 0.016025831922888756, "learning_rate": 0.00016199872692552515, "loss": 0.424, "step": 1077 }, { "epoch": 0.6795634529766361, "grad_norm": 0.015872562304139137, "learning_rate": 0.00016168045830681094, "loss": 0.3763, "step": 1078 }, { "epoch": 0.680193845790158, "grad_norm": 0.014147383160889149, "learning_rate": 0.00016136218968809676, "loss": 0.3801, "step": 1079 }, { "epoch": 0.6808242386036799, "grad_norm": 0.014619135297834873, "learning_rate": 0.00016104392106938256, "loss": 0.3101, "step": 1080 }, { "epoch": 0.6814546314172019, "grad_norm": 0.015309452079236507, "learning_rate": 0.00016072565245066835, "loss": 0.3374, "step": 1081 }, { "epoch": 0.6820850242307238, "grad_norm": 0.013624235987663269, "learning_rate": 0.00016040738383195417, "loss": 0.3307, "step": 1082 }, { "epoch": 0.6827154170442457, "grad_norm": 0.016418637707829475, "learning_rate": 0.00016008911521324, "loss": 0.3483, "step": 1083 }, { "epoch": 0.6833458098577676, "grad_norm": 0.015705889090895653, "learning_rate": 0.0001597708465945258, "loss": 0.3484, "step": 1084 }, { "epoch": 0.6839762026712896, "grad_norm": 0.014886502176523209, "learning_rate": 0.00015945257797581158, "loss": 0.3599, "step": 1085 }, { "epoch": 0.6846065954848115, "grad_norm": 0.01729169301688671, "learning_rate": 0.00015913430935709738, "loss": 0.3755, "step": 1086 }, { "epoch": 0.6852369882983333, "grad_norm": 0.012659845873713493, "learning_rate": 0.0001588160407383832, "loss": 0.3519, "step": 1087 }, { "epoch": 0.6858673811118553, "grad_norm": 0.01389190275222063, "learning_rate": 0.00015849777211966902, "loss": 0.4104, "step": 1088 }, { "epoch": 0.6864977739253773, "grad_norm": 0.015626948326826096, "learning_rate": 0.00015817950350095481, "loss": 0.3997, "step": 1089 }, { "epoch": 0.6871281667388992, "grad_norm": 0.016446728259325027, "learning_rate": 0.0001578612348822406, "loss": 0.3918, "step": 1090 }, { "epoch": 0.6877585595524212, "grad_norm": 0.01472299825400114, "learning_rate": 0.00015754296626352643, "loss": 0.3389, "step": 1091 }, { "epoch": 0.688388952365943, "grad_norm": 0.014689675532281399, "learning_rate": 0.00015722469764481222, "loss": 0.3206, "step": 1092 }, { "epoch": 0.689019345179465, "grad_norm": 0.014283616095781326, "learning_rate": 0.00015690642902609802, "loss": 0.3475, "step": 1093 }, { "epoch": 0.6896497379929869, "grad_norm": 0.017340607941150665, "learning_rate": 0.00015658816040738384, "loss": 0.405, "step": 1094 }, { "epoch": 0.6902801308065089, "grad_norm": 0.018278228119015694, "learning_rate": 0.00015626989178866963, "loss": 0.4156, "step": 1095 }, { "epoch": 0.6909105236200307, "grad_norm": 0.014322202652692795, "learning_rate": 0.00015595162316995546, "loss": 0.3463, "step": 1096 }, { "epoch": 0.6915409164335526, "grad_norm": 0.01600111462175846, "learning_rate": 0.00015563335455124125, "loss": 0.3531, "step": 1097 }, { "epoch": 0.6921713092470746, "grad_norm": 0.014019348658621311, "learning_rate": 0.00015531508593252704, "loss": 0.3346, "step": 1098 }, { "epoch": 0.6928017020605965, "grad_norm": 0.013077255338430405, "learning_rate": 0.00015499681731381284, "loss": 0.3374, "step": 1099 }, { "epoch": 0.6934320948741184, "grad_norm": 0.012031404301524162, "learning_rate": 0.0001546785486950987, "loss": 0.3334, "step": 1100 }, { "epoch": 0.6940624876876403, "grad_norm": 0.01499242428690195, "learning_rate": 0.00015436028007638448, "loss": 0.3226, "step": 1101 }, { "epoch": 0.6946928805011623, "grad_norm": 0.01444091647863388, "learning_rate": 0.00015404201145767028, "loss": 0.3173, "step": 1102 }, { "epoch": 0.6953232733146842, "grad_norm": 0.014048528857529163, "learning_rate": 0.00015372374283895607, "loss": 0.3171, "step": 1103 }, { "epoch": 0.6959536661282061, "grad_norm": 0.017323395237326622, "learning_rate": 0.0001534054742202419, "loss": 0.4283, "step": 1104 }, { "epoch": 0.696584058941728, "grad_norm": 0.015844693407416344, "learning_rate": 0.00015308720560152769, "loss": 0.3696, "step": 1105 }, { "epoch": 0.69721445175525, "grad_norm": 0.013314551673829556, "learning_rate": 0.0001527689369828135, "loss": 0.3057, "step": 1106 }, { "epoch": 0.6978448445687719, "grad_norm": 0.01205646526068449, "learning_rate": 0.0001524506683640993, "loss": 0.3034, "step": 1107 }, { "epoch": 0.6984752373822939, "grad_norm": 0.013907050713896751, "learning_rate": 0.00015213239974538512, "loss": 0.3492, "step": 1108 }, { "epoch": 0.6991056301958157, "grad_norm": 0.017586039379239082, "learning_rate": 0.00015181413112667092, "loss": 0.4372, "step": 1109 }, { "epoch": 0.6997360230093377, "grad_norm": 0.014180300757288933, "learning_rate": 0.0001514958625079567, "loss": 0.3168, "step": 1110 }, { "epoch": 0.7003664158228596, "grad_norm": 0.015157987363636494, "learning_rate": 0.0001511775938892425, "loss": 0.3685, "step": 1111 }, { "epoch": 0.7009968086363816, "grad_norm": 0.022615592926740646, "learning_rate": 0.00015085932527052835, "loss": 0.3559, "step": 1112 }, { "epoch": 0.7016272014499034, "grad_norm": 0.014078492298722267, "learning_rate": 0.00015054105665181415, "loss": 0.3295, "step": 1113 }, { "epoch": 0.7022575942634254, "grad_norm": 0.013817540369927883, "learning_rate": 0.00015022278803309994, "loss": 0.2954, "step": 1114 }, { "epoch": 0.7028879870769473, "grad_norm": 0.016275597736239433, "learning_rate": 0.00014990451941438574, "loss": 0.3453, "step": 1115 }, { "epoch": 0.7035183798904693, "grad_norm": 0.013090684078633785, "learning_rate": 0.00014958625079567156, "loss": 0.3259, "step": 1116 }, { "epoch": 0.7041487727039911, "grad_norm": 0.011116759851574898, "learning_rate": 0.00014926798217695735, "loss": 0.2933, "step": 1117 }, { "epoch": 0.7047791655175131, "grad_norm": 0.014334728941321373, "learning_rate": 0.00014894971355824317, "loss": 0.3377, "step": 1118 }, { "epoch": 0.705409558331035, "grad_norm": 0.01556230615824461, "learning_rate": 0.00014863144493952897, "loss": 0.3647, "step": 1119 }, { "epoch": 0.706039951144557, "grad_norm": 0.014519312418997288, "learning_rate": 0.00014831317632081476, "loss": 0.3107, "step": 1120 }, { "epoch": 0.7066703439580789, "grad_norm": 0.013213284313678741, "learning_rate": 0.00014799490770210058, "loss": 0.3477, "step": 1121 }, { "epoch": 0.7073007367716008, "grad_norm": 0.01849515363574028, "learning_rate": 0.00014767663908338638, "loss": 0.3708, "step": 1122 }, { "epoch": 0.7079311295851227, "grad_norm": 0.013741868548095226, "learning_rate": 0.00014735837046467217, "loss": 0.3521, "step": 1123 }, { "epoch": 0.7085615223986447, "grad_norm": 0.018180161714553833, "learning_rate": 0.000147040101845958, "loss": 0.4702, "step": 1124 }, { "epoch": 0.7091919152121666, "grad_norm": 0.01568315550684929, "learning_rate": 0.00014672183322724381, "loss": 0.3863, "step": 1125 }, { "epoch": 0.7098223080256885, "grad_norm": 0.014101453125476837, "learning_rate": 0.0001464035646085296, "loss": 0.3667, "step": 1126 }, { "epoch": 0.7104527008392104, "grad_norm": 0.016601860523223877, "learning_rate": 0.0001460852959898154, "loss": 0.384, "step": 1127 }, { "epoch": 0.7110830936527324, "grad_norm": 0.01644286699593067, "learning_rate": 0.0001457670273711012, "loss": 0.4105, "step": 1128 }, { "epoch": 0.7117134864662543, "grad_norm": 0.018860826268792152, "learning_rate": 0.00014544875875238702, "loss": 0.3278, "step": 1129 }, { "epoch": 0.7123438792797762, "grad_norm": 0.01249612309038639, "learning_rate": 0.00014513049013367284, "loss": 0.3136, "step": 1130 }, { "epoch": 0.7129742720932981, "grad_norm": 0.017647631466388702, "learning_rate": 0.00014481222151495863, "loss": 0.4117, "step": 1131 }, { "epoch": 0.7136046649068201, "grad_norm": 0.013655045069754124, "learning_rate": 0.00014449395289624443, "loss": 0.3546, "step": 1132 }, { "epoch": 0.714235057720342, "grad_norm": 0.014956158585846424, "learning_rate": 0.00014417568427753025, "loss": 0.3911, "step": 1133 }, { "epoch": 0.714865450533864, "grad_norm": 0.01396497618407011, "learning_rate": 0.00014385741565881604, "loss": 0.309, "step": 1134 }, { "epoch": 0.7154958433473858, "grad_norm": 0.014517375268042088, "learning_rate": 0.00014353914704010184, "loss": 0.3894, "step": 1135 }, { "epoch": 0.7161262361609078, "grad_norm": 0.01695895940065384, "learning_rate": 0.00014322087842138766, "loss": 0.3473, "step": 1136 }, { "epoch": 0.7167566289744297, "grad_norm": 0.013232816010713577, "learning_rate": 0.00014290260980267348, "loss": 0.3301, "step": 1137 }, { "epoch": 0.7173870217879517, "grad_norm": 0.01621880941092968, "learning_rate": 0.00014258434118395927, "loss": 0.4296, "step": 1138 }, { "epoch": 0.7180174146014735, "grad_norm": 0.01946069486439228, "learning_rate": 0.00014226607256524507, "loss": 0.4034, "step": 1139 }, { "epoch": 0.7186478074149955, "grad_norm": 0.020333269611001015, "learning_rate": 0.00014194780394653086, "loss": 0.3787, "step": 1140 }, { "epoch": 0.7192782002285174, "grad_norm": 0.013538191094994545, "learning_rate": 0.00014162953532781666, "loss": 0.3502, "step": 1141 }, { "epoch": 0.7199085930420394, "grad_norm": 0.01587570458650589, "learning_rate": 0.0001413112667091025, "loss": 0.3843, "step": 1142 }, { "epoch": 0.7205389858555612, "grad_norm": 0.01856716349720955, "learning_rate": 0.0001409929980903883, "loss": 0.3182, "step": 1143 }, { "epoch": 0.7211693786690831, "grad_norm": 0.02066134288907051, "learning_rate": 0.0001406747294716741, "loss": 0.4341, "step": 1144 }, { "epoch": 0.7217997714826051, "grad_norm": 0.01531100831925869, "learning_rate": 0.0001403564608529599, "loss": 0.3785, "step": 1145 }, { "epoch": 0.722430164296127, "grad_norm": 0.015404126606881618, "learning_rate": 0.0001400381922342457, "loss": 0.343, "step": 1146 }, { "epoch": 0.723060557109649, "grad_norm": 0.015861671417951584, "learning_rate": 0.0001397199236155315, "loss": 0.3459, "step": 1147 }, { "epoch": 0.7236909499231708, "grad_norm": 0.014947841875255108, "learning_rate": 0.00013940165499681733, "loss": 0.333, "step": 1148 }, { "epoch": 0.7243213427366928, "grad_norm": 0.014956996776163578, "learning_rate": 0.00013908338637810312, "loss": 0.3568, "step": 1149 }, { "epoch": 0.7249517355502147, "grad_norm": 0.013247400522232056, "learning_rate": 0.00013876511775938894, "loss": 0.3245, "step": 1150 }, { "epoch": 0.7255821283637367, "grad_norm": 0.01641037128865719, "learning_rate": 0.00013844684914067474, "loss": 0.335, "step": 1151 }, { "epoch": 0.7262125211772585, "grad_norm": 0.015756070613861084, "learning_rate": 0.00013812858052196053, "loss": 0.3848, "step": 1152 }, { "epoch": 0.7268429139907805, "grad_norm": 0.01808984763920307, "learning_rate": 0.00013781031190324632, "loss": 0.3853, "step": 1153 }, { "epoch": 0.7274733068043024, "grad_norm": 0.012837030924856663, "learning_rate": 0.00013749204328453217, "loss": 0.3614, "step": 1154 }, { "epoch": 0.7281036996178244, "grad_norm": 0.018607156351208687, "learning_rate": 0.00013717377466581797, "loss": 0.3672, "step": 1155 }, { "epoch": 0.7287340924313462, "grad_norm": 0.014206528663635254, "learning_rate": 0.00013685550604710376, "loss": 0.3768, "step": 1156 }, { "epoch": 0.7293644852448682, "grad_norm": 0.014016461558640003, "learning_rate": 0.00013653723742838955, "loss": 0.3979, "step": 1157 }, { "epoch": 0.7299948780583901, "grad_norm": 0.013709808699786663, "learning_rate": 0.00013621896880967538, "loss": 0.3487, "step": 1158 }, { "epoch": 0.7306252708719121, "grad_norm": 0.012709543108940125, "learning_rate": 0.00013590070019096117, "loss": 0.3298, "step": 1159 }, { "epoch": 0.7312556636854339, "grad_norm": 0.01708996668457985, "learning_rate": 0.00013558243157224696, "loss": 0.4008, "step": 1160 }, { "epoch": 0.7318860564989559, "grad_norm": 0.016853036358952522, "learning_rate": 0.00013526416295353279, "loss": 0.4034, "step": 1161 }, { "epoch": 0.7325164493124778, "grad_norm": 0.012666909024119377, "learning_rate": 0.0001349458943348186, "loss": 0.2993, "step": 1162 }, { "epoch": 0.7331468421259998, "grad_norm": 0.01367896143347025, "learning_rate": 0.0001346276257161044, "loss": 0.3442, "step": 1163 }, { "epoch": 0.7337772349395217, "grad_norm": 0.0142219802364707, "learning_rate": 0.0001343093570973902, "loss": 0.37, "step": 1164 }, { "epoch": 0.7344076277530436, "grad_norm": 0.017735477536916733, "learning_rate": 0.000133991088478676, "loss": 0.3902, "step": 1165 }, { "epoch": 0.7350380205665655, "grad_norm": 0.012031124904751778, "learning_rate": 0.00013367281985996178, "loss": 0.3369, "step": 1166 }, { "epoch": 0.7356684133800875, "grad_norm": 0.01426391676068306, "learning_rate": 0.00013335455124124763, "loss": 0.33, "step": 1167 }, { "epoch": 0.7362988061936094, "grad_norm": 0.017490075901150703, "learning_rate": 0.00013303628262253343, "loss": 0.3815, "step": 1168 }, { "epoch": 0.7369291990071313, "grad_norm": 0.01577986776828766, "learning_rate": 0.00013271801400381922, "loss": 0.3244, "step": 1169 }, { "epoch": 0.7375595918206532, "grad_norm": 0.015958884730935097, "learning_rate": 0.00013239974538510502, "loss": 0.3426, "step": 1170 }, { "epoch": 0.7381899846341752, "grad_norm": 0.015272104181349277, "learning_rate": 0.00013208147676639084, "loss": 0.3705, "step": 1171 }, { "epoch": 0.7388203774476971, "grad_norm": 0.01338798739016056, "learning_rate": 0.00013176320814767663, "loss": 0.3528, "step": 1172 }, { "epoch": 0.739450770261219, "grad_norm": 0.01420095469802618, "learning_rate": 0.00013144493952896245, "loss": 0.3424, "step": 1173 }, { "epoch": 0.7400811630747409, "grad_norm": 0.017742201685905457, "learning_rate": 0.00013112667091024825, "loss": 0.3733, "step": 1174 }, { "epoch": 0.7407115558882629, "grad_norm": 0.016413554549217224, "learning_rate": 0.00013080840229153407, "loss": 0.3364, "step": 1175 }, { "epoch": 0.7413419487017848, "grad_norm": 0.012264917604625225, "learning_rate": 0.00013049013367281986, "loss": 0.3482, "step": 1176 }, { "epoch": 0.7419723415153068, "grad_norm": 0.013301195576786995, "learning_rate": 0.00013017186505410566, "loss": 0.319, "step": 1177 }, { "epoch": 0.7426027343288286, "grad_norm": 0.014413679018616676, "learning_rate": 0.00012985359643539145, "loss": 0.3715, "step": 1178 }, { "epoch": 0.7432331271423506, "grad_norm": 0.016620101407170296, "learning_rate": 0.0001295353278166773, "loss": 0.3561, "step": 1179 }, { "epoch": 0.7438635199558725, "grad_norm": 0.014448422007262707, "learning_rate": 0.0001292170591979631, "loss": 0.3745, "step": 1180 }, { "epoch": 0.7444939127693945, "grad_norm": 0.015801845118403435, "learning_rate": 0.0001288987905792489, "loss": 0.3926, "step": 1181 }, { "epoch": 0.7451243055829163, "grad_norm": 0.013477195054292679, "learning_rate": 0.00012858052196053468, "loss": 0.3425, "step": 1182 }, { "epoch": 0.7457546983964383, "grad_norm": 0.01677156612277031, "learning_rate": 0.0001282622533418205, "loss": 0.3431, "step": 1183 }, { "epoch": 0.7463850912099602, "grad_norm": 0.013513016514480114, "learning_rate": 0.0001279439847231063, "loss": 0.3578, "step": 1184 }, { "epoch": 0.7470154840234822, "grad_norm": 0.01470949500799179, "learning_rate": 0.00012762571610439212, "loss": 0.3954, "step": 1185 }, { "epoch": 0.747645876837004, "grad_norm": 0.014376926235854626, "learning_rate": 0.0001273074474856779, "loss": 0.3087, "step": 1186 }, { "epoch": 0.748276269650526, "grad_norm": 0.015603973530232906, "learning_rate": 0.00012698917886696373, "loss": 0.296, "step": 1187 }, { "epoch": 0.7489066624640479, "grad_norm": 0.01415923610329628, "learning_rate": 0.00012667091024824953, "loss": 0.3577, "step": 1188 }, { "epoch": 0.7495370552775699, "grad_norm": 0.018781442195177078, "learning_rate": 0.00012635264162953532, "loss": 0.4558, "step": 1189 }, { "epoch": 0.7501674480910918, "grad_norm": 0.02049703523516655, "learning_rate": 0.00012603437301082112, "loss": 0.4884, "step": 1190 }, { "epoch": 0.7507978409046137, "grad_norm": 0.013096905313432217, "learning_rate": 0.00012571610439210694, "loss": 0.3432, "step": 1191 }, { "epoch": 0.7514282337181356, "grad_norm": 0.0150743518024683, "learning_rate": 0.00012539783577339276, "loss": 0.3501, "step": 1192 }, { "epoch": 0.7520586265316576, "grad_norm": 0.013073778711259365, "learning_rate": 0.00012507956715467855, "loss": 0.3336, "step": 1193 }, { "epoch": 0.7526890193451795, "grad_norm": 0.014403257519006729, "learning_rate": 0.00012476129853596435, "loss": 0.3676, "step": 1194 }, { "epoch": 0.7533194121587014, "grad_norm": 0.014295238070189953, "learning_rate": 0.00012444302991725017, "loss": 0.3485, "step": 1195 }, { "epoch": 0.7539498049722233, "grad_norm": 0.03308374807238579, "learning_rate": 0.00012412476129853596, "loss": 0.3285, "step": 1196 }, { "epoch": 0.7545801977857453, "grad_norm": 0.01713315024971962, "learning_rate": 0.00012380649267982179, "loss": 0.3479, "step": 1197 }, { "epoch": 0.7552105905992672, "grad_norm": 0.012911123223602772, "learning_rate": 0.00012348822406110758, "loss": 0.3248, "step": 1198 }, { "epoch": 0.755840983412789, "grad_norm": 0.015463579446077347, "learning_rate": 0.00012316995544239337, "loss": 0.3708, "step": 1199 }, { "epoch": 0.756471376226311, "grad_norm": 0.017389461398124695, "learning_rate": 0.0001228516868236792, "loss": 0.3771, "step": 1200 }, { "epoch": 0.756471376226311, "eval_loss": 0.3764493763446808, "eval_runtime": 329.1949, "eval_samples_per_second": 3.038, "eval_steps_per_second": 3.038, "step": 1200 }, { "epoch": 0.757101769039833, "grad_norm": 0.013893462717533112, "learning_rate": 0.000122533418204965, "loss": 0.3749, "step": 1201 }, { "epoch": 0.7577321618533549, "grad_norm": 0.01529255323112011, "learning_rate": 0.00012221514958625078, "loss": 0.3441, "step": 1202 }, { "epoch": 0.7583625546668767, "grad_norm": 0.01466412004083395, "learning_rate": 0.0001218968809675366, "loss": 0.3635, "step": 1203 }, { "epoch": 0.7589929474803987, "grad_norm": 0.01847771741449833, "learning_rate": 0.00012157861234882241, "loss": 0.3884, "step": 1204 }, { "epoch": 0.7596233402939206, "grad_norm": 0.01600956730544567, "learning_rate": 0.00012126034373010822, "loss": 0.3402, "step": 1205 }, { "epoch": 0.7602537331074426, "grad_norm": 0.01512977760285139, "learning_rate": 0.00012094207511139401, "loss": 0.3506, "step": 1206 }, { "epoch": 0.7608841259209645, "grad_norm": 0.014811835251748562, "learning_rate": 0.00012062380649267984, "loss": 0.3406, "step": 1207 }, { "epoch": 0.7615145187344864, "grad_norm": 0.012609414756298065, "learning_rate": 0.00012030553787396563, "loss": 0.3411, "step": 1208 }, { "epoch": 0.7621449115480083, "grad_norm": 0.016063913702964783, "learning_rate": 0.00011998726925525144, "loss": 0.3737, "step": 1209 }, { "epoch": 0.7627753043615303, "grad_norm": 0.013802681118249893, "learning_rate": 0.00011966900063653725, "loss": 0.3779, "step": 1210 }, { "epoch": 0.7634056971750522, "grad_norm": 0.01606634445488453, "learning_rate": 0.00011935073201782304, "loss": 0.3683, "step": 1211 }, { "epoch": 0.7640360899885741, "grad_norm": 0.014666946604847908, "learning_rate": 0.00011903246339910885, "loss": 0.348, "step": 1212 }, { "epoch": 0.764666482802096, "grad_norm": 0.01311336737126112, "learning_rate": 0.00011871419478039466, "loss": 0.3738, "step": 1213 }, { "epoch": 0.765296875615618, "grad_norm": 0.014285261742770672, "learning_rate": 0.00011839592616168046, "loss": 0.3137, "step": 1214 }, { "epoch": 0.7659272684291399, "grad_norm": 0.013496417552232742, "learning_rate": 0.00011807765754296626, "loss": 0.2985, "step": 1215 }, { "epoch": 0.7665576612426618, "grad_norm": 0.018807226791977882, "learning_rate": 0.00011775938892425208, "loss": 0.408, "step": 1216 }, { "epoch": 0.7671880540561837, "grad_norm": 0.01625092141330242, "learning_rate": 0.00011744112030553787, "loss": 0.4079, "step": 1217 }, { "epoch": 0.7678184468697057, "grad_norm": 0.01385047473013401, "learning_rate": 0.00011712285168682368, "loss": 0.3515, "step": 1218 }, { "epoch": 0.7684488396832276, "grad_norm": 0.01763092912733555, "learning_rate": 0.00011680458306810949, "loss": 0.4052, "step": 1219 }, { "epoch": 0.7690792324967496, "grad_norm": 0.029317645356059074, "learning_rate": 0.0001164863144493953, "loss": 0.3497, "step": 1220 }, { "epoch": 0.7697096253102714, "grad_norm": 0.01535740215331316, "learning_rate": 0.00011616804583068109, "loss": 0.3689, "step": 1221 }, { "epoch": 0.7703400181237934, "grad_norm": 0.014249589294195175, "learning_rate": 0.00011584977721196691, "loss": 0.3558, "step": 1222 }, { "epoch": 0.7709704109373153, "grad_norm": 0.01716270111501217, "learning_rate": 0.0001155315085932527, "loss": 0.3806, "step": 1223 }, { "epoch": 0.7716008037508373, "grad_norm": 0.01635221764445305, "learning_rate": 0.00011521323997453851, "loss": 0.3498, "step": 1224 }, { "epoch": 0.7722311965643591, "grad_norm": 0.018587900325655937, "learning_rate": 0.00011489497135582432, "loss": 0.3929, "step": 1225 }, { "epoch": 0.7728615893778811, "grad_norm": 0.013028173707425594, "learning_rate": 0.00011457670273711013, "loss": 0.3641, "step": 1226 }, { "epoch": 0.773491982191403, "grad_norm": 0.014693099074065685, "learning_rate": 0.00011425843411839592, "loss": 0.4048, "step": 1227 }, { "epoch": 0.774122375004925, "grad_norm": 0.014489540830254555, "learning_rate": 0.00011394016549968175, "loss": 0.2909, "step": 1228 }, { "epoch": 0.7747527678184468, "grad_norm": 0.015637323260307312, "learning_rate": 0.00011362189688096754, "loss": 0.3169, "step": 1229 }, { "epoch": 0.7753831606319688, "grad_norm": 0.013017135672271252, "learning_rate": 0.00011330362826225335, "loss": 0.3196, "step": 1230 }, { "epoch": 0.7760135534454907, "grad_norm": 0.01679660566151142, "learning_rate": 0.00011298535964353914, "loss": 0.3982, "step": 1231 }, { "epoch": 0.7766439462590127, "grad_norm": 0.016054868698120117, "learning_rate": 0.00011266709102482496, "loss": 0.3056, "step": 1232 }, { "epoch": 0.7772743390725346, "grad_norm": 0.015688486397266388, "learning_rate": 0.00011234882240611076, "loss": 0.2848, "step": 1233 }, { "epoch": 0.7779047318860565, "grad_norm": 0.015926331281661987, "learning_rate": 0.00011203055378739658, "loss": 0.4378, "step": 1234 }, { "epoch": 0.7785351246995784, "grad_norm": 0.014288809150457382, "learning_rate": 0.00011171228516868237, "loss": 0.3581, "step": 1235 }, { "epoch": 0.7791655175131004, "grad_norm": 0.01497898530215025, "learning_rate": 0.00011139401654996817, "loss": 0.3422, "step": 1236 }, { "epoch": 0.7797959103266223, "grad_norm": 0.014813642017543316, "learning_rate": 0.00011107574793125397, "loss": 0.3523, "step": 1237 }, { "epoch": 0.7804263031401442, "grad_norm": 0.014726947993040085, "learning_rate": 0.00011075747931253978, "loss": 0.3327, "step": 1238 }, { "epoch": 0.7810566959536661, "grad_norm": 0.016021117568016052, "learning_rate": 0.00011043921069382559, "loss": 0.4031, "step": 1239 }, { "epoch": 0.7816870887671881, "grad_norm": 0.016203582286834717, "learning_rate": 0.00011012094207511138, "loss": 0.3535, "step": 1240 }, { "epoch": 0.78231748158071, "grad_norm": 0.014114546589553356, "learning_rate": 0.0001098026734563972, "loss": 0.3332, "step": 1241 }, { "epoch": 0.7829478743942319, "grad_norm": 0.015791423618793488, "learning_rate": 0.000109484404837683, "loss": 0.3838, "step": 1242 }, { "epoch": 0.7835782672077538, "grad_norm": 0.017668122425675392, "learning_rate": 0.00010916613621896881, "loss": 0.3753, "step": 1243 }, { "epoch": 0.7842086600212758, "grad_norm": 0.012804379686713219, "learning_rate": 0.00010884786760025462, "loss": 0.3147, "step": 1244 }, { "epoch": 0.7848390528347977, "grad_norm": 0.01961948722600937, "learning_rate": 0.00010852959898154042, "loss": 0.4541, "step": 1245 }, { "epoch": 0.7854694456483197, "grad_norm": 0.014307526871562004, "learning_rate": 0.00010821133036282622, "loss": 0.3474, "step": 1246 }, { "epoch": 0.7860998384618415, "grad_norm": 0.013645196333527565, "learning_rate": 0.00010789306174411204, "loss": 0.3386, "step": 1247 }, { "epoch": 0.7867302312753635, "grad_norm": 0.01503558550029993, "learning_rate": 0.00010757479312539783, "loss": 0.3148, "step": 1248 }, { "epoch": 0.7873606240888854, "grad_norm": 0.013624979183077812, "learning_rate": 0.00010725652450668364, "loss": 0.3909, "step": 1249 }, { "epoch": 0.7879910169024074, "grad_norm": 0.013765980489552021, "learning_rate": 0.00010693825588796945, "loss": 0.3706, "step": 1250 }, { "epoch": 0.7886214097159292, "grad_norm": 0.016694730147719383, "learning_rate": 0.00010661998726925526, "loss": 0.4333, "step": 1251 }, { "epoch": 0.7892518025294512, "grad_norm": 0.014478876255452633, "learning_rate": 0.00010630171865054105, "loss": 0.3428, "step": 1252 }, { "epoch": 0.7898821953429731, "grad_norm": 0.013275336474180222, "learning_rate": 0.00010598345003182687, "loss": 0.3327, "step": 1253 }, { "epoch": 0.790512588156495, "grad_norm": 0.013734846375882626, "learning_rate": 0.00010566518141311267, "loss": 0.3083, "step": 1254 }, { "epoch": 0.7911429809700169, "grad_norm": 0.015706898644566536, "learning_rate": 0.00010534691279439847, "loss": 0.3538, "step": 1255 }, { "epoch": 0.7917733737835388, "grad_norm": 0.01340963039547205, "learning_rate": 0.00010502864417568428, "loss": 0.3156, "step": 1256 }, { "epoch": 0.7924037665970608, "grad_norm": 0.015748783946037292, "learning_rate": 0.00010471037555697009, "loss": 0.4102, "step": 1257 }, { "epoch": 0.7930341594105828, "grad_norm": 0.014184198342263699, "learning_rate": 0.00010439210693825588, "loss": 0.3412, "step": 1258 }, { "epoch": 0.7936645522241046, "grad_norm": 0.018744543194770813, "learning_rate": 0.00010407383831954169, "loss": 0.3488, "step": 1259 }, { "epoch": 0.7942949450376265, "grad_norm": 0.016622383147478104, "learning_rate": 0.0001037555697008275, "loss": 0.3665, "step": 1260 }, { "epoch": 0.7949253378511485, "grad_norm": 0.017016762867569923, "learning_rate": 0.0001034373010821133, "loss": 0.4162, "step": 1261 }, { "epoch": 0.7955557306646704, "grad_norm": 0.015281199477612972, "learning_rate": 0.00010311903246339912, "loss": 0.3518, "step": 1262 }, { "epoch": 0.7961861234781924, "grad_norm": 0.01393594779074192, "learning_rate": 0.00010280076384468491, "loss": 0.3006, "step": 1263 }, { "epoch": 0.7968165162917142, "grad_norm": 0.012513347901403904, "learning_rate": 0.00010248249522597072, "loss": 0.3323, "step": 1264 }, { "epoch": 0.7974469091052362, "grad_norm": 0.015494618564844131, "learning_rate": 0.00010216422660725653, "loss": 0.3655, "step": 1265 }, { "epoch": 0.7980773019187581, "grad_norm": 0.01529205683618784, "learning_rate": 0.00010184595798854233, "loss": 0.3333, "step": 1266 }, { "epoch": 0.7987076947322801, "grad_norm": 0.015652429312467575, "learning_rate": 0.00010152768936982813, "loss": 0.3583, "step": 1267 }, { "epoch": 0.7993380875458019, "grad_norm": 0.016451742500066757, "learning_rate": 0.00010120942075111395, "loss": 0.425, "step": 1268 }, { "epoch": 0.7999684803593239, "grad_norm": 0.012998268008232117, "learning_rate": 0.00010089115213239974, "loss": 0.3339, "step": 1269 }, { "epoch": 0.8005988731728458, "grad_norm": 0.01690104976296425, "learning_rate": 0.00010057288351368555, "loss": 0.4494, "step": 1270 }, { "epoch": 0.8012292659863678, "grad_norm": 0.02040201611816883, "learning_rate": 0.00010025461489497136, "loss": 0.3735, "step": 1271 }, { "epoch": 0.8018596587998896, "grad_norm": 0.01489595603197813, "learning_rate": 9.993634627625717e-05, "loss": 0.3878, "step": 1272 }, { "epoch": 0.8024900516134116, "grad_norm": 0.018899748101830482, "learning_rate": 9.961807765754296e-05, "loss": 0.389, "step": 1273 }, { "epoch": 0.8031204444269335, "grad_norm": 0.013340587727725506, "learning_rate": 9.929980903882878e-05, "loss": 0.3503, "step": 1274 }, { "epoch": 0.8037508372404555, "grad_norm": 0.014525451697409153, "learning_rate": 9.898154042011458e-05, "loss": 0.3412, "step": 1275 }, { "epoch": 0.8043812300539774, "grad_norm": 0.013309518806636333, "learning_rate": 9.866327180140038e-05, "loss": 0.3282, "step": 1276 }, { "epoch": 0.8050116228674993, "grad_norm": 0.013061481527984142, "learning_rate": 9.834500318268619e-05, "loss": 0.3265, "step": 1277 }, { "epoch": 0.8056420156810212, "grad_norm": 0.019597146660089493, "learning_rate": 9.8026734563972e-05, "loss": 0.3639, "step": 1278 }, { "epoch": 0.8062724084945432, "grad_norm": 0.0187253188341856, "learning_rate": 9.77084659452578e-05, "loss": 0.4057, "step": 1279 }, { "epoch": 0.8069028013080651, "grad_norm": 0.015955207869410515, "learning_rate": 9.739019732654362e-05, "loss": 0.3512, "step": 1280 }, { "epoch": 0.807533194121587, "grad_norm": 0.012332870624959469, "learning_rate": 9.707192870782941e-05, "loss": 0.3276, "step": 1281 }, { "epoch": 0.8081635869351089, "grad_norm": 0.016889231279492378, "learning_rate": 9.67536600891152e-05, "loss": 0.378, "step": 1282 }, { "epoch": 0.8087939797486309, "grad_norm": 0.01877409592270851, "learning_rate": 9.643539147040102e-05, "loss": 0.4204, "step": 1283 }, { "epoch": 0.8094243725621528, "grad_norm": 0.014773757196962833, "learning_rate": 9.611712285168682e-05, "loss": 0.3272, "step": 1284 }, { "epoch": 0.8100547653756747, "grad_norm": 0.01613541506230831, "learning_rate": 9.579885423297263e-05, "loss": 0.3115, "step": 1285 }, { "epoch": 0.8106851581891966, "grad_norm": 0.015467733144760132, "learning_rate": 9.548058561425843e-05, "loss": 0.3861, "step": 1286 }, { "epoch": 0.8113155510027186, "grad_norm": 0.013268228620290756, "learning_rate": 9.516231699554424e-05, "loss": 0.352, "step": 1287 }, { "epoch": 0.8119459438162405, "grad_norm": 0.015348583459854126, "learning_rate": 9.484404837683004e-05, "loss": 0.363, "step": 1288 }, { "epoch": 0.8125763366297625, "grad_norm": 0.014995148405432701, "learning_rate": 9.452577975811586e-05, "loss": 0.3537, "step": 1289 }, { "epoch": 0.8132067294432843, "grad_norm": 0.016880935057997704, "learning_rate": 9.420751113940165e-05, "loss": 0.419, "step": 1290 }, { "epoch": 0.8138371222568063, "grad_norm": 0.014265833422541618, "learning_rate": 9.388924252068746e-05, "loss": 0.3484, "step": 1291 }, { "epoch": 0.8144675150703282, "grad_norm": 0.01832139864563942, "learning_rate": 9.357097390197327e-05, "loss": 0.3991, "step": 1292 }, { "epoch": 0.8150979078838502, "grad_norm": 0.01853499561548233, "learning_rate": 9.325270528325908e-05, "loss": 0.383, "step": 1293 }, { "epoch": 0.815728300697372, "grad_norm": 0.012397758662700653, "learning_rate": 9.293443666454487e-05, "loss": 0.3088, "step": 1294 }, { "epoch": 0.816358693510894, "grad_norm": 0.014064849354326725, "learning_rate": 9.261616804583069e-05, "loss": 0.4194, "step": 1295 }, { "epoch": 0.8169890863244159, "grad_norm": 0.015221447683870792, "learning_rate": 9.229789942711649e-05, "loss": 0.3943, "step": 1296 }, { "epoch": 0.8176194791379379, "grad_norm": 0.014992970041930676, "learning_rate": 9.19796308084023e-05, "loss": 0.3224, "step": 1297 }, { "epoch": 0.8182498719514597, "grad_norm": 0.014988161623477936, "learning_rate": 9.16613621896881e-05, "loss": 0.3839, "step": 1298 }, { "epoch": 0.8188802647649817, "grad_norm": 0.013880716636776924, "learning_rate": 9.134309357097391e-05, "loss": 0.3451, "step": 1299 }, { "epoch": 0.8195106575785036, "grad_norm": 0.014946370385587215, "learning_rate": 9.10248249522597e-05, "loss": 0.3412, "step": 1300 }, { "epoch": 0.8201410503920256, "grad_norm": 0.012207560241222382, "learning_rate": 9.070655633354552e-05, "loss": 0.2898, "step": 1301 }, { "epoch": 0.8207714432055474, "grad_norm": 0.016438962891697884, "learning_rate": 9.038828771483132e-05, "loss": 0.3801, "step": 1302 }, { "epoch": 0.8214018360190694, "grad_norm": 0.015621104277670383, "learning_rate": 9.007001909611713e-05, "loss": 0.3985, "step": 1303 }, { "epoch": 0.8220322288325913, "grad_norm": 0.015430763363838196, "learning_rate": 8.975175047740293e-05, "loss": 0.4203, "step": 1304 }, { "epoch": 0.8226626216461133, "grad_norm": 0.014245027676224709, "learning_rate": 8.943348185868873e-05, "loss": 0.3689, "step": 1305 }, { "epoch": 0.8232930144596352, "grad_norm": 0.014247522689402103, "learning_rate": 8.911521323997454e-05, "loss": 0.3307, "step": 1306 }, { "epoch": 0.823923407273157, "grad_norm": 0.026172669604420662, "learning_rate": 8.879694462126034e-05, "loss": 0.3847, "step": 1307 }, { "epoch": 0.824553800086679, "grad_norm": 0.01854928955435753, "learning_rate": 8.847867600254615e-05, "loss": 0.3731, "step": 1308 }, { "epoch": 0.825184192900201, "grad_norm": 0.014989660121500492, "learning_rate": 8.816040738383195e-05, "loss": 0.3898, "step": 1309 }, { "epoch": 0.8258145857137229, "grad_norm": 0.014221888035535812, "learning_rate": 8.784213876511777e-05, "loss": 0.3426, "step": 1310 }, { "epoch": 0.8264449785272447, "grad_norm": 0.022886354476213455, "learning_rate": 8.752387014640356e-05, "loss": 0.3623, "step": 1311 }, { "epoch": 0.8270753713407667, "grad_norm": 0.013089706189930439, "learning_rate": 8.720560152768937e-05, "loss": 0.3393, "step": 1312 }, { "epoch": 0.8277057641542886, "grad_norm": 0.013300247490406036, "learning_rate": 8.688733290897518e-05, "loss": 0.3154, "step": 1313 }, { "epoch": 0.8283361569678106, "grad_norm": 0.01165758166462183, "learning_rate": 8.656906429026099e-05, "loss": 0.3144, "step": 1314 }, { "epoch": 0.8289665497813324, "grad_norm": 0.016146976500749588, "learning_rate": 8.625079567154678e-05, "loss": 0.3897, "step": 1315 }, { "epoch": 0.8295969425948544, "grad_norm": 0.01524006575345993, "learning_rate": 8.59325270528326e-05, "loss": 0.3379, "step": 1316 }, { "epoch": 0.8302273354083763, "grad_norm": 0.013549231924116611, "learning_rate": 8.56142584341184e-05, "loss": 0.3361, "step": 1317 }, { "epoch": 0.8308577282218983, "grad_norm": 0.013456017710268497, "learning_rate": 8.52959898154042e-05, "loss": 0.3244, "step": 1318 }, { "epoch": 0.8314881210354202, "grad_norm": 0.015488195233047009, "learning_rate": 8.497772119669001e-05, "loss": 0.3806, "step": 1319 }, { "epoch": 0.8321185138489421, "grad_norm": 0.015536332502961159, "learning_rate": 8.465945257797582e-05, "loss": 0.3552, "step": 1320 }, { "epoch": 0.832748906662464, "grad_norm": 0.015241379849612713, "learning_rate": 8.434118395926161e-05, "loss": 0.364, "step": 1321 }, { "epoch": 0.833379299475986, "grad_norm": 0.018507009372115135, "learning_rate": 8.402291534054743e-05, "loss": 0.3724, "step": 1322 }, { "epoch": 0.8340096922895079, "grad_norm": 0.016914106905460358, "learning_rate": 8.370464672183323e-05, "loss": 0.3824, "step": 1323 }, { "epoch": 0.8346400851030298, "grad_norm": 0.014568033628165722, "learning_rate": 8.338637810311904e-05, "loss": 0.3431, "step": 1324 }, { "epoch": 0.8352704779165517, "grad_norm": 0.01478442270308733, "learning_rate": 8.306810948440484e-05, "loss": 0.3262, "step": 1325 }, { "epoch": 0.8359008707300737, "grad_norm": 0.0161952693015337, "learning_rate": 8.274984086569065e-05, "loss": 0.2954, "step": 1326 }, { "epoch": 0.8365312635435956, "grad_norm": 0.013155090622603893, "learning_rate": 8.243157224697645e-05, "loss": 0.3454, "step": 1327 }, { "epoch": 0.8371616563571175, "grad_norm": 0.018341947346925735, "learning_rate": 8.211330362826227e-05, "loss": 0.3915, "step": 1328 }, { "epoch": 0.8377920491706394, "grad_norm": 0.0136695122346282, "learning_rate": 8.179503500954806e-05, "loss": 0.3514, "step": 1329 }, { "epoch": 0.8384224419841614, "grad_norm": 0.01584646850824356, "learning_rate": 8.147676639083386e-05, "loss": 0.3392, "step": 1330 }, { "epoch": 0.8390528347976833, "grad_norm": 0.011146236211061478, "learning_rate": 8.115849777211968e-05, "loss": 0.2936, "step": 1331 }, { "epoch": 0.8396832276112053, "grad_norm": 0.013834916055202484, "learning_rate": 8.084022915340547e-05, "loss": 0.2844, "step": 1332 }, { "epoch": 0.8403136204247271, "grad_norm": 0.016150744631886482, "learning_rate": 8.052196053469128e-05, "loss": 0.4015, "step": 1333 }, { "epoch": 0.8409440132382491, "grad_norm": 0.016211718320846558, "learning_rate": 8.020369191597709e-05, "loss": 0.3524, "step": 1334 }, { "epoch": 0.841574406051771, "grad_norm": 0.016436005011200905, "learning_rate": 7.98854232972629e-05, "loss": 0.3715, "step": 1335 }, { "epoch": 0.842204798865293, "grad_norm": 0.013710149563848972, "learning_rate": 7.956715467854869e-05, "loss": 0.3226, "step": 1336 }, { "epoch": 0.8428351916788148, "grad_norm": 0.014572346583008766, "learning_rate": 7.924888605983451e-05, "loss": 0.3346, "step": 1337 }, { "epoch": 0.8434655844923368, "grad_norm": 0.013981449417769909, "learning_rate": 7.89306174411203e-05, "loss": 0.3845, "step": 1338 }, { "epoch": 0.8440959773058587, "grad_norm": 0.012070405296981335, "learning_rate": 7.861234882240611e-05, "loss": 0.337, "step": 1339 }, { "epoch": 0.8447263701193807, "grad_norm": 0.017842991277575493, "learning_rate": 7.829408020369192e-05, "loss": 0.3678, "step": 1340 }, { "epoch": 0.8453567629329025, "grad_norm": 0.016572175547480583, "learning_rate": 7.797581158497773e-05, "loss": 0.3616, "step": 1341 }, { "epoch": 0.8459871557464245, "grad_norm": 0.016668444499373436, "learning_rate": 7.765754296626352e-05, "loss": 0.4127, "step": 1342 }, { "epoch": 0.8466175485599464, "grad_norm": 0.01426651980727911, "learning_rate": 7.733927434754934e-05, "loss": 0.3277, "step": 1343 }, { "epoch": 0.8472479413734684, "grad_norm": 0.016792453825473785, "learning_rate": 7.702100572883514e-05, "loss": 0.3446, "step": 1344 }, { "epoch": 0.8478783341869903, "grad_norm": 0.017911698669195175, "learning_rate": 7.670273711012095e-05, "loss": 0.3778, "step": 1345 }, { "epoch": 0.8485087270005122, "grad_norm": 0.012097560800611973, "learning_rate": 7.638446849140675e-05, "loss": 0.2953, "step": 1346 }, { "epoch": 0.8491391198140341, "grad_norm": 0.012164255604147911, "learning_rate": 7.606619987269256e-05, "loss": 0.3675, "step": 1347 }, { "epoch": 0.8497695126275561, "grad_norm": 0.01443692296743393, "learning_rate": 7.574793125397836e-05, "loss": 0.3786, "step": 1348 }, { "epoch": 0.850399905441078, "grad_norm": 0.015716979280114174, "learning_rate": 7.542966263526418e-05, "loss": 0.3593, "step": 1349 }, { "epoch": 0.8510302982545999, "grad_norm": 0.014889014884829521, "learning_rate": 7.511139401654997e-05, "loss": 0.387, "step": 1350 }, { "epoch": 0.8516606910681218, "grad_norm": 0.014393595047295094, "learning_rate": 7.479312539783578e-05, "loss": 0.345, "step": 1351 }, { "epoch": 0.8522910838816438, "grad_norm": 0.014208640903234482, "learning_rate": 7.447485677912159e-05, "loss": 0.3467, "step": 1352 }, { "epoch": 0.8529214766951657, "grad_norm": 0.013807917013764381, "learning_rate": 7.415658816040738e-05, "loss": 0.3575, "step": 1353 }, { "epoch": 0.8535518695086876, "grad_norm": 0.013681413605809212, "learning_rate": 7.383831954169319e-05, "loss": 0.3398, "step": 1354 }, { "epoch": 0.8541822623222095, "grad_norm": 0.012576180510222912, "learning_rate": 7.3520050922979e-05, "loss": 0.2905, "step": 1355 }, { "epoch": 0.8548126551357315, "grad_norm": 0.013641037978231907, "learning_rate": 7.32017823042648e-05, "loss": 0.3221, "step": 1356 }, { "epoch": 0.8554430479492534, "grad_norm": 0.013322602957487106, "learning_rate": 7.28835136855506e-05, "loss": 0.3278, "step": 1357 }, { "epoch": 0.8560734407627753, "grad_norm": 0.018067650496959686, "learning_rate": 7.256524506683642e-05, "loss": 0.3627, "step": 1358 }, { "epoch": 0.8567038335762972, "grad_norm": 0.012783601880073547, "learning_rate": 7.224697644812221e-05, "loss": 0.3115, "step": 1359 }, { "epoch": 0.8573342263898192, "grad_norm": 0.015951205044984818, "learning_rate": 7.192870782940802e-05, "loss": 0.3606, "step": 1360 }, { "epoch": 0.8579646192033411, "grad_norm": 0.014376024715602398, "learning_rate": 7.161043921069383e-05, "loss": 0.3426, "step": 1361 }, { "epoch": 0.8585950120168631, "grad_norm": 0.013964856043457985, "learning_rate": 7.129217059197964e-05, "loss": 0.374, "step": 1362 }, { "epoch": 0.8592254048303849, "grad_norm": 0.01806429959833622, "learning_rate": 7.097390197326543e-05, "loss": 0.383, "step": 1363 }, { "epoch": 0.8598557976439068, "grad_norm": 0.014006298966705799, "learning_rate": 7.065563335455125e-05, "loss": 0.3078, "step": 1364 }, { "epoch": 0.8604861904574288, "grad_norm": 0.01776600070297718, "learning_rate": 7.033736473583705e-05, "loss": 0.4038, "step": 1365 }, { "epoch": 0.8611165832709508, "grad_norm": 0.01270060520619154, "learning_rate": 7.001909611712285e-05, "loss": 0.2996, "step": 1366 }, { "epoch": 0.8617469760844726, "grad_norm": 0.016662361100316048, "learning_rate": 6.970082749840866e-05, "loss": 0.3906, "step": 1367 }, { "epoch": 0.8623773688979945, "grad_norm": 0.015086748637259007, "learning_rate": 6.938255887969447e-05, "loss": 0.3611, "step": 1368 }, { "epoch": 0.8630077617115165, "grad_norm": 0.018517209216952324, "learning_rate": 6.906429026098026e-05, "loss": 0.3977, "step": 1369 }, { "epoch": 0.8636381545250384, "grad_norm": 0.013170517049729824, "learning_rate": 6.874602164226609e-05, "loss": 0.3427, "step": 1370 }, { "epoch": 0.8642685473385603, "grad_norm": 0.015530762262642384, "learning_rate": 6.842775302355188e-05, "loss": 0.3466, "step": 1371 }, { "epoch": 0.8648989401520822, "grad_norm": 0.013266485184431076, "learning_rate": 6.810948440483769e-05, "loss": 0.3319, "step": 1372 }, { "epoch": 0.8655293329656042, "grad_norm": 0.019919713959097862, "learning_rate": 6.779121578612348e-05, "loss": 0.3965, "step": 1373 }, { "epoch": 0.8661597257791261, "grad_norm": 0.01324549037963152, "learning_rate": 6.74729471674093e-05, "loss": 0.3468, "step": 1374 }, { "epoch": 0.8667901185926481, "grad_norm": 0.014857304282486439, "learning_rate": 6.71546785486951e-05, "loss": 0.3362, "step": 1375 }, { "epoch": 0.8674205114061699, "grad_norm": 0.015873543918132782, "learning_rate": 6.683640992998089e-05, "loss": 0.4089, "step": 1376 }, { "epoch": 0.8680509042196919, "grad_norm": 0.01584213599562645, "learning_rate": 6.651814131126671e-05, "loss": 0.3622, "step": 1377 }, { "epoch": 0.8686812970332138, "grad_norm": 0.015694987028837204, "learning_rate": 6.619987269255251e-05, "loss": 0.3665, "step": 1378 }, { "epoch": 0.8693116898467358, "grad_norm": 0.01624428853392601, "learning_rate": 6.588160407383832e-05, "loss": 0.355, "step": 1379 }, { "epoch": 0.8699420826602576, "grad_norm": 0.015205349773168564, "learning_rate": 6.556333545512412e-05, "loss": 0.3378, "step": 1380 }, { "epoch": 0.8705724754737796, "grad_norm": 0.016568010672926903, "learning_rate": 6.524506683640993e-05, "loss": 0.3735, "step": 1381 }, { "epoch": 0.8712028682873015, "grad_norm": 0.016933640465140343, "learning_rate": 6.492679821769573e-05, "loss": 0.3504, "step": 1382 }, { "epoch": 0.8718332611008235, "grad_norm": 0.014327967539429665, "learning_rate": 6.460852959898155e-05, "loss": 0.3373, "step": 1383 }, { "epoch": 0.8724636539143453, "grad_norm": 0.015418641269207, "learning_rate": 6.429026098026734e-05, "loss": 0.4, "step": 1384 }, { "epoch": 0.8730940467278673, "grad_norm": 0.015355248935520649, "learning_rate": 6.397199236155315e-05, "loss": 0.3514, "step": 1385 }, { "epoch": 0.8737244395413892, "grad_norm": 0.017169740051031113, "learning_rate": 6.365372374283896e-05, "loss": 0.3702, "step": 1386 }, { "epoch": 0.8743548323549112, "grad_norm": 0.015446964651346207, "learning_rate": 6.333545512412476e-05, "loss": 0.3186, "step": 1387 }, { "epoch": 0.8749852251684331, "grad_norm": 0.013855344615876675, "learning_rate": 6.301718650541056e-05, "loss": 0.3728, "step": 1388 }, { "epoch": 0.875615617981955, "grad_norm": 0.012399091385304928, "learning_rate": 6.269891788669638e-05, "loss": 0.3255, "step": 1389 }, { "epoch": 0.8762460107954769, "grad_norm": 0.013493593782186508, "learning_rate": 6.238064926798217e-05, "loss": 0.3269, "step": 1390 }, { "epoch": 0.8768764036089989, "grad_norm": 0.015041259117424488, "learning_rate": 6.206238064926798e-05, "loss": 0.3953, "step": 1391 }, { "epoch": 0.8775067964225208, "grad_norm": 0.014262627810239792, "learning_rate": 6.174411203055379e-05, "loss": 0.3883, "step": 1392 }, { "epoch": 0.8781371892360427, "grad_norm": 0.012238999828696251, "learning_rate": 6.14258434118396e-05, "loss": 0.3258, "step": 1393 }, { "epoch": 0.8787675820495646, "grad_norm": 0.01342200580984354, "learning_rate": 6.110757479312539e-05, "loss": 0.3333, "step": 1394 }, { "epoch": 0.8793979748630866, "grad_norm": 0.013607119210064411, "learning_rate": 6.0789306174411206e-05, "loss": 0.3112, "step": 1395 }, { "epoch": 0.8800283676766085, "grad_norm": 0.01611727848649025, "learning_rate": 6.047103755569701e-05, "loss": 0.3589, "step": 1396 }, { "epoch": 0.8806587604901304, "grad_norm": 0.016114989295601845, "learning_rate": 6.0152768936982815e-05, "loss": 0.3634, "step": 1397 }, { "epoch": 0.8812891533036523, "grad_norm": 0.01715138368308544, "learning_rate": 5.983450031826862e-05, "loss": 0.4102, "step": 1398 }, { "epoch": 0.8819195461171743, "grad_norm": 0.014017704874277115, "learning_rate": 5.9516231699554424e-05, "loss": 0.3711, "step": 1399 }, { "epoch": 0.8825499389306962, "grad_norm": 0.014960573986172676, "learning_rate": 5.919796308084023e-05, "loss": 0.3688, "step": 1400 }, { "epoch": 0.8825499389306962, "eval_loss": 0.3723333179950714, "eval_runtime": 329.5478, "eval_samples_per_second": 3.034, "eval_steps_per_second": 3.034, "step": 1400 }, { "epoch": 0.8831803317442181, "grad_norm": 0.014268855564296246, "learning_rate": 5.887969446212604e-05, "loss": 0.3665, "step": 1401 }, { "epoch": 0.88381072455774, "grad_norm": 0.013557464815676212, "learning_rate": 5.856142584341184e-05, "loss": 0.3151, "step": 1402 }, { "epoch": 0.884441117371262, "grad_norm": 0.015205616131424904, "learning_rate": 5.824315722469765e-05, "loss": 0.3896, "step": 1403 }, { "epoch": 0.8850715101847839, "grad_norm": 0.016049759462475777, "learning_rate": 5.7924888605983456e-05, "loss": 0.3506, "step": 1404 }, { "epoch": 0.8857019029983059, "grad_norm": 0.014878795482218266, "learning_rate": 5.760661998726926e-05, "loss": 0.3349, "step": 1405 }, { "epoch": 0.8863322958118277, "grad_norm": 0.016634635627269745, "learning_rate": 5.7288351368555065e-05, "loss": 0.4009, "step": 1406 }, { "epoch": 0.8869626886253497, "grad_norm": 0.015360746532678604, "learning_rate": 5.697008274984087e-05, "loss": 0.3204, "step": 1407 }, { "epoch": 0.8875930814388716, "grad_norm": 0.015245744027197361, "learning_rate": 5.6651814131126674e-05, "loss": 0.347, "step": 1408 }, { "epoch": 0.8882234742523936, "grad_norm": 0.012498168274760246, "learning_rate": 5.633354551241248e-05, "loss": 0.3243, "step": 1409 }, { "epoch": 0.8888538670659154, "grad_norm": 0.013329975306987762, "learning_rate": 5.601527689369829e-05, "loss": 0.3285, "step": 1410 }, { "epoch": 0.8894842598794374, "grad_norm": 0.013589510694146156, "learning_rate": 5.5697008274984084e-05, "loss": 0.2839, "step": 1411 }, { "epoch": 0.8901146526929593, "grad_norm": 0.016013192012906075, "learning_rate": 5.537873965626989e-05, "loss": 0.3615, "step": 1412 }, { "epoch": 0.8907450455064813, "grad_norm": 0.013837985694408417, "learning_rate": 5.506047103755569e-05, "loss": 0.2932, "step": 1413 }, { "epoch": 0.8913754383200031, "grad_norm": 0.014447502791881561, "learning_rate": 5.47422024188415e-05, "loss": 0.4079, "step": 1414 }, { "epoch": 0.892005831133525, "grad_norm": 0.0160972997546196, "learning_rate": 5.442393380012731e-05, "loss": 0.3886, "step": 1415 }, { "epoch": 0.892636223947047, "grad_norm": 0.014950517565011978, "learning_rate": 5.410566518141311e-05, "loss": 0.3972, "step": 1416 }, { "epoch": 0.893266616760569, "grad_norm": 0.012849722988903522, "learning_rate": 5.378739656269892e-05, "loss": 0.3393, "step": 1417 }, { "epoch": 0.8938970095740909, "grad_norm": 0.01429782249033451, "learning_rate": 5.3469127943984725e-05, "loss": 0.3115, "step": 1418 }, { "epoch": 0.8945274023876127, "grad_norm": 0.017068268731236458, "learning_rate": 5.3150859325270526e-05, "loss": 0.3785, "step": 1419 }, { "epoch": 0.8951577952011347, "grad_norm": 0.01486631203442812, "learning_rate": 5.283259070655633e-05, "loss": 0.3302, "step": 1420 }, { "epoch": 0.8957881880146567, "grad_norm": 0.015406712889671326, "learning_rate": 5.251432208784214e-05, "loss": 0.3491, "step": 1421 }, { "epoch": 0.8964185808281786, "grad_norm": 0.015080674551427364, "learning_rate": 5.219605346912794e-05, "loss": 0.3235, "step": 1422 }, { "epoch": 0.8970489736417004, "grad_norm": 0.018638912588357925, "learning_rate": 5.187778485041375e-05, "loss": 0.3832, "step": 1423 }, { "epoch": 0.8976793664552224, "grad_norm": 0.017536459490656853, "learning_rate": 5.155951623169956e-05, "loss": 0.4131, "step": 1424 }, { "epoch": 0.8983097592687443, "grad_norm": 0.015971779823303223, "learning_rate": 5.124124761298536e-05, "loss": 0.3421, "step": 1425 }, { "epoch": 0.8989401520822663, "grad_norm": 0.015325568616390228, "learning_rate": 5.0922978994271167e-05, "loss": 0.3082, "step": 1426 }, { "epoch": 0.8995705448957881, "grad_norm": 0.01693599671125412, "learning_rate": 5.0604710375556974e-05, "loss": 0.3474, "step": 1427 }, { "epoch": 0.9002009377093101, "grad_norm": 0.01645667850971222, "learning_rate": 5.0286441756842775e-05, "loss": 0.3542, "step": 1428 }, { "epoch": 0.900831330522832, "grad_norm": 0.01388312503695488, "learning_rate": 4.996817313812858e-05, "loss": 0.314, "step": 1429 }, { "epoch": 0.901461723336354, "grad_norm": 0.02007160522043705, "learning_rate": 4.964990451941439e-05, "loss": 0.3802, "step": 1430 }, { "epoch": 0.9020921161498759, "grad_norm": 0.017025291919708252, "learning_rate": 4.933163590070019e-05, "loss": 0.3688, "step": 1431 }, { "epoch": 0.9027225089633978, "grad_norm": 0.01625664159655571, "learning_rate": 4.9013367281986e-05, "loss": 0.407, "step": 1432 }, { "epoch": 0.9033529017769197, "grad_norm": 0.01974036730825901, "learning_rate": 4.869509866327181e-05, "loss": 0.4274, "step": 1433 }, { "epoch": 0.9039832945904417, "grad_norm": 0.014942855574190617, "learning_rate": 4.83768300445576e-05, "loss": 0.3498, "step": 1434 }, { "epoch": 0.9046136874039636, "grad_norm": 0.015789102762937546, "learning_rate": 4.805856142584341e-05, "loss": 0.281, "step": 1435 }, { "epoch": 0.9052440802174855, "grad_norm": 0.014947917312383652, "learning_rate": 4.774029280712922e-05, "loss": 0.3946, "step": 1436 }, { "epoch": 0.9058744730310074, "grad_norm": 0.014621512033045292, "learning_rate": 4.742202418841502e-05, "loss": 0.318, "step": 1437 }, { "epoch": 0.9065048658445294, "grad_norm": 0.017983663827180862, "learning_rate": 4.7103755569700826e-05, "loss": 0.3132, "step": 1438 }, { "epoch": 0.9071352586580513, "grad_norm": 0.01268945261836052, "learning_rate": 4.6785486950986634e-05, "loss": 0.3256, "step": 1439 }, { "epoch": 0.9077656514715732, "grad_norm": 0.01479201577603817, "learning_rate": 4.6467218332272435e-05, "loss": 0.353, "step": 1440 }, { "epoch": 0.9083960442850951, "grad_norm": 0.01699303276836872, "learning_rate": 4.614894971355824e-05, "loss": 0.3433, "step": 1441 }, { "epoch": 0.9090264370986171, "grad_norm": 0.01907494105398655, "learning_rate": 4.583068109484405e-05, "loss": 0.4115, "step": 1442 }, { "epoch": 0.909656829912139, "grad_norm": 0.013479120098054409, "learning_rate": 4.551241247612985e-05, "loss": 0.2957, "step": 1443 }, { "epoch": 0.910287222725661, "grad_norm": 0.0173697080463171, "learning_rate": 4.519414385741566e-05, "loss": 0.4158, "step": 1444 }, { "epoch": 0.9109176155391828, "grad_norm": 0.013225399889051914, "learning_rate": 4.487587523870147e-05, "loss": 0.3463, "step": 1445 }, { "epoch": 0.9115480083527048, "grad_norm": 0.01624819077551365, "learning_rate": 4.455760661998727e-05, "loss": 0.4288, "step": 1446 }, { "epoch": 0.9121784011662267, "grad_norm": 0.014610115438699722, "learning_rate": 4.4239338001273076e-05, "loss": 0.341, "step": 1447 }, { "epoch": 0.9128087939797487, "grad_norm": 0.01420588418841362, "learning_rate": 4.3921069382558884e-05, "loss": 0.3314, "step": 1448 }, { "epoch": 0.9134391867932705, "grad_norm": 0.012280907481908798, "learning_rate": 4.3602800763844685e-05, "loss": 0.2862, "step": 1449 }, { "epoch": 0.9140695796067925, "grad_norm": 0.015962889418005943, "learning_rate": 4.328453214513049e-05, "loss": 0.3574, "step": 1450 }, { "epoch": 0.9146999724203144, "grad_norm": 0.015481521375477314, "learning_rate": 4.29662635264163e-05, "loss": 0.3275, "step": 1451 }, { "epoch": 0.9153303652338364, "grad_norm": 0.017152821645140648, "learning_rate": 4.26479949077021e-05, "loss": 0.3948, "step": 1452 }, { "epoch": 0.9159607580473582, "grad_norm": 0.0119781494140625, "learning_rate": 4.232972628898791e-05, "loss": 0.301, "step": 1453 }, { "epoch": 0.9165911508608802, "grad_norm": 0.015397579409182072, "learning_rate": 4.201145767027372e-05, "loss": 0.3689, "step": 1454 }, { "epoch": 0.9172215436744021, "grad_norm": 0.012618192471563816, "learning_rate": 4.169318905155952e-05, "loss": 0.3376, "step": 1455 }, { "epoch": 0.9178519364879241, "grad_norm": 0.015482811257243156, "learning_rate": 4.1374920432845326e-05, "loss": 0.2962, "step": 1456 }, { "epoch": 0.9184823293014459, "grad_norm": 0.014015165157616138, "learning_rate": 4.1056651814131134e-05, "loss": 0.3163, "step": 1457 }, { "epoch": 0.9191127221149679, "grad_norm": 0.01316810492426157, "learning_rate": 4.073838319541693e-05, "loss": 0.371, "step": 1458 }, { "epoch": 0.9197431149284898, "grad_norm": 0.016929320991039276, "learning_rate": 4.0420114576702736e-05, "loss": 0.3478, "step": 1459 }, { "epoch": 0.9203735077420118, "grad_norm": 0.012490914203226566, "learning_rate": 4.0101845957988543e-05, "loss": 0.3039, "step": 1460 }, { "epoch": 0.9210039005555337, "grad_norm": 0.014584866352379322, "learning_rate": 3.9783577339274344e-05, "loss": 0.3784, "step": 1461 }, { "epoch": 0.9216342933690556, "grad_norm": 0.01257404126226902, "learning_rate": 3.946530872056015e-05, "loss": 0.3221, "step": 1462 }, { "epoch": 0.9222646861825775, "grad_norm": 0.015374168753623962, "learning_rate": 3.914704010184596e-05, "loss": 0.3859, "step": 1463 }, { "epoch": 0.9228950789960995, "grad_norm": 0.015385660342872143, "learning_rate": 3.882877148313176e-05, "loss": 0.3673, "step": 1464 }, { "epoch": 0.9235254718096214, "grad_norm": 0.01580210216343403, "learning_rate": 3.851050286441757e-05, "loss": 0.3793, "step": 1465 }, { "epoch": 0.9241558646231433, "grad_norm": 0.012919224798679352, "learning_rate": 3.819223424570338e-05, "loss": 0.3371, "step": 1466 }, { "epoch": 0.9247862574366652, "grad_norm": 0.014880827628076077, "learning_rate": 3.787396562698918e-05, "loss": 0.3297, "step": 1467 }, { "epoch": 0.9254166502501872, "grad_norm": 0.01484295167028904, "learning_rate": 3.7555697008274985e-05, "loss": 0.3159, "step": 1468 }, { "epoch": 0.9260470430637091, "grad_norm": 0.016505662351846695, "learning_rate": 3.723742838956079e-05, "loss": 0.3364, "step": 1469 }, { "epoch": 0.926677435877231, "grad_norm": 0.018482755869627, "learning_rate": 3.6919159770846594e-05, "loss": 0.4, "step": 1470 }, { "epoch": 0.9273078286907529, "grad_norm": 0.018572820350527763, "learning_rate": 3.66008911521324e-05, "loss": 0.3519, "step": 1471 }, { "epoch": 0.9279382215042749, "grad_norm": 0.018973981961607933, "learning_rate": 3.628262253341821e-05, "loss": 0.375, "step": 1472 }, { "epoch": 0.9285686143177968, "grad_norm": 0.013879822567105293, "learning_rate": 3.596435391470401e-05, "loss": 0.3193, "step": 1473 }, { "epoch": 0.9291990071313188, "grad_norm": 0.014330939389765263, "learning_rate": 3.564608529598982e-05, "loss": 0.3435, "step": 1474 }, { "epoch": 0.9298293999448406, "grad_norm": 0.017358053475618362, "learning_rate": 3.5327816677275626e-05, "loss": 0.3239, "step": 1475 }, { "epoch": 0.9304597927583625, "grad_norm": 0.016250282526016235, "learning_rate": 3.500954805856143e-05, "loss": 0.3974, "step": 1476 }, { "epoch": 0.9310901855718845, "grad_norm": 0.01653210073709488, "learning_rate": 3.4691279439847235e-05, "loss": 0.3817, "step": 1477 }, { "epoch": 0.9317205783854065, "grad_norm": 0.012023596093058586, "learning_rate": 3.437301082113304e-05, "loss": 0.2976, "step": 1478 }, { "epoch": 0.9323509711989283, "grad_norm": 0.014155558310449123, "learning_rate": 3.4054742202418844e-05, "loss": 0.3269, "step": 1479 }, { "epoch": 0.9329813640124502, "grad_norm": 0.014643456786870956, "learning_rate": 3.373647358370465e-05, "loss": 0.3436, "step": 1480 }, { "epoch": 0.9336117568259722, "grad_norm": 0.01392677053809166, "learning_rate": 3.3418204964990446e-05, "loss": 0.3078, "step": 1481 }, { "epoch": 0.9342421496394941, "grad_norm": 0.012554450891911983, "learning_rate": 3.3099936346276254e-05, "loss": 0.3252, "step": 1482 }, { "epoch": 0.934872542453016, "grad_norm": 0.013463757000863552, "learning_rate": 3.278166772756206e-05, "loss": 0.3203, "step": 1483 }, { "epoch": 0.9355029352665379, "grad_norm": 0.013602751307189465, "learning_rate": 3.246339910884786e-05, "loss": 0.3423, "step": 1484 }, { "epoch": 0.9361333280800599, "grad_norm": 0.013283999636769295, "learning_rate": 3.214513049013367e-05, "loss": 0.3144, "step": 1485 }, { "epoch": 0.9367637208935818, "grad_norm": 0.015157591551542282, "learning_rate": 3.182686187141948e-05, "loss": 0.3603, "step": 1486 }, { "epoch": 0.9373941137071038, "grad_norm": 0.014818355441093445, "learning_rate": 3.150859325270528e-05, "loss": 0.3729, "step": 1487 }, { "epoch": 0.9380245065206256, "grad_norm": 0.014894101768732071, "learning_rate": 3.119032463399109e-05, "loss": 0.3687, "step": 1488 }, { "epoch": 0.9386548993341476, "grad_norm": 0.012819494120776653, "learning_rate": 3.0872056015276895e-05, "loss": 0.3529, "step": 1489 }, { "epoch": 0.9392852921476695, "grad_norm": 0.01608632132411003, "learning_rate": 3.0553787396562696e-05, "loss": 0.3942, "step": 1490 }, { "epoch": 0.9399156849611915, "grad_norm": 0.018345532938838005, "learning_rate": 3.0235518777848504e-05, "loss": 0.3952, "step": 1491 }, { "epoch": 0.9405460777747133, "grad_norm": 0.01725699007511139, "learning_rate": 2.991725015913431e-05, "loss": 0.3992, "step": 1492 }, { "epoch": 0.9411764705882353, "grad_norm": 0.015145394951105118, "learning_rate": 2.9598981540420116e-05, "loss": 0.3483, "step": 1493 }, { "epoch": 0.9418068634017572, "grad_norm": 0.015857869759202003, "learning_rate": 2.928071292170592e-05, "loss": 0.4171, "step": 1494 }, { "epoch": 0.9424372562152792, "grad_norm": 0.01388424914330244, "learning_rate": 2.8962444302991728e-05, "loss": 0.3395, "step": 1495 }, { "epoch": 0.943067649028801, "grad_norm": 0.0157376229763031, "learning_rate": 2.8644175684277532e-05, "loss": 0.3772, "step": 1496 }, { "epoch": 0.943698041842323, "grad_norm": 0.020996086299419403, "learning_rate": 2.8325907065563337e-05, "loss": 0.326, "step": 1497 }, { "epoch": 0.9443284346558449, "grad_norm": 0.01764010451734066, "learning_rate": 2.8007638446849145e-05, "loss": 0.3762, "step": 1498 }, { "epoch": 0.9449588274693669, "grad_norm": 0.013302912004292011, "learning_rate": 2.7689369828134946e-05, "loss": 0.3122, "step": 1499 }, { "epoch": 0.9455892202828887, "grad_norm": 0.014994272030889988, "learning_rate": 2.737110120942075e-05, "loss": 0.4058, "step": 1500 }, { "epoch": 0.9462196130964107, "grad_norm": 0.01617559976875782, "learning_rate": 2.7052832590706554e-05, "loss": 0.3802, "step": 1501 }, { "epoch": 0.9468500059099326, "grad_norm": 0.014939346350729465, "learning_rate": 2.6734563971992362e-05, "loss": 0.2976, "step": 1502 }, { "epoch": 0.9474803987234546, "grad_norm": 0.013673944398760796, "learning_rate": 2.6416295353278167e-05, "loss": 0.2987, "step": 1503 }, { "epoch": 0.9481107915369765, "grad_norm": 0.013986543752253056, "learning_rate": 2.609802673456397e-05, "loss": 0.3656, "step": 1504 }, { "epoch": 0.9487411843504984, "grad_norm": 0.013159860856831074, "learning_rate": 2.577975811584978e-05, "loss": 0.3157, "step": 1505 }, { "epoch": 0.9493715771640203, "grad_norm": 0.01675577275454998, "learning_rate": 2.5461489497135583e-05, "loss": 0.4184, "step": 1506 }, { "epoch": 0.9500019699775423, "grad_norm": 0.012845986522734165, "learning_rate": 2.5143220878421388e-05, "loss": 0.3355, "step": 1507 }, { "epoch": 0.9506323627910642, "grad_norm": 0.015393807552754879, "learning_rate": 2.4824952259707195e-05, "loss": 0.3746, "step": 1508 }, { "epoch": 0.9512627556045861, "grad_norm": 0.01672324351966381, "learning_rate": 2.4506683640993e-05, "loss": 0.3741, "step": 1509 }, { "epoch": 0.951893148418108, "grad_norm": 0.015271857380867004, "learning_rate": 2.41884150222788e-05, "loss": 0.3247, "step": 1510 }, { "epoch": 0.95252354123163, "grad_norm": 0.014894278720021248, "learning_rate": 2.387014640356461e-05, "loss": 0.3195, "step": 1511 }, { "epoch": 0.9531539340451519, "grad_norm": 0.014724337495863438, "learning_rate": 2.3551877784850413e-05, "loss": 0.3759, "step": 1512 }, { "epoch": 0.9537843268586738, "grad_norm": 0.015147864818572998, "learning_rate": 2.3233609166136218e-05, "loss": 0.3285, "step": 1513 }, { "epoch": 0.9544147196721957, "grad_norm": 0.011833397671580315, "learning_rate": 2.2915340547422025e-05, "loss": 0.2632, "step": 1514 }, { "epoch": 0.9550451124857177, "grad_norm": 0.013354217633605003, "learning_rate": 2.259707192870783e-05, "loss": 0.3091, "step": 1515 }, { "epoch": 0.9556755052992396, "grad_norm": 0.020135240629315376, "learning_rate": 2.2278803309993634e-05, "loss": 0.3891, "step": 1516 }, { "epoch": 0.9563058981127616, "grad_norm": 0.013294386677443981, "learning_rate": 2.1960534691279442e-05, "loss": 0.2759, "step": 1517 }, { "epoch": 0.9569362909262834, "grad_norm": 0.01891096495091915, "learning_rate": 2.1642266072565246e-05, "loss": 0.3822, "step": 1518 }, { "epoch": 0.9575666837398054, "grad_norm": 0.013109316118061543, "learning_rate": 2.132399745385105e-05, "loss": 0.3191, "step": 1519 }, { "epoch": 0.9581970765533273, "grad_norm": 0.018839385360479355, "learning_rate": 2.100572883513686e-05, "loss": 0.38, "step": 1520 }, { "epoch": 0.9588274693668493, "grad_norm": 0.015989821404218674, "learning_rate": 2.0687460216422663e-05, "loss": 0.3713, "step": 1521 }, { "epoch": 0.9594578621803711, "grad_norm": 0.012762744911015034, "learning_rate": 2.0369191597708464e-05, "loss": 0.3285, "step": 1522 }, { "epoch": 0.960088254993893, "grad_norm": 0.01947489008307457, "learning_rate": 2.0050922978994272e-05, "loss": 0.3341, "step": 1523 }, { "epoch": 0.960718647807415, "grad_norm": 0.01438645925372839, "learning_rate": 1.9732654360280076e-05, "loss": 0.3389, "step": 1524 }, { "epoch": 0.961349040620937, "grad_norm": 0.014690527692437172, "learning_rate": 1.941438574156588e-05, "loss": 0.3376, "step": 1525 }, { "epoch": 0.9619794334344588, "grad_norm": 0.014909609220921993, "learning_rate": 1.909611712285169e-05, "loss": 0.3495, "step": 1526 }, { "epoch": 0.9626098262479807, "grad_norm": 0.011698509566485882, "learning_rate": 1.8777848504137493e-05, "loss": 0.2981, "step": 1527 }, { "epoch": 0.9632402190615027, "grad_norm": 0.013244902715086937, "learning_rate": 1.8459579885423297e-05, "loss": 0.3309, "step": 1528 }, { "epoch": 0.9638706118750247, "grad_norm": 0.015424840152263641, "learning_rate": 1.8141311266709105e-05, "loss": 0.3561, "step": 1529 }, { "epoch": 0.9645010046885466, "grad_norm": 0.03275065869092941, "learning_rate": 1.782304264799491e-05, "loss": 0.3251, "step": 1530 }, { "epoch": 0.9651313975020684, "grad_norm": 0.01490817405283451, "learning_rate": 1.7504774029280714e-05, "loss": 0.3025, "step": 1531 }, { "epoch": 0.9657617903155904, "grad_norm": 0.013759489171206951, "learning_rate": 1.718650541056652e-05, "loss": 0.33, "step": 1532 }, { "epoch": 0.9663921831291123, "grad_norm": 0.019936665892601013, "learning_rate": 1.6868236791852326e-05, "loss": 0.3928, "step": 1533 }, { "epoch": 0.9670225759426343, "grad_norm": 0.015010660514235497, "learning_rate": 1.6549968173138127e-05, "loss": 0.3437, "step": 1534 }, { "epoch": 0.9676529687561561, "grad_norm": 0.015437234193086624, "learning_rate": 1.623169955442393e-05, "loss": 0.3388, "step": 1535 }, { "epoch": 0.9682833615696781, "grad_norm": 0.015262236818671227, "learning_rate": 1.591343093570974e-05, "loss": 0.3515, "step": 1536 }, { "epoch": 0.9689137543832, "grad_norm": 0.011415708810091019, "learning_rate": 1.5595162316995544e-05, "loss": 0.2952, "step": 1537 }, { "epoch": 0.969544147196722, "grad_norm": 0.020619437098503113, "learning_rate": 1.5276893698281348e-05, "loss": 0.3859, "step": 1538 }, { "epoch": 0.9701745400102438, "grad_norm": 0.01314727496355772, "learning_rate": 1.4958625079567156e-05, "loss": 0.3367, "step": 1539 }, { "epoch": 0.9708049328237658, "grad_norm": 0.0131844999268651, "learning_rate": 1.464035646085296e-05, "loss": 0.3724, "step": 1540 }, { "epoch": 0.9714353256372877, "grad_norm": 0.012725952081382275, "learning_rate": 1.4322087842138766e-05, "loss": 0.319, "step": 1541 }, { "epoch": 0.9720657184508097, "grad_norm": 0.01910482905805111, "learning_rate": 1.4003819223424572e-05, "loss": 0.3875, "step": 1542 }, { "epoch": 0.9726961112643316, "grad_norm": 0.014654729515314102, "learning_rate": 1.3685550604710375e-05, "loss": 0.3684, "step": 1543 }, { "epoch": 0.9733265040778535, "grad_norm": 0.013977615162730217, "learning_rate": 1.3367281985996181e-05, "loss": 0.3388, "step": 1544 }, { "epoch": 0.9739568968913754, "grad_norm": 0.013796146027743816, "learning_rate": 1.3049013367281986e-05, "loss": 0.3134, "step": 1545 }, { "epoch": 0.9745872897048974, "grad_norm": 0.016695033758878708, "learning_rate": 1.2730744748567792e-05, "loss": 0.3447, "step": 1546 }, { "epoch": 0.9752176825184193, "grad_norm": 0.014742291532456875, "learning_rate": 1.2412476129853598e-05, "loss": 0.3702, "step": 1547 }, { "epoch": 0.9758480753319412, "grad_norm": 0.0165182426571846, "learning_rate": 1.20942075111394e-05, "loss": 0.3339, "step": 1548 }, { "epoch": 0.9764784681454631, "grad_norm": 0.03674916550517082, "learning_rate": 1.1775938892425207e-05, "loss": 0.3982, "step": 1549 }, { "epoch": 0.9771088609589851, "grad_norm": 0.01274392195045948, "learning_rate": 1.1457670273711013e-05, "loss": 0.3297, "step": 1550 }, { "epoch": 0.977739253772507, "grad_norm": 0.014611528255045414, "learning_rate": 1.1139401654996817e-05, "loss": 0.3251, "step": 1551 }, { "epoch": 0.9783696465860289, "grad_norm": 0.01448411587625742, "learning_rate": 1.0821133036282623e-05, "loss": 0.3583, "step": 1552 }, { "epoch": 0.9790000393995508, "grad_norm": 0.015368525870144367, "learning_rate": 1.050286441756843e-05, "loss": 0.3772, "step": 1553 }, { "epoch": 0.9796304322130728, "grad_norm": 0.026646168902516365, "learning_rate": 1.0184595798854232e-05, "loss": 0.411, "step": 1554 }, { "epoch": 0.9802608250265947, "grad_norm": 0.017233865335583687, "learning_rate": 9.866327180140038e-06, "loss": 0.3779, "step": 1555 }, { "epoch": 0.9808912178401166, "grad_norm": 0.0188888031989336, "learning_rate": 9.548058561425844e-06, "loss": 0.4604, "step": 1556 }, { "epoch": 0.9815216106536385, "grad_norm": 0.013145769014954567, "learning_rate": 9.229789942711649e-06, "loss": 0.3335, "step": 1557 }, { "epoch": 0.9821520034671605, "grad_norm": 0.017094289883971214, "learning_rate": 8.911521323997455e-06, "loss": 0.3441, "step": 1558 }, { "epoch": 0.9827823962806824, "grad_norm": 0.013726276345551014, "learning_rate": 8.59325270528326e-06, "loss": 0.2964, "step": 1559 }, { "epoch": 0.9834127890942044, "grad_norm": 0.0161766167730093, "learning_rate": 8.274984086569063e-06, "loss": 0.3371, "step": 1560 }, { "epoch": 0.9840431819077262, "grad_norm": 0.014361623674631119, "learning_rate": 7.95671546785487e-06, "loss": 0.3073, "step": 1561 }, { "epoch": 0.9846735747212482, "grad_norm": 0.01615068130195141, "learning_rate": 7.638446849140674e-06, "loss": 0.3645, "step": 1562 }, { "epoch": 0.9853039675347701, "grad_norm": 0.01305047981441021, "learning_rate": 7.32017823042648e-06, "loss": 0.3194, "step": 1563 }, { "epoch": 0.9859343603482921, "grad_norm": 0.013356083072721958, "learning_rate": 7.001909611712286e-06, "loss": 0.2987, "step": 1564 }, { "epoch": 0.9865647531618139, "grad_norm": 0.017284424975514412, "learning_rate": 6.6836409929980906e-06, "loss": 0.3356, "step": 1565 }, { "epoch": 0.9871951459753359, "grad_norm": 0.021249329671263695, "learning_rate": 6.365372374283896e-06, "loss": 0.3407, "step": 1566 }, { "epoch": 0.9878255387888578, "grad_norm": 0.01298053003847599, "learning_rate": 6.0471037555697e-06, "loss": 0.3289, "step": 1567 }, { "epoch": 0.9884559316023798, "grad_norm": 0.018960773944854736, "learning_rate": 5.728835136855506e-06, "loss": 0.4266, "step": 1568 }, { "epoch": 0.9890863244159016, "grad_norm": 0.021056795492768288, "learning_rate": 5.4105665181413116e-06, "loss": 0.3305, "step": 1569 }, { "epoch": 0.9897167172294236, "grad_norm": 0.016253411769866943, "learning_rate": 5.092297899427116e-06, "loss": 0.3638, "step": 1570 }, { "epoch": 0.9903471100429455, "grad_norm": 0.015744000673294067, "learning_rate": 4.774029280712922e-06, "loss": 0.3776, "step": 1571 }, { "epoch": 0.9909775028564675, "grad_norm": 0.015269494615495205, "learning_rate": 4.455760661998727e-06, "loss": 0.3783, "step": 1572 }, { "epoch": 0.9916078956699894, "grad_norm": 0.014406811445951462, "learning_rate": 4.137492043284532e-06, "loss": 0.3592, "step": 1573 }, { "epoch": 0.9922382884835113, "grad_norm": 0.012684257701039314, "learning_rate": 3.819223424570337e-06, "loss": 0.3005, "step": 1574 }, { "epoch": 0.9928686812970332, "grad_norm": 0.017118453979492188, "learning_rate": 3.500954805856143e-06, "loss": 0.3633, "step": 1575 }, { "epoch": 0.9934990741105552, "grad_norm": 0.01412774994969368, "learning_rate": 3.182686187141948e-06, "loss": 0.3662, "step": 1576 }, { "epoch": 0.9941294669240771, "grad_norm": 0.01718618907034397, "learning_rate": 2.864417568427753e-06, "loss": 0.3859, "step": 1577 }, { "epoch": 0.994759859737599, "grad_norm": 0.012873218394815922, "learning_rate": 2.546148949713558e-06, "loss": 0.3415, "step": 1578 }, { "epoch": 0.9953902525511209, "grad_norm": 0.014852388761937618, "learning_rate": 2.2278803309993637e-06, "loss": 0.3715, "step": 1579 }, { "epoch": 0.9960206453646429, "grad_norm": 0.014308740384876728, "learning_rate": 1.9096117122851685e-06, "loss": 0.3317, "step": 1580 }, { "epoch": 0.9966510381781648, "grad_norm": 0.017207155004143715, "learning_rate": 1.591343093570974e-06, "loss": 0.3732, "step": 1581 }, { "epoch": 0.9972814309916866, "grad_norm": 0.013973016291856766, "learning_rate": 1.273074474856779e-06, "loss": 0.351, "step": 1582 }, { "epoch": 0.9979118238052086, "grad_norm": 0.01588715799152851, "learning_rate": 9.548058561425842e-07, "loss": 0.3734, "step": 1583 }, { "epoch": 0.9985422166187305, "grad_norm": 0.03298623859882355, "learning_rate": 6.365372374283895e-07, "loss": 0.3839, "step": 1584 }, { "epoch": 0.9991726094322525, "grad_norm": 0.014190657995641232, "learning_rate": 3.1826861871419475e-07, "loss": 0.3485, "step": 1585 }, { "epoch": 0.9998030022457745, "grad_norm": 0.014439358375966549, "learning_rate": 0.0, "loss": 0.3353, "step": 1586 } ], "logging_steps": 1, "max_steps": 1586, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.228128976858153e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }