{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996828417380272, "eval_steps": 500, "global_step": 1576, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006343165239454488, "grad_norm": 4.072216572643748, "learning_rate": 6.329113924050633e-06, "loss": 3.2618, "step": 1 }, { "epoch": 0.003171582619727244, "grad_norm": 3.6569951400637444, "learning_rate": 3.1645569620253167e-05, "loss": 3.3026, "step": 5 }, { "epoch": 0.006343165239454488, "grad_norm": 2.7101736291338634, "learning_rate": 6.329113924050633e-05, "loss": 3.1879, "step": 10 }, { "epoch": 0.009514747859181731, "grad_norm": 0.8292602783930105, "learning_rate": 9.49367088607595e-05, "loss": 2.9431, "step": 15 }, { "epoch": 0.012686330478908976, "grad_norm": 0.4014812685185947, "learning_rate": 0.00012658227848101267, "loss": 2.7946, "step": 20 }, { "epoch": 0.01585791309863622, "grad_norm": 0.3318705203128404, "learning_rate": 0.00015822784810126583, "loss": 2.6345, "step": 25 }, { "epoch": 0.019029495718363463, "grad_norm": 0.4987722149702972, "learning_rate": 0.000189873417721519, "loss": 2.547, "step": 30 }, { "epoch": 0.022201078338090707, "grad_norm": 0.3868479554032603, "learning_rate": 0.00022151898734177215, "loss": 2.462, "step": 35 }, { "epoch": 0.02537266095781795, "grad_norm": 0.3440357488186946, "learning_rate": 0.00025316455696202533, "loss": 2.4119, "step": 40 }, { "epoch": 0.028544243577545196, "grad_norm": 0.2542091158949379, "learning_rate": 0.0002848101265822785, "loss": 2.3337, "step": 45 }, { "epoch": 0.03171582619727244, "grad_norm": 0.5028112177822042, "learning_rate": 0.00031645569620253165, "loss": 2.3079, "step": 50 }, { "epoch": 0.034887408816999685, "grad_norm": 0.3647071858886548, "learning_rate": 0.00034810126582278487, "loss": 2.2772, "step": 55 }, { "epoch": 0.038058991436726926, "grad_norm": 0.3365279113242041, "learning_rate": 0.000379746835443038, "loss": 2.2643, "step": 60 }, { "epoch": 0.041230574056454174, "grad_norm": 0.3437920074360608, "learning_rate": 0.0004113924050632912, "loss": 2.2467, "step": 65 }, { "epoch": 0.044402156676181415, "grad_norm": 0.2180989181512006, "learning_rate": 0.0004430379746835443, "loss": 2.2192, "step": 70 }, { "epoch": 0.047573739295908656, "grad_norm": 0.3740832044447792, "learning_rate": 0.00047468354430379745, "loss": 2.2202, "step": 75 }, { "epoch": 0.0507453219156359, "grad_norm": 0.4042428788012064, "learning_rate": 0.0005063291139240507, "loss": 2.206, "step": 80 }, { "epoch": 0.053916904535363144, "grad_norm": 0.5502810429404877, "learning_rate": 0.0005379746835443038, "loss": 2.1792, "step": 85 }, { "epoch": 0.05708848715509039, "grad_norm": 0.9725610364599878, "learning_rate": 0.000569620253164557, "loss": 2.1717, "step": 90 }, { "epoch": 0.06026006977481763, "grad_norm": 0.4008236462318082, "learning_rate": 0.0006012658227848101, "loss": 2.1581, "step": 95 }, { "epoch": 0.06343165239454487, "grad_norm": 1.1094243654374898, "learning_rate": 0.0006329113924050633, "loss": 2.1497, "step": 100 }, { "epoch": 0.06660323501427212, "grad_norm": 0.257873202758346, "learning_rate": 0.0006645569620253165, "loss": 2.1357, "step": 105 }, { "epoch": 0.06977481763399937, "grad_norm": 0.4530053303085577, "learning_rate": 0.0006962025316455697, "loss": 2.1319, "step": 110 }, { "epoch": 0.0729464002537266, "grad_norm": 0.237617906265262, "learning_rate": 0.0007278481012658228, "loss": 2.114, "step": 115 }, { "epoch": 0.07611798287345385, "grad_norm": 1.309988153168323, "learning_rate": 0.000759493670886076, "loss": 2.101, "step": 120 }, { "epoch": 0.0792895654931811, "grad_norm": 0.33147972360135136, "learning_rate": 0.0007911392405063291, "loss": 2.0983, "step": 125 }, { "epoch": 0.08246114811290835, "grad_norm": 0.6923953247791184, "learning_rate": 0.0008227848101265824, "loss": 2.0775, "step": 130 }, { "epoch": 0.08563273073263558, "grad_norm": 0.6108397955993198, "learning_rate": 0.0008544303797468354, "loss": 2.076, "step": 135 }, { "epoch": 0.08880431335236283, "grad_norm": 0.25451367249745316, "learning_rate": 0.0008860759493670886, "loss": 2.0613, "step": 140 }, { "epoch": 0.09197589597209008, "grad_norm": 0.5696237036165042, "learning_rate": 0.0009177215189873418, "loss": 2.0571, "step": 145 }, { "epoch": 0.09514747859181731, "grad_norm": 0.47600183429795934, "learning_rate": 0.0009493670886075949, "loss": 2.0597, "step": 150 }, { "epoch": 0.09831906121154456, "grad_norm": 0.49360558045014563, "learning_rate": 0.0009810126582278482, "loss": 2.0276, "step": 155 }, { "epoch": 0.1014906438312718, "grad_norm": 0.2619769061768294, "learning_rate": 0.0009999950915251159, "loss": 2.049, "step": 160 }, { "epoch": 0.10466222645099905, "grad_norm": 0.25933552218187667, "learning_rate": 0.0009999398722894419, "loss": 2.0304, "step": 165 }, { "epoch": 0.10783380907072629, "grad_norm": 0.334897031747453, "learning_rate": 0.0009998233050230736, "loss": 2.0144, "step": 170 }, { "epoch": 0.11100539169045354, "grad_norm": 0.31275160877263003, "learning_rate": 0.0009996454040300758, "loss": 1.9773, "step": 175 }, { "epoch": 0.11417697431018078, "grad_norm": 0.5307747748580993, "learning_rate": 0.0009994061911408245, "loss": 1.9863, "step": 180 }, { "epoch": 0.11734855692990802, "grad_norm": 0.4066591897491239, "learning_rate": 0.0009991056957093295, "loss": 1.9812, "step": 185 }, { "epoch": 0.12052013954963527, "grad_norm": 0.40898061678871916, "learning_rate": 0.0009987439546096308, "loss": 1.9983, "step": 190 }, { "epoch": 0.12369172216936251, "grad_norm": 0.6281156992876311, "learning_rate": 0.0009983210122312745, "loss": 1.9663, "step": 195 }, { "epoch": 0.12686330478908975, "grad_norm": 0.418957878268899, "learning_rate": 0.000997836920473866, "loss": 1.9443, "step": 200 }, { "epoch": 0.130034887408817, "grad_norm": 0.2967784254205414, "learning_rate": 0.000997291738740701, "loss": 1.9496, "step": 205 }, { "epoch": 0.13320647002854424, "grad_norm": 0.32659030796363414, "learning_rate": 0.0009966855339314756, "loss": 1.9394, "step": 210 }, { "epoch": 0.1363780526482715, "grad_norm": 0.44336971944165454, "learning_rate": 0.0009960183804340781, "loss": 1.9274, "step": 215 }, { "epoch": 0.13954963526799874, "grad_norm": 0.7513841493295751, "learning_rate": 0.0009952903601154596, "loss": 1.937, "step": 220 }, { "epoch": 0.142721217887726, "grad_norm": 0.8840919710762163, "learning_rate": 0.0009945015623115897, "loss": 1.9222, "step": 225 }, { "epoch": 0.1458928005074532, "grad_norm": 0.3088103069995084, "learning_rate": 0.000993652083816491, "loss": 1.9272, "step": 230 }, { "epoch": 0.14906438312718046, "grad_norm": 0.3205509173648325, "learning_rate": 0.0009927420288703658, "loss": 1.9282, "step": 235 }, { "epoch": 0.1522359657469077, "grad_norm": 0.5725328480987164, "learning_rate": 0.0009917715091467998, "loss": 1.9092, "step": 240 }, { "epoch": 0.15540754836663495, "grad_norm": 0.6881811737021382, "learning_rate": 0.000990740643739063, "loss": 1.9257, "step": 245 }, { "epoch": 0.1585791309863622, "grad_norm": 0.3928160747689014, "learning_rate": 0.000989649559145493, "loss": 1.9075, "step": 250 }, { "epoch": 0.16175071360608945, "grad_norm": 0.3457762887590973, "learning_rate": 0.000988498389253972, "loss": 1.8954, "step": 255 }, { "epoch": 0.1649222962258167, "grad_norm": 0.6130106535523941, "learning_rate": 0.0009872872753254995, "loss": 1.8869, "step": 260 }, { "epoch": 0.16809387884554391, "grad_norm": 0.2006935276789736, "learning_rate": 0.0009860163659768566, "loss": 1.8764, "step": 265 }, { "epoch": 0.17126546146527116, "grad_norm": 0.2519600708036085, "learning_rate": 0.0009846858171623687, "loss": 1.8592, "step": 270 }, { "epoch": 0.1744370440849984, "grad_norm": 0.21430434382675823, "learning_rate": 0.0009832957921547696, "loss": 1.8588, "step": 275 }, { "epoch": 0.17760862670472566, "grad_norm": 0.7316176065198735, "learning_rate": 0.000981846461525165, "loss": 1.8442, "step": 280 }, { "epoch": 0.1807802093244529, "grad_norm": 0.5438158046657656, "learning_rate": 0.0009803380031221018, "loss": 1.8681, "step": 285 }, { "epoch": 0.18395179194418015, "grad_norm": 0.22290789589006946, "learning_rate": 0.000978770602049745, "loss": 1.8342, "step": 290 }, { "epoch": 0.1871233745639074, "grad_norm": 0.2561355818352734, "learning_rate": 0.0009771444506451621, "loss": 1.8408, "step": 295 }, { "epoch": 0.19029495718363462, "grad_norm": 0.3381776052738623, "learning_rate": 0.0009754597484547223, "loss": 1.829, "step": 300 }, { "epoch": 0.19346653980336187, "grad_norm": 0.2267569653346989, "learning_rate": 0.0009737167022096094, "loss": 1.8283, "step": 305 }, { "epoch": 0.19663812242308912, "grad_norm": 0.23165580428938548, "learning_rate": 0.0009719155258004541, "loss": 1.8071, "step": 310 }, { "epoch": 0.19980970504281637, "grad_norm": 0.25586494282771377, "learning_rate": 0.0009700564402510871, "loss": 1.8145, "step": 315 }, { "epoch": 0.2029812876625436, "grad_norm": 0.2540371949506308, "learning_rate": 0.0009681396736914168, "loss": 1.8015, "step": 320 }, { "epoch": 0.20615287028227086, "grad_norm": 0.6388348558478815, "learning_rate": 0.0009661654613294355, "loss": 1.8127, "step": 325 }, { "epoch": 0.2093244529019981, "grad_norm": 0.3864015903258655, "learning_rate": 0.0009641340454223575, "loss": 1.7935, "step": 330 }, { "epoch": 0.21249603552172533, "grad_norm": 0.2751489116810319, "learning_rate": 0.0009620456752468903, "loss": 1.8058, "step": 335 }, { "epoch": 0.21566761814145258, "grad_norm": 0.7422837235361598, "learning_rate": 0.0009599006070686467, "loss": 1.7927, "step": 340 }, { "epoch": 0.21883920076117983, "grad_norm": 0.4534291854575538, "learning_rate": 0.0009576991041106973, "loss": 1.7927, "step": 345 }, { "epoch": 0.22201078338090707, "grad_norm": 0.3583139746684532, "learning_rate": 0.0009554414365212709, "loss": 1.7883, "step": 350 }, { "epoch": 0.22518236600063432, "grad_norm": 0.20634161577455162, "learning_rate": 0.0009531278813406046, "loss": 1.7637, "step": 355 }, { "epoch": 0.22835394862036157, "grad_norm": 0.5462716024749192, "learning_rate": 0.000950758722466947, "loss": 1.7823, "step": 360 }, { "epoch": 0.23152553124008882, "grad_norm": 0.20847302955466993, "learning_rate": 0.0009483342506217214, "loss": 1.7736, "step": 365 }, { "epoch": 0.23469711385981604, "grad_norm": 0.21809684764751344, "learning_rate": 0.0009458547633138515, "loss": 1.7636, "step": 370 }, { "epoch": 0.23786869647954328, "grad_norm": 0.19220401784144317, "learning_rate": 0.0009433205648032528, "loss": 1.7509, "step": 375 }, { "epoch": 0.24104027909927053, "grad_norm": 0.273271874095809, "learning_rate": 0.0009407319660634979, "loss": 1.7488, "step": 380 }, { "epoch": 0.24421186171899778, "grad_norm": 0.31458786826276625, "learning_rate": 0.0009380892847436555, "loss": 1.7342, "step": 385 }, { "epoch": 0.24738344433872503, "grad_norm": 0.19789392284188642, "learning_rate": 0.0009353928451293121, "loss": 1.743, "step": 390 }, { "epoch": 0.2505550269584523, "grad_norm": 0.24499888411428472, "learning_rate": 0.0009326429781027789, "loss": 1.7193, "step": 395 }, { "epoch": 0.2537266095781795, "grad_norm": 0.33173879702411524, "learning_rate": 0.0009298400211024877, "loss": 1.729, "step": 400 }, { "epoch": 0.25689819219790677, "grad_norm": 0.34879621136110506, "learning_rate": 0.0009269843180815853, "loss": 1.7241, "step": 405 }, { "epoch": 0.260069774817634, "grad_norm": 0.20572899222991778, "learning_rate": 0.0009240762194657253, "loss": 1.7229, "step": 410 }, { "epoch": 0.26324135743736127, "grad_norm": 0.21561079654661294, "learning_rate": 0.0009211160821100679, "loss": 1.7155, "step": 415 }, { "epoch": 0.2664129400570885, "grad_norm": 0.443393553505543, "learning_rate": 0.0009181042692554893, "loss": 1.7111, "step": 420 }, { "epoch": 0.2695845226768157, "grad_norm": 0.2266683853424827, "learning_rate": 0.0009150411504840086, "loss": 1.7009, "step": 425 }, { "epoch": 0.272756105296543, "grad_norm": 0.3582735361142464, "learning_rate": 0.000911927101673436, "loss": 1.7016, "step": 430 }, { "epoch": 0.2759276879162702, "grad_norm": 0.43342116834945776, "learning_rate": 0.0009087625049512488, "loss": 1.7037, "step": 435 }, { "epoch": 0.2790992705359975, "grad_norm": 0.3295875571024751, "learning_rate": 0.0009055477486476991, "loss": 1.682, "step": 440 }, { "epoch": 0.2822708531557247, "grad_norm": 0.1891978276034803, "learning_rate": 0.0009022832272481627, "loss": 1.6899, "step": 445 }, { "epoch": 0.285442435775452, "grad_norm": 0.26615608448970285, "learning_rate": 0.000898969341344731, "loss": 1.6909, "step": 450 }, { "epoch": 0.2886140183951792, "grad_norm": 0.26554406802462666, "learning_rate": 0.0008956064975870544, "loss": 1.6764, "step": 455 }, { "epoch": 0.2917856010149064, "grad_norm": 0.20008546513645153, "learning_rate": 0.0008921951086324411, "loss": 1.6571, "step": 460 }, { "epoch": 0.2949571836346337, "grad_norm": 0.25575390463894654, "learning_rate": 0.0008887355930952202, "loss": 1.6636, "step": 465 }, { "epoch": 0.2981287662543609, "grad_norm": 0.3501161922386378, "learning_rate": 0.0008852283754953732, "loss": 1.657, "step": 470 }, { "epoch": 0.3013003488740882, "grad_norm": 0.20707308621635875, "learning_rate": 0.0008816738862064412, "loss": 1.6659, "step": 475 }, { "epoch": 0.3044719314938154, "grad_norm": 0.2572060719794171, "learning_rate": 0.0008780725614027123, "loss": 1.6521, "step": 480 }, { "epoch": 0.3076435141135427, "grad_norm": 0.2773641851176988, "learning_rate": 0.000874424843005699, "loss": 1.6545, "step": 485 }, { "epoch": 0.3108150967332699, "grad_norm": 0.5151669199508683, "learning_rate": 0.0008707311786299099, "loss": 1.6512, "step": 490 }, { "epoch": 0.3139866793529971, "grad_norm": 0.35976330322294225, "learning_rate": 0.0008669920215279222, "loss": 1.6489, "step": 495 }, { "epoch": 0.3171582619727244, "grad_norm": 0.18626964833018503, "learning_rate": 0.0008632078305347623, "loss": 1.6292, "step": 500 }, { "epoch": 0.3203298445924516, "grad_norm": 0.26834931489718644, "learning_rate": 0.0008593790700116029, "loss": 1.6244, "step": 505 }, { "epoch": 0.3235014272121789, "grad_norm": 0.24263398553664595, "learning_rate": 0.0008555062097887796, "loss": 1.6173, "step": 510 }, { "epoch": 0.3266730098319061, "grad_norm": 0.20406088273168801, "learning_rate": 0.0008515897251081384, "loss": 1.6273, "step": 515 }, { "epoch": 0.3298445924516334, "grad_norm": 0.18611145873310309, "learning_rate": 0.0008476300965647186, "loss": 1.5954, "step": 520 }, { "epoch": 0.3330161750713606, "grad_norm": 0.2455194958653317, "learning_rate": 0.0008436278100477775, "loss": 1.6284, "step": 525 }, { "epoch": 0.33618775769108783, "grad_norm": 0.3089123130431731, "learning_rate": 0.0008395833566811676, "loss": 1.6043, "step": 530 }, { "epoch": 0.3393593403108151, "grad_norm": 0.278226103442385, "learning_rate": 0.0008354972327630705, "loss": 1.5991, "step": 535 }, { "epoch": 0.3425309229305423, "grad_norm": 0.40283564646452896, "learning_rate": 0.000831369939705094, "loss": 1.5942, "step": 540 }, { "epoch": 0.3457025055502696, "grad_norm": 0.667879925495927, "learning_rate": 0.0008272019839707461, "loss": 1.5968, "step": 545 }, { "epoch": 0.3488740881699968, "grad_norm": 0.34787949832129605, "learning_rate": 0.0008229938770132843, "loss": 1.5815, "step": 550 }, { "epoch": 0.3520456707897241, "grad_norm": 0.2584482438633063, "learning_rate": 0.0008187461352129555, "loss": 1.5884, "step": 555 }, { "epoch": 0.3552172534094513, "grad_norm": 0.22742176340374293, "learning_rate": 0.0008144592798136309, "loss": 1.5919, "step": 560 }, { "epoch": 0.35838883602917854, "grad_norm": 0.42255520990843093, "learning_rate": 0.0008101338368588436, "loss": 1.5913, "step": 565 }, { "epoch": 0.3615604186489058, "grad_norm": 0.3824113474293145, "learning_rate": 0.0008057703371272366, "loss": 1.5611, "step": 570 }, { "epoch": 0.36473200126863303, "grad_norm": 0.2091017723989841, "learning_rate": 0.0008013693160674316, "loss": 1.5626, "step": 575 }, { "epoch": 0.3679035838883603, "grad_norm": 0.23814734243563393, "learning_rate": 0.0007969313137323229, "loss": 1.5656, "step": 580 }, { "epoch": 0.37107516650808753, "grad_norm": 0.2597004458168679, "learning_rate": 0.0007924568747128076, "loss": 1.5624, "step": 585 }, { "epoch": 0.3742467491278148, "grad_norm": 0.2949069544402481, "learning_rate": 0.0007879465480709576, "loss": 1.5516, "step": 590 }, { "epoch": 0.377418331747542, "grad_norm": 0.21263382790898516, "learning_rate": 0.0007834008872726453, "loss": 1.5409, "step": 595 }, { "epoch": 0.38058991436726924, "grad_norm": 0.27681275720229476, "learning_rate": 0.0007788204501196254, "loss": 1.5507, "step": 600 }, { "epoch": 0.3837614969869965, "grad_norm": 0.5196324383707882, "learning_rate": 0.000774205798681088, "loss": 1.5435, "step": 605 }, { "epoch": 0.38693307960672374, "grad_norm": 0.3397151418636398, "learning_rate": 0.000769557499224686, "loss": 1.5292, "step": 610 }, { "epoch": 0.390104662226451, "grad_norm": 0.21757261564984298, "learning_rate": 0.0007648761221470481, "loss": 1.5342, "step": 615 }, { "epoch": 0.39327624484617824, "grad_norm": 0.23799713493080946, "learning_rate": 0.000760162241903785, "loss": 1.5314, "step": 620 }, { "epoch": 0.3964478274659055, "grad_norm": 0.20955913102047505, "learning_rate": 0.0007554164369389975, "loss": 1.5149, "step": 625 }, { "epoch": 0.39961941008563273, "grad_norm": 0.19465193626198848, "learning_rate": 0.0007506392896142951, "loss": 1.514, "step": 630 }, { "epoch": 0.40279099270535995, "grad_norm": 0.37370015455345407, "learning_rate": 0.0007458313861373336, "loss": 1.5138, "step": 635 }, { "epoch": 0.4059625753250872, "grad_norm": 0.2112845859224254, "learning_rate": 0.0007409933164898818, "loss": 1.5024, "step": 640 }, { "epoch": 0.40913415794481445, "grad_norm": 0.24626397881644146, "learning_rate": 0.0007361256743554241, "loss": 1.519, "step": 645 }, { "epoch": 0.4123057405645417, "grad_norm": 0.3216374157044185, "learning_rate": 0.0007312290570463083, "loss": 1.5039, "step": 650 }, { "epoch": 0.41547732318426894, "grad_norm": 0.22302629969432056, "learning_rate": 0.0007263040654304502, "loss": 1.494, "step": 655 }, { "epoch": 0.4186489058039962, "grad_norm": 0.2675317557830398, "learning_rate": 0.0007213513038575998, "loss": 1.4884, "step": 660 }, { "epoch": 0.42182048842372344, "grad_norm": 0.2905992631967741, "learning_rate": 0.0007163713800851811, "loss": 1.4851, "step": 665 }, { "epoch": 0.42499207104345066, "grad_norm": 0.20033257450058217, "learning_rate": 0.0007113649052037139, "loss": 1.475, "step": 670 }, { "epoch": 0.42816365366317793, "grad_norm": 0.24204591478150614, "learning_rate": 0.0007063324935618264, "loss": 1.4854, "step": 675 }, { "epoch": 0.43133523628290515, "grad_norm": 0.2121430223132248, "learning_rate": 0.0007012747626908679, "loss": 1.4867, "step": 680 }, { "epoch": 0.43450681890263243, "grad_norm": 0.22539040426730952, "learning_rate": 0.0006961923332291309, "loss": 1.467, "step": 685 }, { "epoch": 0.43767840152235965, "grad_norm": 0.22695514383581045, "learning_rate": 0.0006910858288456921, "loss": 1.4657, "step": 690 }, { "epoch": 0.4408499841420869, "grad_norm": 0.22184474318285244, "learning_rate": 0.0006859558761638819, "loss": 1.4423, "step": 695 }, { "epoch": 0.44402156676181415, "grad_norm": 0.29575466467956124, "learning_rate": 0.0006808031046843901, "loss": 1.4485, "step": 700 }, { "epoch": 0.44719314938154137, "grad_norm": 0.21488007270980458, "learning_rate": 0.0006756281467080205, "loss": 1.4508, "step": 705 }, { "epoch": 0.45036473200126864, "grad_norm": 0.38667469007287536, "learning_rate": 0.0006704316372580989, "loss": 1.4459, "step": 710 }, { "epoch": 0.45353631462099586, "grad_norm": 0.5234661249173684, "learning_rate": 0.0006652142140025517, "loss": 1.435, "step": 715 }, { "epoch": 0.45670789724072314, "grad_norm": 0.37462414488518325, "learning_rate": 0.0006599765171756538, "loss": 1.4379, "step": 720 }, { "epoch": 0.45987947986045036, "grad_norm": 0.3040640640559466, "learning_rate": 0.0006547191894994679, "loss": 1.4341, "step": 725 }, { "epoch": 0.46305106248017763, "grad_norm": 0.28687145037107376, "learning_rate": 0.0006494428761049736, "loss": 1.4297, "step": 730 }, { "epoch": 0.46622264509990485, "grad_norm": 0.20728566940658133, "learning_rate": 0.0006441482244529037, "loss": 1.4124, "step": 735 }, { "epoch": 0.4693942277196321, "grad_norm": 0.21956352378213645, "learning_rate": 0.0006388358842542938, "loss": 1.4162, "step": 740 }, { "epoch": 0.47256581033935935, "grad_norm": 0.20961137895482168, "learning_rate": 0.0006335065073907551, "loss": 1.4055, "step": 745 }, { "epoch": 0.47573739295908657, "grad_norm": 0.21979161995613117, "learning_rate": 0.0006281607478344823, "loss": 1.4112, "step": 750 }, { "epoch": 0.47890897557881384, "grad_norm": 0.2707420648256881, "learning_rate": 0.0006227992615680033, "loss": 1.4127, "step": 755 }, { "epoch": 0.48208055819854106, "grad_norm": 0.2829526420993808, "learning_rate": 0.000617422706503684, "loss": 1.3905, "step": 760 }, { "epoch": 0.48525214081826834, "grad_norm": 0.2988909342739172, "learning_rate": 0.0006120317424029943, "loss": 1.3941, "step": 765 }, { "epoch": 0.48842372343799556, "grad_norm": 0.2787477024270155, "learning_rate": 0.0006066270307955492, "loss": 1.404, "step": 770 }, { "epoch": 0.4915953060577228, "grad_norm": 0.22142539860110755, "learning_rate": 0.000601209234897931, "loss": 1.3886, "step": 775 }, { "epoch": 0.49476688867745006, "grad_norm": 0.2777507592841434, "learning_rate": 0.0005957790195323064, "loss": 1.3896, "step": 780 }, { "epoch": 0.4979384712971773, "grad_norm": 0.2552429702914411, "learning_rate": 0.0005903370510448447, "loss": 1.3779, "step": 785 }, { "epoch": 0.5011100539169046, "grad_norm": 0.268362490404369, "learning_rate": 0.0005848839972239511, "loss": 1.3732, "step": 790 }, { "epoch": 0.5042816365366318, "grad_norm": 0.24326700144489616, "learning_rate": 0.0005794205272183205, "loss": 1.3748, "step": 795 }, { "epoch": 0.507453219156359, "grad_norm": 0.36345823855975784, "learning_rate": 0.0005739473114548266, "loss": 1.3755, "step": 800 }, { "epoch": 0.5106248017760863, "grad_norm": 0.4843512449452226, "learning_rate": 0.000568465021556253, "loss": 1.3638, "step": 805 }, { "epoch": 0.5137963843958135, "grad_norm": 0.28627979209509896, "learning_rate": 0.0005629743302588779, "loss": 1.3514, "step": 810 }, { "epoch": 0.5169679670155407, "grad_norm": 0.25100043970170133, "learning_rate": 0.0005574759113299217, "loss": 1.341, "step": 815 }, { "epoch": 0.520139549635268, "grad_norm": 0.294501664896367, "learning_rate": 0.0005519704394848692, "loss": 1.3323, "step": 820 }, { "epoch": 0.5233111322549953, "grad_norm": 0.2382026692611784, "learning_rate": 0.0005464585903046744, "loss": 1.3483, "step": 825 }, { "epoch": 0.5264827148747225, "grad_norm": 0.3233277006590301, "learning_rate": 0.0005409410401528587, "loss": 1.3275, "step": 830 }, { "epoch": 0.5296542974944497, "grad_norm": 0.27343633383605254, "learning_rate": 0.0005354184660925148, "loss": 1.3379, "step": 835 }, { "epoch": 0.532825880114177, "grad_norm": 0.23005655087591242, "learning_rate": 0.0005298915458032233, "loss": 1.3213, "step": 840 }, { "epoch": 0.5359974627339043, "grad_norm": 0.22247227894179622, "learning_rate": 0.0005243609574978941, "loss": 1.3295, "step": 845 }, { "epoch": 0.5391690453536314, "grad_norm": 0.30014284045451645, "learning_rate": 0.0005188273798395424, "loss": 1.3214, "step": 850 }, { "epoch": 0.5423406279733587, "grad_norm": 0.3132385853038301, "learning_rate": 0.0005132914918580093, "loss": 1.3172, "step": 855 }, { "epoch": 0.545512210593086, "grad_norm": 0.33728113255378034, "learning_rate": 0.0005077539728666374, "loss": 1.3218, "step": 860 }, { "epoch": 0.5486837932128132, "grad_norm": 0.25874007616270794, "learning_rate": 0.0005022155023789121, "loss": 1.2957, "step": 865 }, { "epoch": 0.5518553758325404, "grad_norm": 0.24203527103405384, "learning_rate": 0.0004966767600250775, "loss": 1.3035, "step": 870 }, { "epoch": 0.5550269584522677, "grad_norm": 0.21381303917505012, "learning_rate": 0.0004911384254687388, "loss": 1.2995, "step": 875 }, { "epoch": 0.558198541071995, "grad_norm": 0.24304896972645837, "learning_rate": 0.00048560117832345984, "loss": 1.2824, "step": 880 }, { "epoch": 0.5613701236917221, "grad_norm": 0.3080366389357483, "learning_rate": 0.0004800656980693674, "loss": 1.2898, "step": 885 }, { "epoch": 0.5645417063114494, "grad_norm": 0.262557897437375, "learning_rate": 0.00047453266396977174, "loss": 1.2779, "step": 890 }, { "epoch": 0.5677132889311767, "grad_norm": 0.31083455423034645, "learning_rate": 0.00046900275498781347, "loss": 1.2806, "step": 895 }, { "epoch": 0.570884871550904, "grad_norm": 0.21597926838814396, "learning_rate": 0.00046347664970314723, "loss": 1.274, "step": 900 }, { "epoch": 0.5740564541706311, "grad_norm": 0.22596235970597578, "learning_rate": 0.0004579550262286731, "loss": 1.2666, "step": 905 }, { "epoch": 0.5772280367903584, "grad_norm": 0.22827094422158484, "learning_rate": 0.0004524385621273246, "loss": 1.2583, "step": 910 }, { "epoch": 0.5803996194100857, "grad_norm": 0.24853325436526866, "learning_rate": 0.00044692793432892387, "loss": 1.2693, "step": 915 }, { "epoch": 0.5835712020298128, "grad_norm": 0.2765479869012326, "learning_rate": 0.00044142381904711624, "loss": 1.26, "step": 920 }, { "epoch": 0.5867427846495401, "grad_norm": 0.27285996236330706, "learning_rate": 0.00043592689169639034, "loss": 1.246, "step": 925 }, { "epoch": 0.5899143672692674, "grad_norm": 0.28781941328826144, "learning_rate": 0.0004304378268091982, "loss": 1.249, "step": 930 }, { "epoch": 0.5930859498889947, "grad_norm": 0.240504157977766, "learning_rate": 0.0004249572979531822, "loss": 1.2534, "step": 935 }, { "epoch": 0.5962575325087218, "grad_norm": 0.341483100362183, "learning_rate": 0.0004194859776485216, "loss": 1.2376, "step": 940 }, { "epoch": 0.5994291151284491, "grad_norm": 0.27130765824409686, "learning_rate": 0.0004140245372854065, "loss": 1.2426, "step": 945 }, { "epoch": 0.6026006977481764, "grad_norm": 0.28496801994375115, "learning_rate": 0.0004085736470416516, "loss": 1.2347, "step": 950 }, { "epoch": 0.6057722803679035, "grad_norm": 0.33820479660283104, "learning_rate": 0.00040313397580045765, "loss": 1.2397, "step": 955 }, { "epoch": 0.6089438629876308, "grad_norm": 0.2537502852561033, "learning_rate": 0.0003977061910683325, "loss": 1.2319, "step": 960 }, { "epoch": 0.6121154456073581, "grad_norm": 0.2543562572422921, "learning_rate": 0.0003922909588931808, "loss": 1.2221, "step": 965 }, { "epoch": 0.6152870282270854, "grad_norm": 0.28194628415561285, "learning_rate": 0.0003868889437825724, "loss": 1.2213, "step": 970 }, { "epoch": 0.6184586108468125, "grad_norm": 0.26751445743912233, "learning_rate": 0.0003815008086222007, "loss": 1.211, "step": 975 }, { "epoch": 0.6216301934665398, "grad_norm": 0.22966413613029274, "learning_rate": 0.0003761272145945388, "loss": 1.2058, "step": 980 }, { "epoch": 0.6248017760862671, "grad_norm": 0.24668142278345986, "learning_rate": 0.0003707688210977055, "loss": 1.2223, "step": 985 }, { "epoch": 0.6279733587059942, "grad_norm": 0.23811743937781157, "learning_rate": 0.00036542628566455025, "loss": 1.2024, "step": 990 }, { "epoch": 0.6311449413257215, "grad_norm": 0.2901121774163334, "learning_rate": 0.0003601002638819665, "loss": 1.2036, "step": 995 }, { "epoch": 0.6343165239454488, "grad_norm": 0.2600410825499236, "learning_rate": 0.0003547914093104439, "loss": 1.2012, "step": 1000 }, { "epoch": 0.6374881065651761, "grad_norm": 0.352563938838776, "learning_rate": 0.0003495003734038697, "loss": 1.1751, "step": 1005 }, { "epoch": 0.6406596891849032, "grad_norm": 0.26125000772161344, "learning_rate": 0.00034422780542958827, "loss": 1.1919, "step": 1010 }, { "epoch": 0.6438312718046305, "grad_norm": 0.2640437019043301, "learning_rate": 0.00033897435238872874, "loss": 1.1781, "step": 1015 }, { "epoch": 0.6470028544243578, "grad_norm": 0.2782272386225361, "learning_rate": 0.00033374065893681127, "loss": 1.1821, "step": 1020 }, { "epoch": 0.650174437044085, "grad_norm": 0.24555576527657738, "learning_rate": 0.0003285273673046409, "loss": 1.1721, "step": 1025 }, { "epoch": 0.6533460196638122, "grad_norm": 0.40556770599075914, "learning_rate": 0.00032333511721949817, "loss": 1.1679, "step": 1030 }, { "epoch": 0.6565176022835395, "grad_norm": 0.25906084663363754, "learning_rate": 0.00031816454582663856, "loss": 1.1567, "step": 1035 }, { "epoch": 0.6596891849032668, "grad_norm": 0.27183164743159954, "learning_rate": 0.0003130162876111074, "loss": 1.1596, "step": 1040 }, { "epoch": 0.6628607675229939, "grad_norm": 0.24394077297020256, "learning_rate": 0.0003078909743198817, "loss": 1.1487, "step": 1045 }, { "epoch": 0.6660323501427212, "grad_norm": 0.23339037702881532, "learning_rate": 0.000302789234884348, "loss": 1.1636, "step": 1050 }, { "epoch": 0.6692039327624485, "grad_norm": 0.2651227300122355, "learning_rate": 0.00029771169534312583, "loss": 1.1475, "step": 1055 }, { "epoch": 0.6723755153821757, "grad_norm": 0.23719809453094406, "learning_rate": 0.000292658978765246, "loss": 1.1496, "step": 1060 }, { "epoch": 0.6755470980019029, "grad_norm": 0.31466276172538943, "learning_rate": 0.000287631705173693, "loss": 1.1404, "step": 1065 }, { "epoch": 0.6787186806216302, "grad_norm": 0.2657362830496313, "learning_rate": 0.00028263049146932153, "loss": 1.156, "step": 1070 }, { "epoch": 0.6818902632413575, "grad_norm": 0.2353420135393821, "learning_rate": 0.00027765595135515673, "loss": 1.1382, "step": 1075 }, { "epoch": 0.6850618458610847, "grad_norm": 0.29180017450918116, "learning_rate": 0.00027270869526108506, "loss": 1.1403, "step": 1080 }, { "epoch": 0.6882334284808119, "grad_norm": 0.28381426741820764, "learning_rate": 0.000267789330268949, "loss": 1.1351, "step": 1085 }, { "epoch": 0.6914050111005392, "grad_norm": 0.2368326399732858, "learning_rate": 0.00026289846003805075, "loss": 1.1264, "step": 1090 }, { "epoch": 0.6945765937202664, "grad_norm": 0.24260754741892487, "learning_rate": 0.0002580366847310774, "loss": 1.1318, "step": 1095 }, { "epoch": 0.6977481763399936, "grad_norm": 0.33032504483698477, "learning_rate": 0.0002532046009404537, "loss": 1.123, "step": 1100 }, { "epoch": 0.7009197589597209, "grad_norm": 0.2626626593890248, "learning_rate": 0.00024840280161513446, "loss": 1.1147, "step": 1105 }, { "epoch": 0.7040913415794482, "grad_norm": 0.24734490639888912, "learning_rate": 0.0002436318759878432, "loss": 1.1141, "step": 1110 }, { "epoch": 0.7072629241991754, "grad_norm": 0.25777344330608626, "learning_rate": 0.00023889240950276602, "loss": 1.1069, "step": 1115 }, { "epoch": 0.7104345068189026, "grad_norm": 0.24965316567346824, "learning_rate": 0.00023418498374371268, "loss": 1.0961, "step": 1120 }, { "epoch": 0.7136060894386299, "grad_norm": 0.2588175173420704, "learning_rate": 0.0002295101763627483, "loss": 1.1062, "step": 1125 }, { "epoch": 0.7167776720583571, "grad_norm": 0.2617691894820057, "learning_rate": 0.00022486856100931146, "loss": 1.0949, "step": 1130 }, { "epoch": 0.7199492546780843, "grad_norm": 0.24640261475787326, "learning_rate": 0.00022026070725981867, "loss": 1.1024, "step": 1135 }, { "epoch": 0.7231208372978116, "grad_norm": 0.25512789636052857, "learning_rate": 0.0002156871805477732, "loss": 1.0981, "step": 1140 }, { "epoch": 0.7262924199175389, "grad_norm": 0.2380744076277497, "learning_rate": 0.00021114854209437889, "loss": 1.0803, "step": 1145 }, { "epoch": 0.7294640025372661, "grad_norm": 0.26280691701304987, "learning_rate": 0.00020664534883967311, "loss": 1.0851, "step": 1150 }, { "epoch": 0.7326355851569933, "grad_norm": 0.2565430586115982, "learning_rate": 0.00020217815337418427, "loss": 1.076, "step": 1155 }, { "epoch": 0.7358071677767206, "grad_norm": 0.2533752282987412, "learning_rate": 0.00019774750387112174, "loss": 1.0826, "step": 1160 }, { "epoch": 0.7389787503964478, "grad_norm": 0.28126577459530283, "learning_rate": 0.00019335394401911082, "loss": 1.0719, "step": 1165 }, { "epoch": 0.7421503330161751, "grad_norm": 0.2650849332335025, "learning_rate": 0.00018899801295547476, "loss": 1.0742, "step": 1170 }, { "epoch": 0.7453219156359023, "grad_norm": 0.2603829852111257, "learning_rate": 0.00018468024520007764, "loss": 1.0772, "step": 1175 }, { "epoch": 0.7484934982556296, "grad_norm": 0.2527087543783394, "learning_rate": 0.00018040117058973316, "loss": 1.0595, "step": 1180 }, { "epoch": 0.7516650808753568, "grad_norm": 0.24678722431639855, "learning_rate": 0.0001761613142131867, "loss": 1.0469, "step": 1185 }, { "epoch": 0.754836663495084, "grad_norm": 0.25910814410326344, "learning_rate": 0.00017196119634668293, "loss": 1.0627, "step": 1190 }, { "epoch": 0.7580082461148113, "grad_norm": 0.26173306347429054, "learning_rate": 0.00016780133239012075, "loss": 1.0607, "step": 1195 }, { "epoch": 0.7611798287345385, "grad_norm": 0.24651016867032868, "learning_rate": 0.0001636822328038095, "loss": 1.0546, "step": 1200 }, { "epoch": 0.7643514113542658, "grad_norm": 0.28020957707447064, "learning_rate": 0.00015960440304582858, "loss": 1.0579, "step": 1205 }, { "epoch": 0.767522993973993, "grad_norm": 0.26371476098524943, "learning_rate": 0.00015556834351000354, "loss": 1.0537, "step": 1210 }, { "epoch": 0.7706945765937203, "grad_norm": 0.24623457163199874, "learning_rate": 0.0001515745494645019, "loss": 1.045, "step": 1215 }, { "epoch": 0.7738661592134475, "grad_norm": 0.3155558400199454, "learning_rate": 0.0001476235109910576, "loss": 1.0405, "step": 1220 }, { "epoch": 0.7770377418331748, "grad_norm": 0.2789622570986826, "learning_rate": 0.00014371571292483393, "loss": 1.0381, "step": 1225 }, { "epoch": 0.780209324452902, "grad_norm": 0.2409114498230053, "learning_rate": 0.0001398516347949284, "loss": 1.0394, "step": 1230 }, { "epoch": 0.7833809070726292, "grad_norm": 0.267235707838601, "learning_rate": 0.0001360317507655293, "loss": 1.0278, "step": 1235 }, { "epoch": 0.7865524896923565, "grad_norm": 0.28458374786381546, "learning_rate": 0.00013225652957773044, "loss": 1.0326, "step": 1240 }, { "epoch": 0.7897240723120837, "grad_norm": 0.25695712786415686, "learning_rate": 0.00012852643449201212, "loss": 1.023, "step": 1245 }, { "epoch": 0.792895654931811, "grad_norm": 0.2590457354954553, "learning_rate": 0.0001248419232313938, "loss": 1.0232, "step": 1250 }, { "epoch": 0.7960672375515382, "grad_norm": 0.2715843775728456, "learning_rate": 0.000121203447925266, "loss": 1.0287, "step": 1255 }, { "epoch": 0.7992388201712655, "grad_norm": 0.2398511137856279, "learning_rate": 0.00011761145505391024, "loss": 1.0186, "step": 1260 }, { "epoch": 0.8024104027909927, "grad_norm": 0.27281371167245233, "learning_rate": 0.00011406638539370979, "loss": 1.0224, "step": 1265 }, { "epoch": 0.8055819854107199, "grad_norm": 0.3188098317762095, "learning_rate": 0.00011056867396306292, "loss": 1.0092, "step": 1270 }, { "epoch": 0.8087535680304472, "grad_norm": 0.3265540754130617, "learning_rate": 0.00010711874996900023, "loss": 1.0104, "step": 1275 }, { "epoch": 0.8119251506501745, "grad_norm": 0.2607401452606644, "learning_rate": 0.00010371703675451733, "loss": 1.0114, "step": 1280 }, { "epoch": 0.8150967332699017, "grad_norm": 0.2883090335939891, "learning_rate": 0.0001003639517466256, "loss": 1.0093, "step": 1285 }, { "epoch": 0.8182683158896289, "grad_norm": 0.25562259108760305, "learning_rate": 9.705990640512907e-05, "loss": 0.9938, "step": 1290 }, { "epoch": 0.8214398985093562, "grad_norm": 0.2753573095600564, "learning_rate": 9.380530617213456e-05, "loss": 1.0114, "step": 1295 }, { "epoch": 0.8246114811290834, "grad_norm": 0.23998112779723507, "learning_rate": 9.060055042229881e-05, "loss": 1.0089, "step": 1300 }, { "epoch": 0.8277830637488106, "grad_norm": 0.2524204007518801, "learning_rate": 8.74460324138216e-05, "loss": 1.007, "step": 1305 }, { "epoch": 0.8309546463685379, "grad_norm": 0.2526736715480949, "learning_rate": 8.434213924018835e-05, "loss": 1.0, "step": 1310 }, { "epoch": 0.8341262289882652, "grad_norm": 0.2503643121035892, "learning_rate": 8.128925178266927e-05, "loss": 0.9965, "step": 1315 }, { "epoch": 0.8372978116079924, "grad_norm": 0.23244697445873563, "learning_rate": 7.828774466358179e-05, "loss": 0.9988, "step": 1320 }, { "epoch": 0.8404693942277196, "grad_norm": 0.2560633353876119, "learning_rate": 7.53379862003195e-05, "loss": 1.0048, "step": 1325 }, { "epoch": 0.8436409768474469, "grad_norm": 0.2421969390657872, "learning_rate": 7.244033836015695e-05, "loss": 0.9844, "step": 1330 }, { "epoch": 0.8468125594671742, "grad_norm": 0.2607793031238756, "learning_rate": 6.95951567158305e-05, "loss": 0.9778, "step": 1335 }, { "epoch": 0.8499841420869013, "grad_norm": 0.2698408656759126, "learning_rate": 6.680279040190746e-05, "loss": 0.9828, "step": 1340 }, { "epoch": 0.8531557247066286, "grad_norm": 0.23841421968692497, "learning_rate": 6.406358207194224e-05, "loss": 0.9991, "step": 1345 }, { "epoch": 0.8563273073263559, "grad_norm": 0.28084531889088754, "learning_rate": 6.137786785642985e-05, "loss": 0.9855, "step": 1350 }, { "epoch": 0.8594988899460831, "grad_norm": 0.24806562901660065, "learning_rate": 5.8745977321558786e-05, "loss": 0.9747, "step": 1355 }, { "epoch": 0.8626704725658103, "grad_norm": 0.2511271472454086, "learning_rate": 5.616823342876931e-05, "loss": 0.9758, "step": 1360 }, { "epoch": 0.8658420551855376, "grad_norm": 0.24241834259427655, "learning_rate": 5.364495249512336e-05, "loss": 0.9765, "step": 1365 }, { "epoch": 0.8690136378052649, "grad_norm": 0.23883983545121304, "learning_rate": 5.11764441544883e-05, "loss": 0.9808, "step": 1370 }, { "epoch": 0.872185220424992, "grad_norm": 0.24435079120648112, "learning_rate": 4.8763011319542025e-05, "loss": 0.9777, "step": 1375 }, { "epoch": 0.8753568030447193, "grad_norm": 0.2629272614210174, "learning_rate": 4.6404950144602e-05, "loss": 0.9819, "step": 1380 }, { "epoch": 0.8785283856644466, "grad_norm": 0.2663926565440222, "learning_rate": 4.4102549989283756e-05, "loss": 0.9675, "step": 1385 }, { "epoch": 0.8816999682841739, "grad_norm": 0.2431315966754426, "learning_rate": 4.1856093382994067e-05, "loss": 0.9617, "step": 1390 }, { "epoch": 0.884871550903901, "grad_norm": 0.2615985135082817, "learning_rate": 3.966585599026051e-05, "loss": 0.9705, "step": 1395 }, { "epoch": 0.8880431335236283, "grad_norm": 0.2437571495377507, "learning_rate": 3.753210657690537e-05, "loss": 0.9637, "step": 1400 }, { "epoch": 0.8912147161433556, "grad_norm": 0.2867683320851404, "learning_rate": 3.5455106977064555e-05, "loss": 0.9813, "step": 1405 }, { "epoch": 0.8943862987630827, "grad_norm": 0.23596873266775822, "learning_rate": 3.343511206105804e-05, "loss": 0.9654, "step": 1410 }, { "epoch": 0.89755788138281, "grad_norm": 0.2691994741352151, "learning_rate": 3.147236970411449e-05, "loss": 0.9559, "step": 1415 }, { "epoch": 0.9007294640025373, "grad_norm": 0.25848494724563204, "learning_rate": 2.9567120755953858e-05, "loss": 0.9631, "step": 1420 }, { "epoch": 0.9039010466222646, "grad_norm": 0.2329147999513356, "learning_rate": 2.7719599011233333e-05, "loss": 0.9654, "step": 1425 }, { "epoch": 0.9070726292419917, "grad_norm": 0.24897956262619586, "learning_rate": 2.593003118085746e-05, "loss": 0.9686, "step": 1430 }, { "epoch": 0.910244211861719, "grad_norm": 0.24140747398754628, "learning_rate": 2.4198636864158684e-05, "loss": 0.9709, "step": 1435 }, { "epoch": 0.9134157944814463, "grad_norm": 0.23859470642150962, "learning_rate": 2.2525628521949837e-05, "loss": 0.9723, "step": 1440 }, { "epoch": 0.9165873771011734, "grad_norm": 0.23614657553272203, "learning_rate": 2.091121145045327e-05, "loss": 0.96, "step": 1445 }, { "epoch": 0.9197589597209007, "grad_norm": 0.2305582486421353, "learning_rate": 1.9355583756108407e-05, "loss": 0.9622, "step": 1450 }, { "epoch": 0.922930542340628, "grad_norm": 0.23818404259591164, "learning_rate": 1.7858936331262122e-05, "loss": 0.9612, "step": 1455 }, { "epoch": 0.9261021249603553, "grad_norm": 0.2378653986328734, "learning_rate": 1.6421452830744365e-05, "loss": 0.9716, "step": 1460 }, { "epoch": 0.9292737075800824, "grad_norm": 0.2329586634857264, "learning_rate": 1.5043309649331205e-05, "loss": 0.9558, "step": 1465 }, { "epoch": 0.9324452901998097, "grad_norm": 0.4063284757979328, "learning_rate": 1.3724675900099959e-05, "loss": 0.9654, "step": 1470 }, { "epoch": 0.935616872819537, "grad_norm": 0.24883789905797735, "learning_rate": 1.246571339367658e-05, "loss": 0.9603, "step": 1475 }, { "epoch": 0.9387884554392641, "grad_norm": 0.2484198570399255, "learning_rate": 1.1266576618380097e-05, "loss": 0.9579, "step": 1480 }, { "epoch": 0.9419600380589914, "grad_norm": 0.24729636213521072, "learning_rate": 1.0127412721265218e-05, "loss": 0.9675, "step": 1485 }, { "epoch": 0.9451316206787187, "grad_norm": 0.2505398676306425, "learning_rate": 9.048361490065548e-06, "loss": 0.9526, "step": 1490 }, { "epoch": 0.948303203298446, "grad_norm": 0.26466516239326676, "learning_rate": 8.029555336040383e-06, "loss": 0.9661, "step": 1495 }, { "epoch": 0.9514747859181731, "grad_norm": 0.24164934386910944, "learning_rate": 7.071119277726301e-06, "loss": 0.9577, "step": 1500 }, { "epoch": 0.9546463685379004, "grad_norm": 0.23732953651039135, "learning_rate": 6.17317092559605e-06, "loss": 0.9562, "step": 1505 }, { "epoch": 0.9578179511576277, "grad_norm": 0.23388339171880254, "learning_rate": 5.335820467626485e-06, "loss": 0.973, "step": 1510 }, { "epoch": 0.9609895337773549, "grad_norm": 0.2318159543809297, "learning_rate": 4.559170655777267e-06, "loss": 0.9478, "step": 1515 }, { "epoch": 0.9641611163970821, "grad_norm": 0.24710746850544488, "learning_rate": 3.843316793382123e-06, "loss": 0.9707, "step": 1520 }, { "epoch": 0.9673326990168094, "grad_norm": 0.2862292656525582, "learning_rate": 3.188346723454083e-06, "loss": 0.9643, "step": 1525 }, { "epoch": 0.9705042816365367, "grad_norm": 0.2380821089662019, "learning_rate": 2.594340817906271e-06, "loss": 0.9624, "step": 1530 }, { "epoch": 0.9736758642562638, "grad_norm": 0.23933390384190942, "learning_rate": 2.0613719676891853e-06, "loss": 0.9599, "step": 1535 }, { "epoch": 0.9768474468759911, "grad_norm": 0.24854365220185978, "learning_rate": 1.5895055738465169e-06, "loss": 0.9592, "step": 1540 }, { "epoch": 0.9800190294957184, "grad_norm": 0.24336558825284596, "learning_rate": 1.1787995394893502e-06, "loss": 0.962, "step": 1545 }, { "epoch": 0.9831906121154456, "grad_norm": 0.22749162744166387, "learning_rate": 8.293042626912328e-07, "loss": 0.9573, "step": 1550 }, { "epoch": 0.9863621947351728, "grad_norm": 0.2390082283281969, "learning_rate": 5.410626303034017e-07, "loss": 0.9556, "step": 1555 }, { "epoch": 0.9895337773549001, "grad_norm": 0.23360184137608606, "learning_rate": 3.141100126923813e-07, "loss": 0.9571, "step": 1560 }, { "epoch": 0.9927053599746274, "grad_norm": 0.2367998742996814, "learning_rate": 1.4847425939956693e-07, "loss": 0.9495, "step": 1565 }, { "epoch": 0.9958769425943546, "grad_norm": 0.2269345672235601, "learning_rate": 4.417569572368052e-08, "loss": 0.9499, "step": 1570 }, { "epoch": 0.9990485252140818, "grad_norm": 0.2484799432231265, "learning_rate": 1.2271202268210324e-09, "loss": 0.956, "step": 1575 }, { "epoch": 0.9996828417380272, "eval_loss": 2.245941638946533, "eval_runtime": 8.442, "eval_samples_per_second": 46.316, "eval_steps_per_second": 5.804, "step": 1576 }, { "epoch": 0.9996828417380272, "step": 1576, "total_flos": 38663670988800.0, "train_loss": 1.4483577938854393, "train_runtime": 3409.163, "train_samples_per_second": 14.797, "train_steps_per_second": 0.462 } ], "logging_steps": 5, "max_steps": 1576, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 38663670988800.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }