{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99981234753237, "eval_steps": 500, "global_step": 999, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010008131606930632, "grad_norm": 0.09912109375, "learning_rate": 2.0000000000000003e-06, "loss": 1.7364, "step": 1 }, { "epoch": 0.005004065803465315, "grad_norm": 0.09228515625, "learning_rate": 1e-05, "loss": 1.7255, "step": 5 }, { "epoch": 0.01000813160693063, "grad_norm": 0.10009765625, "learning_rate": 2e-05, "loss": 1.693, "step": 10 }, { "epoch": 0.015012197410395946, "grad_norm": 0.10546875, "learning_rate": 3e-05, "loss": 1.7131, "step": 15 }, { "epoch": 0.02001626321386126, "grad_norm": 0.1298828125, "learning_rate": 4e-05, "loss": 1.6944, "step": 20 }, { "epoch": 0.02502032901732658, "grad_norm": 0.162109375, "learning_rate": 5e-05, "loss": 1.6902, "step": 25 }, { "epoch": 0.030024394820791892, "grad_norm": 0.1982421875, "learning_rate": 6e-05, "loss": 1.6689, "step": 30 }, { "epoch": 0.035028460624257206, "grad_norm": 0.173828125, "learning_rate": 7e-05, "loss": 1.6255, "step": 35 }, { "epoch": 0.04003252642772252, "grad_norm": 0.1611328125, "learning_rate": 8e-05, "loss": 1.5785, "step": 40 }, { "epoch": 0.04503659223118784, "grad_norm": 0.1357421875, "learning_rate": 9e-05, "loss": 1.5011, "step": 45 }, { "epoch": 0.05004065803465316, "grad_norm": 0.1376953125, "learning_rate": 0.0001, "loss": 1.4936, "step": 50 }, { "epoch": 0.055044723838118474, "grad_norm": 0.08740234375, "learning_rate": 0.00011000000000000002, "loss": 1.48, "step": 55 }, { "epoch": 0.060048789641583784, "grad_norm": 0.06591796875, "learning_rate": 0.00012, "loss": 1.4597, "step": 60 }, { "epoch": 0.0650528554450491, "grad_norm": 0.04345703125, "learning_rate": 0.00013000000000000002, "loss": 1.4573, "step": 65 }, { "epoch": 0.07005692124851441, "grad_norm": 0.0322265625, "learning_rate": 0.00014, "loss": 1.4094, "step": 70 }, { "epoch": 0.07506098705197974, "grad_norm": 0.037353515625, "learning_rate": 0.00015000000000000001, "loss": 1.4073, "step": 75 }, { "epoch": 0.08006505285544505, "grad_norm": 0.0277099609375, "learning_rate": 0.00016, "loss": 1.4222, "step": 80 }, { "epoch": 0.08506911865891037, "grad_norm": 0.0230712890625, "learning_rate": 0.00017, "loss": 1.4125, "step": 85 }, { "epoch": 0.09007318446237568, "grad_norm": 0.021728515625, "learning_rate": 0.00018, "loss": 1.3956, "step": 90 }, { "epoch": 0.09507725026584099, "grad_norm": 0.0211181640625, "learning_rate": 0.00019, "loss": 1.4091, "step": 95 }, { "epoch": 0.10008131606930631, "grad_norm": 0.0198974609375, "learning_rate": 0.0002, "loss": 1.368, "step": 100 }, { "epoch": 0.10508538187277162, "grad_norm": 0.0196533203125, "learning_rate": 0.00019998473561448797, "loss": 1.381, "step": 105 }, { "epoch": 0.11008944767623695, "grad_norm": 0.01806640625, "learning_rate": 0.0001999389471179811, "loss": 1.3798, "step": 110 }, { "epoch": 0.11509351347970226, "grad_norm": 0.017333984375, "learning_rate": 0.00019986264848914474, "loss": 1.3918, "step": 115 }, { "epoch": 0.12009757928316757, "grad_norm": 0.017578125, "learning_rate": 0.00019975586302101248, "loss": 1.3851, "step": 120 }, { "epoch": 0.12510164508663288, "grad_norm": 0.017578125, "learning_rate": 0.00019961862331387543, "loss": 1.3727, "step": 125 }, { "epoch": 0.1301057108900982, "grad_norm": 0.0169677734375, "learning_rate": 0.00019945097126532955, "loss": 1.3798, "step": 130 }, { "epoch": 0.13510977669356353, "grad_norm": 0.016357421875, "learning_rate": 0.0001992529580574848, "loss": 1.3553, "step": 135 }, { "epoch": 0.14011384249702882, "grad_norm": 0.01953125, "learning_rate": 0.00019902464414134005, "loss": 1.3801, "step": 140 }, { "epoch": 0.14511790830049415, "grad_norm": 0.0203857421875, "learning_rate": 0.000198766099218328, "loss": 1.354, "step": 145 }, { "epoch": 0.15012197410395947, "grad_norm": 0.01806640625, "learning_rate": 0.0001984774022190361, "loss": 1.3874, "step": 150 }, { "epoch": 0.1551260399074248, "grad_norm": 0.0166015625, "learning_rate": 0.0001981586412791103, "loss": 1.3565, "step": 155 }, { "epoch": 0.1601301057108901, "grad_norm": 0.0177001953125, "learning_rate": 0.00019780991371234801, "loss": 1.3525, "step": 160 }, { "epoch": 0.16513417151435542, "grad_norm": 0.01806640625, "learning_rate": 0.00019743132598098963, "loss": 1.3537, "step": 165 }, { "epoch": 0.17013823731782074, "grad_norm": 0.02734375, "learning_rate": 0.00019702299366321677, "loss": 1.3681, "step": 170 }, { "epoch": 0.17514230312128604, "grad_norm": 0.0201416015625, "learning_rate": 0.00019658504141786774, "loss": 1.3453, "step": 175 }, { "epoch": 0.18014636892475136, "grad_norm": 0.0184326171875, "learning_rate": 0.0001961176029463807, "loss": 1.3701, "step": 180 }, { "epoch": 0.18515043472821668, "grad_norm": 0.021240234375, "learning_rate": 0.00019562082095197632, "loss": 1.3525, "step": 185 }, { "epoch": 0.19015450053168198, "grad_norm": 0.02001953125, "learning_rate": 0.00019509484709609215, "loss": 1.3427, "step": 190 }, { "epoch": 0.1951585663351473, "grad_norm": 0.02001953125, "learning_rate": 0.0001945398419520823, "loss": 1.3469, "step": 195 }, { "epoch": 0.20016263213861263, "grad_norm": 0.021484375, "learning_rate": 0.00019395597495619634, "loss": 1.3413, "step": 200 }, { "epoch": 0.20516669794207792, "grad_norm": 0.0230712890625, "learning_rate": 0.00019334342435585255, "loss": 1.3604, "step": 205 }, { "epoch": 0.21017076374554325, "grad_norm": 0.0211181640625, "learning_rate": 0.00019270237715522112, "loss": 1.3695, "step": 210 }, { "epoch": 0.21517482954900857, "grad_norm": 0.02197265625, "learning_rate": 0.00019203302905813406, "loss": 1.3443, "step": 215 }, { "epoch": 0.2201788953524739, "grad_norm": 0.0211181640625, "learning_rate": 0.00019133558440833926, "loss": 1.336, "step": 220 }, { "epoch": 0.2251829611559392, "grad_norm": 0.0234375, "learning_rate": 0.00019061025612711699, "loss": 1.373, "step": 225 }, { "epoch": 0.23018702695940452, "grad_norm": 0.021728515625, "learning_rate": 0.00018985726564827728, "loss": 1.3571, "step": 230 }, { "epoch": 0.23519109276286984, "grad_norm": 0.0218505859375, "learning_rate": 0.0001890768428505593, "loss": 1.3279, "step": 235 }, { "epoch": 0.24019515856633514, "grad_norm": 0.023193359375, "learning_rate": 0.00018826922598745197, "loss": 1.358, "step": 240 }, { "epoch": 0.24519922436980046, "grad_norm": 0.0250244140625, "learning_rate": 0.00018743466161445823, "loss": 1.3361, "step": 245 }, { "epoch": 0.25020329017326576, "grad_norm": 0.02197265625, "learning_rate": 0.00018657340451382447, "loss": 1.3324, "step": 250 }, { "epoch": 0.2552073559767311, "grad_norm": 0.0213623046875, "learning_rate": 0.00018568571761675893, "loss": 1.3304, "step": 255 }, { "epoch": 0.2602114217801964, "grad_norm": 0.0225830078125, "learning_rate": 0.00018477187192316184, "loss": 1.3585, "step": 260 }, { "epoch": 0.2652154875836617, "grad_norm": 0.0250244140625, "learning_rate": 0.00018383214641889243, "loss": 1.3295, "step": 265 }, { "epoch": 0.27021955338712705, "grad_norm": 0.024658203125, "learning_rate": 0.00018286682799059824, "loss": 1.3565, "step": 270 }, { "epoch": 0.27522361919059235, "grad_norm": 0.0223388671875, "learning_rate": 0.00018187621133813187, "loss": 1.3314, "step": 275 }, { "epoch": 0.28022768499405765, "grad_norm": 0.0228271484375, "learning_rate": 0.00018086059888458288, "loss": 1.3395, "step": 280 }, { "epoch": 0.285231750797523, "grad_norm": 0.0218505859375, "learning_rate": 0.0001798203006839517, "loss": 1.3486, "step": 285 }, { "epoch": 0.2902358166009883, "grad_norm": 0.022216796875, "learning_rate": 0.00017875563432649392, "loss": 1.3341, "step": 290 }, { "epoch": 0.29523988240445365, "grad_norm": 0.0245361328125, "learning_rate": 0.0001776669248417641, "loss": 1.3409, "step": 295 }, { "epoch": 0.30024394820791894, "grad_norm": 0.0235595703125, "learning_rate": 0.00017655450459938784, "loss": 1.3506, "step": 300 }, { "epoch": 0.30524801401138424, "grad_norm": 0.0242919921875, "learning_rate": 0.00017541871320759378, "loss": 1.3523, "step": 305 }, { "epoch": 0.3102520798148496, "grad_norm": 0.0230712890625, "learning_rate": 0.0001742598974095353, "loss": 1.3328, "step": 310 }, { "epoch": 0.3152561456183149, "grad_norm": 0.0220947265625, "learning_rate": 0.0001730784109774339, "loss": 1.3394, "step": 315 }, { "epoch": 0.3202602114217802, "grad_norm": 0.0224609375, "learning_rate": 0.00017187461460457717, "loss": 1.3379, "step": 320 }, { "epoch": 0.32526427722524553, "grad_norm": 0.0233154296875, "learning_rate": 0.00017064887579520334, "loss": 1.317, "step": 325 }, { "epoch": 0.33026834302871083, "grad_norm": 0.02197265625, "learning_rate": 0.00016940156875230687, "loss": 1.3257, "step": 330 }, { "epoch": 0.3352724088321761, "grad_norm": 0.0220947265625, "learning_rate": 0.00016813307426339892, "loss": 1.3265, "step": 335 }, { "epoch": 0.3402764746356415, "grad_norm": 0.0235595703125, "learning_rate": 0.0001668437795842574, "loss": 1.3372, "step": 340 }, { "epoch": 0.3452805404391068, "grad_norm": 0.024658203125, "learning_rate": 0.00016553407832070277, "loss": 1.3559, "step": 345 }, { "epoch": 0.35028460624257207, "grad_norm": 0.02490234375, "learning_rate": 0.00016420437030843484, "loss": 1.3396, "step": 350 }, { "epoch": 0.3552886720460374, "grad_norm": 0.025634765625, "learning_rate": 0.00016285506149096794, "loss": 1.3437, "step": 355 }, { "epoch": 0.3602927378495027, "grad_norm": 0.0269775390625, "learning_rate": 0.00016148656379570144, "loss": 1.3409, "step": 360 }, { "epoch": 0.365296803652968, "grad_norm": 0.029052734375, "learning_rate": 0.0001600992950081632, "loss": 1.3445, "step": 365 }, { "epoch": 0.37030086945643337, "grad_norm": 0.026123046875, "learning_rate": 0.0001586936786444648, "loss": 1.3533, "step": 370 }, { "epoch": 0.37530493525989866, "grad_norm": 0.0234375, "learning_rate": 0.0001572701438220074, "loss": 1.343, "step": 375 }, { "epoch": 0.38030900106336396, "grad_norm": 0.026611328125, "learning_rate": 0.0001558291251284774, "loss": 1.3617, "step": 380 }, { "epoch": 0.3853130668668293, "grad_norm": 0.0274658203125, "learning_rate": 0.00015437106248917217, "loss": 1.323, "step": 385 }, { "epoch": 0.3903171326702946, "grad_norm": 0.0235595703125, "learning_rate": 0.00015289640103269625, "loss": 1.3186, "step": 390 }, { "epoch": 0.3953211984737599, "grad_norm": 0.0262451171875, "learning_rate": 0.00015140559095506908, "loss": 1.3228, "step": 395 }, { "epoch": 0.40032526427722526, "grad_norm": 0.02685546875, "learning_rate": 0.00014989908738228567, "loss": 1.3341, "step": 400 }, { "epoch": 0.40532933008069055, "grad_norm": 0.024658203125, "learning_rate": 0.00014837735023137216, "loss": 1.3479, "step": 405 }, { "epoch": 0.41033339588415585, "grad_norm": 0.0267333984375, "learning_rate": 0.00014684084406997903, "loss": 1.3272, "step": 410 }, { "epoch": 0.4153374616876212, "grad_norm": 0.0262451171875, "learning_rate": 0.00014529003797455402, "loss": 1.338, "step": 415 }, { "epoch": 0.4203415274910865, "grad_norm": 0.024658203125, "learning_rate": 0.000143725405387139, "loss": 1.3344, "step": 420 }, { "epoch": 0.42534559329455185, "grad_norm": 0.023681640625, "learning_rate": 0.000142147423970834, "loss": 1.3262, "step": 425 }, { "epoch": 0.43034965909801715, "grad_norm": 0.024658203125, "learning_rate": 0.0001405565754639724, "loss": 1.3282, "step": 430 }, { "epoch": 0.43535372490148244, "grad_norm": 0.0238037109375, "learning_rate": 0.00013895334553305216, "loss": 1.3363, "step": 435 }, { "epoch": 0.4403577907049478, "grad_norm": 0.0281982421875, "learning_rate": 0.0001373382236244679, "loss": 1.3365, "step": 440 }, { "epoch": 0.4453618565084131, "grad_norm": 0.0242919921875, "learning_rate": 0.0001357117028150889, "loss": 1.338, "step": 445 }, { "epoch": 0.4503659223118784, "grad_norm": 0.0240478515625, "learning_rate": 0.00013407427966172865, "loss": 1.3344, "step": 450 }, { "epoch": 0.45536998811534374, "grad_norm": 0.0257568359375, "learning_rate": 0.00013242645404955237, "loss": 1.3576, "step": 455 }, { "epoch": 0.46037405391880903, "grad_norm": 0.02734375, "learning_rate": 0.00013076872903946806, "loss": 1.3258, "step": 460 }, { "epoch": 0.46537811972227433, "grad_norm": 0.02490234375, "learning_rate": 0.0001291016107145483, "loss": 1.3441, "step": 465 }, { "epoch": 0.4703821855257397, "grad_norm": 0.02490234375, "learning_rate": 0.00012742560802552912, "loss": 1.3255, "step": 470 }, { "epoch": 0.475386251329205, "grad_norm": 0.0272216796875, "learning_rate": 0.00012574123263543388, "loss": 1.3483, "step": 475 }, { "epoch": 0.4803903171326703, "grad_norm": 0.025146484375, "learning_rate": 0.0001240489987633686, "loss": 1.3465, "step": 480 }, { "epoch": 0.4853943829361356, "grad_norm": 0.0247802734375, "learning_rate": 0.0001223494230275372, "loss": 1.3419, "step": 485 }, { "epoch": 0.4903984487396009, "grad_norm": 0.0244140625, "learning_rate": 0.0001206430242875246, "loss": 1.3168, "step": 490 }, { "epoch": 0.4954025145430662, "grad_norm": 0.0247802734375, "learning_rate": 0.00011893032348589478, "loss": 1.3379, "step": 495 }, { "epoch": 0.5004065803465315, "grad_norm": 0.025390625, "learning_rate": 0.00011721184348915384, "loss": 1.3195, "step": 500 }, { "epoch": 0.5054106461499969, "grad_norm": 0.024169921875, "learning_rate": 0.00011548810892812505, "loss": 1.3169, "step": 505 }, { "epoch": 0.5104147119534622, "grad_norm": 0.0245361328125, "learning_rate": 0.00011375964603778561, "loss": 1.3208, "step": 510 }, { "epoch": 0.5154187777569275, "grad_norm": 0.025390625, "learning_rate": 0.00011202698249661364, "loss": 1.3139, "step": 515 }, { "epoch": 0.5204228435603928, "grad_norm": 0.0238037109375, "learning_rate": 0.00011029064726549412, "loss": 1.3217, "step": 520 }, { "epoch": 0.5254269093638582, "grad_norm": 0.025634765625, "learning_rate": 0.000108551170426234, "loss": 1.3115, "step": 525 }, { "epoch": 0.5304309751673234, "grad_norm": 0.0272216796875, "learning_rate": 0.0001068090830197346, "loss": 1.3709, "step": 530 }, { "epoch": 0.5354350409707888, "grad_norm": 0.0257568359375, "learning_rate": 0.00010506491688387127, "loss": 1.3332, "step": 535 }, { "epoch": 0.5404391067742541, "grad_norm": 0.025634765625, "learning_rate": 0.00010331920449112991, "loss": 1.3496, "step": 540 }, { "epoch": 0.5454431725777193, "grad_norm": 0.0260009765625, "learning_rate": 0.00010157247878604961, "loss": 1.3611, "step": 545 }, { "epoch": 0.5504472383811847, "grad_norm": 0.024658203125, "learning_rate": 9.982527302252135e-05, "loss": 1.3502, "step": 550 }, { "epoch": 0.55545130418465, "grad_norm": 0.0235595703125, "learning_rate": 9.807812060099191e-05, "loss": 1.3278, "step": 555 }, { "epoch": 0.5604553699881153, "grad_norm": 0.0244140625, "learning_rate": 9.633155490562358e-05, "loss": 1.3605, "step": 560 }, { "epoch": 0.5654594357915806, "grad_norm": 0.0235595703125, "learning_rate": 9.458610914145826e-05, "loss": 1.3379, "step": 565 }, { "epoch": 0.570463501595046, "grad_norm": 0.0257568359375, "learning_rate": 9.284231617163666e-05, "loss": 1.3085, "step": 570 }, { "epoch": 0.5754675673985112, "grad_norm": 0.0252685546875, "learning_rate": 9.11007083547216e-05, "loss": 1.3415, "step": 575 }, { "epoch": 0.5804716332019766, "grad_norm": 0.0252685546875, "learning_rate": 8.936181738217571e-05, "loss": 1.3251, "step": 580 }, { "epoch": 0.5854756990054419, "grad_norm": 0.0262451171875, "learning_rate": 8.762617411604235e-05, "loss": 1.3519, "step": 585 }, { "epoch": 0.5904797648089073, "grad_norm": 0.0252685546875, "learning_rate": 8.589430842688001e-05, "loss": 1.3148, "step": 590 }, { "epoch": 0.5954838306123725, "grad_norm": 0.02685546875, "learning_rate": 8.41667490319994e-05, "loss": 1.3169, "step": 595 }, { "epoch": 0.6004878964158379, "grad_norm": 0.024658203125, "learning_rate": 8.244402333405252e-05, "loss": 1.3561, "step": 600 }, { "epoch": 0.6054919622193032, "grad_norm": 0.025146484375, "learning_rate": 8.0726657260023e-05, "loss": 1.3059, "step": 605 }, { "epoch": 0.6104960280227685, "grad_norm": 0.0245361328125, "learning_rate": 7.901517510066724e-05, "loss": 1.329, "step": 610 }, { "epoch": 0.6155000938262338, "grad_norm": 0.0250244140625, "learning_rate": 7.73100993504548e-05, "loss": 1.3412, "step": 615 }, { "epoch": 0.6205041596296992, "grad_norm": 0.025146484375, "learning_rate": 7.561195054805729e-05, "loss": 1.3447, "step": 620 }, { "epoch": 0.6255082254331644, "grad_norm": 0.02392578125, "learning_rate": 7.392124711743422e-05, "loss": 1.3445, "step": 625 }, { "epoch": 0.6305122912366298, "grad_norm": 0.025634765625, "learning_rate": 7.223850520956457e-05, "loss": 1.3078, "step": 630 }, { "epoch": 0.6355163570400951, "grad_norm": 0.026123046875, "learning_rate": 7.056423854487236e-05, "loss": 1.3427, "step": 635 }, { "epoch": 0.6405204228435604, "grad_norm": 0.0244140625, "learning_rate": 6.889895825639401e-05, "loss": 1.3364, "step": 640 }, { "epoch": 0.6455244886470257, "grad_norm": 0.0263671875, "learning_rate": 6.724317273373563e-05, "loss": 1.3555, "step": 645 }, { "epoch": 0.6505285544504911, "grad_norm": 0.02587890625, "learning_rate": 6.55973874678682e-05, "loss": 1.3542, "step": 650 }, { "epoch": 0.6555326202539563, "grad_norm": 0.025390625, "learning_rate": 6.396210489680699e-05, "loss": 1.3421, "step": 655 }, { "epoch": 0.6605366860574217, "grad_norm": 0.0240478515625, "learning_rate": 6.23378242522237e-05, "loss": 1.3395, "step": 660 }, { "epoch": 0.665540751860887, "grad_norm": 0.02490234375, "learning_rate": 6.072504140703714e-05, "loss": 1.3291, "step": 665 }, { "epoch": 0.6705448176643523, "grad_norm": 0.0255126953125, "learning_rate": 5.912424872402927e-05, "loss": 1.33, "step": 670 }, { "epoch": 0.6755488834678176, "grad_norm": 0.0257568359375, "learning_rate": 5.7535934905532816e-05, "loss": 1.3547, "step": 675 }, { "epoch": 0.680552949271283, "grad_norm": 0.02490234375, "learning_rate": 5.596058484423656e-05, "loss": 1.344, "step": 680 }, { "epoch": 0.6855570150747482, "grad_norm": 0.025390625, "learning_rate": 5.43986794751536e-05, "loss": 1.3617, "step": 685 }, { "epoch": 0.6905610808782136, "grad_norm": 0.023193359375, "learning_rate": 5.285069562879758e-05, "loss": 1.3275, "step": 690 }, { "epoch": 0.6955651466816789, "grad_norm": 0.0245361328125, "learning_rate": 5.1317105885612524e-05, "loss": 1.3459, "step": 695 }, { "epoch": 0.7005692124851441, "grad_norm": 0.026123046875, "learning_rate": 4.9798378431699585e-05, "loss": 1.3345, "step": 700 }, { "epoch": 0.7055732782886095, "grad_norm": 0.0260009765625, "learning_rate": 4.829497691588557e-05, "loss": 1.3208, "step": 705 }, { "epoch": 0.7105773440920748, "grad_norm": 0.02587890625, "learning_rate": 4.680736030817687e-05, "loss": 1.3546, "step": 710 }, { "epoch": 0.7155814098955401, "grad_norm": 0.0240478515625, "learning_rate": 4.533598275964139e-05, "loss": 1.326, "step": 715 }, { "epoch": 0.7205854756990054, "grad_norm": 0.0250244140625, "learning_rate": 4.388129346376178e-05, "loss": 1.3337, "step": 720 }, { "epoch": 0.7255895415024708, "grad_norm": 0.025390625, "learning_rate": 4.2443736519302314e-05, "loss": 1.3264, "step": 725 }, { "epoch": 0.730593607305936, "grad_norm": 0.0250244140625, "learning_rate": 4.102375079473087e-05, "loss": 1.3214, "step": 730 }, { "epoch": 0.7355976731094014, "grad_norm": 0.0255126953125, "learning_rate": 3.9621769794237894e-05, "loss": 1.3318, "step": 735 }, { "epoch": 0.7406017389128667, "grad_norm": 0.0247802734375, "learning_rate": 3.823822152539286e-05, "loss": 1.3327, "step": 740 }, { "epoch": 0.745605804716332, "grad_norm": 0.025146484375, "learning_rate": 3.687352836847874e-05, "loss": 1.3486, "step": 745 }, { "epoch": 0.7506098705197973, "grad_norm": 0.0252685546875, "learning_rate": 3.552810694754463e-05, "loss": 1.329, "step": 750 }, { "epoch": 0.7556139363232627, "grad_norm": 0.025634765625, "learning_rate": 3.42023680032154e-05, "loss": 1.3553, "step": 755 }, { "epoch": 0.7606180021267279, "grad_norm": 0.0240478515625, "learning_rate": 3.289671626729772e-05, "loss": 1.3087, "step": 760 }, { "epoch": 0.7656220679301933, "grad_norm": 0.0244140625, "learning_rate": 3.161155033922045e-05, "loss": 1.3299, "step": 765 }, { "epoch": 0.7706261337336586, "grad_norm": 0.02392578125, "learning_rate": 3.0347262564347057e-05, "loss": 1.3156, "step": 770 }, { "epoch": 0.7756301995371239, "grad_norm": 0.02587890625, "learning_rate": 2.9104238914197445e-05, "loss": 1.3171, "step": 775 }, { "epoch": 0.7806342653405892, "grad_norm": 0.02392578125, "learning_rate": 2.7882858868615467e-05, "loss": 1.3415, "step": 780 }, { "epoch": 0.7856383311440546, "grad_norm": 0.023193359375, "learning_rate": 2.6683495299918648e-05, "loss": 1.3576, "step": 785 }, { "epoch": 0.7906423969475198, "grad_norm": 0.025146484375, "learning_rate": 2.550651435906456e-05, "loss": 1.309, "step": 790 }, { "epoch": 0.7956464627509852, "grad_norm": 0.0272216796875, "learning_rate": 2.435227536386967e-05, "loss": 1.3299, "step": 795 }, { "epoch": 0.8006505285544505, "grad_norm": 0.0240478515625, "learning_rate": 2.3221130689313907e-05, "loss": 1.3354, "step": 800 }, { "epoch": 0.8056545943579158, "grad_norm": 0.023681640625, "learning_rate": 2.211342565996487e-05, "loss": 1.3262, "step": 805 }, { "epoch": 0.8106586601613811, "grad_norm": 0.025390625, "learning_rate": 2.1029498444554618e-05, "loss": 1.3339, "step": 810 }, { "epoch": 0.8156627259648465, "grad_norm": 0.02490234375, "learning_rate": 1.9969679952740805e-05, "loss": 1.3516, "step": 815 }, { "epoch": 0.8206667917683117, "grad_norm": 0.0234375, "learning_rate": 1.893429373408411e-05, "loss": 1.3399, "step": 820 }, { "epoch": 0.825670857571777, "grad_norm": 0.0242919921875, "learning_rate": 1.7923655879272393e-05, "loss": 1.3149, "step": 825 }, { "epoch": 0.8306749233752424, "grad_norm": 0.0242919921875, "learning_rate": 1.6938074923622227e-05, "loss": 1.3292, "step": 830 }, { "epoch": 0.8356789891787078, "grad_norm": 0.0242919921875, "learning_rate": 1.597785175288683e-05, "loss": 1.3325, "step": 835 }, { "epoch": 0.840683054982173, "grad_norm": 0.0245361328125, "learning_rate": 1.5043279511399333e-05, "loss": 1.3544, "step": 840 }, { "epoch": 0.8456871207856383, "grad_norm": 0.0245361328125, "learning_rate": 1.4134643512579382e-05, "loss": 1.3601, "step": 845 }, { "epoch": 0.8506911865891037, "grad_norm": 0.0238037109375, "learning_rate": 1.3252221151830513e-05, "loss": 1.3447, "step": 850 }, { "epoch": 0.8556952523925689, "grad_norm": 0.0238037109375, "learning_rate": 1.2396281821854683e-05, "loss": 1.3296, "step": 855 }, { "epoch": 0.8606993181960343, "grad_norm": 0.024169921875, "learning_rate": 1.156708683041008e-05, "loss": 1.3187, "step": 860 }, { "epoch": 0.8657033839994996, "grad_norm": 0.024169921875, "learning_rate": 1.0764889320536931e-05, "loss": 1.3136, "step": 865 }, { "epoch": 0.8707074498029649, "grad_norm": 0.023193359375, "learning_rate": 9.989934193276219e-06, "loss": 1.323, "step": 870 }, { "epoch": 0.8757115156064302, "grad_norm": 0.024658203125, "learning_rate": 9.242458032904311e-06, "loss": 1.3221, "step": 875 }, { "epoch": 0.8807155814098956, "grad_norm": 0.02587890625, "learning_rate": 8.52268903470661e-06, "loss": 1.3413, "step": 880 }, { "epoch": 0.8857196472133608, "grad_norm": 0.0242919921875, "learning_rate": 7.830846935312509e-06, "loss": 1.3342, "step": 885 }, { "epoch": 0.8907237130168262, "grad_norm": 0.0245361328125, "learning_rate": 7.167142945612393e-06, "loss": 1.3265, "step": 890 }, { "epoch": 0.8957277788202915, "grad_norm": 0.0238037109375, "learning_rate": 6.531779686277528e-06, "loss": 1.3387, "step": 895 }, { "epoch": 0.9007318446237568, "grad_norm": 0.02392578125, "learning_rate": 5.924951125902545e-06, "loss": 1.3167, "step": 900 }, { "epoch": 0.9057359104272221, "grad_norm": 0.0235595703125, "learning_rate": 5.346842521789141e-06, "loss": 1.3465, "step": 905 }, { "epoch": 0.9107399762306875, "grad_norm": 0.0244140625, "learning_rate": 4.7976303633893384e-06, "loss": 1.3396, "step": 910 }, { "epoch": 0.9157440420341527, "grad_norm": 0.0250244140625, "learning_rate": 4.277482318425408e-06, "loss": 1.3569, "step": 915 }, { "epoch": 0.9207481078376181, "grad_norm": 0.025390625, "learning_rate": 3.7865571817029877e-06, "loss": 1.3265, "step": 920 }, { "epoch": 0.9257521736410834, "grad_norm": 0.025390625, "learning_rate": 3.3250048266329825e-06, "loss": 1.3392, "step": 925 }, { "epoch": 0.9307562394445487, "grad_norm": 0.024658203125, "learning_rate": 2.8929661594770174e-06, "loss": 1.3597, "step": 930 }, { "epoch": 0.935760305248014, "grad_norm": 0.0257568359375, "learning_rate": 2.4905730763305047e-06, "loss": 1.3329, "step": 935 }, { "epoch": 0.9407643710514794, "grad_norm": 0.025146484375, "learning_rate": 2.1179484228564305e-06, "loss": 1.3497, "step": 940 }, { "epoch": 0.9457684368549446, "grad_norm": 0.02392578125, "learning_rate": 1.7752059567820333e-06, "loss": 1.3367, "step": 945 }, { "epoch": 0.95077250265841, "grad_norm": 0.0245361328125, "learning_rate": 1.4624503131699828e-06, "loss": 1.3459, "step": 950 }, { "epoch": 0.9557765684618753, "grad_norm": 0.02490234375, "learning_rate": 1.1797769724745888e-06, "loss": 1.3326, "step": 955 }, { "epoch": 0.9607806342653405, "grad_norm": 0.024169921875, "learning_rate": 9.272722313927617e-07, "loss": 1.3487, "step": 960 }, { "epoch": 0.9657847000688059, "grad_norm": 0.0247802734375, "learning_rate": 7.05013176518754e-07, "loss": 1.3647, "step": 965 }, { "epoch": 0.9707887658722713, "grad_norm": 0.0233154296875, "learning_rate": 5.130676608104845e-07, "loss": 1.3482, "step": 970 }, { "epoch": 0.9757928316757365, "grad_norm": 0.0247802734375, "learning_rate": 3.5149428287495343e-07, "loss": 1.3423, "step": 975 }, { "epoch": 0.9807968974792018, "grad_norm": 0.02587890625, "learning_rate": 2.2034236907874094e-07, "loss": 1.3267, "step": 980 }, { "epoch": 0.9858009632826672, "grad_norm": 0.0238037109375, "learning_rate": 1.1965195848929745e-07, "loss": 1.3185, "step": 985 }, { "epoch": 0.9908050290861324, "grad_norm": 0.02392578125, "learning_rate": 4.945379065152134e-08, "loss": 1.3129, "step": 990 }, { "epoch": 0.9958090948895978, "grad_norm": 0.024658203125, "learning_rate": 9.769296203332446e-09, "loss": 1.3423, "step": 995 }, { "epoch": 0.99981234753237, "eval_loss": 1.343445897102356, "eval_runtime": 1960.9798, "eval_samples_per_second": 7.216, "eval_steps_per_second": 7.216, "step": 999 }, { "epoch": 0.99981234753237, "step": 999, "total_flos": 2.0711686751461048e+18, "train_loss": 0.052138939037456644, "train_runtime": 4093.6241, "train_samples_per_second": 31.243, "train_steps_per_second": 0.244 } ], "logging_steps": 5, "max_steps": 999, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 2.0711686751461048e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }