{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.954337899543379, "eval_steps": 500, "global_step": 1090, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0091324200913242, "grad_norm": 3.1570751667022705, "learning_rate": 1.8348623853211011e-06, "loss": 2.2349, "step": 1 }, { "epoch": 0.045662100456621, "grad_norm": 3.327242374420166, "learning_rate": 9.174311926605506e-06, "loss": 2.2947, "step": 5 }, { "epoch": 0.091324200913242, "grad_norm": 3.6639091968536377, "learning_rate": 1.834862385321101e-05, "loss": 2.1754, "step": 10 }, { "epoch": 0.136986301369863, "grad_norm": 6.006739616394043, "learning_rate": 2.7522935779816515e-05, "loss": 1.8388, "step": 15 }, { "epoch": 0.182648401826484, "grad_norm": 1.5798718929290771, "learning_rate": 3.669724770642202e-05, "loss": 1.4696, "step": 20 }, { "epoch": 0.228310502283105, "grad_norm": 1.038404107093811, "learning_rate": 4.587155963302753e-05, "loss": 1.2583, "step": 25 }, { "epoch": 0.273972602739726, "grad_norm": 0.5968286395072937, "learning_rate": 5.504587155963303e-05, "loss": 1.1721, "step": 30 }, { "epoch": 0.319634703196347, "grad_norm": 0.42477676272392273, "learning_rate": 6.422018348623854e-05, "loss": 1.0966, "step": 35 }, { "epoch": 0.365296803652968, "grad_norm": 0.46538373827934265, "learning_rate": 7.339449541284404e-05, "loss": 1.0378, "step": 40 }, { "epoch": 0.410958904109589, "grad_norm": 0.32725027203559875, "learning_rate": 8.256880733944955e-05, "loss": 0.996, "step": 45 }, { "epoch": 0.45662100456621, "grad_norm": 0.27102047204971313, "learning_rate": 9.174311926605506e-05, "loss": 0.9549, "step": 50 }, { "epoch": 0.502283105022831, "grad_norm": 0.20055657625198364, "learning_rate": 0.00010091743119266055, "loss": 0.9426, "step": 55 }, { "epoch": 0.547945205479452, "grad_norm": 0.24150966107845306, "learning_rate": 0.00011009174311926606, "loss": 0.918, "step": 60 }, { "epoch": 0.593607305936073, "grad_norm": 0.20545370876789093, "learning_rate": 0.00011926605504587157, "loss": 0.8995, "step": 65 }, { "epoch": 0.639269406392694, "grad_norm": 0.24165408313274384, "learning_rate": 0.00012844036697247707, "loss": 0.8851, "step": 70 }, { "epoch": 0.684931506849315, "grad_norm": 0.22626952826976776, "learning_rate": 0.00013761467889908258, "loss": 0.8729, "step": 75 }, { "epoch": 0.730593607305936, "grad_norm": 0.2153974175453186, "learning_rate": 0.0001467889908256881, "loss": 0.8588, "step": 80 }, { "epoch": 0.776255707762557, "grad_norm": 0.22552534937858582, "learning_rate": 0.0001559633027522936, "loss": 0.8453, "step": 85 }, { "epoch": 0.821917808219178, "grad_norm": 0.2299458086490631, "learning_rate": 0.0001651376146788991, "loss": 0.8491, "step": 90 }, { "epoch": 0.867579908675799, "grad_norm": 0.38727596402168274, "learning_rate": 0.00017431192660550458, "loss": 0.839, "step": 95 }, { "epoch": 0.91324200913242, "grad_norm": 0.5172630548477173, "learning_rate": 0.00018348623853211012, "loss": 0.8359, "step": 100 }, { "epoch": 0.958904109589041, "grad_norm": 0.3504508137702942, "learning_rate": 0.0001926605504587156, "loss": 0.8176, "step": 105 }, { "epoch": 0.9954337899543378, "eval_loss": 2.11497163772583, "eval_runtime": 0.7274, "eval_samples_per_second": 13.748, "eval_steps_per_second": 1.375, "step": 109 }, { "epoch": 1.004566210045662, "grad_norm": 0.21483348309993744, "learning_rate": 0.00019999948721966259, "loss": 0.812, "step": 110 }, { "epoch": 1.0502283105022832, "grad_norm": 0.25326865911483765, "learning_rate": 0.00019998154046002822, "loss": 0.8028, "step": 115 }, { "epoch": 1.095890410958904, "grad_norm": 0.23645517230033875, "learning_rate": 0.0001999379599421534, "loss": 0.8003, "step": 120 }, { "epoch": 1.1415525114155252, "grad_norm": 0.22504329681396484, "learning_rate": 0.00019986875683942535, "loss": 0.7939, "step": 125 }, { "epoch": 1.187214611872146, "grad_norm": 0.2312326580286026, "learning_rate": 0.00019977394889447524, "loss": 0.7847, "step": 130 }, { "epoch": 1.2328767123287672, "grad_norm": 0.21880114078521729, "learning_rate": 0.00019965356041462955, "loss": 0.7856, "step": 135 }, { "epoch": 1.278538812785388, "grad_norm": 0.19915428757667542, "learning_rate": 0.00019950762226567781, "loss": 0.7772, "step": 140 }, { "epoch": 1.3242009132420092, "grad_norm": 0.21661226451396942, "learning_rate": 0.00019933617186395917, "loss": 0.7698, "step": 145 }, { "epoch": 1.36986301369863, "grad_norm": 0.23418781161308289, "learning_rate": 0.00019913925316676945, "loss": 0.771, "step": 150 }, { "epoch": 1.4155251141552512, "grad_norm": 0.2712844908237457, "learning_rate": 0.00019891691666109113, "loss": 0.7754, "step": 155 }, { "epoch": 1.461187214611872, "grad_norm": 0.20658765733242035, "learning_rate": 0.00019866921935064906, "loss": 0.772, "step": 160 }, { "epoch": 1.5068493150684932, "grad_norm": 0.2858370542526245, "learning_rate": 0.00019839622474129596, "loss": 0.7648, "step": 165 }, { "epoch": 1.5525114155251143, "grad_norm": 0.2993509769439697, "learning_rate": 0.00019809800282473013, "loss": 0.7553, "step": 170 }, { "epoch": 1.5981735159817352, "grad_norm": 0.19971656799316406, "learning_rate": 0.0001977746300605507, "loss": 0.7591, "step": 175 }, { "epoch": 1.643835616438356, "grad_norm": 0.2134828269481659, "learning_rate": 0.00019742618935665476, "loss": 0.7549, "step": 180 }, { "epoch": 1.6894977168949772, "grad_norm": 0.18528909981250763, "learning_rate": 0.00019705277004798073, "loss": 0.7511, "step": 185 }, { "epoch": 1.7351598173515983, "grad_norm": 0.22641536593437195, "learning_rate": 0.0001966544678736044, "loss": 0.754, "step": 190 }, { "epoch": 1.7808219178082192, "grad_norm": 0.23213805258274078, "learning_rate": 0.00019623138495219292, "loss": 0.7476, "step": 195 }, { "epoch": 1.82648401826484, "grad_norm": 0.28454649448394775, "learning_rate": 0.00019578362975582292, "loss": 0.7535, "step": 200 }, { "epoch": 1.8721461187214612, "grad_norm": 0.3238324224948883, "learning_rate": 0.00019531131708217005, "loss": 0.7489, "step": 205 }, { "epoch": 1.9178082191780823, "grad_norm": 0.23858831822872162, "learning_rate": 0.0001948145680250766, "loss": 0.7514, "step": 210 }, { "epoch": 1.9634703196347032, "grad_norm": 0.21854574978351593, "learning_rate": 0.00019429350994350483, "loss": 0.7464, "step": 215 }, { "epoch": 2.0, "eval_loss": 2.1313483715057373, "eval_runtime": 0.7223, "eval_samples_per_second": 13.845, "eval_steps_per_second": 1.384, "step": 219 }, { "epoch": 2.009132420091324, "grad_norm": 0.24195519089698792, "learning_rate": 0.00019374827642888398, "loss": 0.7367, "step": 220 }, { "epoch": 2.0547945205479454, "grad_norm": 0.2783088684082031, "learning_rate": 0.0001931790072708596, "loss": 0.7283, "step": 225 }, { "epoch": 2.1004566210045663, "grad_norm": 0.25543445348739624, "learning_rate": 0.00019258584842145343, "loss": 0.7298, "step": 230 }, { "epoch": 2.146118721461187, "grad_norm": 0.23951423168182373, "learning_rate": 0.00019196895195764362, "loss": 0.7296, "step": 235 }, { "epoch": 2.191780821917808, "grad_norm": 0.2517695426940918, "learning_rate": 0.0001913284760423745, "loss": 0.719, "step": 240 }, { "epoch": 2.237442922374429, "grad_norm": 0.22162802517414093, "learning_rate": 0.00019066458488400584, "loss": 0.7252, "step": 245 }, { "epoch": 2.2831050228310503, "grad_norm": 0.2749063968658447, "learning_rate": 0.00018997744869421246, "loss": 0.7253, "step": 250 }, { "epoch": 2.328767123287671, "grad_norm": 0.2638462483882904, "learning_rate": 0.00018926724364434446, "loss": 0.718, "step": 255 }, { "epoch": 2.374429223744292, "grad_norm": 0.23074951767921448, "learning_rate": 0.0001885341518202595, "loss": 0.7203, "step": 260 }, { "epoch": 2.4200913242009134, "grad_norm": 0.18681403994560242, "learning_rate": 0.00018777836117563892, "loss": 0.7253, "step": 265 }, { "epoch": 2.4657534246575343, "grad_norm": 0.19247153401374817, "learning_rate": 0.00018700006548379898, "loss": 0.7175, "step": 270 }, { "epoch": 2.5114155251141552, "grad_norm": 0.18738825619220734, "learning_rate": 0.0001861994642880105, "loss": 0.7206, "step": 275 }, { "epoch": 2.557077625570776, "grad_norm": 0.2461051046848297, "learning_rate": 0.00018537676285033887, "loss": 0.7134, "step": 280 }, { "epoch": 2.602739726027397, "grad_norm": 0.31902387738227844, "learning_rate": 0.0001845321720990181, "loss": 0.7135, "step": 285 }, { "epoch": 2.6484018264840183, "grad_norm": 0.19439548254013062, "learning_rate": 0.00018366590857437184, "loss": 0.7194, "step": 290 }, { "epoch": 2.6940639269406392, "grad_norm": 0.18397387862205505, "learning_rate": 0.00018277819437329576, "loss": 0.7172, "step": 295 }, { "epoch": 2.73972602739726, "grad_norm": 0.1935635656118393, "learning_rate": 0.00018186925709231532, "loss": 0.7188, "step": 300 }, { "epoch": 2.7853881278538815, "grad_norm": 0.20888376235961914, "learning_rate": 0.0001809393297692334, "loss": 0.71, "step": 305 }, { "epoch": 2.8310502283105023, "grad_norm": 0.20157787203788757, "learning_rate": 0.0001799886508233829, "loss": 0.7152, "step": 310 }, { "epoch": 2.8767123287671232, "grad_norm": 0.201010599732399, "learning_rate": 0.0001790174639944997, "loss": 0.7147, "step": 315 }, { "epoch": 2.922374429223744, "grad_norm": 0.20877932012081146, "learning_rate": 0.00017802601828023138, "loss": 0.7101, "step": 320 }, { "epoch": 2.968036529680365, "grad_norm": 0.19995956122875214, "learning_rate": 0.00017701456787229804, "loss": 0.7128, "step": 325 }, { "epoch": 2.9954337899543377, "eval_loss": 2.144402027130127, "eval_runtime": 0.7303, "eval_samples_per_second": 13.694, "eval_steps_per_second": 1.369, "step": 328 }, { "epoch": 3.0136986301369864, "grad_norm": 0.18146736919879913, "learning_rate": 0.0001759833720913214, "loss": 0.7071, "step": 330 }, { "epoch": 3.0593607305936072, "grad_norm": 0.20878320932388306, "learning_rate": 0.00017493269532033883, "loss": 0.6943, "step": 335 }, { "epoch": 3.105022831050228, "grad_norm": 0.24599188566207886, "learning_rate": 0.0001738628069370195, "loss": 0.6968, "step": 340 }, { "epoch": 3.1506849315068495, "grad_norm": 0.22413116693496704, "learning_rate": 0.00017277398124460023, "loss": 0.6949, "step": 345 }, { "epoch": 3.1963470319634704, "grad_norm": 0.22311843931674957, "learning_rate": 0.000171666497401558, "loss": 0.6988, "step": 350 }, { "epoch": 3.2420091324200913, "grad_norm": 0.19590473175048828, "learning_rate": 0.0001705406393500381, "loss": 0.692, "step": 355 }, { "epoch": 3.287671232876712, "grad_norm": 0.18890812993049622, "learning_rate": 0.00016939669574305566, "loss": 0.6923, "step": 360 }, { "epoch": 3.3333333333333335, "grad_norm": 0.19763173162937164, "learning_rate": 0.0001682349598704892, "loss": 0.6899, "step": 365 }, { "epoch": 3.3789954337899544, "grad_norm": 0.21059083938598633, "learning_rate": 0.00016705572958388576, "loss": 0.689, "step": 370 }, { "epoch": 3.4246575342465753, "grad_norm": 0.2090144008398056, "learning_rate": 0.00016585930722009601, "loss": 0.6909, "step": 375 }, { "epoch": 3.470319634703196, "grad_norm": 0.1937251091003418, "learning_rate": 0.00016464599952375998, "loss": 0.6943, "step": 380 }, { "epoch": 3.5159817351598175, "grad_norm": 0.19087150692939758, "learning_rate": 0.000163416117568662, "loss": 0.6936, "step": 385 }, { "epoch": 3.5616438356164384, "grad_norm": 0.21907366812229156, "learning_rate": 0.0001621699766779763, "loss": 0.6928, "step": 390 }, { "epoch": 3.6073059360730593, "grad_norm": 0.1987326741218567, "learning_rate": 0.00016090789634342278, "loss": 0.6913, "step": 395 }, { "epoch": 3.65296803652968, "grad_norm": 0.19414757192134857, "learning_rate": 0.00015963020014335438, "loss": 0.6873, "step": 400 }, { "epoch": 3.6986301369863015, "grad_norm": 0.19000251591205597, "learning_rate": 0.0001583372156597961, "loss": 0.6895, "step": 405 }, { "epoch": 3.7442922374429224, "grad_norm": 0.17563720047473907, "learning_rate": 0.00015702927439445826, "loss": 0.6905, "step": 410 }, { "epoch": 3.7899543378995433, "grad_norm": 0.22648802399635315, "learning_rate": 0.00015570671168374438, "loss": 0.685, "step": 415 }, { "epoch": 3.8356164383561646, "grad_norm": 0.1822802722454071, "learning_rate": 0.00015436986661277577, "loss": 0.6897, "step": 420 }, { "epoch": 3.8812785388127855, "grad_norm": 0.18406091630458832, "learning_rate": 0.0001530190819284555, "loss": 0.6849, "step": 425 }, { "epoch": 3.9269406392694064, "grad_norm": 0.1968332827091217, "learning_rate": 0.00015165470395159313, "loss": 0.6841, "step": 430 }, { "epoch": 3.9726027397260273, "grad_norm": 0.2520104646682739, "learning_rate": 0.0001502770824881133, "loss": 0.6924, "step": 435 }, { "epoch": 4.0, "eval_loss": 2.1630699634552, "eval_runtime": 0.7164, "eval_samples_per_second": 13.958, "eval_steps_per_second": 1.396, "step": 438 }, { "epoch": 4.018264840182648, "grad_norm": 0.20841118693351746, "learning_rate": 0.00014888657073937076, "loss": 0.6786, "step": 440 }, { "epoch": 4.063926940639269, "grad_norm": 0.2335735559463501, "learning_rate": 0.00014748352521159493, "loss": 0.6687, "step": 445 }, { "epoch": 4.109589041095891, "grad_norm": 0.23169922828674316, "learning_rate": 0.0001460683056244869, "loss": 0.6701, "step": 450 }, { "epoch": 4.155251141552512, "grad_norm": 0.20742633938789368, "learning_rate": 0.00014464127481899312, "loss": 0.6678, "step": 455 }, { "epoch": 4.200913242009133, "grad_norm": 0.233662411570549, "learning_rate": 0.00014320279866427796, "loss": 0.669, "step": 460 }, { "epoch": 4.2465753424657535, "grad_norm": 0.22496378421783447, "learning_rate": 0.00014175324596392075, "loss": 0.6695, "step": 465 }, { "epoch": 4.292237442922374, "grad_norm": 0.1961798071861267, "learning_rate": 0.00014029298836135988, "loss": 0.6694, "step": 470 }, { "epoch": 4.337899543378995, "grad_norm": 0.21112968027591705, "learning_rate": 0.00013882240024460927, "loss": 0.6758, "step": 475 }, { "epoch": 4.383561643835616, "grad_norm": 0.21480615437030792, "learning_rate": 0.0001373418586502706, "loss": 0.6685, "step": 480 }, { "epoch": 4.429223744292237, "grad_norm": 0.24116870760917664, "learning_rate": 0.0001358517431668672, "loss": 0.6686, "step": 485 }, { "epoch": 4.474885844748858, "grad_norm": 0.20268571376800537, "learning_rate": 0.00013435243583752294, "loss": 0.6642, "step": 490 }, { "epoch": 4.52054794520548, "grad_norm": 0.19283969700336456, "learning_rate": 0.00013284432106201233, "loss": 0.6729, "step": 495 }, { "epoch": 4.566210045662101, "grad_norm": 0.18919415771961212, "learning_rate": 0.00013132778549820618, "loss": 0.6713, "step": 500 }, { "epoch": 4.6118721461187215, "grad_norm": 0.19388696551322937, "learning_rate": 0.00012980321796293836, "loss": 0.6698, "step": 505 }, { "epoch": 4.657534246575342, "grad_norm": 0.19509977102279663, "learning_rate": 0.00012827100933231905, "loss": 0.6732, "step": 510 }, { "epoch": 4.703196347031963, "grad_norm": 0.19472207129001617, "learning_rate": 0.00012673155244151985, "loss": 0.6689, "step": 515 }, { "epoch": 4.748858447488584, "grad_norm": 0.1974049210548401, "learning_rate": 0.000125185241984057, "loss": 0.6661, "step": 520 }, { "epoch": 4.794520547945205, "grad_norm": 0.18362760543823242, "learning_rate": 0.00012363247441059776, "loss": 0.6705, "step": 525 }, { "epoch": 4.840182648401827, "grad_norm": 0.1863856315612793, "learning_rate": 0.00012207364782731655, "loss": 0.6663, "step": 530 }, { "epoch": 4.885844748858448, "grad_norm": 0.19237390160560608, "learning_rate": 0.00012050916189382646, "loss": 0.6701, "step": 535 }, { "epoch": 4.931506849315069, "grad_norm": 0.1915617734193802, "learning_rate": 0.00011893941772071249, "loss": 0.6781, "step": 540 }, { "epoch": 4.9771689497716896, "grad_norm": 0.18157874047756195, "learning_rate": 0.00011736481776669306, "loss": 0.6777, "step": 545 }, { "epoch": 4.995433789954338, "eval_loss": 2.182265520095825, "eval_runtime": 0.7269, "eval_samples_per_second": 13.757, "eval_steps_per_second": 1.376, "step": 547 }, { "epoch": 5.0228310502283104, "grad_norm": 0.19536516070365906, "learning_rate": 0.0001157857657354354, "loss": 0.6657, "step": 550 }, { "epoch": 5.068493150684931, "grad_norm": 0.22191356122493744, "learning_rate": 0.00011420266647205231, "loss": 0.6547, "step": 555 }, { "epoch": 5.114155251141552, "grad_norm": 0.2473655790090561, "learning_rate": 0.00011261592585930576, "loss": 0.649, "step": 560 }, { "epoch": 5.159817351598173, "grad_norm": 0.21013621985912323, "learning_rate": 0.00011102595071354472, "loss": 0.6493, "step": 565 }, { "epoch": 5.205479452054795, "grad_norm": 0.2081068456172943, "learning_rate": 0.00010943314868040364, "loss": 0.647, "step": 570 }, { "epoch": 5.251141552511416, "grad_norm": 0.23985308408737183, "learning_rate": 0.00010783792813028827, "loss": 0.6505, "step": 575 }, { "epoch": 5.296803652968037, "grad_norm": 0.19800134003162384, "learning_rate": 0.00010624069805367559, "loss": 0.6537, "step": 580 }, { "epoch": 5.342465753424658, "grad_norm": 0.21391679346561432, "learning_rate": 0.00010464186795625482, "loss": 0.6558, "step": 585 }, { "epoch": 5.3881278538812785, "grad_norm": 0.205993190407753, "learning_rate": 0.00010304184775393642, "loss": 0.6578, "step": 590 }, { "epoch": 5.433789954337899, "grad_norm": 0.20510774850845337, "learning_rate": 0.00010144104766775572, "loss": 0.6565, "step": 595 }, { "epoch": 5.47945205479452, "grad_norm": 0.1970670372247696, "learning_rate": 9.983987811869862e-05, "loss": 0.6515, "step": 600 }, { "epoch": 5.525114155251142, "grad_norm": 0.18836742639541626, "learning_rate": 9.823874962247564e-05, "loss": 0.6557, "step": 605 }, { "epoch": 5.570776255707763, "grad_norm": 0.20027703046798706, "learning_rate": 9.663807268427198e-05, "loss": 0.6522, "step": 610 }, { "epoch": 5.616438356164384, "grad_norm": 0.2243603765964508, "learning_rate": 9.503825769350017e-05, "loss": 0.6496, "step": 615 }, { "epoch": 5.662100456621005, "grad_norm": 0.18932883441448212, "learning_rate": 9.343971481858246e-05, "loss": 0.6532, "step": 620 }, { "epoch": 5.707762557077626, "grad_norm": 0.19952723383903503, "learning_rate": 9.184285390178978e-05, "loss": 0.6511, "step": 625 }, { "epoch": 5.7534246575342465, "grad_norm": 0.20302745699882507, "learning_rate": 9.024808435416434e-05, "loss": 0.6524, "step": 630 }, { "epoch": 5.799086757990867, "grad_norm": 0.1958286166191101, "learning_rate": 8.865581505055291e-05, "loss": 0.6543, "step": 635 }, { "epoch": 5.844748858447488, "grad_norm": 0.189736470580101, "learning_rate": 8.706645422477739e-05, "loss": 0.6529, "step": 640 }, { "epoch": 5.890410958904109, "grad_norm": 0.2069847285747528, "learning_rate": 8.548040936496989e-05, "loss": 0.6542, "step": 645 }, { "epoch": 5.936073059360731, "grad_norm": 0.2031773030757904, "learning_rate": 8.389808710909881e-05, "loss": 0.6515, "step": 650 }, { "epoch": 5.981735159817352, "grad_norm": 0.20669801533222198, "learning_rate": 8.231989314071317e-05, "loss": 0.6526, "step": 655 }, { "epoch": 6.0, "eval_loss": 2.207833766937256, "eval_runtime": 0.7173, "eval_samples_per_second": 13.941, "eval_steps_per_second": 1.394, "step": 657 }, { "epoch": 6.027397260273973, "grad_norm": 0.2104310542345047, "learning_rate": 8.07462320849313e-05, "loss": 0.6444, "step": 660 }, { "epoch": 6.073059360730594, "grad_norm": 0.18349581956863403, "learning_rate": 7.917750740470117e-05, "loss": 0.6397, "step": 665 }, { "epoch": 6.1187214611872145, "grad_norm": 0.20204049348831177, "learning_rate": 7.761412129735852e-05, "loss": 0.6439, "step": 670 }, { "epoch": 6.164383561643835, "grad_norm": 0.20316743850708008, "learning_rate": 7.605647459150961e-05, "loss": 0.6392, "step": 675 }, { "epoch": 6.210045662100456, "grad_norm": 0.2150379866361618, "learning_rate": 7.450496664426477e-05, "loss": 0.634, "step": 680 }, { "epoch": 6.255707762557078, "grad_norm": 0.19887958467006683, "learning_rate": 7.295999523884921e-05, "loss": 0.6442, "step": 685 }, { "epoch": 6.301369863013699, "grad_norm": 0.1974460333585739, "learning_rate": 7.142195648261747e-05, "loss": 0.6396, "step": 690 }, { "epoch": 6.34703196347032, "grad_norm": 0.21475858986377716, "learning_rate": 6.989124470549745e-05, "loss": 0.6354, "step": 695 }, { "epoch": 6.392694063926941, "grad_norm": 0.19873230159282684, "learning_rate": 6.83682523588902e-05, "loss": 0.64, "step": 700 }, { "epoch": 6.438356164383562, "grad_norm": 0.20658674836158752, "learning_rate": 6.685336991505122e-05, "loss": 0.6367, "step": 705 }, { "epoch": 6.4840182648401825, "grad_norm": 0.2072938233613968, "learning_rate": 6.534698576697939e-05, "loss": 0.6388, "step": 710 }, { "epoch": 6.529680365296803, "grad_norm": 0.20482133328914642, "learning_rate": 6.384948612883873e-05, "loss": 0.6397, "step": 715 }, { "epoch": 6.575342465753424, "grad_norm": 0.20856335759162903, "learning_rate": 6.2361254936939e-05, "loss": 0.6407, "step": 720 }, { "epoch": 6.621004566210045, "grad_norm": 0.21489538252353668, "learning_rate": 6.088267375130023e-05, "loss": 0.6414, "step": 725 }, { "epoch": 6.666666666666667, "grad_norm": 0.19792255759239197, "learning_rate": 5.941412165782645e-05, "loss": 0.634, "step": 730 }, { "epoch": 6.712328767123288, "grad_norm": 0.19678843021392822, "learning_rate": 5.79559751711138e-05, "loss": 0.6407, "step": 735 }, { "epoch": 6.757990867579909, "grad_norm": 0.18998436629772186, "learning_rate": 5.650860813791785e-05, "loss": 0.6346, "step": 740 }, { "epoch": 6.80365296803653, "grad_norm": 0.19847027957439423, "learning_rate": 5.507239164130501e-05, "loss": 0.6409, "step": 745 }, { "epoch": 6.8493150684931505, "grad_norm": 0.205197274684906, "learning_rate": 5.364769390551225e-05, "loss": 0.6404, "step": 750 }, { "epoch": 6.894977168949771, "grad_norm": 0.19521455466747284, "learning_rate": 5.2234880201540284e-05, "loss": 0.6386, "step": 755 }, { "epoch": 6.940639269406392, "grad_norm": 0.1955854445695877, "learning_rate": 5.0834312753503124e-05, "loss": 0.6349, "step": 760 }, { "epoch": 6.986301369863014, "grad_norm": 0.1964850276708603, "learning_rate": 4.9446350645759885e-05, "loss": 0.6326, "step": 765 }, { "epoch": 6.995433789954338, "eval_loss": 2.229551315307617, "eval_runtime": 0.7351, "eval_samples_per_second": 13.604, "eval_steps_per_second": 1.36, "step": 766 }, { "epoch": 7.031963470319635, "grad_norm": 0.1950986385345459, "learning_rate": 4.807134973085036e-05, "loss": 0.626, "step": 770 }, { "epoch": 7.077625570776256, "grad_norm": 0.20411163568496704, "learning_rate": 4.6709662538260267e-05, "loss": 0.6293, "step": 775 }, { "epoch": 7.123287671232877, "grad_norm": 0.21672751009464264, "learning_rate": 4.53616381840377e-05, "loss": 0.6255, "step": 780 }, { "epoch": 7.168949771689498, "grad_norm": 0.20474384725093842, "learning_rate": 4.402762228128531e-05, "loss": 0.6317, "step": 785 }, { "epoch": 7.2146118721461185, "grad_norm": 0.20330122113227844, "learning_rate": 4.2707956851550016e-05, "loss": 0.6297, "step": 790 }, { "epoch": 7.260273972602739, "grad_norm": 0.21286101639270782, "learning_rate": 4.140298023713416e-05, "loss": 0.6278, "step": 795 }, { "epoch": 7.30593607305936, "grad_norm": 0.20241223275661469, "learning_rate": 4.011302701434937e-05, "loss": 0.6223, "step": 800 }, { "epoch": 7.351598173515982, "grad_norm": 0.20820164680480957, "learning_rate": 3.8838427907736476e-05, "loss": 0.631, "step": 805 }, { "epoch": 7.397260273972603, "grad_norm": 0.2071073353290558, "learning_rate": 3.757950970527249e-05, "loss": 0.627, "step": 810 }, { "epoch": 7.442922374429224, "grad_norm": 0.20754040777683258, "learning_rate": 3.633659517458736e-05, "loss": 0.6284, "step": 815 }, { "epoch": 7.488584474885845, "grad_norm": 0.204667329788208, "learning_rate": 3.5110002980210975e-05, "loss": 0.6214, "step": 820 }, { "epoch": 7.534246575342466, "grad_norm": 0.204596146941185, "learning_rate": 3.3900047601872596e-05, "loss": 0.6241, "step": 825 }, { "epoch": 7.579908675799087, "grad_norm": 0.2120332568883896, "learning_rate": 3.270703925387279e-05, "loss": 0.6295, "step": 830 }, { "epoch": 7.6255707762557075, "grad_norm": 0.20006106793880463, "learning_rate": 3.153128380554941e-05, "loss": 0.6259, "step": 835 }, { "epoch": 7.671232876712329, "grad_norm": 0.19608016312122345, "learning_rate": 3.037308270285709e-05, "loss": 0.6234, "step": 840 }, { "epoch": 7.71689497716895, "grad_norm": 0.20698235929012299, "learning_rate": 2.923273289108115e-05, "loss": 0.6312, "step": 845 }, { "epoch": 7.762557077625571, "grad_norm": 0.20038080215454102, "learning_rate": 2.8110526738705344e-05, "loss": 0.6266, "step": 850 }, { "epoch": 7.808219178082192, "grad_norm": 0.2022303193807602, "learning_rate": 2.7006751962452882e-05, "loss": 0.631, "step": 855 }, { "epoch": 7.853881278538813, "grad_norm": 0.20605237782001495, "learning_rate": 2.592169155352031e-05, "loss": 0.6331, "step": 860 }, { "epoch": 7.899543378995434, "grad_norm": 0.2080409824848175, "learning_rate": 2.485562370502279e-05, "loss": 0.6313, "step": 865 }, { "epoch": 7.945205479452055, "grad_norm": 0.2000366896390915, "learning_rate": 2.3808821740669606e-05, "loss": 0.6285, "step": 870 }, { "epoch": 7.9908675799086755, "grad_norm": 0.20418736338615417, "learning_rate": 2.2781554044688015e-05, "loss": 0.6311, "step": 875 }, { "epoch": 8.0, "eval_loss": 2.2484524250030518, "eval_runtime": 0.7257, "eval_samples_per_second": 13.779, "eval_steps_per_second": 1.378, "step": 876 }, { "epoch": 8.036529680365296, "grad_norm": 0.2047678381204605, "learning_rate": 2.1774083993013718e-05, "loss": 0.6168, "step": 880 }, { "epoch": 8.082191780821917, "grad_norm": 0.20055288076400757, "learning_rate": 2.078666988576504e-05, "loss": 0.6163, "step": 885 }, { "epoch": 8.127853881278538, "grad_norm": 0.2002391219139099, "learning_rate": 1.9819564881018983e-05, "loss": 0.6216, "step": 890 }, { "epoch": 8.173515981735159, "grad_norm": 0.19824841618537903, "learning_rate": 1.887301692990494e-05, "loss": 0.6215, "step": 895 }, { "epoch": 8.219178082191782, "grad_norm": 0.20471996068954468, "learning_rate": 1.7947268713034127e-05, "loss": 0.6168, "step": 900 }, { "epoch": 8.264840182648403, "grad_norm": 0.20544229447841644, "learning_rate": 1.7042557578279626e-05, "loss": 0.6188, "step": 905 }, { "epoch": 8.310502283105023, "grad_norm": 0.208919957280159, "learning_rate": 1.6159115479924257e-05, "loss": 0.6221, "step": 910 }, { "epoch": 8.356164383561644, "grad_norm": 0.20433735847473145, "learning_rate": 1.529716891919074e-05, "loss": 0.6187, "step": 915 }, { "epoch": 8.401826484018265, "grad_norm": 0.20233039557933807, "learning_rate": 1.4456938886170412e-05, "loss": 0.6224, "step": 920 }, { "epoch": 8.447488584474886, "grad_norm": 0.2014380395412445, "learning_rate": 1.3638640803164516e-05, "loss": 0.6186, "step": 925 }, { "epoch": 8.493150684931507, "grad_norm": 0.20106054842472076, "learning_rate": 1.2842484469453365e-05, "loss": 0.6229, "step": 930 }, { "epoch": 8.538812785388128, "grad_norm": 0.20424200594425201, "learning_rate": 1.2068674007506786e-05, "loss": 0.6225, "step": 935 }, { "epoch": 8.584474885844749, "grad_norm": 0.19919848442077637, "learning_rate": 1.1317407810650372e-05, "loss": 0.621, "step": 940 }, { "epoch": 8.63013698630137, "grad_norm": 0.21022921800613403, "learning_rate": 1.058887849220026e-05, "loss": 0.6208, "step": 945 }, { "epoch": 8.67579908675799, "grad_norm": 0.2036212980747223, "learning_rate": 9.883272836080116e-06, "loss": 0.6184, "step": 950 }, { "epoch": 8.721461187214611, "grad_norm": 0.19858643412590027, "learning_rate": 9.200771748932513e-06, "loss": 0.6224, "step": 955 }, { "epoch": 8.767123287671232, "grad_norm": 0.2013389766216278, "learning_rate": 8.541550213737171e-06, "loss": 0.6175, "step": 960 }, { "epoch": 8.812785388127853, "grad_norm": 0.20145344734191895, "learning_rate": 7.905777244947954e-06, "loss": 0.6204, "step": 965 }, { "epoch": 8.858447488584474, "grad_norm": 0.20189128816127777, "learning_rate": 7.293615845160196e-06, "loss": 0.6225, "step": 970 }, { "epoch": 8.904109589041095, "grad_norm": 0.2002427577972412, "learning_rate": 6.705222963319191e-06, "loss": 0.6197, "step": 975 }, { "epoch": 8.949771689497716, "grad_norm": 0.21808940172195435, "learning_rate": 6.140749454480932e-06, "loss": 0.6228, "step": 980 }, { "epoch": 8.995433789954339, "grad_norm": 0.20092381536960602, "learning_rate": 5.6003400411351325e-06, "loss": 0.6233, "step": 985 }, { "epoch": 8.995433789954339, "eval_loss": 2.258711099624634, "eval_runtime": 0.7286, "eval_samples_per_second": 13.726, "eval_steps_per_second": 1.373, "step": 985 }, { "epoch": 9.04109589041096, "grad_norm": 0.19878605008125305, "learning_rate": 5.0841332761005e-06, "loss": 0.6193, "step": 990 }, { "epoch": 9.08675799086758, "grad_norm": 0.20306305587291718, "learning_rate": 4.592261507001993e-06, "loss": 0.6177, "step": 995 }, { "epoch": 9.132420091324201, "grad_norm": 0.2019263654947281, "learning_rate": 4.124850842338779e-06, "loss": 0.6133, "step": 1000 }, { "epoch": 9.178082191780822, "grad_norm": 0.2005142867565155, "learning_rate": 3.6820211191520125e-06, "loss": 0.6141, "step": 1005 }, { "epoch": 9.223744292237443, "grad_norm": 0.20048119127750397, "learning_rate": 3.263885872300343e-06, "loss": 0.6224, "step": 1010 }, { "epoch": 9.269406392694064, "grad_norm": 0.20118780434131622, "learning_rate": 2.8705523053513816e-06, "loss": 0.6158, "step": 1015 }, { "epoch": 9.315068493150685, "grad_norm": 0.19994989037513733, "learning_rate": 2.502121263096224e-06, "loss": 0.6133, "step": 1020 }, { "epoch": 9.360730593607306, "grad_norm": 0.19647841155529022, "learning_rate": 2.1586872056944428e-06, "loss": 0.6172, "step": 1025 }, { "epoch": 9.406392694063927, "grad_norm": 0.20165683329105377, "learning_rate": 1.840338184455881e-06, "loss": 0.6193, "step": 1030 }, { "epoch": 9.452054794520548, "grad_norm": 0.20188592374324799, "learning_rate": 1.5471558192656777e-06, "loss": 0.6216, "step": 1035 }, { "epoch": 9.497716894977168, "grad_norm": 0.20157510042190552, "learning_rate": 1.2792152776580968e-06, "loss": 0.616, "step": 1040 }, { "epoch": 9.54337899543379, "grad_norm": 0.20351751148700714, "learning_rate": 1.036585255544764e-06, "loss": 0.6149, "step": 1045 }, { "epoch": 9.58904109589041, "grad_norm": 0.1994985044002533, "learning_rate": 8.193279596020121e-07, "loss": 0.6184, "step": 1050 }, { "epoch": 9.634703196347033, "grad_norm": 0.2011667788028717, "learning_rate": 6.274990913221035e-07, "loss": 0.6149, "step": 1055 }, { "epoch": 9.680365296803654, "grad_norm": 0.2020387500524521, "learning_rate": 4.6114783273213393e-07, "loss": 0.619, "step": 1060 }, { "epoch": 9.726027397260275, "grad_norm": 0.19661925733089447, "learning_rate": 3.203168337845508e-07, "loss": 0.6193, "step": 1065 }, { "epoch": 9.771689497716896, "grad_norm": 0.20111018419265747, "learning_rate": 2.05042201422323e-07, "loss": 0.615, "step": 1070 }, { "epoch": 9.817351598173516, "grad_norm": 0.20135089755058289, "learning_rate": 1.1535349032167908e-07, "loss": 0.6158, "step": 1075 }, { "epoch": 9.863013698630137, "grad_norm": 0.20196588337421417, "learning_rate": 5.127369531473525e-08, "loss": 0.614, "step": 1080 }, { "epoch": 9.908675799086758, "grad_norm": 0.19875915348529816, "learning_rate": 1.2819245493955744e-08, "loss": 0.6174, "step": 1085 }, { "epoch": 9.954337899543379, "grad_norm": 0.19983670115470886, "learning_rate": 0.0, "loss": 0.6194, "step": 1090 }, { "epoch": 9.954337899543379, "eval_loss": 2.260620594024658, "eval_runtime": 0.6934, "eval_samples_per_second": 14.422, "eval_steps_per_second": 1.442, "step": 1090 }, { "epoch": 9.954337899543379, "step": 1090, "total_flos": 6.456679991336763e+18, "train_loss": 0.7130365929472338, "train_runtime": 9668.9725, "train_samples_per_second": 14.452, "train_steps_per_second": 0.113 } ], "logging_steps": 5, "max_steps": 1090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.456679991336763e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }