{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9976735168670028, "eval_steps": 500, "global_step": 1288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015509887553315239, "grad_norm": 3.1287975311279297, "learning_rate": 3.608247422680412e-06, "loss": 2.828, "step": 10 }, { "epoch": 0.031019775106630478, "grad_norm": 2.065422296524048, "learning_rate": 8.762886597938144e-06, "loss": 2.3927, "step": 20 }, { "epoch": 0.046529662659945716, "grad_norm": 2.040574312210083, "learning_rate": 1.3917525773195878e-05, "loss": 2.1847, "step": 30 }, { "epoch": 0.062039550213260956, "grad_norm": 2.27348256111145, "learning_rate": 1.9072164948453608e-05, "loss": 2.0933, "step": 40 }, { "epoch": 0.07754943776657619, "grad_norm": 2.1425204277038574, "learning_rate": 2.422680412371134e-05, "loss": 1.8182, "step": 50 }, { "epoch": 0.09305932531989143, "grad_norm": 1.7984707355499268, "learning_rate": 2.9381443298969075e-05, "loss": 1.9989, "step": 60 }, { "epoch": 0.10856921287320667, "grad_norm": 2.2766242027282715, "learning_rate": 3.4536082474226805e-05, "loss": 1.8708, "step": 70 }, { "epoch": 0.12407910042652191, "grad_norm": 2.106276750564575, "learning_rate": 3.9690721649484535e-05, "loss": 1.893, "step": 80 }, { "epoch": 0.13958898797983715, "grad_norm": 2.5897228717803955, "learning_rate": 4.484536082474227e-05, "loss": 1.889, "step": 90 }, { "epoch": 0.15509887553315238, "grad_norm": 2.0310323238372803, "learning_rate": 5e-05, "loss": 1.847, "step": 100 }, { "epoch": 0.17060876308646764, "grad_norm": 2.198716640472412, "learning_rate": 5.515463917525774e-05, "loss": 1.5618, "step": 110 }, { "epoch": 0.18611865063978286, "grad_norm": 1.959328055381775, "learning_rate": 6.030927835051546e-05, "loss": 1.6953, "step": 120 }, { "epoch": 0.2016285381930981, "grad_norm": 2.2912769317626953, "learning_rate": 6.546391752577319e-05, "loss": 1.5452, "step": 130 }, { "epoch": 0.21713842574641334, "grad_norm": 2.818457841873169, "learning_rate": 7.061855670103093e-05, "loss": 1.6685, "step": 140 }, { "epoch": 0.23264831329972857, "grad_norm": 2.236240863800049, "learning_rate": 7.577319587628867e-05, "loss": 1.5547, "step": 150 }, { "epoch": 0.24815820085304383, "grad_norm": 2.780366897583008, "learning_rate": 8.092783505154639e-05, "loss": 1.655, "step": 160 }, { "epoch": 0.2636680884063591, "grad_norm": 3.003800630569458, "learning_rate": 8.608247422680413e-05, "loss": 1.6785, "step": 170 }, { "epoch": 0.2791779759596743, "grad_norm": 2.680654287338257, "learning_rate": 9.123711340206186e-05, "loss": 1.5605, "step": 180 }, { "epoch": 0.29468786351298953, "grad_norm": 3.134833574295044, "learning_rate": 9.639175257731959e-05, "loss": 1.5149, "step": 190 }, { "epoch": 0.31019775106630476, "grad_norm": 3.5344340801239014, "learning_rate": 9.999814456846558e-05, "loss": 1.55, "step": 200 }, { "epoch": 0.32570763861962, "grad_norm": 2.9158310890197754, "learning_rate": 9.996516294963207e-05, "loss": 1.4199, "step": 210 }, { "epoch": 0.34121752617293527, "grad_norm": 3.0592031478881836, "learning_rate": 9.989098082340299e-05, "loss": 1.5944, "step": 220 }, { "epoch": 0.3567274137262505, "grad_norm": 2.9872217178344727, "learning_rate": 9.977565935922244e-05, "loss": 1.4701, "step": 230 }, { "epoch": 0.3722373012795657, "grad_norm": 3.328583002090454, "learning_rate": 9.96192936494019e-05, "loss": 1.5171, "step": 240 }, { "epoch": 0.38774718883288095, "grad_norm": 3.8056564331054688, "learning_rate": 9.942201263070854e-05, "loss": 1.525, "step": 250 }, { "epoch": 0.4032570763861962, "grad_norm": 3.7598862648010254, "learning_rate": 9.918397897804607e-05, "loss": 1.5645, "step": 260 }, { "epoch": 0.41876696393951146, "grad_norm": 3.966884136199951, "learning_rate": 9.890538897031536e-05, "loss": 1.5055, "step": 270 }, { "epoch": 0.4342768514928267, "grad_norm": 3.147392511367798, "learning_rate": 9.8586472328566e-05, "loss": 1.3782, "step": 280 }, { "epoch": 0.4497867390461419, "grad_norm": 4.427823543548584, "learning_rate": 9.822749202657169e-05, "loss": 1.4728, "step": 290 }, { "epoch": 0.46529662659945714, "grad_norm": 3.6006522178649902, "learning_rate": 9.782874407398626e-05, "loss": 1.4457, "step": 300 }, { "epoch": 0.48080651415277237, "grad_norm": 3.9323055744171143, "learning_rate": 9.739055727225856e-05, "loss": 1.3709, "step": 310 }, { "epoch": 0.49631640170608765, "grad_norm": 4.171195983886719, "learning_rate": 9.691329294350784e-05, "loss": 1.3101, "step": 320 }, { "epoch": 0.5118262892594029, "grad_norm": 4.27497673034668, "learning_rate": 9.639734463258303e-05, "loss": 1.3937, "step": 330 }, { "epoch": 0.5273361768127182, "grad_norm": 3.4899075031280518, "learning_rate": 9.584313778255179e-05, "loss": 1.376, "step": 340 }, { "epoch": 0.5428460643660333, "grad_norm": 3.729485511779785, "learning_rate": 9.525112938388658e-05, "loss": 1.407, "step": 350 }, { "epoch": 0.5583559519193486, "grad_norm": 3.903895378112793, "learning_rate": 9.462180759763737e-05, "loss": 1.3552, "step": 360 }, { "epoch": 0.5738658394726638, "grad_norm": 4.612530708312988, "learning_rate": 9.39556913529015e-05, "loss": 1.3075, "step": 370 }, { "epoch": 0.5893757270259791, "grad_norm": 4.5943193435668945, "learning_rate": 9.325332991892273e-05, "loss": 1.3499, "step": 380 }, { "epoch": 0.6048856145792944, "grad_norm": 3.5239744186401367, "learning_rate": 9.251530245217225e-05, "loss": 1.319, "step": 390 }, { "epoch": 0.6203955021326095, "grad_norm": 3.609306573867798, "learning_rate": 9.174221751878505e-05, "loss": 1.2705, "step": 400 }, { "epoch": 0.6359053896859248, "grad_norm": 3.8195927143096924, "learning_rate": 9.093471259274572e-05, "loss": 1.2472, "step": 410 }, { "epoch": 0.65141527723924, "grad_norm": 5.582481861114502, "learning_rate": 9.009345353023711e-05, "loss": 1.2423, "step": 420 }, { "epoch": 0.6669251647925553, "grad_norm": 3.6613643169403076, "learning_rate": 8.930803335557602e-05, "loss": 1.2564, "step": 430 }, { "epoch": 0.6824350523458705, "grad_norm": 3.8977560997009277, "learning_rate": 8.840457509138307e-05, "loss": 1.1387, "step": 440 }, { "epoch": 0.6979449398991857, "grad_norm": 4.135696887969971, "learning_rate": 8.746944900331711e-05, "loss": 1.246, "step": 450 }, { "epoch": 0.713454827452501, "grad_norm": 4.384019374847412, "learning_rate": 8.650342618201475e-05, "loss": 1.1653, "step": 460 }, { "epoch": 0.7289647150058162, "grad_norm": 4.108353614807129, "learning_rate": 8.550730319508516e-05, "loss": 1.1597, "step": 470 }, { "epoch": 0.7444746025591314, "grad_norm": 4.743010997772217, "learning_rate": 8.448190143027269e-05, "loss": 1.2637, "step": 480 }, { "epoch": 0.7599844901124467, "grad_norm": 4.176271915435791, "learning_rate": 8.342806641815304e-05, "loss": 1.1836, "step": 490 }, { "epoch": 0.7754943776657619, "grad_norm": 5.721745491027832, "learning_rate": 8.234666713492178e-05, "loss": 1.1127, "step": 500 }, { "epoch": 0.7910042652190772, "grad_norm": 4.143364906311035, "learning_rate": 8.123859528584985e-05, "loss": 1.2644, "step": 510 }, { "epoch": 0.8065141527723924, "grad_norm": 3.5532310009002686, "learning_rate": 8.010476456999712e-05, "loss": 1.116, "step": 520 }, { "epoch": 0.8220240403257076, "grad_norm": 5.060513496398926, "learning_rate": 7.894610992679008e-05, "loss": 1.1528, "step": 530 }, { "epoch": 0.8375339278790229, "grad_norm": 4.231233596801758, "learning_rate": 7.776358676508522e-05, "loss": 1.107, "step": 540 }, { "epoch": 0.8530438154323381, "grad_norm": 5.624173641204834, "learning_rate": 7.655817017535339e-05, "loss": 1.1705, "step": 550 }, { "epoch": 0.8685537029856534, "grad_norm": 4.362249851226807, "learning_rate": 7.533085412563534e-05, "loss": 1.08, "step": 560 }, { "epoch": 0.8840635905389685, "grad_norm": 5.387088298797607, "learning_rate": 7.408265064193071e-05, "loss": 1.1468, "step": 570 }, { "epoch": 0.8995734780922838, "grad_norm": 5.324846267700195, "learning_rate": 7.281458897369707e-05, "loss": 1.0551, "step": 580 }, { "epoch": 0.9150833656455991, "grad_norm": 5.092646598815918, "learning_rate": 7.152771474514642e-05, "loss": 1.0841, "step": 590 }, { "epoch": 0.9305932531989143, "grad_norm": 4.7164082527160645, "learning_rate": 7.022308909303974e-05, "loss": 1.0842, "step": 600 }, { "epoch": 0.9461031407522296, "grad_norm": 5.104768753051758, "learning_rate": 6.890178779168963e-05, "loss": 1.0545, "step": 610 }, { "epoch": 0.9616130283055447, "grad_norm": 4.294586181640625, "learning_rate": 6.756490036589346e-05, "loss": 1.0422, "step": 620 }, { "epoch": 0.97712291585886, "grad_norm": 5.337200164794922, "learning_rate": 6.621352919252788e-05, "loss": 1.0441, "step": 630 }, { "epoch": 0.9926328034121753, "grad_norm": 3.790553092956543, "learning_rate": 6.484878859154576e-05, "loss": 0.9895, "step": 640 }, { "epoch": 1.0081426909654905, "grad_norm": 3.337839126586914, "learning_rate": 6.347180390712497e-05, "loss": 0.8061, "step": 650 }, { "epoch": 1.0236525785188058, "grad_norm": 4.857980728149414, "learning_rate": 6.208371057972694e-05, "loss": 0.7235, "step": 660 }, { "epoch": 1.039162466072121, "grad_norm": 5.063539505004883, "learning_rate": 6.068565320982982e-05, "loss": 0.6637, "step": 670 }, { "epoch": 1.0546723536254363, "grad_norm": 4.6024274826049805, "learning_rate": 5.9278784614108375e-05, "loss": 0.6375, "step": 680 }, { "epoch": 1.0701822411787514, "grad_norm": 4.301967620849609, "learning_rate": 5.7864264874839144e-05, "loss": 0.6602, "step": 690 }, { "epoch": 1.0856921287320667, "grad_norm": 4.590878963470459, "learning_rate": 5.644326038331439e-05, "loss": 0.6464, "step": 700 }, { "epoch": 1.101202016285382, "grad_norm": 4.38985013961792, "learning_rate": 5.501694287805361e-05, "loss": 0.5979, "step": 710 }, { "epoch": 1.1167119038386972, "grad_norm": 4.648299217224121, "learning_rate": 5.358648847860599e-05, "loss": 0.6146, "step": 720 }, { "epoch": 1.1322217913920123, "grad_norm": 3.4942831993103027, "learning_rate": 5.215307671574027e-05, "loss": 0.5525, "step": 730 }, { "epoch": 1.1477316789453276, "grad_norm": 4.376279354095459, "learning_rate": 5.071788955882171e-05, "loss": 0.5887, "step": 740 }, { "epoch": 1.1632415664986429, "grad_norm": 4.812126636505127, "learning_rate": 4.92821104411783e-05, "loss": 0.5673, "step": 750 }, { "epoch": 1.1787514540519581, "grad_norm": 4.856875419616699, "learning_rate": 4.784692328425974e-05, "loss": 0.5525, "step": 760 }, { "epoch": 1.1942613416052734, "grad_norm": 3.832489490509033, "learning_rate": 4.6413511521394026e-05, "loss": 0.5717, "step": 770 }, { "epoch": 1.2097712291585885, "grad_norm": 3.861131191253662, "learning_rate": 4.4983057121946414e-05, "loss": 0.5778, "step": 780 }, { "epoch": 1.2252811167119038, "grad_norm": 3.7560808658599854, "learning_rate": 4.355673961668561e-05, "loss": 0.6202, "step": 790 }, { "epoch": 1.240791004265219, "grad_norm": 4.375748634338379, "learning_rate": 4.213573512516086e-05, "loss": 0.5646, "step": 800 }, { "epoch": 1.2563008918185343, "grad_norm": 4.177556991577148, "learning_rate": 4.072121538589164e-05, "loss": 0.5251, "step": 810 }, { "epoch": 1.2718107793718496, "grad_norm": 3.94201922416687, "learning_rate": 3.931434679017019e-05, "loss": 0.5271, "step": 820 }, { "epoch": 1.2873206669251647, "grad_norm": 3.7559092044830322, "learning_rate": 3.791628942027307e-05, "loss": 0.5479, "step": 830 }, { "epoch": 1.30283055447848, "grad_norm": 4.03005838394165, "learning_rate": 3.6528196092875044e-05, "loss": 0.5279, "step": 840 }, { "epoch": 1.3183404420317952, "grad_norm": 4.147780895233154, "learning_rate": 3.5151211408454276e-05, "loss": 0.4981, "step": 850 }, { "epoch": 1.3338503295851105, "grad_norm": 4.283544540405273, "learning_rate": 3.378647080747213e-05, "loss": 0.5398, "step": 860 }, { "epoch": 1.3493602171384258, "grad_norm": 3.875366449356079, "learning_rate": 3.2435099634106545e-05, "loss": 0.4504, "step": 870 }, { "epoch": 1.3648701046917409, "grad_norm": 3.7917864322662354, "learning_rate": 3.1098212208310385e-05, "loss": 0.4393, "step": 880 }, { "epoch": 1.3803799922450561, "grad_norm": 4.397382736206055, "learning_rate": 2.977691090696027e-05, "loss": 0.4437, "step": 890 }, { "epoch": 1.3958898797983714, "grad_norm": 3.607226848602295, "learning_rate": 2.8472285254853593e-05, "loss": 0.4576, "step": 900 }, { "epoch": 1.4113997673516867, "grad_norm": 4.605495452880859, "learning_rate": 2.7185411026302964e-05, "loss": 0.4662, "step": 910 }, { "epoch": 1.426909654905002, "grad_norm": 3.19085431098938, "learning_rate": 2.591734935806929e-05, "loss": 0.4229, "step": 920 }, { "epoch": 1.442419542458317, "grad_norm": 4.335093975067139, "learning_rate": 2.4669145874364658e-05, "loss": 0.4643, "step": 930 }, { "epoch": 1.4579294300116323, "grad_norm": 3.3597371578216553, "learning_rate": 2.3441829824646604e-05, "loss": 0.4029, "step": 940 }, { "epoch": 1.4734393175649476, "grad_norm": 4.720088958740234, "learning_rate": 2.2236413234914805e-05, "loss": 0.431, "step": 950 }, { "epoch": 1.488949205118263, "grad_norm": 3.779006242752075, "learning_rate": 2.105389007320992e-05, "loss": 0.4296, "step": 960 }, { "epoch": 1.5044590926715782, "grad_norm": 3.9406046867370605, "learning_rate": 1.9895235430002894e-05, "loss": 0.3959, "step": 970 }, { "epoch": 1.5199689802248932, "grad_norm": 3.5670626163482666, "learning_rate": 1.876140471415016e-05, "loss": 0.3907, "step": 980 }, { "epoch": 1.5354788677782087, "grad_norm": 5.355038166046143, "learning_rate": 1.7653332865078242e-05, "loss": 0.421, "step": 990 }, { "epoch": 1.5509887553315238, "grad_norm": 3.822100877761841, "learning_rate": 1.6571933581846965e-05, "loss": 0.3363, "step": 1000 }, { "epoch": 1.566498642884839, "grad_norm": 4.373218059539795, "learning_rate": 1.55180985697273e-05, "loss": 0.3965, "step": 1010 }, { "epoch": 1.5820085304381544, "grad_norm": 4.487693786621094, "learning_rate": 1.449269680491484e-05, "loss": 0.3237, "step": 1020 }, { "epoch": 1.5975184179914694, "grad_norm": 4.757607936859131, "learning_rate": 1.3496573817985264e-05, "loss": 0.3727, "step": 1030 }, { "epoch": 1.613028305544785, "grad_norm": 4.6655426025390625, "learning_rate": 1.2530550996682905e-05, "loss": 0.3301, "step": 1040 }, { "epoch": 1.6285381930981, "grad_norm": 4.697271347045898, "learning_rate": 1.1595424908616931e-05, "loss": 0.3866, "step": 1050 }, { "epoch": 1.6440480806514153, "grad_norm": 3.7209982872009277, "learning_rate": 1.0691966644423985e-05, "loss": 0.3376, "step": 1060 }, { "epoch": 1.6595579682047306, "grad_norm": 3.772613048553467, "learning_rate": 9.820921181938547e-06, "loss": 0.3684, "step": 1070 }, { "epoch": 1.6750678557580456, "grad_norm": 4.735914707183838, "learning_rate": 8.983006771895763e-06, "loss": 0.3535, "step": 1080 }, { "epoch": 1.6905777433113611, "grad_norm": 3.6291725635528564, "learning_rate": 8.1789143456728e-06, "loss": 0.324, "step": 1090 }, { "epoch": 1.7060876308646762, "grad_norm": 4.227074146270752, "learning_rate": 7.409306945557487e-06, "loss": 0.3367, "step": 1100 }, { "epoch": 1.7215975184179915, "grad_norm": 3.435671329498291, "learning_rate": 6.674819178013769e-06, "loss": 0.3172, "step": 1110 }, { "epoch": 1.7371074059713068, "grad_norm": 2.9583516120910645, "learning_rate": 5.97605669039496e-06, "loss": 0.3338, "step": 1120 }, { "epoch": 1.7526172935246218, "grad_norm": 4.426352024078369, "learning_rate": 5.3135956715362205e-06, "loss": 0.3108, "step": 1130 }, { "epoch": 1.7681271810779373, "grad_norm": 3.8417787551879883, "learning_rate": 4.687982376638101e-06, "loss": 0.3314, "step": 1140 }, { "epoch": 1.7836370686312524, "grad_norm": 4.444839954376221, "learning_rate": 4.099732676832818e-06, "loss": 0.2836, "step": 1150 }, { "epoch": 1.7991469561845677, "grad_norm": 3.3175086975097656, "learning_rate": 3.5493316338049086e-06, "loss": 0.3429, "step": 1160 }, { "epoch": 1.814656843737883, "grad_norm": 4.2491068840026855, "learning_rate": 3.037233099816705e-06, "loss": 0.3089, "step": 1170 }, { "epoch": 1.830166731291198, "grad_norm": 4.635960578918457, "learning_rate": 2.563859343468822e-06, "loss": 0.3364, "step": 1180 }, { "epoch": 1.8456766188445135, "grad_norm": 4.215371608734131, "learning_rate": 2.1296007015038366e-06, "loss": 0.3102, "step": 1190 }, { "epoch": 1.8611865063978286, "grad_norm": 4.521674156188965, "learning_rate": 1.734815256940675e-06, "loss": 0.3228, "step": 1200 }, { "epoch": 1.8766963939511438, "grad_norm": 4.14400053024292, "learning_rate": 1.379828543804812e-06, "loss": 0.3075, "step": 1210 }, { "epoch": 1.8922062815044591, "grad_norm": 3.0325677394866943, "learning_rate": 1.064933278697905e-06, "loss": 0.3188, "step": 1220 }, { "epoch": 1.9077161690577742, "grad_norm": 3.0454375743865967, "learning_rate": 7.903891194281754e-07, "loss": 0.281, "step": 1230 }, { "epoch": 1.9232260566110897, "grad_norm": 3.7709105014801025, "learning_rate": 5.564224509005566e-07, "loss": 0.343, "step": 1240 }, { "epoch": 1.9387359441644048, "grad_norm": 3.867096185684204, "learning_rate": 3.6322619844317286e-07, "loss": 0.3157, "step": 1250 }, { "epoch": 1.95424583171772, "grad_norm": 3.638516664505005, "learning_rate": 2.1095966872407557e-07, "loss": 0.3046, "step": 1260 }, { "epoch": 1.9697557192710353, "grad_norm": 3.1184732913970947, "learning_rate": 9.974841838941151e-08, "loss": 0.3263, "step": 1270 }, { "epoch": 1.9852656068243504, "grad_norm": 4.19941520690918, "learning_rate": 2.9684150531317233e-08, "loss": 0.3, "step": 1280 } ], "logging_steps": 10, "max_steps": 1288, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.506259383331062e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }