{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997846452029719, "eval_steps": 500, "global_step": 3714, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026919349628512973, "learning_rate": 5e-09, "loss": 9.3094, "step": 1 }, { "epoch": 0.005383869925702595, "learning_rate": 1e-07, "loss": 9.7938, "step": 20 }, { "epoch": 0.01076773985140519, "learning_rate": 2e-07, "loss": 9.4725, "step": 40 }, { "epoch": 0.016151609777107785, "learning_rate": 3e-07, "loss": 8.3855, "step": 60 }, { "epoch": 0.02153547970281038, "learning_rate": 4e-07, "loss": 7.0557, "step": 80 }, { "epoch": 0.026919349628512976, "learning_rate": 5e-07, "loss": 6.2559, "step": 100 }, { "epoch": 0.03230321955421557, "learning_rate": 6e-07, "loss": 5.941, "step": 120 }, { "epoch": 0.03768708947991817, "learning_rate": 7e-07, "loss": 5.657, "step": 140 }, { "epoch": 0.04307095940562076, "learning_rate": 8e-07, "loss": 5.3831, "step": 160 }, { "epoch": 0.048454829331323356, "learning_rate": 9e-07, "loss": 5.476, "step": 180 }, { "epoch": 0.05383869925702595, "learning_rate": 1e-06, "loss": 5.081, "step": 200 }, { "epoch": 0.05922256918272854, "learning_rate": 9.99696069161151e-07, "loss": 4.9985, "step": 220 }, { "epoch": 0.06460643910843114, "learning_rate": 9.993921383223017e-07, "loss": 5.0411, "step": 240 }, { "epoch": 0.06999030903413374, "learning_rate": 9.990882074834525e-07, "loss": 5.0192, "step": 260 }, { "epoch": 0.07537417895983634, "learning_rate": 9.987842766446035e-07, "loss": 5.0827, "step": 280 }, { "epoch": 0.08075804888553892, "learning_rate": 9.984803458057545e-07, "loss": 4.9614, "step": 300 }, { "epoch": 0.08614191881124152, "learning_rate": 9.981764149669053e-07, "loss": 5.0168, "step": 320 }, { "epoch": 0.09152578873694411, "learning_rate": 9.97872484128056e-07, "loss": 5.0651, "step": 340 }, { "epoch": 0.09690965866264671, "learning_rate": 9.97568553289207e-07, "loss": 5.0639, "step": 360 }, { "epoch": 0.10229352858834931, "learning_rate": 9.972646224503578e-07, "loss": 4.8337, "step": 380 }, { "epoch": 0.1076773985140519, "learning_rate": 9.969606916115088e-07, "loss": 4.7206, "step": 400 }, { "epoch": 0.11306126843975449, "learning_rate": 9.966567607726596e-07, "loss": 4.8773, "step": 420 }, { "epoch": 0.11844513836545709, "learning_rate": 9.963528299338106e-07, "loss": 4.7987, "step": 440 }, { "epoch": 0.12382900829115968, "learning_rate": 9.960488990949614e-07, "loss": 4.7167, "step": 460 }, { "epoch": 0.12921287821686228, "learning_rate": 9.957449682561124e-07, "loss": 4.6926, "step": 480 }, { "epoch": 0.13459674814256486, "learning_rate": 9.954410374172632e-07, "loss": 4.8212, "step": 500 }, { "epoch": 0.13998061806826748, "learning_rate": 9.95137106578414e-07, "loss": 4.7605, "step": 520 }, { "epoch": 0.14536448799397006, "learning_rate": 9.94833175739565e-07, "loss": 4.8473, "step": 540 }, { "epoch": 0.15074835791967267, "learning_rate": 9.94529244900716e-07, "loss": 4.735, "step": 560 }, { "epoch": 0.15613222784537525, "learning_rate": 9.942253140618667e-07, "loss": 4.7372, "step": 580 }, { "epoch": 0.16151609777107784, "learning_rate": 9.939213832230175e-07, "loss": 4.8833, "step": 600 }, { "epoch": 0.16689996769678045, "learning_rate": 9.936174523841685e-07, "loss": 4.742, "step": 620 }, { "epoch": 0.17228383762248303, "learning_rate": 9.933135215453195e-07, "loss": 4.5842, "step": 640 }, { "epoch": 0.17766770754818564, "learning_rate": 9.930095907064703e-07, "loss": 4.7701, "step": 660 }, { "epoch": 0.18305157747388823, "learning_rate": 9.92705659867621e-07, "loss": 4.7634, "step": 680 }, { "epoch": 0.18843544739959084, "learning_rate": 9.92401729028772e-07, "loss": 4.6502, "step": 700 }, { "epoch": 0.19381931732529342, "learning_rate": 9.92097798189923e-07, "loss": 4.6672, "step": 720 }, { "epoch": 0.199203187250996, "learning_rate": 9.917938673510738e-07, "loss": 4.75, "step": 740 }, { "epoch": 0.20458705717669862, "learning_rate": 9.914899365122246e-07, "loss": 4.5932, "step": 760 }, { "epoch": 0.2099709271024012, "learning_rate": 9.911860056733756e-07, "loss": 4.6468, "step": 780 }, { "epoch": 0.2153547970281038, "learning_rate": 9.908820748345266e-07, "loss": 4.5592, "step": 800 }, { "epoch": 0.2207386669538064, "learning_rate": 9.905781439956774e-07, "loss": 4.6977, "step": 820 }, { "epoch": 0.22612253687950898, "learning_rate": 9.902742131568282e-07, "loss": 4.6447, "step": 840 }, { "epoch": 0.2315064068052116, "learning_rate": 9.899702823179792e-07, "loss": 4.6937, "step": 860 }, { "epoch": 0.23689027673091417, "learning_rate": 9.8966635147913e-07, "loss": 4.7408, "step": 880 }, { "epoch": 0.24227414665661678, "learning_rate": 9.89362420640281e-07, "loss": 4.6969, "step": 900 }, { "epoch": 0.24765801658231937, "learning_rate": 9.890584898014317e-07, "loss": 4.6467, "step": 920 }, { "epoch": 0.25304188650802195, "learning_rate": 9.887545589625827e-07, "loss": 4.6131, "step": 940 }, { "epoch": 0.25842575643372456, "learning_rate": 9.884506281237335e-07, "loss": 4.6146, "step": 960 }, { "epoch": 0.2638096263594272, "learning_rate": 9.881466972848845e-07, "loss": 4.6659, "step": 980 }, { "epoch": 0.26919349628512973, "learning_rate": 9.878427664460353e-07, "loss": 4.6548, "step": 1000 }, { "epoch": 0.27457736621083234, "learning_rate": 9.87538835607186e-07, "loss": 4.5025, "step": 1020 }, { "epoch": 0.27996123613653495, "learning_rate": 9.87234904768337e-07, "loss": 4.5433, "step": 1040 }, { "epoch": 0.28534510606223756, "learning_rate": 9.86930973929488e-07, "loss": 4.6703, "step": 1060 }, { "epoch": 0.2907289759879401, "learning_rate": 9.866270430906388e-07, "loss": 4.4159, "step": 1080 }, { "epoch": 0.29611284591364273, "learning_rate": 9.863231122517896e-07, "loss": 4.5891, "step": 1100 }, { "epoch": 0.30149671583934534, "learning_rate": 9.860191814129406e-07, "loss": 4.5233, "step": 1120 }, { "epoch": 0.3068805857650479, "learning_rate": 9.857152505740916e-07, "loss": 4.48, "step": 1140 }, { "epoch": 0.3122644556907505, "learning_rate": 9.854113197352424e-07, "loss": 4.651, "step": 1160 }, { "epoch": 0.3176483256164531, "learning_rate": 9.851073888963932e-07, "loss": 4.508, "step": 1180 }, { "epoch": 0.3230321955421557, "learning_rate": 9.848034580575442e-07, "loss": 4.361, "step": 1200 }, { "epoch": 0.3284160654678583, "learning_rate": 9.844995272186952e-07, "loss": 4.3891, "step": 1220 }, { "epoch": 0.3337999353935609, "learning_rate": 9.84195596379846e-07, "loss": 4.3769, "step": 1240 }, { "epoch": 0.3391838053192635, "learning_rate": 9.838916655409967e-07, "loss": 4.4891, "step": 1260 }, { "epoch": 0.34456767524496607, "learning_rate": 9.835877347021477e-07, "loss": 4.4237, "step": 1280 }, { "epoch": 0.3499515451706687, "learning_rate": 9.832838038632987e-07, "loss": 4.3194, "step": 1300 }, { "epoch": 0.3553354150963713, "learning_rate": 9.829798730244495e-07, "loss": 4.4627, "step": 1320 }, { "epoch": 0.36071928502207384, "learning_rate": 9.826759421856003e-07, "loss": 4.3827, "step": 1340 }, { "epoch": 0.36610315494777645, "learning_rate": 9.823720113467513e-07, "loss": 4.3972, "step": 1360 }, { "epoch": 0.37148702487347907, "learning_rate": 9.820680805079023e-07, "loss": 4.3642, "step": 1380 }, { "epoch": 0.3768708947991817, "learning_rate": 9.81764149669053e-07, "loss": 4.3781, "step": 1400 }, { "epoch": 0.38225476472488423, "learning_rate": 9.814602188302038e-07, "loss": 4.5385, "step": 1420 }, { "epoch": 0.38763863465058684, "learning_rate": 9.811562879913546e-07, "loss": 4.3083, "step": 1440 }, { "epoch": 0.39302250457628946, "learning_rate": 9.808523571525058e-07, "loss": 4.3722, "step": 1460 }, { "epoch": 0.398406374501992, "learning_rate": 9.805484263136566e-07, "loss": 4.3383, "step": 1480 }, { "epoch": 0.4037902444276946, "learning_rate": 9.802444954748074e-07, "loss": 4.3492, "step": 1500 }, { "epoch": 0.40917411435339723, "learning_rate": 9.799405646359582e-07, "loss": 4.3363, "step": 1520 }, { "epoch": 0.4145579842790998, "learning_rate": 9.796366337971094e-07, "loss": 4.365, "step": 1540 }, { "epoch": 0.4199418542048024, "learning_rate": 9.793327029582602e-07, "loss": 4.3037, "step": 1560 }, { "epoch": 0.425325724130505, "learning_rate": 9.79028772119411e-07, "loss": 4.4482, "step": 1580 }, { "epoch": 0.4307095940562076, "learning_rate": 9.787248412805617e-07, "loss": 4.3282, "step": 1600 }, { "epoch": 0.4360934639819102, "learning_rate": 9.78420910441713e-07, "loss": 4.1645, "step": 1620 }, { "epoch": 0.4414773339076128, "learning_rate": 9.781169796028637e-07, "loss": 4.2391, "step": 1640 }, { "epoch": 0.4468612038333154, "learning_rate": 9.778130487640145e-07, "loss": 4.3259, "step": 1660 }, { "epoch": 0.45224507375901796, "learning_rate": 9.775091179251653e-07, "loss": 4.3063, "step": 1680 }, { "epoch": 0.45762894368472057, "learning_rate": 9.772051870863165e-07, "loss": 4.1056, "step": 1700 }, { "epoch": 0.4630128136104232, "learning_rate": 9.769012562474673e-07, "loss": 4.3614, "step": 1720 }, { "epoch": 0.4683966835361258, "learning_rate": 9.76597325408618e-07, "loss": 4.2459, "step": 1740 }, { "epoch": 0.47378055346182835, "learning_rate": 9.762933945697688e-07, "loss": 4.3372, "step": 1760 }, { "epoch": 0.47916442338753096, "learning_rate": 9.7598946373092e-07, "loss": 4.2036, "step": 1780 }, { "epoch": 0.48454829331323357, "learning_rate": 9.756855328920708e-07, "loss": 4.2001, "step": 1800 }, { "epoch": 0.4899321632389361, "learning_rate": 9.753816020532216e-07, "loss": 4.331, "step": 1820 }, { "epoch": 0.49531603316463874, "learning_rate": 9.750776712143724e-07, "loss": 4.2512, "step": 1840 }, { "epoch": 0.5006999030903413, "learning_rate": 9.747737403755234e-07, "loss": 4.1834, "step": 1860 }, { "epoch": 0.5060837730160439, "learning_rate": 9.744698095366744e-07, "loss": 4.3188, "step": 1880 }, { "epoch": 0.5114676429417465, "learning_rate": 9.741658786978252e-07, "loss": 4.4222, "step": 1900 }, { "epoch": 0.5168515128674491, "learning_rate": 9.73861947858976e-07, "loss": 4.0525, "step": 1920 }, { "epoch": 0.5222353827931517, "learning_rate": 9.73558017020127e-07, "loss": 4.137, "step": 1940 }, { "epoch": 0.5276192527188543, "learning_rate": 9.73254086181278e-07, "loss": 4.1998, "step": 1960 }, { "epoch": 0.533003122644557, "learning_rate": 9.729501553424287e-07, "loss": 4.0201, "step": 1980 }, { "epoch": 0.5383869925702595, "learning_rate": 9.726462245035795e-07, "loss": 4.1648, "step": 2000 }, { "epoch": 0.5437708624959621, "learning_rate": 9.723422936647305e-07, "loss": 4.0576, "step": 2020 }, { "epoch": 0.5491547324216647, "learning_rate": 9.720383628258815e-07, "loss": 4.1884, "step": 2040 }, { "epoch": 0.5545386023473673, "learning_rate": 9.717344319870323e-07, "loss": 4.0947, "step": 2060 }, { "epoch": 0.5599224722730699, "learning_rate": 9.71430501148183e-07, "loss": 4.1478, "step": 2080 }, { "epoch": 0.5653063421987725, "learning_rate": 9.71126570309334e-07, "loss": 4.2073, "step": 2100 }, { "epoch": 0.5706902121244751, "learning_rate": 9.70822639470485e-07, "loss": 4.2448, "step": 2120 }, { "epoch": 0.5760740820501776, "learning_rate": 9.705187086316358e-07, "loss": 4.1576, "step": 2140 }, { "epoch": 0.5814579519758802, "learning_rate": 9.702147777927866e-07, "loss": 3.9818, "step": 2160 }, { "epoch": 0.5868418219015828, "learning_rate": 9.699108469539376e-07, "loss": 4.1782, "step": 2180 }, { "epoch": 0.5922256918272855, "learning_rate": 9.696069161150886e-07, "loss": 4.1193, "step": 2200 }, { "epoch": 0.5976095617529881, "learning_rate": 9.693029852762394e-07, "loss": 3.8744, "step": 2220 }, { "epoch": 0.6029934316786907, "learning_rate": 9.689990544373901e-07, "loss": 3.9768, "step": 2240 }, { "epoch": 0.6083773016043932, "learning_rate": 9.686951235985411e-07, "loss": 4.0686, "step": 2260 }, { "epoch": 0.6137611715300958, "learning_rate": 9.68391192759692e-07, "loss": 4.1936, "step": 2280 }, { "epoch": 0.6191450414557984, "learning_rate": 9.68087261920843e-07, "loss": 4.0601, "step": 2300 }, { "epoch": 0.624528911381501, "learning_rate": 9.677833310819937e-07, "loss": 4.1562, "step": 2320 }, { "epoch": 0.6299127813072036, "learning_rate": 9.674794002431447e-07, "loss": 4.1972, "step": 2340 }, { "epoch": 0.6352966512329062, "learning_rate": 9.671754694042955e-07, "loss": 4.1596, "step": 2360 }, { "epoch": 0.6406805211586089, "learning_rate": 9.668715385654465e-07, "loss": 4.1037, "step": 2380 }, { "epoch": 0.6460643910843114, "learning_rate": 9.665676077265973e-07, "loss": 4.1364, "step": 2400 }, { "epoch": 0.651448261010014, "learning_rate": 9.662636768877483e-07, "loss": 3.9431, "step": 2420 }, { "epoch": 0.6568321309357166, "learning_rate": 9.65959746048899e-07, "loss": 4.0679, "step": 2440 }, { "epoch": 0.6622160008614192, "learning_rate": 9.6565581521005e-07, "loss": 4.0304, "step": 2460 }, { "epoch": 0.6675998707871218, "learning_rate": 9.653518843712008e-07, "loss": 4.1289, "step": 2480 }, { "epoch": 0.6729837407128244, "learning_rate": 9.650479535323516e-07, "loss": 4.1471, "step": 2500 }, { "epoch": 0.678367610638527, "learning_rate": 9.647440226935026e-07, "loss": 4.0465, "step": 2520 }, { "epoch": 0.6837514805642295, "learning_rate": 9.644400918546536e-07, "loss": 4.0011, "step": 2540 }, { "epoch": 0.6891353504899321, "learning_rate": 9.641361610158044e-07, "loss": 4.0198, "step": 2560 }, { "epoch": 0.6945192204156347, "learning_rate": 9.638322301769551e-07, "loss": 4.1341, "step": 2580 }, { "epoch": 0.6999030903413374, "learning_rate": 9.635282993381061e-07, "loss": 3.9473, "step": 2600 }, { "epoch": 0.70528696026704, "learning_rate": 9.632243684992571e-07, "loss": 3.9478, "step": 2620 }, { "epoch": 0.7106708301927426, "learning_rate": 9.62920437660408e-07, "loss": 4.1796, "step": 2640 }, { "epoch": 0.7160547001184452, "learning_rate": 9.626165068215587e-07, "loss": 4.0625, "step": 2660 }, { "epoch": 0.7214385700441477, "learning_rate": 9.623125759827097e-07, "loss": 3.9337, "step": 2680 }, { "epoch": 0.7268224399698503, "learning_rate": 9.620086451438607e-07, "loss": 4.0254, "step": 2700 }, { "epoch": 0.7322063098955529, "learning_rate": 9.617047143050115e-07, "loss": 4.0872, "step": 2720 }, { "epoch": 0.7375901798212555, "learning_rate": 9.614007834661623e-07, "loss": 4.0988, "step": 2740 }, { "epoch": 0.7429740497469581, "learning_rate": 9.610968526273132e-07, "loss": 3.9557, "step": 2760 }, { "epoch": 0.7483579196726607, "learning_rate": 9.60792921788464e-07, "loss": 4.2914, "step": 2780 }, { "epoch": 0.7537417895983634, "learning_rate": 9.60488990949615e-07, "loss": 4.1434, "step": 2800 }, { "epoch": 0.7591256595240659, "learning_rate": 9.601850601107658e-07, "loss": 3.8402, "step": 2820 }, { "epoch": 0.7645095294497685, "learning_rate": 9.598811292719168e-07, "loss": 3.9003, "step": 2840 }, { "epoch": 0.7698933993754711, "learning_rate": 9.595771984330676e-07, "loss": 3.8761, "step": 2860 }, { "epoch": 0.7752772693011737, "learning_rate": 9.592732675942186e-07, "loss": 3.9404, "step": 2880 }, { "epoch": 0.7806611392268763, "learning_rate": 9.589693367553694e-07, "loss": 3.859, "step": 2900 }, { "epoch": 0.7860450091525789, "learning_rate": 9.586654059165201e-07, "loss": 3.9817, "step": 2920 }, { "epoch": 0.7914288790782815, "learning_rate": 9.583614750776711e-07, "loss": 4.0039, "step": 2940 }, { "epoch": 0.796812749003984, "learning_rate": 9.580575442388221e-07, "loss": 4.0198, "step": 2960 }, { "epoch": 0.8021966189296866, "learning_rate": 9.57753613399973e-07, "loss": 3.945, "step": 2980 }, { "epoch": 0.8075804888553892, "learning_rate": 9.574496825611237e-07, "loss": 3.9635, "step": 3000 }, { "epoch": 0.8129643587810919, "learning_rate": 9.571457517222747e-07, "loss": 3.8891, "step": 3020 }, { "epoch": 0.8183482287067945, "learning_rate": 9.568418208834257e-07, "loss": 3.9844, "step": 3040 }, { "epoch": 0.8237320986324971, "learning_rate": 9.565378900445765e-07, "loss": 4.0569, "step": 3060 }, { "epoch": 0.8291159685581996, "learning_rate": 9.562339592057273e-07, "loss": 3.8665, "step": 3080 }, { "epoch": 0.8344998384839022, "learning_rate": 9.559300283668782e-07, "loss": 3.9799, "step": 3100 }, { "epoch": 0.8398837084096048, "learning_rate": 9.556260975280292e-07, "loss": 4.0793, "step": 3120 }, { "epoch": 0.8452675783353074, "learning_rate": 9.5532216668918e-07, "loss": 3.941, "step": 3140 }, { "epoch": 0.85065144826101, "learning_rate": 9.550182358503308e-07, "loss": 3.9981, "step": 3160 }, { "epoch": 0.8560353181867126, "learning_rate": 9.547143050114818e-07, "loss": 3.9694, "step": 3180 }, { "epoch": 0.8614191881124152, "learning_rate": 9.544103741726328e-07, "loss": 3.9918, "step": 3200 }, { "epoch": 0.8668030580381177, "learning_rate": 9.541064433337836e-07, "loss": 3.8542, "step": 3220 }, { "epoch": 0.8721869279638204, "learning_rate": 9.538025124949344e-07, "loss": 3.7751, "step": 3240 }, { "epoch": 0.877570797889523, "learning_rate": 9.534985816560854e-07, "loss": 3.9786, "step": 3260 }, { "epoch": 0.8829546678152256, "learning_rate": 9.531946508172361e-07, "loss": 3.9525, "step": 3280 }, { "epoch": 0.8883385377409282, "learning_rate": 9.528907199783871e-07, "loss": 3.8379, "step": 3300 }, { "epoch": 0.8937224076666308, "learning_rate": 9.52586789139538e-07, "loss": 3.9449, "step": 3320 }, { "epoch": 0.8991062775923334, "learning_rate": 9.522828583006888e-07, "loss": 3.891, "step": 3340 }, { "epoch": 0.9044901475180359, "learning_rate": 9.519789274618397e-07, "loss": 3.8922, "step": 3360 }, { "epoch": 0.9098740174437385, "learning_rate": 9.516749966229907e-07, "loss": 3.9657, "step": 3380 }, { "epoch": 0.9152578873694411, "learning_rate": 9.513710657841416e-07, "loss": 3.9795, "step": 3400 }, { "epoch": 0.9206417572951437, "learning_rate": 9.510671349452924e-07, "loss": 3.9513, "step": 3420 }, { "epoch": 0.9260256272208464, "learning_rate": 9.507632041064432e-07, "loss": 4.0202, "step": 3440 }, { "epoch": 0.931409497146549, "learning_rate": 9.504592732675942e-07, "loss": 3.9291, "step": 3460 }, { "epoch": 0.9367933670722516, "learning_rate": 9.501553424287451e-07, "loss": 4.1216, "step": 3480 }, { "epoch": 0.9421772369979541, "learning_rate": 9.498514115898959e-07, "loss": 3.9733, "step": 3500 }, { "epoch": 0.9475611069236567, "learning_rate": 9.495474807510468e-07, "loss": 3.8097, "step": 3520 }, { "epoch": 0.9529449768493593, "learning_rate": 9.492435499121978e-07, "loss": 3.8426, "step": 3540 }, { "epoch": 0.9583288467750619, "learning_rate": 9.489396190733485e-07, "loss": 3.9829, "step": 3560 }, { "epoch": 0.9637127167007645, "learning_rate": 9.486356882344995e-07, "loss": 3.646, "step": 3580 }, { "epoch": 0.9690965866264671, "learning_rate": 9.483317573956504e-07, "loss": 3.7726, "step": 3600 }, { "epoch": 0.9744804565521697, "learning_rate": 9.480278265568012e-07, "loss": 3.7979, "step": 3620 }, { "epoch": 0.9798643264778722, "learning_rate": 9.477238957179521e-07, "loss": 3.8609, "step": 3640 }, { "epoch": 0.9852481964035749, "learning_rate": 9.47419964879103e-07, "loss": 3.7803, "step": 3660 }, { "epoch": 0.9906320663292775, "learning_rate": 9.471160340402539e-07, "loss": 3.9751, "step": 3680 }, { "epoch": 0.9960159362549801, "learning_rate": 9.468121032014048e-07, "loss": 3.7751, "step": 3700 } ], "logging_steps": 20, "max_steps": 59424, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 10000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.454634472726528e+16, "train_batch_size": 5, "trial_name": null, "trial_params": null }