{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1548, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01937984496124031, "grad_norm": 3.1349940241372636, "learning_rate": 2e-06, "loss": 0.7225, "step": 10 }, { "epoch": 0.03875968992248062, "grad_norm": 1.478554813389481, "learning_rate": 2e-06, "loss": 0.6518, "step": 20 }, { "epoch": 0.05813953488372093, "grad_norm": 1.40892259720196, "learning_rate": 2e-06, "loss": 0.6267, "step": 30 }, { "epoch": 0.07751937984496124, "grad_norm": 1.6253951402557405, "learning_rate": 2e-06, "loss": 0.6193, "step": 40 }, { "epoch": 0.09689922480620156, "grad_norm": 1.474392955252126, "learning_rate": 2e-06, "loss": 0.6139, "step": 50 }, { "epoch": 0.11627906976744186, "grad_norm": 3.0751907911279477, "learning_rate": 2e-06, "loss": 0.6069, "step": 60 }, { "epoch": 0.13565891472868216, "grad_norm": 2.9223888412970873, "learning_rate": 2e-06, "loss": 0.6024, "step": 70 }, { "epoch": 0.15503875968992248, "grad_norm": 2.357095375310306, "learning_rate": 2e-06, "loss": 0.6052, "step": 80 }, { "epoch": 0.1744186046511628, "grad_norm": 1.3933025656894271, "learning_rate": 2e-06, "loss": 0.5906, "step": 90 }, { "epoch": 0.1937984496124031, "grad_norm": 1.820645926185548, "learning_rate": 2e-06, "loss": 0.5929, "step": 100 }, { "epoch": 0.2131782945736434, "grad_norm": 1.7924399526821087, "learning_rate": 2e-06, "loss": 0.5958, "step": 110 }, { "epoch": 0.23255813953488372, "grad_norm": 1.7736991192509453, "learning_rate": 2e-06, "loss": 0.587, "step": 120 }, { "epoch": 0.25193798449612403, "grad_norm": 1.9201526538063434, "learning_rate": 2e-06, "loss": 0.5972, "step": 130 }, { "epoch": 0.2713178294573643, "grad_norm": 1.9904730096438479, "learning_rate": 2e-06, "loss": 0.5837, "step": 140 }, { "epoch": 0.29069767441860467, "grad_norm": 1.706489075036192, "learning_rate": 2e-06, "loss": 0.5837, "step": 150 }, { "epoch": 0.31007751937984496, "grad_norm": 1.6008690300286739, "learning_rate": 2e-06, "loss": 0.5824, "step": 160 }, { "epoch": 0.32945736434108525, "grad_norm": 1.7784490780851796, "learning_rate": 2e-06, "loss": 0.5776, "step": 170 }, { "epoch": 0.3488372093023256, "grad_norm": 1.4254130054140608, "learning_rate": 2e-06, "loss": 0.5817, "step": 180 }, { "epoch": 0.3682170542635659, "grad_norm": 1.4517537061348775, "learning_rate": 2e-06, "loss": 0.5745, "step": 190 }, { "epoch": 0.3875968992248062, "grad_norm": 1.421098096153975, "learning_rate": 2e-06, "loss": 0.5822, "step": 200 }, { "epoch": 0.4069767441860465, "grad_norm": 1.4340247949454419, "learning_rate": 2e-06, "loss": 0.5803, "step": 210 }, { "epoch": 0.4263565891472868, "grad_norm": 1.2607995600168784, "learning_rate": 2e-06, "loss": 0.5683, "step": 220 }, { "epoch": 0.44573643410852715, "grad_norm": 1.4088258448752224, "learning_rate": 2e-06, "loss": 0.5669, "step": 230 }, { "epoch": 0.46511627906976744, "grad_norm": 1.4547922751942315, "learning_rate": 2e-06, "loss": 0.5711, "step": 240 }, { "epoch": 0.4844961240310077, "grad_norm": 1.3166116102995473, "learning_rate": 2e-06, "loss": 0.5715, "step": 250 }, { "epoch": 0.5038759689922481, "grad_norm": 1.2865076146935452, "learning_rate": 2e-06, "loss": 0.5706, "step": 260 }, { "epoch": 0.5232558139534884, "grad_norm": 1.9514850118526776, "learning_rate": 2e-06, "loss": 0.5711, "step": 270 }, { "epoch": 0.5426356589147286, "grad_norm": 1.3656201312041214, "learning_rate": 2e-06, "loss": 0.5665, "step": 280 }, { "epoch": 0.562015503875969, "grad_norm": 1.2117280702790532, "learning_rate": 2e-06, "loss": 0.5685, "step": 290 }, { "epoch": 0.5813953488372093, "grad_norm": 1.5936964457861214, "learning_rate": 2e-06, "loss": 0.5767, "step": 300 }, { "epoch": 0.6007751937984496, "grad_norm": 1.262836540024248, "learning_rate": 2e-06, "loss": 0.5639, "step": 310 }, { "epoch": 0.6201550387596899, "grad_norm": 1.5253983364542758, "learning_rate": 2e-06, "loss": 0.5613, "step": 320 }, { "epoch": 0.6395348837209303, "grad_norm": 1.695119554349292, "learning_rate": 2e-06, "loss": 0.5629, "step": 330 }, { "epoch": 0.6589147286821705, "grad_norm": 1.2779875033648296, "learning_rate": 2e-06, "loss": 0.5591, "step": 340 }, { "epoch": 0.6782945736434108, "grad_norm": 1.6166948739589506, "learning_rate": 2e-06, "loss": 0.5561, "step": 350 }, { "epoch": 0.6976744186046512, "grad_norm": 1.3469984241923647, "learning_rate": 2e-06, "loss": 0.561, "step": 360 }, { "epoch": 0.7170542635658915, "grad_norm": 1.2982439077151307, "learning_rate": 2e-06, "loss": 0.5662, "step": 370 }, { "epoch": 0.7364341085271318, "grad_norm": 1.3681013360427425, "learning_rate": 2e-06, "loss": 0.5671, "step": 380 }, { "epoch": 0.7558139534883721, "grad_norm": 1.2116770625468472, "learning_rate": 2e-06, "loss": 0.5568, "step": 390 }, { "epoch": 0.7751937984496124, "grad_norm": 1.433614262268133, "learning_rate": 2e-06, "loss": 0.5581, "step": 400 }, { "epoch": 0.7945736434108527, "grad_norm": 1.1848821794193687, "learning_rate": 2e-06, "loss": 0.5637, "step": 410 }, { "epoch": 0.813953488372093, "grad_norm": 1.3158386135055107, "learning_rate": 2e-06, "loss": 0.5582, "step": 420 }, { "epoch": 0.8333333333333334, "grad_norm": 2.3927036985893504, "learning_rate": 2e-06, "loss": 0.5612, "step": 430 }, { "epoch": 0.8527131782945736, "grad_norm": 1.1806090504426345, "learning_rate": 2e-06, "loss": 0.5588, "step": 440 }, { "epoch": 0.872093023255814, "grad_norm": 1.2223208556627352, "learning_rate": 2e-06, "loss": 0.5621, "step": 450 }, { "epoch": 0.8914728682170543, "grad_norm": 1.1512657163820463, "learning_rate": 2e-06, "loss": 0.5513, "step": 460 }, { "epoch": 0.9108527131782945, "grad_norm": 1.1826527683444463, "learning_rate": 2e-06, "loss": 0.5577, "step": 470 }, { "epoch": 0.9302325581395349, "grad_norm": 1.578741070129489, "learning_rate": 2e-06, "loss": 0.5584, "step": 480 }, { "epoch": 0.9496124031007752, "grad_norm": 1.364010417258464, "learning_rate": 2e-06, "loss": 0.5517, "step": 490 }, { "epoch": 0.9689922480620154, "grad_norm": 1.564561079833014, "learning_rate": 2e-06, "loss": 0.5528, "step": 500 }, { "epoch": 0.9883720930232558, "grad_norm": 1.1934813597924194, "learning_rate": 2e-06, "loss": 0.5573, "step": 510 }, { "epoch": 1.0077519379844961, "grad_norm": 2.2183985963794735, "learning_rate": 2e-06, "loss": 0.5359, "step": 520 }, { "epoch": 1.0271317829457365, "grad_norm": 1.444929607797708, "learning_rate": 2e-06, "loss": 0.5096, "step": 530 }, { "epoch": 1.0465116279069768, "grad_norm": 1.3333146357476189, "learning_rate": 2e-06, "loss": 0.5072, "step": 540 }, { "epoch": 1.0658914728682172, "grad_norm": 1.1259644665340547, "learning_rate": 2e-06, "loss": 0.5052, "step": 550 }, { "epoch": 1.0852713178294573, "grad_norm": 1.4403172490091567, "learning_rate": 2e-06, "loss": 0.5169, "step": 560 }, { "epoch": 1.1046511627906976, "grad_norm": 1.157835174296272, "learning_rate": 2e-06, "loss": 0.5102, "step": 570 }, { "epoch": 1.124031007751938, "grad_norm": 1.1960407146223624, "learning_rate": 2e-06, "loss": 0.5038, "step": 580 }, { "epoch": 1.1434108527131783, "grad_norm": 1.253507090792846, "learning_rate": 2e-06, "loss": 0.5034, "step": 590 }, { "epoch": 1.1627906976744187, "grad_norm": 1.2979047842512628, "learning_rate": 2e-06, "loss": 0.5038, "step": 600 }, { "epoch": 1.1821705426356588, "grad_norm": 1.2576281990523717, "learning_rate": 2e-06, "loss": 0.5137, "step": 610 }, { "epoch": 1.2015503875968991, "grad_norm": 1.2748802075166439, "learning_rate": 2e-06, "loss": 0.5076, "step": 620 }, { "epoch": 1.2209302325581395, "grad_norm": 1.5223812042187916, "learning_rate": 2e-06, "loss": 0.5077, "step": 630 }, { "epoch": 1.2403100775193798, "grad_norm": 1.3632540604107077, "learning_rate": 2e-06, "loss": 0.5083, "step": 640 }, { "epoch": 1.2596899224806202, "grad_norm": 1.2321831811826818, "learning_rate": 2e-06, "loss": 0.5096, "step": 650 }, { "epoch": 1.2790697674418605, "grad_norm": 1.3323190632353188, "learning_rate": 2e-06, "loss": 0.5063, "step": 660 }, { "epoch": 1.2984496124031009, "grad_norm": 1.2066604050023704, "learning_rate": 2e-06, "loss": 0.5122, "step": 670 }, { "epoch": 1.3178294573643412, "grad_norm": 1.186727847270962, "learning_rate": 2e-06, "loss": 0.5121, "step": 680 }, { "epoch": 1.3372093023255813, "grad_norm": 1.397263671569467, "learning_rate": 2e-06, "loss": 0.5089, "step": 690 }, { "epoch": 1.3565891472868217, "grad_norm": 1.573664700028339, "learning_rate": 2e-06, "loss": 0.5099, "step": 700 }, { "epoch": 1.375968992248062, "grad_norm": 1.4467914138897073, "learning_rate": 2e-06, "loss": 0.5134, "step": 710 }, { "epoch": 1.3953488372093024, "grad_norm": 1.3706902755992394, "learning_rate": 2e-06, "loss": 0.5123, "step": 720 }, { "epoch": 1.4147286821705427, "grad_norm": 1.2411084475779852, "learning_rate": 2e-06, "loss": 0.5078, "step": 730 }, { "epoch": 1.4341085271317828, "grad_norm": 1.5902510539142722, "learning_rate": 2e-06, "loss": 0.5137, "step": 740 }, { "epoch": 1.4534883720930232, "grad_norm": 1.7787034613442634, "learning_rate": 2e-06, "loss": 0.5094, "step": 750 }, { "epoch": 1.4728682170542635, "grad_norm": 1.2787619752439543, "learning_rate": 2e-06, "loss": 0.5063, "step": 760 }, { "epoch": 1.4922480620155039, "grad_norm": 1.2898677794347344, "learning_rate": 2e-06, "loss": 0.5097, "step": 770 }, { "epoch": 1.5116279069767442, "grad_norm": 1.2320340010541546, "learning_rate": 2e-06, "loss": 0.5127, "step": 780 }, { "epoch": 1.5310077519379846, "grad_norm": 1.1998558863263413, "learning_rate": 2e-06, "loss": 0.5097, "step": 790 }, { "epoch": 1.550387596899225, "grad_norm": 1.2491228211939762, "learning_rate": 2e-06, "loss": 0.507, "step": 800 }, { "epoch": 1.5697674418604652, "grad_norm": 1.2235970378609549, "learning_rate": 2e-06, "loss": 0.5052, "step": 810 }, { "epoch": 1.5891472868217056, "grad_norm": 1.4176990973616905, "learning_rate": 2e-06, "loss": 0.5039, "step": 820 }, { "epoch": 1.6085271317829457, "grad_norm": 1.3149977315253063, "learning_rate": 2e-06, "loss": 0.512, "step": 830 }, { "epoch": 1.627906976744186, "grad_norm": 1.2333272112594988, "learning_rate": 2e-06, "loss": 0.5066, "step": 840 }, { "epoch": 1.6472868217054264, "grad_norm": 1.1574106550887124, "learning_rate": 2e-06, "loss": 0.5113, "step": 850 }, { "epoch": 1.6666666666666665, "grad_norm": 1.418576038122765, "learning_rate": 2e-06, "loss": 0.5137, "step": 860 }, { "epoch": 1.6860465116279069, "grad_norm": 1.187391106828372, "learning_rate": 2e-06, "loss": 0.506, "step": 870 }, { "epoch": 1.7054263565891472, "grad_norm": 1.170649147011855, "learning_rate": 2e-06, "loss": 0.5107, "step": 880 }, { "epoch": 1.7248062015503876, "grad_norm": 1.3798145976951228, "learning_rate": 2e-06, "loss": 0.5099, "step": 890 }, { "epoch": 1.744186046511628, "grad_norm": 1.725987629268818, "learning_rate": 2e-06, "loss": 0.5139, "step": 900 }, { "epoch": 1.7635658914728682, "grad_norm": 1.8813904948358928, "learning_rate": 2e-06, "loss": 0.5059, "step": 910 }, { "epoch": 1.7829457364341086, "grad_norm": 1.8516754910417244, "learning_rate": 2e-06, "loss": 0.5088, "step": 920 }, { "epoch": 1.802325581395349, "grad_norm": 1.2095051182172416, "learning_rate": 2e-06, "loss": 0.5096, "step": 930 }, { "epoch": 1.8217054263565893, "grad_norm": 1.2435069711568396, "learning_rate": 2e-06, "loss": 0.5077, "step": 940 }, { "epoch": 1.8410852713178296, "grad_norm": 1.3586556706802664, "learning_rate": 2e-06, "loss": 0.5102, "step": 950 }, { "epoch": 1.8604651162790697, "grad_norm": 1.1979327298252027, "learning_rate": 2e-06, "loss": 0.512, "step": 960 }, { "epoch": 1.87984496124031, "grad_norm": 1.7904993765105046, "learning_rate": 2e-06, "loss": 0.5096, "step": 970 }, { "epoch": 1.8992248062015504, "grad_norm": 1.377601184927356, "learning_rate": 2e-06, "loss": 0.5057, "step": 980 }, { "epoch": 1.9186046511627906, "grad_norm": 1.1816932854326225, "learning_rate": 2e-06, "loss": 0.5118, "step": 990 }, { "epoch": 1.937984496124031, "grad_norm": 1.235638000745403, "learning_rate": 2e-06, "loss": 0.5092, "step": 1000 }, { "epoch": 1.9573643410852712, "grad_norm": 1.2730535171048605, "learning_rate": 2e-06, "loss": 0.5104, "step": 1010 }, { "epoch": 1.9767441860465116, "grad_norm": 1.2382473568182877, "learning_rate": 2e-06, "loss": 0.5059, "step": 1020 }, { "epoch": 1.996124031007752, "grad_norm": 1.2856863827108878, "learning_rate": 2e-06, "loss": 0.51, "step": 1030 }, { "epoch": 2.0155038759689923, "grad_norm": 1.6836529854231972, "learning_rate": 2e-06, "loss": 0.4603, "step": 1040 }, { "epoch": 2.0348837209302326, "grad_norm": 1.4922089047807388, "learning_rate": 2e-06, "loss": 0.4585, "step": 1050 }, { "epoch": 2.054263565891473, "grad_norm": 1.3990249605527378, "learning_rate": 2e-06, "loss": 0.4571, "step": 1060 }, { "epoch": 2.0736434108527133, "grad_norm": 1.5500783972286114, "learning_rate": 2e-06, "loss": 0.4591, "step": 1070 }, { "epoch": 2.0930232558139537, "grad_norm": 1.4063766317234851, "learning_rate": 2e-06, "loss": 0.4537, "step": 1080 }, { "epoch": 2.112403100775194, "grad_norm": 1.4498154054826256, "learning_rate": 2e-06, "loss": 0.4567, "step": 1090 }, { "epoch": 2.1317829457364343, "grad_norm": 1.4659238804284036, "learning_rate": 2e-06, "loss": 0.4564, "step": 1100 }, { "epoch": 2.1511627906976742, "grad_norm": 1.4331071288445956, "learning_rate": 2e-06, "loss": 0.4597, "step": 1110 }, { "epoch": 2.1705426356589146, "grad_norm": 1.2265580968692957, "learning_rate": 2e-06, "loss": 0.4613, "step": 1120 }, { "epoch": 2.189922480620155, "grad_norm": 1.430430093258902, "learning_rate": 2e-06, "loss": 0.4582, "step": 1130 }, { "epoch": 2.2093023255813953, "grad_norm": 1.3191685780949374, "learning_rate": 2e-06, "loss": 0.4567, "step": 1140 }, { "epoch": 2.2286821705426356, "grad_norm": 1.4143557981830728, "learning_rate": 2e-06, "loss": 0.4593, "step": 1150 }, { "epoch": 2.248062015503876, "grad_norm": 1.2038927572265354, "learning_rate": 2e-06, "loss": 0.4571, "step": 1160 }, { "epoch": 2.2674418604651163, "grad_norm": 1.5101700084835745, "learning_rate": 2e-06, "loss": 0.4576, "step": 1170 }, { "epoch": 2.2868217054263567, "grad_norm": 1.3238249446175274, "learning_rate": 2e-06, "loss": 0.4648, "step": 1180 }, { "epoch": 2.306201550387597, "grad_norm": 1.4453440150118313, "learning_rate": 2e-06, "loss": 0.46, "step": 1190 }, { "epoch": 2.3255813953488373, "grad_norm": 1.291302766406732, "learning_rate": 2e-06, "loss": 0.4564, "step": 1200 }, { "epoch": 2.3449612403100777, "grad_norm": 1.3371668345858843, "learning_rate": 2e-06, "loss": 0.4605, "step": 1210 }, { "epoch": 2.3643410852713176, "grad_norm": 1.4583613955952517, "learning_rate": 2e-06, "loss": 0.4639, "step": 1220 }, { "epoch": 2.383720930232558, "grad_norm": 1.3603243092911215, "learning_rate": 2e-06, "loss": 0.4591, "step": 1230 }, { "epoch": 2.4031007751937983, "grad_norm": 1.4381588797038276, "learning_rate": 2e-06, "loss": 0.4654, "step": 1240 }, { "epoch": 2.4224806201550386, "grad_norm": 1.2897514442264095, "learning_rate": 2e-06, "loss": 0.46, "step": 1250 }, { "epoch": 2.441860465116279, "grad_norm": 1.4149740934308317, "learning_rate": 2e-06, "loss": 0.462, "step": 1260 }, { "epoch": 2.4612403100775193, "grad_norm": 1.2334708007117, "learning_rate": 2e-06, "loss": 0.4636, "step": 1270 }, { "epoch": 2.4806201550387597, "grad_norm": 1.3277525646372448, "learning_rate": 2e-06, "loss": 0.4637, "step": 1280 }, { "epoch": 2.5, "grad_norm": 1.341129908785728, "learning_rate": 2e-06, "loss": 0.4572, "step": 1290 }, { "epoch": 2.5193798449612403, "grad_norm": 1.2028868010871578, "learning_rate": 2e-06, "loss": 0.4643, "step": 1300 }, { "epoch": 2.5387596899224807, "grad_norm": 1.2788628805207698, "learning_rate": 2e-06, "loss": 0.458, "step": 1310 }, { "epoch": 2.558139534883721, "grad_norm": 1.1964555955603347, "learning_rate": 2e-06, "loss": 0.4702, "step": 1320 }, { "epoch": 2.5775193798449614, "grad_norm": 1.2630513178641603, "learning_rate": 2e-06, "loss": 0.4622, "step": 1330 }, { "epoch": 2.5968992248062017, "grad_norm": 1.4987867600057845, "learning_rate": 2e-06, "loss": 0.4708, "step": 1340 }, { "epoch": 2.616279069767442, "grad_norm": 1.328908367081974, "learning_rate": 2e-06, "loss": 0.4622, "step": 1350 }, { "epoch": 2.6356589147286824, "grad_norm": 1.572958887182858, "learning_rate": 2e-06, "loss": 0.459, "step": 1360 }, { "epoch": 2.6550387596899228, "grad_norm": 1.2305373948782317, "learning_rate": 2e-06, "loss": 0.4597, "step": 1370 }, { "epoch": 2.6744186046511627, "grad_norm": 1.231294042248163, "learning_rate": 2e-06, "loss": 0.468, "step": 1380 }, { "epoch": 2.693798449612403, "grad_norm": 1.2696874963913714, "learning_rate": 2e-06, "loss": 0.4598, "step": 1390 }, { "epoch": 2.7131782945736433, "grad_norm": 1.2272319783021322, "learning_rate": 2e-06, "loss": 0.4678, "step": 1400 }, { "epoch": 2.7325581395348837, "grad_norm": 1.3352293934792803, "learning_rate": 2e-06, "loss": 0.4594, "step": 1410 }, { "epoch": 2.751937984496124, "grad_norm": 1.306891032131746, "learning_rate": 2e-06, "loss": 0.4613, "step": 1420 }, { "epoch": 2.7713178294573644, "grad_norm": 1.2665315216624808, "learning_rate": 2e-06, "loss": 0.4667, "step": 1430 }, { "epoch": 2.7906976744186047, "grad_norm": 1.3240308251203166, "learning_rate": 2e-06, "loss": 0.4656, "step": 1440 }, { "epoch": 2.810077519379845, "grad_norm": 1.4793024854316217, "learning_rate": 2e-06, "loss": 0.463, "step": 1450 }, { "epoch": 2.8294573643410854, "grad_norm": 1.4217782757634918, "learning_rate": 2e-06, "loss": 0.4703, "step": 1460 }, { "epoch": 2.8488372093023253, "grad_norm": 1.2290713413876615, "learning_rate": 2e-06, "loss": 0.4661, "step": 1470 }, { "epoch": 2.8682170542635657, "grad_norm": 1.4408298205910421, "learning_rate": 2e-06, "loss": 0.4667, "step": 1480 }, { "epoch": 2.887596899224806, "grad_norm": 1.4463509582697884, "learning_rate": 2e-06, "loss": 0.4692, "step": 1490 }, { "epoch": 2.9069767441860463, "grad_norm": 1.3757313989698736, "learning_rate": 2e-06, "loss": 0.4585, "step": 1500 }, { "epoch": 2.9263565891472867, "grad_norm": 1.3950705796217753, "learning_rate": 2e-06, "loss": 0.4627, "step": 1510 }, { "epoch": 2.945736434108527, "grad_norm": 1.269632815277089, "learning_rate": 2e-06, "loss": 0.4679, "step": 1520 }, { "epoch": 2.9651162790697674, "grad_norm": 1.3792098795510677, "learning_rate": 2e-06, "loss": 0.463, "step": 1530 }, { "epoch": 2.9844961240310077, "grad_norm": 1.2723166198917764, "learning_rate": 2e-06, "loss": 0.468, "step": 1540 }, { "epoch": 3.0, "step": 1548, "total_flos": 2591282618695680.0, "train_loss": 0.1541580106552873, "train_runtime": 7454.5076, "train_samples_per_second": 106.232, "train_steps_per_second": 0.208 } ], "logging_steps": 10, "max_steps": 1548, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2591282618695680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }