|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.993050193050193, |
|
"eval_steps": 500, |
|
"global_step": 969, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03088803088803089, |
|
"grad_norm": 109.2129922948738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8218, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06177606177606178, |
|
"grad_norm": 1.5509789288186628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7505, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09266409266409266, |
|
"grad_norm": 1.0237535334293637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7022, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12355212355212356, |
|
"grad_norm": 0.9555198347715455, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6862, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15444015444015444, |
|
"grad_norm": 0.9962768534382307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6721, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18532818532818532, |
|
"grad_norm": 1.7530907194148484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6509, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 0.7959383217983834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6472, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2471042471042471, |
|
"grad_norm": 0.7944327039168986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.65, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.277992277992278, |
|
"grad_norm": 0.6922868471685667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.639, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3088803088803089, |
|
"grad_norm": 0.6117044147831933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6398, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.33976833976833976, |
|
"grad_norm": 0.6349747322397297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6335, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.37065637065637064, |
|
"grad_norm": 0.6236670890266053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6337, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4015444015444015, |
|
"grad_norm": 0.6724570938621872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6255, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.8157933757633341, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6283, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46332046332046334, |
|
"grad_norm": 0.5839495663310666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6214, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4942084942084942, |
|
"grad_norm": 0.5048909710838132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6243, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.525096525096525, |
|
"grad_norm": 0.8095059876811945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6251, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.555984555984556, |
|
"grad_norm": 0.5136815801780633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6145, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5868725868725869, |
|
"grad_norm": 0.6364035663779027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6298, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6177606177606177, |
|
"grad_norm": 0.6844817393850995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6197, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.7483950060039181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6269, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6795366795366795, |
|
"grad_norm": 0.7350511698635498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6175, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7104247104247104, |
|
"grad_norm": 0.6246522556489467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6172, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7413127413127413, |
|
"grad_norm": 0.5845884993818068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6119, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7722007722007722, |
|
"grad_norm": 0.6246906999878916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6191, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.803088803088803, |
|
"grad_norm": 0.5725768617907299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6111, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.833976833976834, |
|
"grad_norm": 0.7575125169415168, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6078, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 0.6292537869493213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6119, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8957528957528957, |
|
"grad_norm": 0.6613666650568315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6219, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9266409266409267, |
|
"grad_norm": 0.482392770012809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6032, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9575289575289575, |
|
"grad_norm": 0.5691595473530817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6137, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9884169884169884, |
|
"grad_norm": 0.5605696956503327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6059, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9976833976833976, |
|
"eval_loss": 0.611504316329956, |
|
"eval_runtime": 174.6046, |
|
"eval_samples_per_second": 49.941, |
|
"eval_steps_per_second": 0.395, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.0193050193050193, |
|
"grad_norm": 0.7653552774499555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.577, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.05019305019305, |
|
"grad_norm": 0.6717959962039358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5516, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 0.7810736772266839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5527, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.111969111969112, |
|
"grad_norm": 0.5222987434687074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5672, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.6086528109996991, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1737451737451738, |
|
"grad_norm": 0.5603581613184462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5557, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2046332046332047, |
|
"grad_norm": 0.5733679682136087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5523, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2355212355212355, |
|
"grad_norm": 0.5892263714928079, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5581, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2664092664092665, |
|
"grad_norm": 0.5383912423961117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5587, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 0.5266690487444351, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5538, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3281853281853282, |
|
"grad_norm": 0.6329004292969694, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5621, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.359073359073359, |
|
"grad_norm": 0.7361457930766753, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5507, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3899613899613898, |
|
"grad_norm": 0.5514849953088692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5458, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.420849420849421, |
|
"grad_norm": 0.6157873717956057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5576, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4517374517374517, |
|
"grad_norm": 0.6133558219166387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5549, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4826254826254825, |
|
"grad_norm": 0.8739126560777261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5544, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5135135135135136, |
|
"grad_norm": 0.6361501246308614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5593, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5444015444015444, |
|
"grad_norm": 0.5962865704485671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5536, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5752895752895753, |
|
"grad_norm": 0.5717477466618819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5613, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.606177606177606, |
|
"grad_norm": 0.66303577521287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5515, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.637065637065637, |
|
"grad_norm": 0.567568871578932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.552, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.667953667953668, |
|
"grad_norm": 0.6260947334671597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5523, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.698841698841699, |
|
"grad_norm": 0.5443908156986677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5597, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 0.5875127617844319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5577, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7606177606177607, |
|
"grad_norm": 0.5839768971343271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5556, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7915057915057915, |
|
"grad_norm": 0.5415996097268374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5541, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8223938223938223, |
|
"grad_norm": 0.5047758992113026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5498, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8532818532818531, |
|
"grad_norm": 0.6095158836516334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5577, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8841698841698842, |
|
"grad_norm": 0.6179499758866378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5561, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.915057915057915, |
|
"grad_norm": 0.6168326581011625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5623, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 0.5503294354981652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5513, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9768339768339769, |
|
"grad_norm": 0.5514628691685661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5543, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9984555984555985, |
|
"eval_loss": 0.6036180853843689, |
|
"eval_runtime": 175.7287, |
|
"eval_samples_per_second": 49.622, |
|
"eval_steps_per_second": 0.393, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 2.0077220077220077, |
|
"grad_norm": 1.1966714312564959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5432, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0386100386100385, |
|
"grad_norm": 0.6689403789809841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5094, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0694980694980694, |
|
"grad_norm": 0.61180285844113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5053, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1003861003861, |
|
"grad_norm": 0.6066089470745547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5038, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1312741312741315, |
|
"grad_norm": 0.7339359470891843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4866, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 0.6280327917691826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4925, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.193050193050193, |
|
"grad_norm": 0.6228681526915294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4964, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.223938223938224, |
|
"grad_norm": 0.7409827598226171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4991, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2548262548262548, |
|
"grad_norm": 0.5789864024622833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5047, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.6184054511523953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4983, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3166023166023164, |
|
"grad_norm": 0.7218033338604144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5006, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3474903474903477, |
|
"grad_norm": 0.5488090893995611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4982, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3783783783783785, |
|
"grad_norm": 0.7369531886408642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4967, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4092664092664093, |
|
"grad_norm": 0.5519469047632208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4965, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.44015444015444, |
|
"grad_norm": 0.6584775748338472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.501, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.471042471042471, |
|
"grad_norm": 0.6589409942831427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5006, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.501930501930502, |
|
"grad_norm": 0.6035656966090169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.504, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.532818532818533, |
|
"grad_norm": 0.5939046793612662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4962, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.563706563706564, |
|
"grad_norm": 0.6332349300931522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5184, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 0.5980197287456133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4989, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6254826254826256, |
|
"grad_norm": 0.5239673559847541, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5013, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6563706563706564, |
|
"grad_norm": 0.6161550811061723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4979, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.687258687258687, |
|
"grad_norm": 0.5658973017035395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5069, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.718146718146718, |
|
"grad_norm": 0.5709137558646814, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5065, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.749034749034749, |
|
"grad_norm": 0.6389113703934911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5037, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7799227799227797, |
|
"grad_norm": 0.6445028760804165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5064, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.810810810810811, |
|
"grad_norm": 0.6013274832800054, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5032, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.841698841698842, |
|
"grad_norm": 0.5664869441381479, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5068, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8725868725868726, |
|
"grad_norm": 0.6735218011480151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5076, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9034749034749034, |
|
"grad_norm": 1.6394857976813213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5032, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9343629343629343, |
|
"grad_norm": 0.5576873331550094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.504, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.965250965250965, |
|
"grad_norm": 0.5637202117054834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5012, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.993050193050193, |
|
"eval_loss": 0.611685574054718, |
|
"eval_runtime": 175.5767, |
|
"eval_samples_per_second": 49.665, |
|
"eval_steps_per_second": 0.393, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 2.993050193050193, |
|
"step": 969, |
|
"total_flos": 1622692331520000.0, |
|
"train_loss": 0.5660572312810719, |
|
"train_runtime": 29124.935, |
|
"train_samples_per_second": 17.065, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 969, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1622692331520000.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|