{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.993050193050193, "eval_steps": 500, "global_step": 969, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03088803088803089, "grad_norm": 109.2129922948738, "learning_rate": 5e-06, "loss": 0.8218, "step": 10 }, { "epoch": 0.06177606177606178, "grad_norm": 1.5509789288186628, "learning_rate": 5e-06, "loss": 0.7505, "step": 20 }, { "epoch": 0.09266409266409266, "grad_norm": 1.0237535334293637, "learning_rate": 5e-06, "loss": 0.7022, "step": 30 }, { "epoch": 0.12355212355212356, "grad_norm": 0.9555198347715455, "learning_rate": 5e-06, "loss": 0.6862, "step": 40 }, { "epoch": 0.15444015444015444, "grad_norm": 0.9962768534382307, "learning_rate": 5e-06, "loss": 0.6721, "step": 50 }, { "epoch": 0.18532818532818532, "grad_norm": 1.7530907194148484, "learning_rate": 5e-06, "loss": 0.6509, "step": 60 }, { "epoch": 0.21621621621621623, "grad_norm": 0.7959383217983834, "learning_rate": 5e-06, "loss": 0.6472, "step": 70 }, { "epoch": 0.2471042471042471, "grad_norm": 0.7944327039168986, "learning_rate": 5e-06, "loss": 0.65, "step": 80 }, { "epoch": 0.277992277992278, "grad_norm": 0.6922868471685667, "learning_rate": 5e-06, "loss": 0.639, "step": 90 }, { "epoch": 0.3088803088803089, "grad_norm": 0.6117044147831933, "learning_rate": 5e-06, "loss": 0.6398, "step": 100 }, { "epoch": 0.33976833976833976, "grad_norm": 0.6349747322397297, "learning_rate": 5e-06, "loss": 0.6335, "step": 110 }, { "epoch": 0.37065637065637064, "grad_norm": 0.6236670890266053, "learning_rate": 5e-06, "loss": 0.6337, "step": 120 }, { "epoch": 0.4015444015444015, "grad_norm": 0.6724570938621872, "learning_rate": 5e-06, "loss": 0.6255, "step": 130 }, { "epoch": 0.43243243243243246, "grad_norm": 0.8157933757633341, "learning_rate": 5e-06, "loss": 0.6283, "step": 140 }, { "epoch": 0.46332046332046334, "grad_norm": 0.5839495663310666, "learning_rate": 5e-06, "loss": 0.6214, "step": 150 }, { "epoch": 0.4942084942084942, "grad_norm": 0.5048909710838132, "learning_rate": 5e-06, "loss": 0.6243, "step": 160 }, { "epoch": 0.525096525096525, "grad_norm": 0.8095059876811945, "learning_rate": 5e-06, "loss": 0.6251, "step": 170 }, { "epoch": 0.555984555984556, "grad_norm": 0.5136815801780633, "learning_rate": 5e-06, "loss": 0.6145, "step": 180 }, { "epoch": 0.5868725868725869, "grad_norm": 0.6364035663779027, "learning_rate": 5e-06, "loss": 0.6298, "step": 190 }, { "epoch": 0.6177606177606177, "grad_norm": 0.6844817393850995, "learning_rate": 5e-06, "loss": 0.6197, "step": 200 }, { "epoch": 0.6486486486486487, "grad_norm": 0.7483950060039181, "learning_rate": 5e-06, "loss": 0.6269, "step": 210 }, { "epoch": 0.6795366795366795, "grad_norm": 0.7350511698635498, "learning_rate": 5e-06, "loss": 0.6175, "step": 220 }, { "epoch": 0.7104247104247104, "grad_norm": 0.6246522556489467, "learning_rate": 5e-06, "loss": 0.6172, "step": 230 }, { "epoch": 0.7413127413127413, "grad_norm": 0.5845884993818068, "learning_rate": 5e-06, "loss": 0.6119, "step": 240 }, { "epoch": 0.7722007722007722, "grad_norm": 0.6246906999878916, "learning_rate": 5e-06, "loss": 0.6191, "step": 250 }, { "epoch": 0.803088803088803, "grad_norm": 0.5725768617907299, "learning_rate": 5e-06, "loss": 0.6111, "step": 260 }, { "epoch": 0.833976833976834, "grad_norm": 0.7575125169415168, "learning_rate": 5e-06, "loss": 0.6078, "step": 270 }, { "epoch": 0.8648648648648649, "grad_norm": 0.6292537869493213, "learning_rate": 5e-06, "loss": 0.6119, "step": 280 }, { "epoch": 0.8957528957528957, "grad_norm": 0.6613666650568315, "learning_rate": 5e-06, "loss": 0.6219, "step": 290 }, { "epoch": 0.9266409266409267, "grad_norm": 0.482392770012809, "learning_rate": 5e-06, "loss": 0.6032, "step": 300 }, { "epoch": 0.9575289575289575, "grad_norm": 0.5691595473530817, "learning_rate": 5e-06, "loss": 0.6137, "step": 310 }, { "epoch": 0.9884169884169884, "grad_norm": 0.5605696956503327, "learning_rate": 5e-06, "loss": 0.6059, "step": 320 }, { "epoch": 0.9976833976833976, "eval_loss": 0.611504316329956, "eval_runtime": 174.6046, "eval_samples_per_second": 49.941, "eval_steps_per_second": 0.395, "step": 323 }, { "epoch": 1.0193050193050193, "grad_norm": 0.7653552774499555, "learning_rate": 5e-06, "loss": 0.577, "step": 330 }, { "epoch": 1.05019305019305, "grad_norm": 0.6717959962039358, "learning_rate": 5e-06, "loss": 0.5516, "step": 340 }, { "epoch": 1.0810810810810811, "grad_norm": 0.7810736772266839, "learning_rate": 5e-06, "loss": 0.5527, "step": 350 }, { "epoch": 1.111969111969112, "grad_norm": 0.5222987434687074, "learning_rate": 5e-06, "loss": 0.5672, "step": 360 }, { "epoch": 1.1428571428571428, "grad_norm": 0.6086528109996991, "learning_rate": 5e-06, "loss": 0.559, "step": 370 }, { "epoch": 1.1737451737451738, "grad_norm": 0.5603581613184462, "learning_rate": 5e-06, "loss": 0.5557, "step": 380 }, { "epoch": 1.2046332046332047, "grad_norm": 0.5733679682136087, "learning_rate": 5e-06, "loss": 0.5523, "step": 390 }, { "epoch": 1.2355212355212355, "grad_norm": 0.5892263714928079, "learning_rate": 5e-06, "loss": 0.5581, "step": 400 }, { "epoch": 1.2664092664092665, "grad_norm": 0.5383912423961117, "learning_rate": 5e-06, "loss": 0.5587, "step": 410 }, { "epoch": 1.2972972972972974, "grad_norm": 0.5266690487444351, "learning_rate": 5e-06, "loss": 0.5538, "step": 420 }, { "epoch": 1.3281853281853282, "grad_norm": 0.6329004292969694, "learning_rate": 5e-06, "loss": 0.5621, "step": 430 }, { "epoch": 1.359073359073359, "grad_norm": 0.7361457930766753, "learning_rate": 5e-06, "loss": 0.5507, "step": 440 }, { "epoch": 1.3899613899613898, "grad_norm": 0.5514849953088692, "learning_rate": 5e-06, "loss": 0.5458, "step": 450 }, { "epoch": 1.420849420849421, "grad_norm": 0.6157873717956057, "learning_rate": 5e-06, "loss": 0.5576, "step": 460 }, { "epoch": 1.4517374517374517, "grad_norm": 0.6133558219166387, "learning_rate": 5e-06, "loss": 0.5549, "step": 470 }, { "epoch": 1.4826254826254825, "grad_norm": 0.8739126560777261, "learning_rate": 5e-06, "loss": 0.5544, "step": 480 }, { "epoch": 1.5135135135135136, "grad_norm": 0.6361501246308614, "learning_rate": 5e-06, "loss": 0.5593, "step": 490 }, { "epoch": 1.5444015444015444, "grad_norm": 0.5962865704485671, "learning_rate": 5e-06, "loss": 0.5536, "step": 500 }, { "epoch": 1.5752895752895753, "grad_norm": 0.5717477466618819, "learning_rate": 5e-06, "loss": 0.5613, "step": 510 }, { "epoch": 1.606177606177606, "grad_norm": 0.66303577521287, "learning_rate": 5e-06, "loss": 0.5515, "step": 520 }, { "epoch": 1.637065637065637, "grad_norm": 0.567568871578932, "learning_rate": 5e-06, "loss": 0.552, "step": 530 }, { "epoch": 1.667953667953668, "grad_norm": 0.6260947334671597, "learning_rate": 5e-06, "loss": 0.5523, "step": 540 }, { "epoch": 1.698841698841699, "grad_norm": 0.5443908156986677, "learning_rate": 5e-06, "loss": 0.5597, "step": 550 }, { "epoch": 1.7297297297297298, "grad_norm": 0.5875127617844319, "learning_rate": 5e-06, "loss": 0.5577, "step": 560 }, { "epoch": 1.7606177606177607, "grad_norm": 0.5839768971343271, "learning_rate": 5e-06, "loss": 0.5556, "step": 570 }, { "epoch": 1.7915057915057915, "grad_norm": 0.5415996097268374, "learning_rate": 5e-06, "loss": 0.5541, "step": 580 }, { "epoch": 1.8223938223938223, "grad_norm": 0.5047758992113026, "learning_rate": 5e-06, "loss": 0.5498, "step": 590 }, { "epoch": 1.8532818532818531, "grad_norm": 0.6095158836516334, "learning_rate": 5e-06, "loss": 0.5577, "step": 600 }, { "epoch": 1.8841698841698842, "grad_norm": 0.6179499758866378, "learning_rate": 5e-06, "loss": 0.5561, "step": 610 }, { "epoch": 1.915057915057915, "grad_norm": 0.6168326581011625, "learning_rate": 5e-06, "loss": 0.5623, "step": 620 }, { "epoch": 1.945945945945946, "grad_norm": 0.5503294354981652, "learning_rate": 5e-06, "loss": 0.5513, "step": 630 }, { "epoch": 1.9768339768339769, "grad_norm": 0.5514628691685661, "learning_rate": 5e-06, "loss": 0.5543, "step": 640 }, { "epoch": 1.9984555984555985, "eval_loss": 0.6036180853843689, "eval_runtime": 175.7287, "eval_samples_per_second": 49.622, "eval_steps_per_second": 0.393, "step": 647 }, { "epoch": 2.0077220077220077, "grad_norm": 1.1966714312564959, "learning_rate": 5e-06, "loss": 0.5432, "step": 650 }, { "epoch": 2.0386100386100385, "grad_norm": 0.6689403789809841, "learning_rate": 5e-06, "loss": 0.5094, "step": 660 }, { "epoch": 2.0694980694980694, "grad_norm": 0.61180285844113, "learning_rate": 5e-06, "loss": 0.5053, "step": 670 }, { "epoch": 2.1003861003861, "grad_norm": 0.6066089470745547, "learning_rate": 5e-06, "loss": 0.5038, "step": 680 }, { "epoch": 2.1312741312741315, "grad_norm": 0.7339359470891843, "learning_rate": 5e-06, "loss": 0.4866, "step": 690 }, { "epoch": 2.1621621621621623, "grad_norm": 0.6280327917691826, "learning_rate": 5e-06, "loss": 0.4925, "step": 700 }, { "epoch": 2.193050193050193, "grad_norm": 0.6228681526915294, "learning_rate": 5e-06, "loss": 0.4964, "step": 710 }, { "epoch": 2.223938223938224, "grad_norm": 0.7409827598226171, "learning_rate": 5e-06, "loss": 0.4991, "step": 720 }, { "epoch": 2.2548262548262548, "grad_norm": 0.5789864024622833, "learning_rate": 5e-06, "loss": 0.5047, "step": 730 }, { "epoch": 2.2857142857142856, "grad_norm": 0.6184054511523953, "learning_rate": 5e-06, "loss": 0.4983, "step": 740 }, { "epoch": 2.3166023166023164, "grad_norm": 0.7218033338604144, "learning_rate": 5e-06, "loss": 0.5006, "step": 750 }, { "epoch": 2.3474903474903477, "grad_norm": 0.5488090893995611, "learning_rate": 5e-06, "loss": 0.4982, "step": 760 }, { "epoch": 2.3783783783783785, "grad_norm": 0.7369531886408642, "learning_rate": 5e-06, "loss": 0.4967, "step": 770 }, { "epoch": 2.4092664092664093, "grad_norm": 0.5519469047632208, "learning_rate": 5e-06, "loss": 0.4965, "step": 780 }, { "epoch": 2.44015444015444, "grad_norm": 0.6584775748338472, "learning_rate": 5e-06, "loss": 0.501, "step": 790 }, { "epoch": 2.471042471042471, "grad_norm": 0.6589409942831427, "learning_rate": 5e-06, "loss": 0.5006, "step": 800 }, { "epoch": 2.501930501930502, "grad_norm": 0.6035656966090169, "learning_rate": 5e-06, "loss": 0.504, "step": 810 }, { "epoch": 2.532818532818533, "grad_norm": 0.5939046793612662, "learning_rate": 5e-06, "loss": 0.4962, "step": 820 }, { "epoch": 2.563706563706564, "grad_norm": 0.6332349300931522, "learning_rate": 5e-06, "loss": 0.5184, "step": 830 }, { "epoch": 2.5945945945945947, "grad_norm": 0.5980197287456133, "learning_rate": 5e-06, "loss": 0.4989, "step": 840 }, { "epoch": 2.6254826254826256, "grad_norm": 0.5239673559847541, "learning_rate": 5e-06, "loss": 0.5013, "step": 850 }, { "epoch": 2.6563706563706564, "grad_norm": 0.6161550811061723, "learning_rate": 5e-06, "loss": 0.4979, "step": 860 }, { "epoch": 2.687258687258687, "grad_norm": 0.5658973017035395, "learning_rate": 5e-06, "loss": 0.5069, "step": 870 }, { "epoch": 2.718146718146718, "grad_norm": 0.5709137558646814, "learning_rate": 5e-06, "loss": 0.5065, "step": 880 }, { "epoch": 2.749034749034749, "grad_norm": 0.6389113703934911, "learning_rate": 5e-06, "loss": 0.5037, "step": 890 }, { "epoch": 2.7799227799227797, "grad_norm": 0.6445028760804165, "learning_rate": 5e-06, "loss": 0.5064, "step": 900 }, { "epoch": 2.810810810810811, "grad_norm": 0.6013274832800054, "learning_rate": 5e-06, "loss": 0.5032, "step": 910 }, { "epoch": 2.841698841698842, "grad_norm": 0.5664869441381479, "learning_rate": 5e-06, "loss": 0.5068, "step": 920 }, { "epoch": 2.8725868725868726, "grad_norm": 0.6735218011480151, "learning_rate": 5e-06, "loss": 0.5076, "step": 930 }, { "epoch": 2.9034749034749034, "grad_norm": 1.6394857976813213, "learning_rate": 5e-06, "loss": 0.5032, "step": 940 }, { "epoch": 2.9343629343629343, "grad_norm": 0.5576873331550094, "learning_rate": 5e-06, "loss": 0.504, "step": 950 }, { "epoch": 2.965250965250965, "grad_norm": 0.5637202117054834, "learning_rate": 5e-06, "loss": 0.5012, "step": 960 }, { "epoch": 2.993050193050193, "eval_loss": 0.611685574054718, "eval_runtime": 175.5767, "eval_samples_per_second": 49.665, "eval_steps_per_second": 0.393, "step": 969 }, { "epoch": 2.993050193050193, "step": 969, "total_flos": 1622692331520000.0, "train_loss": 0.5660572312810719, "train_runtime": 29124.935, "train_samples_per_second": 17.065, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 969, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1622692331520000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }