{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9945750452079567, "eval_steps": 500, "global_step": 1242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024110910186859555, "grad_norm": 6.533456606849755, "learning_rate": 5e-06, "loss": 0.8842, "step": 10 }, { "epoch": 0.04822182037371911, "grad_norm": 1.7431134609583179, "learning_rate": 5e-06, "loss": 0.7716, "step": 20 }, { "epoch": 0.07233273056057866, "grad_norm": 1.3259257434449123, "learning_rate": 5e-06, "loss": 0.7528, "step": 30 }, { "epoch": 0.09644364074743822, "grad_norm": 0.7933729448787181, "learning_rate": 5e-06, "loss": 0.7272, "step": 40 }, { "epoch": 0.12055455093429777, "grad_norm": 0.9489596846030721, "learning_rate": 5e-06, "loss": 0.7249, "step": 50 }, { "epoch": 0.14466546112115733, "grad_norm": 0.9021467855462361, "learning_rate": 5e-06, "loss": 0.7048, "step": 60 }, { "epoch": 0.16877637130801687, "grad_norm": 0.9161730770097936, "learning_rate": 5e-06, "loss": 0.7005, "step": 70 }, { "epoch": 0.19288728149487644, "grad_norm": 0.7938637800361754, "learning_rate": 5e-06, "loss": 0.6932, "step": 80 }, { "epoch": 0.21699819168173598, "grad_norm": 0.5753057373815283, "learning_rate": 5e-06, "loss": 0.6974, "step": 90 }, { "epoch": 0.24110910186859555, "grad_norm": 0.9667031835337068, "learning_rate": 5e-06, "loss": 0.6757, "step": 100 }, { "epoch": 0.2652200120554551, "grad_norm": 0.5028322572120897, "learning_rate": 5e-06, "loss": 0.6834, "step": 110 }, { "epoch": 0.28933092224231466, "grad_norm": 0.6797693061745307, "learning_rate": 5e-06, "loss": 0.674, "step": 120 }, { "epoch": 0.3134418324291742, "grad_norm": 1.1680360190006298, "learning_rate": 5e-06, "loss": 0.6722, "step": 130 }, { "epoch": 0.33755274261603374, "grad_norm": 0.544648561957048, "learning_rate": 5e-06, "loss": 0.677, "step": 140 }, { "epoch": 0.3616636528028933, "grad_norm": 0.7257586557706087, "learning_rate": 5e-06, "loss": 0.6747, "step": 150 }, { "epoch": 0.3857745629897529, "grad_norm": 0.4119617826643094, "learning_rate": 5e-06, "loss": 0.6693, "step": 160 }, { "epoch": 0.4098854731766124, "grad_norm": 0.5605900141967505, "learning_rate": 5e-06, "loss": 0.6688, "step": 170 }, { "epoch": 0.43399638336347196, "grad_norm": 0.6385688604944322, "learning_rate": 5e-06, "loss": 0.6643, "step": 180 }, { "epoch": 0.45810729355033153, "grad_norm": 1.0125290764918353, "learning_rate": 5e-06, "loss": 0.6641, "step": 190 }, { "epoch": 0.4822182037371911, "grad_norm": 0.6186627046406172, "learning_rate": 5e-06, "loss": 0.658, "step": 200 }, { "epoch": 0.5063291139240507, "grad_norm": 0.8414739576765752, "learning_rate": 5e-06, "loss": 0.6671, "step": 210 }, { "epoch": 0.5304400241109102, "grad_norm": 0.6025383396507406, "learning_rate": 5e-06, "loss": 0.6651, "step": 220 }, { "epoch": 0.5545509342977697, "grad_norm": 0.5410823595468066, "learning_rate": 5e-06, "loss": 0.6554, "step": 230 }, { "epoch": 0.5786618444846293, "grad_norm": 0.6181513401688427, "learning_rate": 5e-06, "loss": 0.6641, "step": 240 }, { "epoch": 0.6027727546714888, "grad_norm": 0.4896208841371711, "learning_rate": 5e-06, "loss": 0.657, "step": 250 }, { "epoch": 0.6268836648583485, "grad_norm": 0.544546111477725, "learning_rate": 5e-06, "loss": 0.6638, "step": 260 }, { "epoch": 0.650994575045208, "grad_norm": 0.5356265326461168, "learning_rate": 5e-06, "loss": 0.6602, "step": 270 }, { "epoch": 0.6751054852320675, "grad_norm": 0.5475932069244179, "learning_rate": 5e-06, "loss": 0.6538, "step": 280 }, { "epoch": 0.6992163954189271, "grad_norm": 0.8345098978281534, "learning_rate": 5e-06, "loss": 0.6523, "step": 290 }, { "epoch": 0.7233273056057866, "grad_norm": 0.8160477568039888, "learning_rate": 5e-06, "loss": 0.6604, "step": 300 }, { "epoch": 0.7474382157926461, "grad_norm": 0.5563594159462366, "learning_rate": 5e-06, "loss": 0.6581, "step": 310 }, { "epoch": 0.7715491259795058, "grad_norm": 0.6104670026137493, "learning_rate": 5e-06, "loss": 0.6543, "step": 320 }, { "epoch": 0.7956600361663653, "grad_norm": 0.5818143425968119, "learning_rate": 5e-06, "loss": 0.6508, "step": 330 }, { "epoch": 0.8197709463532248, "grad_norm": 0.39317653113678785, "learning_rate": 5e-06, "loss": 0.6459, "step": 340 }, { "epoch": 0.8438818565400844, "grad_norm": 0.4869964807571895, "learning_rate": 5e-06, "loss": 0.6476, "step": 350 }, { "epoch": 0.8679927667269439, "grad_norm": 0.9839633535279524, "learning_rate": 5e-06, "loss": 0.6517, "step": 360 }, { "epoch": 0.8921036769138035, "grad_norm": 0.4947132075136725, "learning_rate": 5e-06, "loss": 0.6554, "step": 370 }, { "epoch": 0.9162145871006631, "grad_norm": 0.42196728270115014, "learning_rate": 5e-06, "loss": 0.647, "step": 380 }, { "epoch": 0.9403254972875226, "grad_norm": 0.7036293961206416, "learning_rate": 5e-06, "loss": 0.6437, "step": 390 }, { "epoch": 0.9644364074743822, "grad_norm": 0.4303638291795801, "learning_rate": 5e-06, "loss": 0.647, "step": 400 }, { "epoch": 0.9885473176612417, "grad_norm": 0.4993150805880552, "learning_rate": 5e-06, "loss": 0.6408, "step": 410 }, { "epoch": 0.9981916817359855, "eval_loss": 0.6481794714927673, "eval_runtime": 221.8793, "eval_samples_per_second": 50.37, "eval_steps_per_second": 0.397, "step": 414 }, { "epoch": 1.0126582278481013, "grad_norm": 0.5861902646980864, "learning_rate": 5e-06, "loss": 0.6219, "step": 420 }, { "epoch": 1.0367691380349608, "grad_norm": 0.4608104690581376, "learning_rate": 5e-06, "loss": 0.6109, "step": 430 }, { "epoch": 1.0608800482218204, "grad_norm": 0.7019806195266277, "learning_rate": 5e-06, "loss": 0.5999, "step": 440 }, { "epoch": 1.0849909584086799, "grad_norm": 0.4666118598227287, "learning_rate": 5e-06, "loss": 0.6071, "step": 450 }, { "epoch": 1.1091018685955394, "grad_norm": 0.49273088471001014, "learning_rate": 5e-06, "loss": 0.6079, "step": 460 }, { "epoch": 1.1332127787823991, "grad_norm": 0.5608412041594104, "learning_rate": 5e-06, "loss": 0.6093, "step": 470 }, { "epoch": 1.1573236889692586, "grad_norm": 0.5133766270512516, "learning_rate": 5e-06, "loss": 0.6023, "step": 480 }, { "epoch": 1.1814345991561181, "grad_norm": 0.4639503656965253, "learning_rate": 5e-06, "loss": 0.6067, "step": 490 }, { "epoch": 1.2055455093429777, "grad_norm": 0.4941484591532595, "learning_rate": 5e-06, "loss": 0.6034, "step": 500 }, { "epoch": 1.2296564195298372, "grad_norm": 0.532046568060987, "learning_rate": 5e-06, "loss": 0.6021, "step": 510 }, { "epoch": 1.253767329716697, "grad_norm": 0.6313451414543506, "learning_rate": 5e-06, "loss": 0.6072, "step": 520 }, { "epoch": 1.2778782399035564, "grad_norm": 0.48840150221258893, "learning_rate": 5e-06, "loss": 0.6008, "step": 530 }, { "epoch": 1.301989150090416, "grad_norm": 0.4346073819877919, "learning_rate": 5e-06, "loss": 0.6084, "step": 540 }, { "epoch": 1.3261000602772754, "grad_norm": 0.5696969325867375, "learning_rate": 5e-06, "loss": 0.6073, "step": 550 }, { "epoch": 1.350210970464135, "grad_norm": 0.6029521082479712, "learning_rate": 5e-06, "loss": 0.6045, "step": 560 }, { "epoch": 1.3743218806509945, "grad_norm": 0.5359000000861764, "learning_rate": 5e-06, "loss": 0.601, "step": 570 }, { "epoch": 1.3984327908378542, "grad_norm": 0.4280776424781654, "learning_rate": 5e-06, "loss": 0.6068, "step": 580 }, { "epoch": 1.4225437010247137, "grad_norm": 0.42975173635641, "learning_rate": 5e-06, "loss": 0.6058, "step": 590 }, { "epoch": 1.4466546112115732, "grad_norm": 0.4148935722421534, "learning_rate": 5e-06, "loss": 0.6084, "step": 600 }, { "epoch": 1.4707655213984328, "grad_norm": 0.4346895040838288, "learning_rate": 5e-06, "loss": 0.6083, "step": 610 }, { "epoch": 1.4948764315852923, "grad_norm": 0.456872099031643, "learning_rate": 5e-06, "loss": 0.6101, "step": 620 }, { "epoch": 1.518987341772152, "grad_norm": 0.518636393965265, "learning_rate": 5e-06, "loss": 0.6107, "step": 630 }, { "epoch": 1.5430982519590115, "grad_norm": 0.4976317739138397, "learning_rate": 5e-06, "loss": 0.5995, "step": 640 }, { "epoch": 1.567209162145871, "grad_norm": 0.5121056663367101, "learning_rate": 5e-06, "loss": 0.6147, "step": 650 }, { "epoch": 1.5913200723327305, "grad_norm": 0.49181051844188867, "learning_rate": 5e-06, "loss": 0.6046, "step": 660 }, { "epoch": 1.61543098251959, "grad_norm": 0.4913489094366748, "learning_rate": 5e-06, "loss": 0.605, "step": 670 }, { "epoch": 1.6395418927064496, "grad_norm": 0.4360413141924259, "learning_rate": 5e-06, "loss": 0.603, "step": 680 }, { "epoch": 1.663652802893309, "grad_norm": 0.5553873036504335, "learning_rate": 5e-06, "loss": 0.6037, "step": 690 }, { "epoch": 1.6877637130801688, "grad_norm": 0.439159626571011, "learning_rate": 5e-06, "loss": 0.6037, "step": 700 }, { "epoch": 1.7118746232670283, "grad_norm": 0.5009323338564864, "learning_rate": 5e-06, "loss": 0.6046, "step": 710 }, { "epoch": 1.7359855334538878, "grad_norm": 0.49820787215486934, "learning_rate": 5e-06, "loss": 0.6057, "step": 720 }, { "epoch": 1.7600964436407476, "grad_norm": 0.553637472752945, "learning_rate": 5e-06, "loss": 0.6066, "step": 730 }, { "epoch": 1.784207353827607, "grad_norm": 0.44541140483577896, "learning_rate": 5e-06, "loss": 0.6, "step": 740 }, { "epoch": 1.8083182640144666, "grad_norm": 0.5310706794248644, "learning_rate": 5e-06, "loss": 0.6098, "step": 750 }, { "epoch": 1.8324291742013261, "grad_norm": 0.6630764624549126, "learning_rate": 5e-06, "loss": 0.6054, "step": 760 }, { "epoch": 1.8565400843881856, "grad_norm": 0.553711920694149, "learning_rate": 5e-06, "loss": 0.6037, "step": 770 }, { "epoch": 1.8806509945750451, "grad_norm": 0.566305473833487, "learning_rate": 5e-06, "loss": 0.6019, "step": 780 }, { "epoch": 1.9047619047619047, "grad_norm": 0.582333160680419, "learning_rate": 5e-06, "loss": 0.6051, "step": 790 }, { "epoch": 1.9288728149487642, "grad_norm": 0.509141986707748, "learning_rate": 5e-06, "loss": 0.6052, "step": 800 }, { "epoch": 1.952983725135624, "grad_norm": 0.4543923308424651, "learning_rate": 5e-06, "loss": 0.598, "step": 810 }, { "epoch": 1.9770946353224834, "grad_norm": 0.45958164108182104, "learning_rate": 5e-06, "loss": 0.6026, "step": 820 }, { "epoch": 1.998794454490657, "eval_loss": 0.6378007531166077, "eval_runtime": 222.8588, "eval_samples_per_second": 50.148, "eval_steps_per_second": 0.395, "step": 829 }, { "epoch": 2.001205545509343, "grad_norm": 0.7867156441363433, "learning_rate": 5e-06, "loss": 0.6043, "step": 830 }, { "epoch": 2.0253164556962027, "grad_norm": 0.5284619907031279, "learning_rate": 5e-06, "loss": 0.5707, "step": 840 }, { "epoch": 2.049427365883062, "grad_norm": 0.5795012320295118, "learning_rate": 5e-06, "loss": 0.5606, "step": 850 }, { "epoch": 2.0735382760699217, "grad_norm": 0.5627294692682645, "learning_rate": 5e-06, "loss": 0.563, "step": 860 }, { "epoch": 2.097649186256781, "grad_norm": 0.5052179539566712, "learning_rate": 5e-06, "loss": 0.5631, "step": 870 }, { "epoch": 2.1217600964436407, "grad_norm": 0.4428407773542258, "learning_rate": 5e-06, "loss": 0.5595, "step": 880 }, { "epoch": 2.1458710066305002, "grad_norm": 0.5267499633401915, "learning_rate": 5e-06, "loss": 0.5601, "step": 890 }, { "epoch": 2.1699819168173597, "grad_norm": 0.4655374512529405, "learning_rate": 5e-06, "loss": 0.5607, "step": 900 }, { "epoch": 2.1940928270042193, "grad_norm": 0.48398838396056276, "learning_rate": 5e-06, "loss": 0.5595, "step": 910 }, { "epoch": 2.2182037371910788, "grad_norm": 0.48096941817619093, "learning_rate": 5e-06, "loss": 0.5677, "step": 920 }, { "epoch": 2.2423146473779383, "grad_norm": 0.5154141010470734, "learning_rate": 5e-06, "loss": 0.5594, "step": 930 }, { "epoch": 2.2664255575647982, "grad_norm": 0.4799488446079912, "learning_rate": 5e-06, "loss": 0.5621, "step": 940 }, { "epoch": 2.2905364677516578, "grad_norm": 0.5540016498502853, "learning_rate": 5e-06, "loss": 0.5638, "step": 950 }, { "epoch": 2.3146473779385173, "grad_norm": 0.6082357481189948, "learning_rate": 5e-06, "loss": 0.564, "step": 960 }, { "epoch": 2.338758288125377, "grad_norm": 0.5420853183530063, "learning_rate": 5e-06, "loss": 0.5643, "step": 970 }, { "epoch": 2.3628691983122363, "grad_norm": 0.42570128293415416, "learning_rate": 5e-06, "loss": 0.5642, "step": 980 }, { "epoch": 2.386980108499096, "grad_norm": 0.5255517048498499, "learning_rate": 5e-06, "loss": 0.5672, "step": 990 }, { "epoch": 2.4110910186859553, "grad_norm": 0.5353694927594205, "learning_rate": 5e-06, "loss": 0.5565, "step": 1000 }, { "epoch": 2.435201928872815, "grad_norm": 0.4617633168683323, "learning_rate": 5e-06, "loss": 0.5657, "step": 1010 }, { "epoch": 2.4593128390596743, "grad_norm": 0.449869806649973, "learning_rate": 5e-06, "loss": 0.5586, "step": 1020 }, { "epoch": 2.483423749246534, "grad_norm": 0.5115337318725849, "learning_rate": 5e-06, "loss": 0.5576, "step": 1030 }, { "epoch": 2.507534659433394, "grad_norm": 0.6907411145245406, "learning_rate": 5e-06, "loss": 0.5681, "step": 1040 }, { "epoch": 2.5316455696202533, "grad_norm": 0.5238948140915647, "learning_rate": 5e-06, "loss": 0.5659, "step": 1050 }, { "epoch": 2.555756479807113, "grad_norm": 0.6589003211840228, "learning_rate": 5e-06, "loss": 0.5667, "step": 1060 }, { "epoch": 2.5798673899939724, "grad_norm": 0.4764556136945032, "learning_rate": 5e-06, "loss": 0.5629, "step": 1070 }, { "epoch": 2.603978300180832, "grad_norm": 0.44468254080490577, "learning_rate": 5e-06, "loss": 0.5687, "step": 1080 }, { "epoch": 2.6280892103676914, "grad_norm": 0.5124860341949249, "learning_rate": 5e-06, "loss": 0.5692, "step": 1090 }, { "epoch": 2.652200120554551, "grad_norm": 0.5228826110878407, "learning_rate": 5e-06, "loss": 0.5667, "step": 1100 }, { "epoch": 2.6763110307414104, "grad_norm": 0.5458373344595544, "learning_rate": 5e-06, "loss": 0.5617, "step": 1110 }, { "epoch": 2.70042194092827, "grad_norm": 0.43248189186264496, "learning_rate": 5e-06, "loss": 0.5589, "step": 1120 }, { "epoch": 2.7245328511151294, "grad_norm": 0.44951413853647815, "learning_rate": 5e-06, "loss": 0.5696, "step": 1130 }, { "epoch": 2.748643761301989, "grad_norm": 0.5059427152996532, "learning_rate": 5e-06, "loss": 0.5645, "step": 1140 }, { "epoch": 2.7727546714888485, "grad_norm": 0.4713166756254001, "learning_rate": 5e-06, "loss": 0.5659, "step": 1150 }, { "epoch": 2.7968655816757084, "grad_norm": 0.4662277376061737, "learning_rate": 5e-06, "loss": 0.5638, "step": 1160 }, { "epoch": 2.820976491862568, "grad_norm": 0.5055943494520574, "learning_rate": 5e-06, "loss": 0.5651, "step": 1170 }, { "epoch": 2.8450874020494274, "grad_norm": 0.49826856850045714, "learning_rate": 5e-06, "loss": 0.5664, "step": 1180 }, { "epoch": 2.869198312236287, "grad_norm": 0.46906591997365343, "learning_rate": 5e-06, "loss": 0.5708, "step": 1190 }, { "epoch": 2.8933092224231465, "grad_norm": 0.5743140790459712, "learning_rate": 5e-06, "loss": 0.5713, "step": 1200 }, { "epoch": 2.917420132610006, "grad_norm": 0.5413293244789124, "learning_rate": 5e-06, "loss": 0.566, "step": 1210 }, { "epoch": 2.9415310427968655, "grad_norm": 0.4769984493754597, "learning_rate": 5e-06, "loss": 0.5653, "step": 1220 }, { "epoch": 2.965641952983725, "grad_norm": 0.4784113431133355, "learning_rate": 5e-06, "loss": 0.5663, "step": 1230 }, { "epoch": 2.9897528631705845, "grad_norm": 0.46857130335535624, "learning_rate": 5e-06, "loss": 0.569, "step": 1240 }, { "epoch": 2.9945750452079567, "eval_loss": 0.6389562487602234, "eval_runtime": 223.5725, "eval_samples_per_second": 49.988, "eval_steps_per_second": 0.394, "step": 1242 }, { "epoch": 2.9945750452079567, "step": 1242, "total_flos": 2079977499525120.0, "train_loss": 0.615725677754376, "train_runtime": 37262.9121, "train_samples_per_second": 17.094, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2079977499525120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }