{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9945750452079567, "eval_steps": 500, "global_step": 1242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024110910186859555, "grad_norm": 2.685936547429143, "learning_rate": 5e-06, "loss": 0.9133, "step": 10 }, { "epoch": 0.04822182037371911, "grad_norm": 2.8983680167236696, "learning_rate": 5e-06, "loss": 0.7794, "step": 20 }, { "epoch": 0.07233273056057866, "grad_norm": 1.4715031186017837, "learning_rate": 5e-06, "loss": 0.759, "step": 30 }, { "epoch": 0.09644364074743822, "grad_norm": 0.8377570160614484, "learning_rate": 5e-06, "loss": 0.7317, "step": 40 }, { "epoch": 0.12055455093429777, "grad_norm": 1.0172390996140888, "learning_rate": 5e-06, "loss": 0.728, "step": 50 }, { "epoch": 0.14466546112115733, "grad_norm": 2.076736903100202, "learning_rate": 5e-06, "loss": 0.7065, "step": 60 }, { "epoch": 0.16877637130801687, "grad_norm": 0.9324814149418421, "learning_rate": 5e-06, "loss": 0.7025, "step": 70 }, { "epoch": 0.19288728149487644, "grad_norm": 0.8961533346444337, "learning_rate": 5e-06, "loss": 0.6948, "step": 80 }, { "epoch": 0.21699819168173598, "grad_norm": 0.9833679322557037, "learning_rate": 5e-06, "loss": 0.6997, "step": 90 }, { "epoch": 0.24110910186859555, "grad_norm": 0.6320425718923794, "learning_rate": 5e-06, "loss": 0.6781, "step": 100 }, { "epoch": 0.2652200120554551, "grad_norm": 0.8047932595929222, "learning_rate": 5e-06, "loss": 0.6857, "step": 110 }, { "epoch": 0.28933092224231466, "grad_norm": 0.6873832051216665, "learning_rate": 5e-06, "loss": 0.6762, "step": 120 }, { "epoch": 0.3134418324291742, "grad_norm": 0.8515251273118922, "learning_rate": 5e-06, "loss": 0.6741, "step": 130 }, { "epoch": 0.33755274261603374, "grad_norm": 0.6815399625732373, "learning_rate": 5e-06, "loss": 0.6786, "step": 140 }, { "epoch": 0.3616636528028933, "grad_norm": 0.6231823130658575, "learning_rate": 5e-06, "loss": 0.6763, "step": 150 }, { "epoch": 0.3857745629897529, "grad_norm": 0.552827319574485, "learning_rate": 5e-06, "loss": 0.6711, "step": 160 }, { "epoch": 0.4098854731766124, "grad_norm": 0.6826986498299203, "learning_rate": 5e-06, "loss": 0.6706, "step": 170 }, { "epoch": 0.43399638336347196, "grad_norm": 0.5574310360062503, "learning_rate": 5e-06, "loss": 0.6659, "step": 180 }, { "epoch": 0.45810729355033153, "grad_norm": 0.7613567669157012, "learning_rate": 5e-06, "loss": 0.6658, "step": 190 }, { "epoch": 0.4822182037371911, "grad_norm": 0.5609659476480818, "learning_rate": 5e-06, "loss": 0.6598, "step": 200 }, { "epoch": 0.5063291139240507, "grad_norm": 1.078834895881199, "learning_rate": 5e-06, "loss": 0.6687, "step": 210 }, { "epoch": 0.5304400241109102, "grad_norm": 0.6016551358752319, "learning_rate": 5e-06, "loss": 0.6667, "step": 220 }, { "epoch": 0.5545509342977697, "grad_norm": 0.5329067498961892, "learning_rate": 5e-06, "loss": 0.6568, "step": 230 }, { "epoch": 0.5786618444846293, "grad_norm": 0.5844269800148942, "learning_rate": 5e-06, "loss": 0.6656, "step": 240 }, { "epoch": 0.6027727546714888, "grad_norm": 0.713015217035973, "learning_rate": 5e-06, "loss": 0.6584, "step": 250 }, { "epoch": 0.6268836648583485, "grad_norm": 0.7063878216983879, "learning_rate": 5e-06, "loss": 0.665, "step": 260 }, { "epoch": 0.650994575045208, "grad_norm": 0.50774960805631, "learning_rate": 5e-06, "loss": 0.6615, "step": 270 }, { "epoch": 0.6751054852320675, "grad_norm": 0.6111313528033431, "learning_rate": 5e-06, "loss": 0.6551, "step": 280 }, { "epoch": 0.6992163954189271, "grad_norm": 0.6458858962308502, "learning_rate": 5e-06, "loss": 0.6535, "step": 290 }, { "epoch": 0.7233273056057866, "grad_norm": 0.6797329430329018, "learning_rate": 5e-06, "loss": 0.6616, "step": 300 }, { "epoch": 0.7474382157926461, "grad_norm": 1.0271382997748104, "learning_rate": 5e-06, "loss": 0.6593, "step": 310 }, { "epoch": 0.7715491259795058, "grad_norm": 0.5821025343959978, "learning_rate": 5e-06, "loss": 0.6556, "step": 320 }, { "epoch": 0.7956600361663653, "grad_norm": 0.575144218324774, "learning_rate": 5e-06, "loss": 0.6522, "step": 330 }, { "epoch": 0.8197709463532248, "grad_norm": 0.4992177743591918, "learning_rate": 5e-06, "loss": 0.6472, "step": 340 }, { "epoch": 0.8438818565400844, "grad_norm": 0.5518799725500897, "learning_rate": 5e-06, "loss": 0.6486, "step": 350 }, { "epoch": 0.8679927667269439, "grad_norm": 0.6827706978670125, "learning_rate": 5e-06, "loss": 0.6527, "step": 360 }, { "epoch": 0.8921036769138035, "grad_norm": 0.5370276906753118, "learning_rate": 5e-06, "loss": 0.6564, "step": 370 }, { "epoch": 0.9162145871006631, "grad_norm": 0.5011748190469159, "learning_rate": 5e-06, "loss": 0.648, "step": 380 }, { "epoch": 0.9403254972875226, "grad_norm": 0.7289445343800255, "learning_rate": 5e-06, "loss": 0.645, "step": 390 }, { "epoch": 0.9644364074743822, "grad_norm": 0.5223137931656774, "learning_rate": 5e-06, "loss": 0.6481, "step": 400 }, { "epoch": 0.9885473176612417, "grad_norm": 0.5702001612329072, "learning_rate": 5e-06, "loss": 0.6417, "step": 410 }, { "epoch": 0.9981916817359855, "eval_loss": 0.6491908431053162, "eval_runtime": 223.3579, "eval_samples_per_second": 50.036, "eval_steps_per_second": 0.394, "step": 414 }, { "epoch": 1.0126582278481013, "grad_norm": 0.6141037997267318, "learning_rate": 5e-06, "loss": 0.6207, "step": 420 }, { "epoch": 1.0367691380349608, "grad_norm": 0.5738222179228437, "learning_rate": 5e-06, "loss": 0.6073, "step": 430 }, { "epoch": 1.0608800482218204, "grad_norm": 0.6152321417799828, "learning_rate": 5e-06, "loss": 0.5963, "step": 440 }, { "epoch": 1.0849909584086799, "grad_norm": 0.6230797448075694, "learning_rate": 5e-06, "loss": 0.6035, "step": 450 }, { "epoch": 1.1091018685955394, "grad_norm": 0.5547485435536735, "learning_rate": 5e-06, "loss": 0.6043, "step": 460 }, { "epoch": 1.1332127787823991, "grad_norm": 0.6897788261093171, "learning_rate": 5e-06, "loss": 0.6059, "step": 470 }, { "epoch": 1.1573236889692586, "grad_norm": 0.5319379437293987, "learning_rate": 5e-06, "loss": 0.5991, "step": 480 }, { "epoch": 1.1814345991561181, "grad_norm": 0.5927433636509655, "learning_rate": 5e-06, "loss": 0.6033, "step": 490 }, { "epoch": 1.2055455093429777, "grad_norm": 0.6178241976987927, "learning_rate": 5e-06, "loss": 0.6, "step": 500 }, { "epoch": 1.2296564195298372, "grad_norm": 0.5009847922110348, "learning_rate": 5e-06, "loss": 0.5987, "step": 510 }, { "epoch": 1.253767329716697, "grad_norm": 0.6865827636690425, "learning_rate": 5e-06, "loss": 0.6039, "step": 520 }, { "epoch": 1.2778782399035564, "grad_norm": 0.6419339118636196, "learning_rate": 5e-06, "loss": 0.5977, "step": 530 }, { "epoch": 1.301989150090416, "grad_norm": 0.5403820568820131, "learning_rate": 5e-06, "loss": 0.6053, "step": 540 }, { "epoch": 1.3261000602772754, "grad_norm": 0.496944344094317, "learning_rate": 5e-06, "loss": 0.6043, "step": 550 }, { "epoch": 1.350210970464135, "grad_norm": 0.6835364259470225, "learning_rate": 5e-06, "loss": 0.6015, "step": 560 }, { "epoch": 1.3743218806509945, "grad_norm": 0.5433357613957998, "learning_rate": 5e-06, "loss": 0.5979, "step": 570 }, { "epoch": 1.3984327908378542, "grad_norm": 0.48293592352088544, "learning_rate": 5e-06, "loss": 0.6039, "step": 580 }, { "epoch": 1.4225437010247137, "grad_norm": 0.5167692584382013, "learning_rate": 5e-06, "loss": 0.6029, "step": 590 }, { "epoch": 1.4466546112115732, "grad_norm": 0.5467014681458703, "learning_rate": 5e-06, "loss": 0.6056, "step": 600 }, { "epoch": 1.4707655213984328, "grad_norm": 0.48669984975762765, "learning_rate": 5e-06, "loss": 0.6053, "step": 610 }, { "epoch": 1.4948764315852923, "grad_norm": 0.5052139384494145, "learning_rate": 5e-06, "loss": 0.607, "step": 620 }, { "epoch": 1.518987341772152, "grad_norm": 0.5189039466272587, "learning_rate": 5e-06, "loss": 0.6079, "step": 630 }, { "epoch": 1.5430982519590115, "grad_norm": 0.5340411087467901, "learning_rate": 5e-06, "loss": 0.5966, "step": 640 }, { "epoch": 1.567209162145871, "grad_norm": 0.6320951914134804, "learning_rate": 5e-06, "loss": 0.6119, "step": 650 }, { "epoch": 1.5913200723327305, "grad_norm": 0.5402636477743581, "learning_rate": 5e-06, "loss": 0.6018, "step": 660 }, { "epoch": 1.61543098251959, "grad_norm": 0.6023321834042192, "learning_rate": 5e-06, "loss": 0.6023, "step": 670 }, { "epoch": 1.6395418927064496, "grad_norm": 0.49282224066247415, "learning_rate": 5e-06, "loss": 0.6002, "step": 680 }, { "epoch": 1.663652802893309, "grad_norm": 0.6838051107799483, "learning_rate": 5e-06, "loss": 0.601, "step": 690 }, { "epoch": 1.6877637130801688, "grad_norm": 0.4809683173497573, "learning_rate": 5e-06, "loss": 0.6012, "step": 700 }, { "epoch": 1.7118746232670283, "grad_norm": 0.5130004764470846, "learning_rate": 5e-06, "loss": 0.6019, "step": 710 }, { "epoch": 1.7359855334538878, "grad_norm": 0.5222089493788711, "learning_rate": 5e-06, "loss": 0.6029, "step": 720 }, { "epoch": 1.7600964436407476, "grad_norm": 0.5537154673186192, "learning_rate": 5e-06, "loss": 0.6039, "step": 730 }, { "epoch": 1.784207353827607, "grad_norm": 0.5081950888314039, "learning_rate": 5e-06, "loss": 0.5973, "step": 740 }, { "epoch": 1.8083182640144666, "grad_norm": 0.5806567422134803, "learning_rate": 5e-06, "loss": 0.6072, "step": 750 }, { "epoch": 1.8324291742013261, "grad_norm": 0.5192410257029635, "learning_rate": 5e-06, "loss": 0.6026, "step": 760 }, { "epoch": 1.8565400843881856, "grad_norm": 0.5487344170749389, "learning_rate": 5e-06, "loss": 0.6009, "step": 770 }, { "epoch": 1.8806509945750451, "grad_norm": 0.5324805374861366, "learning_rate": 5e-06, "loss": 0.5994, "step": 780 }, { "epoch": 1.9047619047619047, "grad_norm": 0.6058321884008855, "learning_rate": 5e-06, "loss": 0.6025, "step": 790 }, { "epoch": 1.9288728149487642, "grad_norm": 0.57365525151735, "learning_rate": 5e-06, "loss": 0.6026, "step": 800 }, { "epoch": 1.952983725135624, "grad_norm": 0.5436955562661013, "learning_rate": 5e-06, "loss": 0.5953, "step": 810 }, { "epoch": 1.9770946353224834, "grad_norm": 0.6042343773815075, "learning_rate": 5e-06, "loss": 0.6, "step": 820 }, { "epoch": 1.998794454490657, "eval_loss": 0.6393378973007202, "eval_runtime": 225.3367, "eval_samples_per_second": 49.597, "eval_steps_per_second": 0.391, "step": 829 }, { "epoch": 2.001205545509343, "grad_norm": 0.9670296692720087, "learning_rate": 5e-06, "loss": 0.6016, "step": 830 }, { "epoch": 2.0253164556962027, "grad_norm": 0.6837527713124405, "learning_rate": 5e-06, "loss": 0.5631, "step": 840 }, { "epoch": 2.049427365883062, "grad_norm": 0.5935688974373606, "learning_rate": 5e-06, "loss": 0.5531, "step": 850 }, { "epoch": 2.0735382760699217, "grad_norm": 0.5980530217682797, "learning_rate": 5e-06, "loss": 0.5554, "step": 860 }, { "epoch": 2.097649186256781, "grad_norm": 0.5752374885434699, "learning_rate": 5e-06, "loss": 0.5557, "step": 870 }, { "epoch": 2.1217600964436407, "grad_norm": 0.5042143345935887, "learning_rate": 5e-06, "loss": 0.5522, "step": 880 }, { "epoch": 2.1458710066305002, "grad_norm": 0.5980920545311946, "learning_rate": 5e-06, "loss": 0.553, "step": 890 }, { "epoch": 2.1699819168173597, "grad_norm": 0.5290062022586566, "learning_rate": 5e-06, "loss": 0.5541, "step": 900 }, { "epoch": 2.1940928270042193, "grad_norm": 0.6029389321066391, "learning_rate": 5e-06, "loss": 0.5527, "step": 910 }, { "epoch": 2.2182037371910788, "grad_norm": 0.5761620842575014, "learning_rate": 5e-06, "loss": 0.561, "step": 920 }, { "epoch": 2.2423146473779383, "grad_norm": 0.5382086109948551, "learning_rate": 5e-06, "loss": 0.5528, "step": 930 }, { "epoch": 2.2664255575647982, "grad_norm": 0.5536204411197307, "learning_rate": 5e-06, "loss": 0.5552, "step": 940 }, { "epoch": 2.2905364677516578, "grad_norm": 0.7414422036930762, "learning_rate": 5e-06, "loss": 0.557, "step": 950 }, { "epoch": 2.3146473779385173, "grad_norm": 0.6072913873182035, "learning_rate": 5e-06, "loss": 0.5573, "step": 960 }, { "epoch": 2.338758288125377, "grad_norm": 0.5786725716853928, "learning_rate": 5e-06, "loss": 0.5577, "step": 970 }, { "epoch": 2.3628691983122363, "grad_norm": 0.5958758621711483, "learning_rate": 5e-06, "loss": 0.5573, "step": 980 }, { "epoch": 2.386980108499096, "grad_norm": 0.5427800525323759, "learning_rate": 5e-06, "loss": 0.5605, "step": 990 }, { "epoch": 2.4110910186859553, "grad_norm": 0.5008520202035274, "learning_rate": 5e-06, "loss": 0.55, "step": 1000 }, { "epoch": 2.435201928872815, "grad_norm": 0.5438627458062395, "learning_rate": 5e-06, "loss": 0.5591, "step": 1010 }, { "epoch": 2.4593128390596743, "grad_norm": 0.523458598668171, "learning_rate": 5e-06, "loss": 0.5523, "step": 1020 }, { "epoch": 2.483423749246534, "grad_norm": 0.562845339140823, "learning_rate": 5e-06, "loss": 0.5513, "step": 1030 }, { "epoch": 2.507534659433394, "grad_norm": 0.71192454951128, "learning_rate": 5e-06, "loss": 0.5617, "step": 1040 }, { "epoch": 2.5316455696202533, "grad_norm": 0.5488684911452221, "learning_rate": 5e-06, "loss": 0.5594, "step": 1050 }, { "epoch": 2.555756479807113, "grad_norm": 0.6322721667592042, "learning_rate": 5e-06, "loss": 0.5603, "step": 1060 }, { "epoch": 2.5798673899939724, "grad_norm": 0.5208011078844106, "learning_rate": 5e-06, "loss": 0.5564, "step": 1070 }, { "epoch": 2.603978300180832, "grad_norm": 0.5150689754075237, "learning_rate": 5e-06, "loss": 0.5624, "step": 1080 }, { "epoch": 2.6280892103676914, "grad_norm": 0.5338754237375813, "learning_rate": 5e-06, "loss": 0.5628, "step": 1090 }, { "epoch": 2.652200120554551, "grad_norm": 0.5072044155960452, "learning_rate": 5e-06, "loss": 0.5606, "step": 1100 }, { "epoch": 2.6763110307414104, "grad_norm": 0.7238515722776927, "learning_rate": 5e-06, "loss": 0.5557, "step": 1110 }, { "epoch": 2.70042194092827, "grad_norm": 0.5147434745712806, "learning_rate": 5e-06, "loss": 0.553, "step": 1120 }, { "epoch": 2.7245328511151294, "grad_norm": 0.5564967074947503, "learning_rate": 5e-06, "loss": 0.5635, "step": 1130 }, { "epoch": 2.748643761301989, "grad_norm": 0.5501220049253929, "learning_rate": 5e-06, "loss": 0.5583, "step": 1140 }, { "epoch": 2.7727546714888485, "grad_norm": 0.5103459117518057, "learning_rate": 5e-06, "loss": 0.5597, "step": 1150 }, { "epoch": 2.7968655816757084, "grad_norm": 0.5479118611862815, "learning_rate": 5e-06, "loss": 0.5579, "step": 1160 }, { "epoch": 2.820976491862568, "grad_norm": 0.5471001762934908, "learning_rate": 5e-06, "loss": 0.5591, "step": 1170 }, { "epoch": 2.8450874020494274, "grad_norm": 0.6232136492982399, "learning_rate": 5e-06, "loss": 0.5606, "step": 1180 }, { "epoch": 2.869198312236287, "grad_norm": 0.5669388319949817, "learning_rate": 5e-06, "loss": 0.5649, "step": 1190 }, { "epoch": 2.8933092224231465, "grad_norm": 0.6969387028585086, "learning_rate": 5e-06, "loss": 0.5651, "step": 1200 }, { "epoch": 2.917420132610006, "grad_norm": 0.6374387529410114, "learning_rate": 5e-06, "loss": 0.56, "step": 1210 }, { "epoch": 2.9415310427968655, "grad_norm": 0.560816628841587, "learning_rate": 5e-06, "loss": 0.5594, "step": 1220 }, { "epoch": 2.965641952983725, "grad_norm": 0.6033572013760955, "learning_rate": 5e-06, "loss": 0.5604, "step": 1230 }, { "epoch": 2.9897528631705845, "grad_norm": 0.5557325437050415, "learning_rate": 5e-06, "loss": 0.5631, "step": 1240 }, { "epoch": 2.9945750452079567, "eval_loss": 0.6422178745269775, "eval_runtime": 225.1456, "eval_samples_per_second": 49.639, "eval_steps_per_second": 0.391, "step": 1242 }, { "epoch": 2.9945750452079567, "step": 1242, "total_flos": 2079977499525120.0, "train_loss": 0.6134321775029439, "train_runtime": 37323.5248, "train_samples_per_second": 17.067, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1242, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2079977499525120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }