{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996415770609319, "eval_steps": 500, "global_step": 1254, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023894862604540025, "grad_norm": 4.236361518666415, "learning_rate": 5e-06, "loss": 0.881, "step": 10 }, { "epoch": 0.04778972520908005, "grad_norm": 5.741199257309677, "learning_rate": 5e-06, "loss": 0.7803, "step": 20 }, { "epoch": 0.07168458781362007, "grad_norm": 19.255957544263854, "learning_rate": 5e-06, "loss": 0.7636, "step": 30 }, { "epoch": 0.0955794504181601, "grad_norm": 1.4389740219584857, "learning_rate": 5e-06, "loss": 0.7601, "step": 40 }, { "epoch": 0.11947431302270012, "grad_norm": 2.052033228969374, "learning_rate": 5e-06, "loss": 0.7307, "step": 50 }, { "epoch": 0.14336917562724014, "grad_norm": 1.4883942502737415, "learning_rate": 5e-06, "loss": 0.7194, "step": 60 }, { "epoch": 0.16726403823178015, "grad_norm": 0.7721229562083435, "learning_rate": 5e-06, "loss": 0.7186, "step": 70 }, { "epoch": 0.1911589008363202, "grad_norm": 0.5683129469939435, "learning_rate": 5e-06, "loss": 0.6965, "step": 80 }, { "epoch": 0.21505376344086022, "grad_norm": 0.527733611127623, "learning_rate": 5e-06, "loss": 0.6922, "step": 90 }, { "epoch": 0.23894862604540024, "grad_norm": 0.5540046789983225, "learning_rate": 5e-06, "loss": 0.693, "step": 100 }, { "epoch": 0.2628434886499403, "grad_norm": 0.5451390307514128, "learning_rate": 5e-06, "loss": 0.6802, "step": 110 }, { "epoch": 0.2867383512544803, "grad_norm": 0.5143838898116624, "learning_rate": 5e-06, "loss": 0.688, "step": 120 }, { "epoch": 0.3106332138590203, "grad_norm": 0.6502984472755421, "learning_rate": 5e-06, "loss": 0.6838, "step": 130 }, { "epoch": 0.3345280764635603, "grad_norm": 0.5635569077666838, "learning_rate": 5e-06, "loss": 0.6733, "step": 140 }, { "epoch": 0.35842293906810035, "grad_norm": 0.6029469287016763, "learning_rate": 5e-06, "loss": 0.6776, "step": 150 }, { "epoch": 0.3823178016726404, "grad_norm": 0.486292600711864, "learning_rate": 5e-06, "loss": 0.6661, "step": 160 }, { "epoch": 0.4062126642771804, "grad_norm": 0.6615883711779132, "learning_rate": 5e-06, "loss": 0.6652, "step": 170 }, { "epoch": 0.43010752688172044, "grad_norm": 0.4717863479299739, "learning_rate": 5e-06, "loss": 0.6655, "step": 180 }, { "epoch": 0.4540023894862604, "grad_norm": 0.4888275284899482, "learning_rate": 5e-06, "loss": 0.662, "step": 190 }, { "epoch": 0.4778972520908005, "grad_norm": 0.5394213188181476, "learning_rate": 5e-06, "loss": 0.6687, "step": 200 }, { "epoch": 0.5017921146953405, "grad_norm": 0.45576158948311507, "learning_rate": 5e-06, "loss": 0.6661, "step": 210 }, { "epoch": 0.5256869772998806, "grad_norm": 0.45151984287636476, "learning_rate": 5e-06, "loss": 0.6597, "step": 220 }, { "epoch": 0.5495818399044206, "grad_norm": 0.4799647706900106, "learning_rate": 5e-06, "loss": 0.6602, "step": 230 }, { "epoch": 0.5734767025089605, "grad_norm": 0.47076133511342133, "learning_rate": 5e-06, "loss": 0.6571, "step": 240 }, { "epoch": 0.5973715651135006, "grad_norm": 0.5919116297131423, "learning_rate": 5e-06, "loss": 0.6615, "step": 250 }, { "epoch": 0.6212664277180406, "grad_norm": 0.5500231129527917, "learning_rate": 5e-06, "loss": 0.6585, "step": 260 }, { "epoch": 0.6451612903225806, "grad_norm": 0.5242998976170237, "learning_rate": 5e-06, "loss": 0.6633, "step": 270 }, { "epoch": 0.6690561529271206, "grad_norm": 0.44132900428051, "learning_rate": 5e-06, "loss": 0.6588, "step": 280 }, { "epoch": 0.6929510155316607, "grad_norm": 0.6925054556015406, "learning_rate": 5e-06, "loss": 0.6553, "step": 290 }, { "epoch": 0.7168458781362007, "grad_norm": 0.4625241785333385, "learning_rate": 5e-06, "loss": 0.6574, "step": 300 }, { "epoch": 0.7407407407407407, "grad_norm": 0.4229402308269957, "learning_rate": 5e-06, "loss": 0.6527, "step": 310 }, { "epoch": 0.7646356033452808, "grad_norm": 0.5130609463277542, "learning_rate": 5e-06, "loss": 0.6561, "step": 320 }, { "epoch": 0.7885304659498208, "grad_norm": 0.6838274381409521, "learning_rate": 5e-06, "loss": 0.6555, "step": 330 }, { "epoch": 0.8124253285543608, "grad_norm": 0.4426103821343896, "learning_rate": 5e-06, "loss": 0.6528, "step": 340 }, { "epoch": 0.8363201911589009, "grad_norm": 0.4768048776745041, "learning_rate": 5e-06, "loss": 0.6526, "step": 350 }, { "epoch": 0.8602150537634409, "grad_norm": 0.47979657505843953, "learning_rate": 5e-06, "loss": 0.6507, "step": 360 }, { "epoch": 0.8841099163679809, "grad_norm": 0.43210991398577236, "learning_rate": 5e-06, "loss": 0.6545, "step": 370 }, { "epoch": 0.9080047789725209, "grad_norm": 0.4219482631866451, "learning_rate": 5e-06, "loss": 0.6561, "step": 380 }, { "epoch": 0.931899641577061, "grad_norm": 0.4889263682317913, "learning_rate": 5e-06, "loss": 0.6415, "step": 390 }, { "epoch": 0.955794504181601, "grad_norm": 0.4994356501839893, "learning_rate": 5e-06, "loss": 0.6434, "step": 400 }, { "epoch": 0.9796893667861409, "grad_norm": 0.5756138907013993, "learning_rate": 5e-06, "loss": 0.6535, "step": 410 }, { "epoch": 0.998805256869773, "eval_loss": 0.6466529965400696, "eval_runtime": 225.9354, "eval_samples_per_second": 49.895, "eval_steps_per_second": 0.394, "step": 418 }, { "epoch": 1.003584229390681, "grad_norm": 0.7256416169536216, "learning_rate": 5e-06, "loss": 0.6438, "step": 420 }, { "epoch": 1.027479091995221, "grad_norm": 0.6564158902335233, "learning_rate": 5e-06, "loss": 0.6108, "step": 430 }, { "epoch": 1.0513739545997611, "grad_norm": 0.4999679801637927, "learning_rate": 5e-06, "loss": 0.6106, "step": 440 }, { "epoch": 1.075268817204301, "grad_norm": 0.5241048691611577, "learning_rate": 5e-06, "loss": 0.6124, "step": 450 }, { "epoch": 1.099163679808841, "grad_norm": 0.5456228664692746, "learning_rate": 5e-06, "loss": 0.6042, "step": 460 }, { "epoch": 1.1230585424133812, "grad_norm": 0.5456744152195628, "learning_rate": 5e-06, "loss": 0.6028, "step": 470 }, { "epoch": 1.146953405017921, "grad_norm": 0.4664933079979728, "learning_rate": 5e-06, "loss": 0.6095, "step": 480 }, { "epoch": 1.1708482676224612, "grad_norm": 0.4894583019401931, "learning_rate": 5e-06, "loss": 0.6019, "step": 490 }, { "epoch": 1.194743130227001, "grad_norm": 0.4942642519947347, "learning_rate": 5e-06, "loss": 0.6114, "step": 500 }, { "epoch": 1.2186379928315412, "grad_norm": 0.46554339302452813, "learning_rate": 5e-06, "loss": 0.6099, "step": 510 }, { "epoch": 1.2425328554360813, "grad_norm": 0.5215764597896382, "learning_rate": 5e-06, "loss": 0.6073, "step": 520 }, { "epoch": 1.2664277180406214, "grad_norm": 0.5142341654295087, "learning_rate": 5e-06, "loss": 0.6105, "step": 530 }, { "epoch": 1.2903225806451613, "grad_norm": 0.4429903840954624, "learning_rate": 5e-06, "loss": 0.61, "step": 540 }, { "epoch": 1.3142174432497014, "grad_norm": 0.4244756990330428, "learning_rate": 5e-06, "loss": 0.6113, "step": 550 }, { "epoch": 1.3381123058542412, "grad_norm": 0.4664930270424248, "learning_rate": 5e-06, "loss": 0.6057, "step": 560 }, { "epoch": 1.3620071684587813, "grad_norm": 0.6747787167132405, "learning_rate": 5e-06, "loss": 0.6084, "step": 570 }, { "epoch": 1.3859020310633214, "grad_norm": 0.8515989236641928, "learning_rate": 5e-06, "loss": 0.6058, "step": 580 }, { "epoch": 1.4097968936678615, "grad_norm": 0.634857639704424, "learning_rate": 5e-06, "loss": 0.6058, "step": 590 }, { "epoch": 1.4336917562724014, "grad_norm": 0.5282115500074044, "learning_rate": 5e-06, "loss": 0.6096, "step": 600 }, { "epoch": 1.4575866188769415, "grad_norm": 0.5576953727126037, "learning_rate": 5e-06, "loss": 0.616, "step": 610 }, { "epoch": 1.4814814814814814, "grad_norm": 0.45965397939992636, "learning_rate": 5e-06, "loss": 0.6082, "step": 620 }, { "epoch": 1.5053763440860215, "grad_norm": 0.5729607655893968, "learning_rate": 5e-06, "loss": 0.6037, "step": 630 }, { "epoch": 1.5292712066905616, "grad_norm": 0.4420855639504453, "learning_rate": 5e-06, "loss": 0.608, "step": 640 }, { "epoch": 1.5531660692951017, "grad_norm": 0.4815965030552482, "learning_rate": 5e-06, "loss": 0.6053, "step": 650 }, { "epoch": 1.5770609318996416, "grad_norm": 0.5446732967871324, "learning_rate": 5e-06, "loss": 0.6076, "step": 660 }, { "epoch": 1.6009557945041815, "grad_norm": 0.5773921107864519, "learning_rate": 5e-06, "loss": 0.6058, "step": 670 }, { "epoch": 1.6248506571087216, "grad_norm": 0.44904612161350127, "learning_rate": 5e-06, "loss": 0.5988, "step": 680 }, { "epoch": 1.6487455197132617, "grad_norm": 0.4659803956684399, "learning_rate": 5e-06, "loss": 0.598, "step": 690 }, { "epoch": 1.6726403823178018, "grad_norm": 0.4361474003132107, "learning_rate": 5e-06, "loss": 0.6081, "step": 700 }, { "epoch": 1.6965352449223416, "grad_norm": 0.4702827100539838, "learning_rate": 5e-06, "loss": 0.5997, "step": 710 }, { "epoch": 1.7204301075268817, "grad_norm": 0.46962735672309736, "learning_rate": 5e-06, "loss": 0.6135, "step": 720 }, { "epoch": 1.7443249701314216, "grad_norm": 0.5064462322593579, "learning_rate": 5e-06, "loss": 0.6034, "step": 730 }, { "epoch": 1.7682198327359617, "grad_norm": 0.6442892941899157, "learning_rate": 5e-06, "loss": 0.6044, "step": 740 }, { "epoch": 1.7921146953405018, "grad_norm": 0.449859458258856, "learning_rate": 5e-06, "loss": 0.6062, "step": 750 }, { "epoch": 1.816009557945042, "grad_norm": 0.47467567108778363, "learning_rate": 5e-06, "loss": 0.6035, "step": 760 }, { "epoch": 1.8399044205495818, "grad_norm": 0.43550415026449085, "learning_rate": 5e-06, "loss": 0.5987, "step": 770 }, { "epoch": 1.863799283154122, "grad_norm": 0.48913780227876247, "learning_rate": 5e-06, "loss": 0.6031, "step": 780 }, { "epoch": 1.8876941457586618, "grad_norm": 0.5594004132295759, "learning_rate": 5e-06, "loss": 0.5995, "step": 790 }, { "epoch": 1.911589008363202, "grad_norm": 0.4971730954697683, "learning_rate": 5e-06, "loss": 0.6056, "step": 800 }, { "epoch": 1.935483870967742, "grad_norm": 0.46186692571258725, "learning_rate": 5e-06, "loss": 0.6004, "step": 810 }, { "epoch": 1.959378733572282, "grad_norm": 0.4508830943663248, "learning_rate": 5e-06, "loss": 0.604, "step": 820 }, { "epoch": 1.983273596176822, "grad_norm": 0.501207898912081, "learning_rate": 5e-06, "loss": 0.608, "step": 830 }, { "epoch": 2.0, "eval_loss": 0.6365451812744141, "eval_runtime": 226.5918, "eval_samples_per_second": 49.75, "eval_steps_per_second": 0.393, "step": 837 }, { "epoch": 2.007168458781362, "grad_norm": 0.85054207213812, "learning_rate": 5e-06, "loss": 0.592, "step": 840 }, { "epoch": 2.031063321385902, "grad_norm": 0.5201507086108782, "learning_rate": 5e-06, "loss": 0.5608, "step": 850 }, { "epoch": 2.054958183990442, "grad_norm": 0.7188535226812537, "learning_rate": 5e-06, "loss": 0.5647, "step": 860 }, { "epoch": 2.078853046594982, "grad_norm": 0.4763195641282365, "learning_rate": 5e-06, "loss": 0.5644, "step": 870 }, { "epoch": 2.1027479091995223, "grad_norm": 0.5081415859208832, "learning_rate": 5e-06, "loss": 0.5622, "step": 880 }, { "epoch": 2.126642771804062, "grad_norm": 0.5931792000293172, "learning_rate": 5e-06, "loss": 0.5563, "step": 890 }, { "epoch": 2.150537634408602, "grad_norm": 0.49851033855755, "learning_rate": 5e-06, "loss": 0.5611, "step": 900 }, { "epoch": 2.174432497013142, "grad_norm": 0.5379278365329638, "learning_rate": 5e-06, "loss": 0.5573, "step": 910 }, { "epoch": 2.198327359617682, "grad_norm": 0.5350268044233742, "learning_rate": 5e-06, "loss": 0.5678, "step": 920 }, { "epoch": 2.2222222222222223, "grad_norm": 0.5689123372030673, "learning_rate": 5e-06, "loss": 0.5686, "step": 930 }, { "epoch": 2.2461170848267624, "grad_norm": 0.6671996296787344, "learning_rate": 5e-06, "loss": 0.5622, "step": 940 }, { "epoch": 2.270011947431302, "grad_norm": 0.4506810352733908, "learning_rate": 5e-06, "loss": 0.5642, "step": 950 }, { "epoch": 2.293906810035842, "grad_norm": 0.5358151280205125, "learning_rate": 5e-06, "loss": 0.5628, "step": 960 }, { "epoch": 2.3178016726403823, "grad_norm": 0.5566771627404731, "learning_rate": 5e-06, "loss": 0.5634, "step": 970 }, { "epoch": 2.3416965352449224, "grad_norm": 0.49963936030628325, "learning_rate": 5e-06, "loss": 0.5632, "step": 980 }, { "epoch": 2.3655913978494625, "grad_norm": 0.48679480824629434, "learning_rate": 5e-06, "loss": 0.5583, "step": 990 }, { "epoch": 2.389486260454002, "grad_norm": 0.5074816823498985, "learning_rate": 5e-06, "loss": 0.5636, "step": 1000 }, { "epoch": 2.4133811230585422, "grad_norm": 0.5739148795335686, "learning_rate": 5e-06, "loss": 0.5671, "step": 1010 }, { "epoch": 2.4372759856630823, "grad_norm": 0.6501742104516552, "learning_rate": 5e-06, "loss": 0.5666, "step": 1020 }, { "epoch": 2.4611708482676224, "grad_norm": 0.43406800220014613, "learning_rate": 5e-06, "loss": 0.5645, "step": 1030 }, { "epoch": 2.4850657108721625, "grad_norm": 0.47946981158627366, "learning_rate": 5e-06, "loss": 0.5612, "step": 1040 }, { "epoch": 2.5089605734767026, "grad_norm": 0.5508677225984592, "learning_rate": 5e-06, "loss": 0.5658, "step": 1050 }, { "epoch": 2.5328554360812428, "grad_norm": 0.6172108213167418, "learning_rate": 5e-06, "loss": 0.5656, "step": 1060 }, { "epoch": 2.5567502986857824, "grad_norm": 0.6149816712572169, "learning_rate": 5e-06, "loss": 0.5637, "step": 1070 }, { "epoch": 2.5806451612903225, "grad_norm": 0.5494076230620691, "learning_rate": 5e-06, "loss": 0.5748, "step": 1080 }, { "epoch": 2.6045400238948626, "grad_norm": 0.5098015036653776, "learning_rate": 5e-06, "loss": 0.5665, "step": 1090 }, { "epoch": 2.6284348864994027, "grad_norm": 0.4763003977246298, "learning_rate": 5e-06, "loss": 0.5659, "step": 1100 }, { "epoch": 2.652329749103943, "grad_norm": 0.45015059391064355, "learning_rate": 5e-06, "loss": 0.5613, "step": 1110 }, { "epoch": 2.6762246117084825, "grad_norm": 0.5195016081388676, "learning_rate": 5e-06, "loss": 0.5661, "step": 1120 }, { "epoch": 2.7001194743130226, "grad_norm": 0.461979850463992, "learning_rate": 5e-06, "loss": 0.5703, "step": 1130 }, { "epoch": 2.7240143369175627, "grad_norm": 0.4611698536891998, "learning_rate": 5e-06, "loss": 0.5628, "step": 1140 }, { "epoch": 2.7479091995221028, "grad_norm": 0.5474996121575114, "learning_rate": 5e-06, "loss": 0.5692, "step": 1150 }, { "epoch": 2.771804062126643, "grad_norm": 0.477411852958178, "learning_rate": 5e-06, "loss": 0.5745, "step": 1160 }, { "epoch": 2.795698924731183, "grad_norm": 0.48004817339516165, "learning_rate": 5e-06, "loss": 0.5625, "step": 1170 }, { "epoch": 2.819593787335723, "grad_norm": 0.5043226922994581, "learning_rate": 5e-06, "loss": 0.5664, "step": 1180 }, { "epoch": 2.8434886499402627, "grad_norm": 0.4988305698181874, "learning_rate": 5e-06, "loss": 0.5649, "step": 1190 }, { "epoch": 2.867383512544803, "grad_norm": 0.4569103859069353, "learning_rate": 5e-06, "loss": 0.5647, "step": 1200 }, { "epoch": 2.891278375149343, "grad_norm": 0.46286445346886024, "learning_rate": 5e-06, "loss": 0.5621, "step": 1210 }, { "epoch": 2.915173237753883, "grad_norm": 0.5296890930558641, "learning_rate": 5e-06, "loss": 0.5668, "step": 1220 }, { "epoch": 2.9390681003584227, "grad_norm": 0.5546209266748766, "learning_rate": 5e-06, "loss": 0.5688, "step": 1230 }, { "epoch": 2.962962962962963, "grad_norm": 0.5910470543653078, "learning_rate": 5e-06, "loss": 0.5668, "step": 1240 }, { "epoch": 2.986857825567503, "grad_norm": 0.5524701632367459, "learning_rate": 5e-06, "loss": 0.5714, "step": 1250 }, { "epoch": 2.996415770609319, "eval_loss": 0.6373269557952881, "eval_runtime": 227.0283, "eval_samples_per_second": 49.655, "eval_steps_per_second": 0.392, "step": 1254 }, { "epoch": 2.996415770609319, "step": 1254, "total_flos": 2100077946470400.0, "train_loss": 0.6173035094612523, "train_runtime": 37751.1113, "train_samples_per_second": 17.021, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 1254, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2100077946470400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }