{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4676125848241828, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030845157310302282, "grad_norm": 2.8206074237823486, "learning_rate": 1.0277492291880782e-05, "loss": 1.8082, "step": 50 }, { "epoch": 0.061690314620604564, "grad_norm": 3.4183013439178467, "learning_rate": 2.0554984583761563e-05, "loss": 0.6538, "step": 100 }, { "epoch": 0.09253547193090685, "grad_norm": 2.170591354370117, "learning_rate": 3.083247687564235e-05, "loss": 0.4563, "step": 150 }, { "epoch": 0.12338062924120913, "grad_norm": 1.4687080383300781, "learning_rate": 4.110996916752313e-05, "loss": 0.4263, "step": 200 }, { "epoch": 0.15422578655151142, "grad_norm": 1.836676836013794, "learning_rate": 5.1387461459403907e-05, "loss": 0.3994, "step": 250 }, { "epoch": 0.1850709438618137, "grad_norm": 1.2718663215637207, "learning_rate": 6.16649537512847e-05, "loss": 0.3665, "step": 300 }, { "epoch": 0.215916101172116, "grad_norm": 1.6945191621780396, "learning_rate": 7.194244604316547e-05, "loss": 0.3577, "step": 350 }, { "epoch": 0.24676125848241826, "grad_norm": 1.2829898595809937, "learning_rate": 8.221993833504625e-05, "loss": 0.347, "step": 400 }, { "epoch": 0.27760641579272055, "grad_norm": 1.01521635055542, "learning_rate": 9.249743062692704e-05, "loss": 0.3288, "step": 450 }, { "epoch": 0.30845157310302285, "grad_norm": 1.522111415863037, "learning_rate": 0.00010277492291880781, "loss": 0.3267, "step": 500 }, { "epoch": 0.3392967304133251, "grad_norm": 0.9678927659988403, "learning_rate": 0.00011305241521068859, "loss": 0.3198, "step": 550 }, { "epoch": 0.3701418877236274, "grad_norm": 1.2144405841827393, "learning_rate": 0.0001233299075025694, "loss": 0.3099, "step": 600 }, { "epoch": 0.4009870450339297, "grad_norm": 1.3122639656066895, "learning_rate": 0.00013360739979445017, "loss": 0.2929, "step": 650 }, { "epoch": 0.431832202344232, "grad_norm": 1.0934101343154907, "learning_rate": 0.00014388489208633093, "loss": 0.3003, "step": 700 }, { "epoch": 0.4626773596545342, "grad_norm": 0.7938969731330872, "learning_rate": 0.00015416238437821172, "loss": 0.2956, "step": 750 }, { "epoch": 0.4935225169648365, "grad_norm": 0.6571168303489685, "learning_rate": 0.0001644398766700925, "loss": 0.2736, "step": 800 }, { "epoch": 0.5243676742751388, "grad_norm": 1.0073938369750977, "learning_rate": 0.0001747173689619733, "loss": 0.2892, "step": 850 }, { "epoch": 0.5552128315854411, "grad_norm": 0.9874083399772644, "learning_rate": 0.00018499486125385408, "loss": 0.2723, "step": 900 }, { "epoch": 0.5860579888957433, "grad_norm": 1.1770968437194824, "learning_rate": 0.00019527235354573487, "loss": 0.2855, "step": 950 }, { "epoch": 0.6169031462060457, "grad_norm": 1.00326669216156, "learning_rate": 0.00019997622717095418, "loss": 0.2587, "step": 1000 }, { "epoch": 0.6477483035163479, "grad_norm": 1.0380828380584717, "learning_rate": 0.0001998067088192682, "loss": 0.2764, "step": 1050 }, { "epoch": 0.6785934608266502, "grad_norm": 1.430301547050476, "learning_rate": 0.00019947447034120033, "loss": 0.2565, "step": 1100 }, { "epoch": 0.7094386181369525, "grad_norm": 1.0970648527145386, "learning_rate": 0.00019898005340261433, "loss": 0.2685, "step": 1150 }, { "epoch": 0.7402837754472548, "grad_norm": 0.9110261797904968, "learning_rate": 0.0001983242640774473, "loss": 0.2489, "step": 1200 }, { "epoch": 0.7711289327575571, "grad_norm": 1.0265332460403442, "learning_rate": 0.00019750817153352506, "loss": 0.2425, "step": 1250 }, { "epoch": 0.8019740900678594, "grad_norm": 0.8820884823799133, "learning_rate": 0.00019653310628944164, "loss": 0.2402, "step": 1300 }, { "epoch": 0.8328192473781616, "grad_norm": 0.933083713054657, "learning_rate": 0.00019540065804534467, "loss": 0.2444, "step": 1350 }, { "epoch": 0.863664404688464, "grad_norm": 0.8229042887687683, "learning_rate": 0.00019411267309116375, "loss": 0.2172, "step": 1400 }, { "epoch": 0.8945095619987662, "grad_norm": 0.9641085267066956, "learning_rate": 0.00019267125129650688, "loss": 0.2501, "step": 1450 }, { "epoch": 0.9253547193090684, "grad_norm": 0.6993410587310791, "learning_rate": 0.00019107874268713254, "loss": 0.2246, "step": 1500 }, { "epoch": 0.9561998766193708, "grad_norm": 1.0574674606323242, "learning_rate": 0.00018933774361357917, "loss": 0.2265, "step": 1550 }, { "epoch": 0.987045033929673, "grad_norm": 1.0701500177383423, "learning_rate": 0.0001874510925181983, "loss": 0.2116, "step": 1600 }, { "epoch": 1.0178901912399754, "grad_norm": 0.44411325454711914, "learning_rate": 0.0001854218653074927, "loss": 0.1978, "step": 1650 }, { "epoch": 1.0487353485502775, "grad_norm": 0.8408161401748657, "learning_rate": 0.0001832533703373043, "loss": 0.1701, "step": 1700 }, { "epoch": 1.0795805058605799, "grad_norm": 0.7417210936546326, "learning_rate": 0.0001809491430190276, "loss": 0.1873, "step": 1750 }, { "epoch": 1.1104256631708822, "grad_norm": 0.859341025352478, "learning_rate": 0.00017851294005564254, "loss": 0.1717, "step": 1800 }, { "epoch": 1.1412708204811843, "grad_norm": 0.6794934272766113, "learning_rate": 0.0001759487333169642, "loss": 0.1732, "step": 1850 }, { "epoch": 1.1721159777914867, "grad_norm": 0.5129622220993042, "learning_rate": 0.00017326070336409427, "loss": 0.172, "step": 1900 }, { "epoch": 1.202961135101789, "grad_norm": 0.5867941379547119, "learning_rate": 0.00017045323263363272, "loss": 0.1724, "step": 1950 }, { "epoch": 1.2338062924120914, "grad_norm": 0.887195348739624, "learning_rate": 0.0001675308982927608, "loss": 0.1655, "step": 2000 }, { "epoch": 1.2646514497223935, "grad_norm": 0.4721340835094452, "learning_rate": 0.0001644984647768447, "loss": 0.1642, "step": 2050 }, { "epoch": 1.2954966070326959, "grad_norm": 0.551078200340271, "learning_rate": 0.00016136087602172582, "loss": 0.1678, "step": 2100 }, { "epoch": 1.3263417643429982, "grad_norm": 0.97613525390625, "learning_rate": 0.00015812324740336248, "loss": 0.159, "step": 2150 }, { "epoch": 1.3571869216533003, "grad_norm": 0.7279055714607239, "learning_rate": 0.00015479085739796328, "loss": 0.1612, "step": 2200 }, { "epoch": 1.3880320789636027, "grad_norm": 0.9861264228820801, "learning_rate": 0.0001513691389762097, "loss": 0.1578, "step": 2250 }, { "epoch": 1.418877236273905, "grad_norm": 0.7998865246772766, "learning_rate": 0.00014786367074559828, "loss": 0.1569, "step": 2300 }, { "epoch": 1.4497223935842074, "grad_norm": 0.851041853427887, "learning_rate": 0.0001442801678553436, "loss": 0.1486, "step": 2350 }, { "epoch": 1.4805675508945095, "grad_norm": 0.42780816555023193, "learning_rate": 0.00014062447267866986, "loss": 0.1486, "step": 2400 }, { "epoch": 1.5114127082048119, "grad_norm": 0.7399270534515381, "learning_rate": 0.00013690254528768225, "loss": 0.1364, "step": 2450 }, { "epoch": 1.542257865515114, "grad_norm": 1.0372874736785889, "learning_rate": 0.0001331204537363485, "loss": 0.1393, "step": 2500 }, { "epoch": 1.5731030228254164, "grad_norm": 1.0774208307266235, "learning_rate": 0.00012928436416743098, "loss": 0.1468, "step": 2550 }, { "epoch": 1.6039481801357187, "grad_norm": 0.849371075630188, "learning_rate": 0.00012540053075949987, "loss": 0.1357, "step": 2600 }, { "epoch": 1.634793337446021, "grad_norm": 0.9047374725341797, "learning_rate": 0.00012147528553041718, "loss": 0.1292, "step": 2650 }, { "epoch": 1.6656384947563234, "grad_norm": 0.46791982650756836, "learning_rate": 0.00011751502801391479, "loss": 0.1308, "step": 2700 }, { "epoch": 1.6964836520666255, "grad_norm": 0.8372901082038879, "learning_rate": 0.00011352621482609807, "loss": 0.1401, "step": 2750 }, { "epoch": 1.7273288093769277, "grad_norm": 0.7648535966873169, "learning_rate": 0.00010951534913888515, "loss": 0.1245, "step": 2800 }, { "epoch": 1.75817396668723, "grad_norm": 0.4626932442188263, "learning_rate": 0.00010548897007754374, "loss": 0.1288, "step": 2850 }, { "epoch": 1.7890191239975324, "grad_norm": 0.5342707633972168, "learning_rate": 0.00010145364205961125, "loss": 0.1237, "step": 2900 }, { "epoch": 1.8198642813078347, "grad_norm": 0.7241202592849731, "learning_rate": 9.74159440925796e-05, "loss": 0.1168, "step": 2950 }, { "epoch": 1.850709438618137, "grad_norm": 0.48138442635536194, "learning_rate": 9.338245904779345e-05, "loss": 0.1149, "step": 3000 }, { "epoch": 1.8815545959284392, "grad_norm": 0.5214439630508423, "learning_rate": 8.93597629280487e-05, "loss": 0.1175, "step": 3050 }, { "epoch": 1.9123997532387416, "grad_norm": 0.5231944918632507, "learning_rate": 8.535441414638937e-05, "loss": 0.1168, "step": 3100 }, { "epoch": 1.9432449105490437, "grad_norm": 0.7800565958023071, "learning_rate": 8.13729428335819e-05, "loss": 0.1081, "step": 3150 }, { "epoch": 1.974090067859346, "grad_norm": 0.592022716999054, "learning_rate": 7.742184019169945e-05, "loss": 0.115, "step": 3200 }, { "epoch": 2.0049352251696484, "grad_norm": 0.4546678960323334, "learning_rate": 7.350754791117384e-05, "loss": 0.1002, "step": 3250 }, { "epoch": 2.0357803824799507, "grad_norm": 0.4683123826980591, "learning_rate": 6.963644766856894e-05, "loss": 0.0696, "step": 3300 }, { "epoch": 2.066625539790253, "grad_norm": 0.4901474416255951, "learning_rate": 6.581485072219755e-05, "loss": 0.0696, "step": 3350 }, { "epoch": 2.097470697100555, "grad_norm": 0.7581807971000671, "learning_rate": 6.204898762254524e-05, "loss": 0.0705, "step": 3400 }, { "epoch": 2.1283158544108574, "grad_norm": 0.5819096565246582, "learning_rate": 5.8344998054276115e-05, "loss": 0.0695, "step": 3450 }, { "epoch": 2.1591610117211597, "grad_norm": 0.7029954791069031, "learning_rate": 5.4708920826382035e-05, "loss": 0.0683, "step": 3500 }, { "epoch": 2.190006169031462, "grad_norm": 0.6168348789215088, "learning_rate": 5.114668402679472e-05, "loss": 0.0675, "step": 3550 }, { "epoch": 2.2208513263417644, "grad_norm": 0.5696262717247009, "learning_rate": 4.766409535751225e-05, "loss": 0.0678, "step": 3600 }, { "epoch": 2.2516964836520668, "grad_norm": 0.7154285907745361, "learning_rate": 4.426683266599702e-05, "loss": 0.0655, "step": 3650 }, { "epoch": 2.2825416409623687, "grad_norm": 0.6194272041320801, "learning_rate": 4.0960434688282515e-05, "loss": 0.0623, "step": 3700 }, { "epoch": 2.313386798272671, "grad_norm": 0.4507332444190979, "learning_rate": 3.775029201888051e-05, "loss": 0.0645, "step": 3750 }, { "epoch": 2.3442319555829734, "grad_norm": 0.42285481095314026, "learning_rate": 3.4641638322211456e-05, "loss": 0.0623, "step": 3800 }, { "epoch": 2.3750771128932757, "grad_norm": 0.4432896077632904, "learning_rate": 3.1639541799886083e-05, "loss": 0.0625, "step": 3850 }, { "epoch": 2.405922270203578, "grad_norm": 0.4708622097969055, "learning_rate": 2.874889692774978e-05, "loss": 0.063, "step": 3900 }, { "epoch": 2.4367674275138804, "grad_norm": 0.30538269877433777, "learning_rate": 2.5974416476161167e-05, "loss": 0.06, "step": 3950 }, { "epoch": 2.4676125848241828, "grad_norm": 0.7072311639785767, "learning_rate": 2.3320623826514897e-05, "loss": 0.0623, "step": 4000 } ], "logging_steps": 50, "max_steps": 4863, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8423096729862144e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }