|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.850709438618137, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030845157310302282, |
|
"grad_norm": 2.8206074237823486, |
|
"learning_rate": 1.0277492291880782e-05, |
|
"loss": 1.8082, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.061690314620604564, |
|
"grad_norm": 3.4183013439178467, |
|
"learning_rate": 2.0554984583761563e-05, |
|
"loss": 0.6538, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09253547193090685, |
|
"grad_norm": 2.170591354370117, |
|
"learning_rate": 3.083247687564235e-05, |
|
"loss": 0.4563, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12338062924120913, |
|
"grad_norm": 1.4687080383300781, |
|
"learning_rate": 4.110996916752313e-05, |
|
"loss": 0.4263, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15422578655151142, |
|
"grad_norm": 1.836676836013794, |
|
"learning_rate": 5.1387461459403907e-05, |
|
"loss": 0.3994, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1850709438618137, |
|
"grad_norm": 1.2718663215637207, |
|
"learning_rate": 6.16649537512847e-05, |
|
"loss": 0.3665, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.215916101172116, |
|
"grad_norm": 1.6945191621780396, |
|
"learning_rate": 7.194244604316547e-05, |
|
"loss": 0.3577, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.24676125848241826, |
|
"grad_norm": 1.2829898595809937, |
|
"learning_rate": 8.221993833504625e-05, |
|
"loss": 0.347, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.27760641579272055, |
|
"grad_norm": 1.01521635055542, |
|
"learning_rate": 9.249743062692704e-05, |
|
"loss": 0.3288, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.30845157310302285, |
|
"grad_norm": 1.522111415863037, |
|
"learning_rate": 0.00010277492291880781, |
|
"loss": 0.3267, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3392967304133251, |
|
"grad_norm": 0.9678927659988403, |
|
"learning_rate": 0.00011305241521068859, |
|
"loss": 0.3198, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3701418877236274, |
|
"grad_norm": 1.2144405841827393, |
|
"learning_rate": 0.0001233299075025694, |
|
"loss": 0.3099, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4009870450339297, |
|
"grad_norm": 1.3122639656066895, |
|
"learning_rate": 0.00013360739979445017, |
|
"loss": 0.2929, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.431832202344232, |
|
"grad_norm": 1.0934101343154907, |
|
"learning_rate": 0.00014388489208633093, |
|
"loss": 0.3003, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4626773596545342, |
|
"grad_norm": 0.7938969731330872, |
|
"learning_rate": 0.00015416238437821172, |
|
"loss": 0.2956, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4935225169648365, |
|
"grad_norm": 0.6571168303489685, |
|
"learning_rate": 0.0001644398766700925, |
|
"loss": 0.2736, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5243676742751388, |
|
"grad_norm": 1.0073938369750977, |
|
"learning_rate": 0.0001747173689619733, |
|
"loss": 0.2892, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5552128315854411, |
|
"grad_norm": 0.9874083399772644, |
|
"learning_rate": 0.00018499486125385408, |
|
"loss": 0.2723, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5860579888957433, |
|
"grad_norm": 1.1770968437194824, |
|
"learning_rate": 0.00019527235354573487, |
|
"loss": 0.2855, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6169031462060457, |
|
"grad_norm": 1.00326669216156, |
|
"learning_rate": 0.00019997622717095418, |
|
"loss": 0.2587, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6477483035163479, |
|
"grad_norm": 1.0380828380584717, |
|
"learning_rate": 0.0001998067088192682, |
|
"loss": 0.2764, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6785934608266502, |
|
"grad_norm": 1.430301547050476, |
|
"learning_rate": 0.00019947447034120033, |
|
"loss": 0.2565, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7094386181369525, |
|
"grad_norm": 1.0970648527145386, |
|
"learning_rate": 0.00019898005340261433, |
|
"loss": 0.2685, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7402837754472548, |
|
"grad_norm": 0.9110261797904968, |
|
"learning_rate": 0.0001983242640774473, |
|
"loss": 0.2489, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7711289327575571, |
|
"grad_norm": 1.0265332460403442, |
|
"learning_rate": 0.00019750817153352506, |
|
"loss": 0.2425, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8019740900678594, |
|
"grad_norm": 0.8820884823799133, |
|
"learning_rate": 0.00019653310628944164, |
|
"loss": 0.2402, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8328192473781616, |
|
"grad_norm": 0.933083713054657, |
|
"learning_rate": 0.00019540065804534467, |
|
"loss": 0.2444, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.863664404688464, |
|
"grad_norm": 0.8229042887687683, |
|
"learning_rate": 0.00019411267309116375, |
|
"loss": 0.2172, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8945095619987662, |
|
"grad_norm": 0.9641085267066956, |
|
"learning_rate": 0.00019267125129650688, |
|
"loss": 0.2501, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9253547193090684, |
|
"grad_norm": 0.6993410587310791, |
|
"learning_rate": 0.00019107874268713254, |
|
"loss": 0.2246, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9561998766193708, |
|
"grad_norm": 1.0574674606323242, |
|
"learning_rate": 0.00018933774361357917, |
|
"loss": 0.2265, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.987045033929673, |
|
"grad_norm": 1.0701500177383423, |
|
"learning_rate": 0.0001874510925181983, |
|
"loss": 0.2116, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0178901912399754, |
|
"grad_norm": 0.44411325454711914, |
|
"learning_rate": 0.0001854218653074927, |
|
"loss": 0.1978, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.0487353485502775, |
|
"grad_norm": 0.8408161401748657, |
|
"learning_rate": 0.0001832533703373043, |
|
"loss": 0.1701, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.0795805058605799, |
|
"grad_norm": 0.7417210936546326, |
|
"learning_rate": 0.0001809491430190276, |
|
"loss": 0.1873, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.1104256631708822, |
|
"grad_norm": 0.859341025352478, |
|
"learning_rate": 0.00017851294005564254, |
|
"loss": 0.1717, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.1412708204811843, |
|
"grad_norm": 0.6794934272766113, |
|
"learning_rate": 0.0001759487333169642, |
|
"loss": 0.1732, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.1721159777914867, |
|
"grad_norm": 0.5129622220993042, |
|
"learning_rate": 0.00017326070336409427, |
|
"loss": 0.172, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.202961135101789, |
|
"grad_norm": 0.5867941379547119, |
|
"learning_rate": 0.00017045323263363272, |
|
"loss": 0.1724, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.2338062924120914, |
|
"grad_norm": 0.887195348739624, |
|
"learning_rate": 0.0001675308982927608, |
|
"loss": 0.1655, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.2646514497223935, |
|
"grad_norm": 0.4721340835094452, |
|
"learning_rate": 0.0001644984647768447, |
|
"loss": 0.1642, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.2954966070326959, |
|
"grad_norm": 0.551078200340271, |
|
"learning_rate": 0.00016136087602172582, |
|
"loss": 0.1678, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.3263417643429982, |
|
"grad_norm": 0.97613525390625, |
|
"learning_rate": 0.00015812324740336248, |
|
"loss": 0.159, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.3571869216533003, |
|
"grad_norm": 0.7279055714607239, |
|
"learning_rate": 0.00015479085739796328, |
|
"loss": 0.1612, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.3880320789636027, |
|
"grad_norm": 0.9861264228820801, |
|
"learning_rate": 0.0001513691389762097, |
|
"loss": 0.1578, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.418877236273905, |
|
"grad_norm": 0.7998865246772766, |
|
"learning_rate": 0.00014786367074559828, |
|
"loss": 0.1569, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.4497223935842074, |
|
"grad_norm": 0.851041853427887, |
|
"learning_rate": 0.0001442801678553436, |
|
"loss": 0.1486, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.4805675508945095, |
|
"grad_norm": 0.42780816555023193, |
|
"learning_rate": 0.00014062447267866986, |
|
"loss": 0.1486, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.5114127082048119, |
|
"grad_norm": 0.7399270534515381, |
|
"learning_rate": 0.00013690254528768225, |
|
"loss": 0.1364, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.542257865515114, |
|
"grad_norm": 1.0372874736785889, |
|
"learning_rate": 0.0001331204537363485, |
|
"loss": 0.1393, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.5731030228254164, |
|
"grad_norm": 1.0774208307266235, |
|
"learning_rate": 0.00012928436416743098, |
|
"loss": 0.1468, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.6039481801357187, |
|
"grad_norm": 0.849371075630188, |
|
"learning_rate": 0.00012540053075949987, |
|
"loss": 0.1357, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.634793337446021, |
|
"grad_norm": 0.9047374725341797, |
|
"learning_rate": 0.00012147528553041718, |
|
"loss": 0.1292, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.6656384947563234, |
|
"grad_norm": 0.46791982650756836, |
|
"learning_rate": 0.00011751502801391479, |
|
"loss": 0.1308, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.6964836520666255, |
|
"grad_norm": 0.8372901082038879, |
|
"learning_rate": 0.00011352621482609807, |
|
"loss": 0.1401, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.7273288093769277, |
|
"grad_norm": 0.7648535966873169, |
|
"learning_rate": 0.00010951534913888515, |
|
"loss": 0.1245, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.75817396668723, |
|
"grad_norm": 0.4626932442188263, |
|
"learning_rate": 0.00010548897007754374, |
|
"loss": 0.1288, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.7890191239975324, |
|
"grad_norm": 0.5342707633972168, |
|
"learning_rate": 0.00010145364205961125, |
|
"loss": 0.1237, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.8198642813078347, |
|
"grad_norm": 0.7241202592849731, |
|
"learning_rate": 9.74159440925796e-05, |
|
"loss": 0.1168, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.850709438618137, |
|
"grad_norm": 0.48138442635536194, |
|
"learning_rate": 9.338245904779345e-05, |
|
"loss": 0.1149, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 4863, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1332533710946304e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|