|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 22079, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.022645953168168847, |
|
"grad_norm": 6.0433220863342285, |
|
"learning_rate": 1.9547080936636626e-05, |
|
"loss": 1.9021, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.045291906336337694, |
|
"grad_norm": 5.967625617980957, |
|
"learning_rate": 1.9094161873273246e-05, |
|
"loss": 1.312, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06793785950450655, |
|
"grad_norm": 6.04085636138916, |
|
"learning_rate": 1.864124280990987e-05, |
|
"loss": 1.079, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.09058381267267539, |
|
"grad_norm": 4.7192912101745605, |
|
"learning_rate": 1.8188323746546494e-05, |
|
"loss": 0.919, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.11322976584084424, |
|
"grad_norm": 5.109660625457764, |
|
"learning_rate": 1.7735404683183118e-05, |
|
"loss": 0.8596, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1358757190090131, |
|
"grad_norm": 4.687028884887695, |
|
"learning_rate": 1.728248561981974e-05, |
|
"loss": 0.7987, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.15852167217718194, |
|
"grad_norm": 3.7766010761260986, |
|
"learning_rate": 1.6829566556456363e-05, |
|
"loss": 0.7281, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.18116762534535077, |
|
"grad_norm": 4.304531574249268, |
|
"learning_rate": 1.6376647493092987e-05, |
|
"loss": 0.6821, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.20381357851351964, |
|
"grad_norm": 5.7325968742370605, |
|
"learning_rate": 1.5923728429729607e-05, |
|
"loss": 0.6425, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.22645953168168848, |
|
"grad_norm": 5.178163051605225, |
|
"learning_rate": 1.547080936636623e-05, |
|
"loss": 0.6295, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.24910548484985734, |
|
"grad_norm": 3.056478500366211, |
|
"learning_rate": 1.5017890303002855e-05, |
|
"loss": 0.6005, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.2717514380180262, |
|
"grad_norm": 4.0474467277526855, |
|
"learning_rate": 1.4564971239639478e-05, |
|
"loss": 0.5868, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.294397391186195, |
|
"grad_norm": 3.2683613300323486, |
|
"learning_rate": 1.4112052176276102e-05, |
|
"loss": 0.5647, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.3170433443543639, |
|
"grad_norm": 3.586979627609253, |
|
"learning_rate": 1.3659133112912724e-05, |
|
"loss": 0.5415, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.33968929752253274, |
|
"grad_norm": 4.297875881195068, |
|
"learning_rate": 1.3206214049549346e-05, |
|
"loss": 0.5294, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.36233525069070155, |
|
"grad_norm": 3.461622714996338, |
|
"learning_rate": 1.2753294986185968e-05, |
|
"loss": 0.5211, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.3849812038588704, |
|
"grad_norm": 3.42155385017395, |
|
"learning_rate": 1.2300375922822592e-05, |
|
"loss": 0.505, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.4076271570270393, |
|
"grad_norm": 3.385619878768921, |
|
"learning_rate": 1.1847456859459216e-05, |
|
"loss": 0.4997, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.43027311019520814, |
|
"grad_norm": 3.422764778137207, |
|
"learning_rate": 1.1394537796095839e-05, |
|
"loss": 0.484, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.45291906336337695, |
|
"grad_norm": 2.883808135986328, |
|
"learning_rate": 1.0941618732732463e-05, |
|
"loss": 0.4775, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.4755650165315458, |
|
"grad_norm": 3.6908833980560303, |
|
"learning_rate": 1.0488699669369085e-05, |
|
"loss": 0.4739, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.4982109696997147, |
|
"grad_norm": 3.540358066558838, |
|
"learning_rate": 1.0035780606005707e-05, |
|
"loss": 0.4584, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.5208569228678835, |
|
"grad_norm": 3.336552858352661, |
|
"learning_rate": 9.582861542642331e-06, |
|
"loss": 0.4445, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.5435028760360524, |
|
"grad_norm": 2.803347110748291, |
|
"learning_rate": 9.129942479278953e-06, |
|
"loss": 0.4435, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.5661488292042212, |
|
"grad_norm": 3.0676352977752686, |
|
"learning_rate": 8.677023415915576e-06, |
|
"loss": 0.4312, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.58879478237239, |
|
"grad_norm": 3.1657562255859375, |
|
"learning_rate": 8.2241043525522e-06, |
|
"loss": 0.4349, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.6114407355405589, |
|
"grad_norm": 3.551663398742676, |
|
"learning_rate": 7.771185289188824e-06, |
|
"loss": 0.4255, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.6340866887087278, |
|
"grad_norm": 2.3778510093688965, |
|
"learning_rate": 7.318266225825446e-06, |
|
"loss": 0.4148, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.6567326418768966, |
|
"grad_norm": 3.684070587158203, |
|
"learning_rate": 6.865347162462068e-06, |
|
"loss": 0.4151, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.6793785950450655, |
|
"grad_norm": 3.859379529953003, |
|
"learning_rate": 6.412428099098692e-06, |
|
"loss": 0.4113, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.7020245482132343, |
|
"grad_norm": 3.526158094406128, |
|
"learning_rate": 5.9595090357353145e-06, |
|
"loss": 0.4124, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.7246705013814031, |
|
"grad_norm": 3.6779990196228027, |
|
"learning_rate": 5.506589972371938e-06, |
|
"loss": 0.4032, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.747316454549572, |
|
"grad_norm": 6.240302085876465, |
|
"learning_rate": 5.05367090900856e-06, |
|
"loss": 0.4019, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.7699624077177408, |
|
"grad_norm": 3.5423877239227295, |
|
"learning_rate": 4.600751845645184e-06, |
|
"loss": 0.3845, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.7926083608859097, |
|
"grad_norm": 3.668982982635498, |
|
"learning_rate": 4.147832782281806e-06, |
|
"loss": 0.3948, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.8152543140540786, |
|
"grad_norm": 4.242081642150879, |
|
"learning_rate": 3.6949137189184298e-06, |
|
"loss": 0.3826, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.8379002672222474, |
|
"grad_norm": 3.223057270050049, |
|
"learning_rate": 3.2419946555550525e-06, |
|
"loss": 0.3867, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.8605462203904163, |
|
"grad_norm": 2.9004533290863037, |
|
"learning_rate": 2.7890755921916756e-06, |
|
"loss": 0.3894, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.883192173558585, |
|
"grad_norm": 3.2891807556152344, |
|
"learning_rate": 2.3361565288282987e-06, |
|
"loss": 0.3793, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.9058381267267539, |
|
"grad_norm": 2.6867599487304688, |
|
"learning_rate": 1.8832374654649217e-06, |
|
"loss": 0.3732, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.9284840798949228, |
|
"grad_norm": 2.811148166656494, |
|
"learning_rate": 1.4303184021015446e-06, |
|
"loss": 0.3794, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.9511300330630916, |
|
"grad_norm": 3.6918985843658447, |
|
"learning_rate": 9.773993387381675e-07, |
|
"loss": 0.3767, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.9737759862312605, |
|
"grad_norm": 2.762467622756958, |
|
"learning_rate": 5.244802753747906e-07, |
|
"loss": 0.3741, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.9964219393994294, |
|
"grad_norm": 2.7618136405944824, |
|
"learning_rate": 7.156121201141356e-08, |
|
"loss": 0.3751, |
|
"step": 22000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 22079, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.147503784984576e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|