|
{ |
|
"best_metric": 0.3561370372772217, |
|
"best_model_checkpoint": "results/checkpoint-8500", |
|
"epoch": 0.8119209093514185, |
|
"eval_steps": 500, |
|
"global_step": 8500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04776005349125991, |
|
"grad_norm": 6.31744909286499, |
|
"learning_rate": 4.8805998662718505e-05, |
|
"loss": 0.4525, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.04776005349125991, |
|
"eval_loss": 0.38868293166160583, |
|
"eval_runtime": 105.2805, |
|
"eval_samples_per_second": 248.393, |
|
"eval_steps_per_second": 3.885, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09552010698251982, |
|
"grad_norm": 8.537174224853516, |
|
"learning_rate": 4.761199732543701e-05, |
|
"loss": 0.4516, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09552010698251982, |
|
"eval_loss": 0.3857233226299286, |
|
"eval_runtime": 110.5179, |
|
"eval_samples_per_second": 236.622, |
|
"eval_steps_per_second": 3.701, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.14328016047377973, |
|
"grad_norm": 3.55123233795166, |
|
"learning_rate": 4.641799598815551e-05, |
|
"loss": 0.4412, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.14328016047377973, |
|
"eval_loss": 0.3843745291233063, |
|
"eval_runtime": 108.5384, |
|
"eval_samples_per_second": 240.938, |
|
"eval_steps_per_second": 3.768, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.19104021396503965, |
|
"grad_norm": 4.456860065460205, |
|
"learning_rate": 4.522399465087401e-05, |
|
"loss": 0.4411, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.19104021396503965, |
|
"eval_loss": 0.3724469542503357, |
|
"eval_runtime": 104.9479, |
|
"eval_samples_per_second": 249.181, |
|
"eval_steps_per_second": 3.897, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.23880026745629956, |
|
"grad_norm": 8.174205780029297, |
|
"learning_rate": 4.402999331359251e-05, |
|
"loss": 0.4269, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.23880026745629956, |
|
"eval_loss": 0.3822018504142761, |
|
"eval_runtime": 109.3926, |
|
"eval_samples_per_second": 239.056, |
|
"eval_steps_per_second": 3.739, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.28656032094755945, |
|
"grad_norm": 8.617950439453125, |
|
"learning_rate": 4.2835991976311015e-05, |
|
"loss": 0.4417, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.28656032094755945, |
|
"eval_loss": 0.37791815400123596, |
|
"eval_runtime": 110.9874, |
|
"eval_samples_per_second": 235.621, |
|
"eval_steps_per_second": 3.685, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.33432037443881935, |
|
"grad_norm": 4.857789993286133, |
|
"learning_rate": 4.164199063902952e-05, |
|
"loss": 0.4324, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.33432037443881935, |
|
"eval_loss": 0.37730905413627625, |
|
"eval_runtime": 110.4008, |
|
"eval_samples_per_second": 236.873, |
|
"eval_steps_per_second": 3.705, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3820804279300793, |
|
"grad_norm": 4.581517219543457, |
|
"learning_rate": 4.044798930174802e-05, |
|
"loss": 0.4184, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3820804279300793, |
|
"eval_loss": 0.3715578615665436, |
|
"eval_runtime": 111.0466, |
|
"eval_samples_per_second": 235.496, |
|
"eval_steps_per_second": 3.683, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4298404814213392, |
|
"grad_norm": 7.005139350891113, |
|
"learning_rate": 3.925398796446652e-05, |
|
"loss": 0.4135, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4298404814213392, |
|
"eval_loss": 0.3852500319480896, |
|
"eval_runtime": 110.8005, |
|
"eval_samples_per_second": 236.019, |
|
"eval_steps_per_second": 3.691, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.47760053491259913, |
|
"grad_norm": 7.208944797515869, |
|
"learning_rate": 3.8059986627185024e-05, |
|
"loss": 0.4172, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.47760053491259913, |
|
"eval_loss": 0.3664040267467499, |
|
"eval_runtime": 110.6088, |
|
"eval_samples_per_second": 236.428, |
|
"eval_steps_per_second": 3.698, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.525360588403859, |
|
"grad_norm": 4.011592864990234, |
|
"learning_rate": 3.6865985289903526e-05, |
|
"loss": 0.4164, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.525360588403859, |
|
"eval_loss": 0.36664679646492004, |
|
"eval_runtime": 108.0591, |
|
"eval_samples_per_second": 242.006, |
|
"eval_steps_per_second": 3.785, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.5731206418951189, |
|
"grad_norm": 3.7559821605682373, |
|
"learning_rate": 3.567198395262203e-05, |
|
"loss": 0.4124, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5731206418951189, |
|
"eval_loss": 0.36337631940841675, |
|
"eval_runtime": 105.397, |
|
"eval_samples_per_second": 248.119, |
|
"eval_steps_per_second": 3.881, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6208806953863788, |
|
"grad_norm": 4.904799938201904, |
|
"learning_rate": 3.447798261534053e-05, |
|
"loss": 0.4133, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6208806953863788, |
|
"eval_loss": 0.3614007234573364, |
|
"eval_runtime": 110.9753, |
|
"eval_samples_per_second": 235.647, |
|
"eval_steps_per_second": 3.686, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6686407488776387, |
|
"grad_norm": 8.097270011901855, |
|
"learning_rate": 3.328398127805903e-05, |
|
"loss": 0.4132, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6686407488776387, |
|
"eval_loss": 0.360387921333313, |
|
"eval_runtime": 110.5513, |
|
"eval_samples_per_second": 236.551, |
|
"eval_steps_per_second": 3.7, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7164008023688987, |
|
"grad_norm": 5.6349968910217285, |
|
"learning_rate": 3.2089979940777535e-05, |
|
"loss": 0.3994, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7164008023688987, |
|
"eval_loss": 0.3582770824432373, |
|
"eval_runtime": 109.3558, |
|
"eval_samples_per_second": 239.137, |
|
"eval_steps_per_second": 3.74, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7641608558601586, |
|
"grad_norm": 5.345180988311768, |
|
"learning_rate": 3.089597860349604e-05, |
|
"loss": 0.4032, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7641608558601586, |
|
"eval_loss": 0.3689400851726532, |
|
"eval_runtime": 110.4919, |
|
"eval_samples_per_second": 236.678, |
|
"eval_steps_per_second": 3.702, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8119209093514185, |
|
"grad_norm": 3.314952850341797, |
|
"learning_rate": 2.9701977266214542e-05, |
|
"loss": 0.4101, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.8119209093514185, |
|
"eval_loss": 0.3561370372772217, |
|
"eval_runtime": 112.2203, |
|
"eval_samples_per_second": 233.033, |
|
"eval_steps_per_second": 3.645, |
|
"step": 8500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 20938, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.43163257192448e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|