|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.046207497820401, |
|
"eval_steps": 500, |
|
"global_step": 1200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08718395815170009, |
|
"grad_norm": 492.52783203125, |
|
"learning_rate": 5.833333333333333e-06, |
|
"loss": 21.8384, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17436791630340018, |
|
"grad_norm": 143.71047973632812, |
|
"learning_rate": 6.481481481481481e-06, |
|
"loss": 19.3308, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.26155187445510025, |
|
"grad_norm": 805.2228393554688, |
|
"learning_rate": 5.833333333333333e-06, |
|
"loss": 21.3734, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.34873583260680036, |
|
"grad_norm": 405.0468444824219, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 19.1667, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43591979075850046, |
|
"grad_norm": 632.6600952148438, |
|
"learning_rate": 4.537037037037037e-06, |
|
"loss": 17.4984, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5231037489102005, |
|
"grad_norm": 890.8648071289062, |
|
"learning_rate": 3.888888888888889e-06, |
|
"loss": 16.9422, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6102877070619006, |
|
"grad_norm": 243.8746337890625, |
|
"learning_rate": 3.2407407407407406e-06, |
|
"loss": 19.9537, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6974716652136007, |
|
"grad_norm": 408.9120178222656, |
|
"learning_rate": 2.5925925925925925e-06, |
|
"loss": 18.6301, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7846556233653008, |
|
"grad_norm": 171.10406494140625, |
|
"learning_rate": 1.9444444444444444e-06, |
|
"loss": 15.2552, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8718395815170009, |
|
"grad_norm": 236.40130615234375, |
|
"learning_rate": 1.2962962962962962e-06, |
|
"loss": 14.5902, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9590235396687009, |
|
"grad_norm": 386.51702880859375, |
|
"learning_rate": 6.481481481481481e-07, |
|
"loss": 12.5311, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 508.4683532714844, |
|
"eval_runtime": 15.2151, |
|
"eval_samples_per_second": 67.038, |
|
"eval_steps_per_second": 8.413, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 1.046207497820401, |
|
"grad_norm": 72.9695053100586, |
|
"learning_rate": 0.0, |
|
"loss": 13.603, |
|
"step": 1200 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 1200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 300, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|