|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 204.0, |
|
"learning_rate": 2e-05, |
|
"loss": 35.5858, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 83.0, |
|
"learning_rate": 0.0001, |
|
"loss": 33.1551, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 19.0, |
|
"learning_rate": 0.0002, |
|
"loss": 21.1609, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 12.363748550415039, |
|
"eval_runtime": 0.2492, |
|
"eval_samples_per_second": 40.136, |
|
"eval_steps_per_second": 4.014, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 16.2359, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 14.2403, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 7.762616157531738, |
|
"eval_runtime": 0.234, |
|
"eval_samples_per_second": 42.735, |
|
"eval_steps_per_second": 4.274, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 13.444, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 12.4548, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 6.874077796936035, |
|
"eval_runtime": 0.2344, |
|
"eval_samples_per_second": 42.661, |
|
"eval_steps_per_second": 4.266, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 11.875, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 10.8198, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 17.0, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 8.6478, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 6.141203880310059, |
|
"eval_runtime": 0.2347, |
|
"eval_samples_per_second": 42.609, |
|
"eval_steps_per_second": 4.261, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 18.0, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 5.648, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 3.1923, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 4.4401326179504395, |
|
"eval_runtime": 0.236, |
|
"eval_samples_per_second": 42.379, |
|
"eval_steps_per_second": 4.238, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.0001, |
|
"loss": 2.2412, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 1.9614, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 3.3291573524475098, |
|
"eval_runtime": 0.2367, |
|
"eval_samples_per_second": 42.255, |
|
"eval_steps_per_second": 4.225, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 1.7794, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.692, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 3.127150297164917, |
|
"eval_runtime": 0.2355, |
|
"eval_samples_per_second": 42.466, |
|
"eval_steps_per_second": 4.247, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.625, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.6184, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 1.5661, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 3.0726068019866943, |
|
"eval_runtime": 0.2359, |
|
"eval_samples_per_second": 42.395, |
|
"eval_steps_per_second": 4.24, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 1.5358, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 1.5417, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 3.0536386966705322, |
|
"eval_runtime": 0.2358, |
|
"eval_samples_per_second": 42.414, |
|
"eval_steps_per_second": 4.241, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 1.5267, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0, |
|
"loss": 1.5287, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 3.052680015563965, |
|
"eval_runtime": 0.234, |
|
"eval_samples_per_second": 42.735, |
|
"eval_steps_per_second": 4.274, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 100, |
|
"total_flos": 3.0491778760677786e+17, |
|
"train_loss": 7.823821220397949, |
|
"train_runtime": 245.6292, |
|
"train_samples_per_second": 25.038, |
|
"train_steps_per_second": 0.407 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 3.0491778760677786e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|