|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 116.27799227799228, |
|
"global_step": 15000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 1e-05, |
|
"learning_rate_embeddings": 1e-05, |
|
"loss": 8.5314, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"learning_rate": 2e-05, |
|
"learning_rate_embeddings": 2e-05, |
|
"loss": 5.6865, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 11.63, |
|
"learning_rate": 3e-05, |
|
"learning_rate_embeddings": 3e-05, |
|
"loss": 4.756, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 15.5, |
|
"learning_rate": 4e-05, |
|
"learning_rate_embeddings": 4e-05, |
|
"loss": 4.3111, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 19.38, |
|
"learning_rate": 5e-05, |
|
"learning_rate_embeddings": 5e-05, |
|
"loss": 4.021, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 19.38, |
|
"eval_loss": 3.8212602138519287, |
|
"eval_runtime": 23.2233, |
|
"eval_samples_per_second": 315.33, |
|
"eval_steps_per_second": 2.497, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 23.25, |
|
"learning_rate": 6e-05, |
|
"learning_rate_embeddings": 6e-05, |
|
"loss": 3.829, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 27.13, |
|
"learning_rate": 7.000000000000001e-05, |
|
"learning_rate_embeddings": 7.000000000000001e-05, |
|
"loss": 3.6784, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 31.01, |
|
"learning_rate": 8e-05, |
|
"learning_rate_embeddings": 8e-05, |
|
"loss": 3.5455, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 34.88, |
|
"learning_rate": 8.999999999999999e-05, |
|
"learning_rate_embeddings": 8.999999999999999e-05, |
|
"loss": 3.4304, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 38.76, |
|
"learning_rate": 0.0001, |
|
"learning_rate_embeddings": 0.0001, |
|
"loss": 3.3348, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 38.76, |
|
"eval_loss": 3.238213300704956, |
|
"eval_runtime": 9.9786, |
|
"eval_samples_per_second": 733.872, |
|
"eval_steps_per_second": 5.812, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 42.63, |
|
"learning_rate": 0.00011, |
|
"learning_rate_embeddings": 0.00011, |
|
"loss": 3.2411, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 46.51, |
|
"learning_rate": 0.00012, |
|
"learning_rate_embeddings": 0.00012, |
|
"loss": 3.1542, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 50.39, |
|
"learning_rate": 0.00013000000000000002, |
|
"learning_rate_embeddings": 0.00013000000000000002, |
|
"loss": 3.0781, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 54.26, |
|
"learning_rate": 0.00014000000000000001, |
|
"learning_rate_embeddings": 0.00014000000000000001, |
|
"loss": 3.0012, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 58.14, |
|
"learning_rate": 0.00015, |
|
"learning_rate_embeddings": 0.00015, |
|
"loss": 2.9349, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 58.14, |
|
"eval_loss": 2.92873477935791, |
|
"eval_runtime": 9.9086, |
|
"eval_samples_per_second": 739.053, |
|
"eval_steps_per_second": 5.853, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 62.02, |
|
"learning_rate": 0.00016, |
|
"learning_rate_embeddings": 0.00016, |
|
"loss": 2.8707, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 65.89, |
|
"learning_rate": 0.00017, |
|
"learning_rate_embeddings": 0.00017, |
|
"loss": 2.8075, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 69.76, |
|
"learning_rate": 0.00017999999999999998, |
|
"learning_rate_embeddings": 0.00017999999999999998, |
|
"loss": 2.7504, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 73.64, |
|
"learning_rate": 0.00019, |
|
"learning_rate_embeddings": 0.00019, |
|
"loss": 2.695, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 77.52, |
|
"learning_rate": 0.0002, |
|
"learning_rate_embeddings": 0.0002, |
|
"loss": 2.6372, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 77.52, |
|
"eval_loss": 2.7580177783966064, |
|
"eval_runtime": 9.9239, |
|
"eval_samples_per_second": 737.915, |
|
"eval_steps_per_second": 5.844, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 81.39, |
|
"learning_rate": 0.00021, |
|
"learning_rate_embeddings": 0.00021, |
|
"loss": 2.5911, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 85.27, |
|
"learning_rate": 0.00022, |
|
"learning_rate_embeddings": 0.00022, |
|
"loss": 2.5392, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 89.15, |
|
"learning_rate": 0.00023, |
|
"learning_rate_embeddings": 0.00023, |
|
"loss": 2.4921, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 93.02, |
|
"learning_rate": 0.00024, |
|
"learning_rate_embeddings": 0.00024, |
|
"loss": 2.4477, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 96.9, |
|
"learning_rate": 0.00025, |
|
"learning_rate_embeddings": 0.00025, |
|
"loss": 2.3984, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 96.9, |
|
"eval_loss": 2.669841766357422, |
|
"eval_runtime": 9.9424, |
|
"eval_samples_per_second": 736.544, |
|
"eval_steps_per_second": 5.834, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 100.77, |
|
"learning_rate": 0.00026000000000000003, |
|
"learning_rate_embeddings": 0.00026000000000000003, |
|
"loss": 2.3548, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 104.65, |
|
"learning_rate": 0.00027, |
|
"learning_rate_embeddings": 0.00027, |
|
"loss": 2.3141, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 108.53, |
|
"learning_rate": 0.00028000000000000003, |
|
"learning_rate_embeddings": 0.00028000000000000003, |
|
"loss": 2.2728, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 112.4, |
|
"learning_rate": 0.00029, |
|
"learning_rate_embeddings": 0.00029, |
|
"loss": 2.233, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 116.28, |
|
"learning_rate": 0.0003, |
|
"learning_rate_embeddings": 0.0003, |
|
"loss": 2.1912, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 116.28, |
|
"eval_loss": 2.6404244899749756, |
|
"eval_runtime": 9.8742, |
|
"eval_samples_per_second": 741.631, |
|
"eval_steps_per_second": 5.874, |
|
"step": 15000 |
|
} |
|
], |
|
"max_steps": 250000, |
|
"num_train_epochs": 1938, |
|
"total_flos": 2.51810266742784e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|