kanishka's picture
End of training
34e82f8 verified
{
"best_metric": 2.6840312480926514,
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3/checkpoint-45100",
"epoch": 19.991464360935595,
"eval_steps": 500,
"global_step": 45100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.4434098215275468,
"grad_norm": 0.4883245825767517,
"learning_rate": 3.125e-05,
"loss": 5.5896,
"step": 1000
},
{
"epoch": 0.8868196430550936,
"grad_norm": 0.6184232831001282,
"learning_rate": 6.25e-05,
"loss": 4.1044,
"step": 2000
},
{
"epoch": 1.0,
"eval_accuracy": 0.3604034665942844,
"eval_loss": 3.820427417755127,
"eval_runtime": 74.0095,
"eval_samples_per_second": 819.894,
"eval_steps_per_second": 12.823,
"step": 2256
},
{
"epoch": 1.3298969072164948,
"grad_norm": 0.5272237658500671,
"learning_rate": 9.375e-05,
"loss": 3.6981,
"step": 3000
},
{
"epoch": 1.7733067287440418,
"grad_norm": 0.580745279788971,
"learning_rate": 0.000125,
"loss": 3.4457,
"step": 4000
},
{
"epoch": 2.0,
"eval_accuracy": 0.4093216099887549,
"eval_loss": 3.304572105407715,
"eval_runtime": 74.3697,
"eval_samples_per_second": 815.923,
"eval_steps_per_second": 12.761,
"step": 4512
},
{
"epoch": 2.2163839929054427,
"grad_norm": 0.5752166509628296,
"learning_rate": 0.00015625,
"loss": 3.2482,
"step": 5000
},
{
"epoch": 2.6597938144329896,
"grad_norm": 0.45855629444122314,
"learning_rate": 0.0001875,
"loss": 3.13,
"step": 6000
},
{
"epoch": 3.0,
"eval_accuracy": 0.42987378339602156,
"eval_loss": 3.0944786071777344,
"eval_runtime": 73.3184,
"eval_samples_per_second": 827.624,
"eval_steps_per_second": 12.944,
"step": 6768
},
{
"epoch": 3.102871078594391,
"grad_norm": 0.4158306419849396,
"learning_rate": 0.00021875,
"loss": 3.0338,
"step": 7000
},
{
"epoch": 3.5462809001219378,
"grad_norm": 0.3917515277862549,
"learning_rate": 0.00025,
"loss": 2.9667,
"step": 8000
},
{
"epoch": 3.9896907216494846,
"grad_norm": 0.3958011865615845,
"learning_rate": 0.00028125000000000003,
"loss": 2.9219,
"step": 9000
},
{
"epoch": 4.0,
"eval_accuracy": 0.4403801362337948,
"eval_loss": 2.988952398300171,
"eval_runtime": 72.5536,
"eval_samples_per_second": 836.347,
"eval_steps_per_second": 13.08,
"step": 9024
},
{
"epoch": 4.4327679858108855,
"grad_norm": 0.3376877009868622,
"learning_rate": 0.0003125,
"loss": 2.8585,
"step": 10000
},
{
"epoch": 4.876177807338433,
"grad_norm": 0.32727962732315063,
"learning_rate": 0.00034375,
"loss": 2.8444,
"step": 11000
},
{
"epoch": 5.0,
"eval_accuracy": 0.44664624452285856,
"eval_loss": 2.928157091140747,
"eval_runtime": 72.5793,
"eval_samples_per_second": 836.051,
"eval_steps_per_second": 13.075,
"step": 11280
},
{
"epoch": 5.319255071499834,
"grad_norm": 0.32956644892692566,
"learning_rate": 0.000375,
"loss": 2.7978,
"step": 12000
},
{
"epoch": 5.762664893027381,
"grad_norm": 0.3080673813819885,
"learning_rate": 0.00040625000000000004,
"loss": 2.7883,
"step": 13000
},
{
"epoch": 6.0,
"eval_accuracy": 0.4507549730505254,
"eval_loss": 2.8910350799560547,
"eval_runtime": 72.5685,
"eval_samples_per_second": 836.175,
"eval_steps_per_second": 13.077,
"step": 13536
},
{
"epoch": 6.205742157188782,
"grad_norm": 0.2959093153476715,
"learning_rate": 0.0004375,
"loss": 2.7566,
"step": 14000
},
{
"epoch": 6.649151978716328,
"grad_norm": 0.29388415813446045,
"learning_rate": 0.00046871875,
"loss": 2.7434,
"step": 15000
},
{
"epoch": 7.0,
"eval_accuracy": 0.4544950043300115,
"eval_loss": 2.8579459190368652,
"eval_runtime": 72.4898,
"eval_samples_per_second": 837.083,
"eval_steps_per_second": 13.091,
"step": 15792
},
{
"epoch": 7.09222924287773,
"grad_norm": 0.27015742659568787,
"learning_rate": 0.00049996875,
"loss": 2.7294,
"step": 16000
},
{
"epoch": 7.535639064405276,
"grad_norm": 0.2585032880306244,
"learning_rate": 0.00053121875,
"loss": 2.7057,
"step": 17000
},
{
"epoch": 7.979048885932824,
"grad_norm": 0.26894038915634155,
"learning_rate": 0.0005624687499999999,
"loss": 2.7158,
"step": 18000
},
{
"epoch": 8.0,
"eval_accuracy": 0.4559880181472721,
"eval_loss": 2.842834949493408,
"eval_runtime": 72.6498,
"eval_samples_per_second": 835.24,
"eval_steps_per_second": 13.063,
"step": 18048
},
{
"epoch": 8.422126150094225,
"grad_norm": 0.24038437008857727,
"learning_rate": 0.0005936875,
"loss": 2.6733,
"step": 19000
},
{
"epoch": 8.865535971621771,
"grad_norm": 0.22421102225780487,
"learning_rate": 0.0006249375000000001,
"loss": 2.6905,
"step": 20000
},
{
"epoch": 9.0,
"eval_accuracy": 0.4572794602349839,
"eval_loss": 2.8298442363739014,
"eval_runtime": 72.6455,
"eval_samples_per_second": 835.29,
"eval_steps_per_second": 13.063,
"step": 20304
},
{
"epoch": 9.308613235783172,
"grad_norm": 0.22955693304538727,
"learning_rate": 0.0006561562500000001,
"loss": 2.6582,
"step": 21000
},
{
"epoch": 9.75202305731072,
"grad_norm": 0.20538607239723206,
"learning_rate": 0.00068740625,
"loss": 2.6697,
"step": 22000
},
{
"epoch": 10.0,
"eval_accuracy": 0.45919856010960747,
"eval_loss": 2.816859006881714,
"eval_runtime": 72.615,
"eval_samples_per_second": 835.64,
"eval_steps_per_second": 13.069,
"step": 22560
},
{
"epoch": 10.19510032147212,
"grad_norm": 0.2177572250366211,
"learning_rate": 0.00071865625,
"loss": 2.6506,
"step": 23000
},
{
"epoch": 10.638510142999667,
"grad_norm": 0.2029583603143692,
"learning_rate": 0.000749875,
"loss": 2.6509,
"step": 24000
},
{
"epoch": 11.0,
"eval_accuracy": 0.46011432522910284,
"eval_loss": 2.807971477508545,
"eval_runtime": 72.7484,
"eval_samples_per_second": 834.107,
"eval_steps_per_second": 13.045,
"step": 24816
},
{
"epoch": 11.081587407161068,
"grad_norm": 0.2110850214958191,
"learning_rate": 0.000781125,
"loss": 2.6497,
"step": 25000
},
{
"epoch": 11.524997228688616,
"grad_norm": 0.19248805940151215,
"learning_rate": 0.000812375,
"loss": 2.6322,
"step": 26000
},
{
"epoch": 11.968407050216163,
"grad_norm": 0.18789444863796234,
"learning_rate": 0.00084359375,
"loss": 2.6494,
"step": 27000
},
{
"epoch": 12.0,
"eval_accuracy": 0.4606821383794124,
"eval_loss": 2.8019886016845703,
"eval_runtime": 72.5563,
"eval_samples_per_second": 836.316,
"eval_steps_per_second": 13.079,
"step": 27072
},
{
"epoch": 12.411484314377564,
"grad_norm": 0.1861707717180252,
"learning_rate": 0.0008748437500000001,
"loss": 2.6148,
"step": 28000
},
{
"epoch": 12.854894135905111,
"grad_norm": 0.18803346157073975,
"learning_rate": 0.0009060625,
"loss": 2.6384,
"step": 29000
},
{
"epoch": 13.0,
"eval_accuracy": 0.46163558106169295,
"eval_loss": 2.7958271503448486,
"eval_runtime": 72.8339,
"eval_samples_per_second": 833.128,
"eval_steps_per_second": 13.03,
"step": 29328
},
{
"epoch": 13.297971400066512,
"grad_norm": 0.1761549860239029,
"learning_rate": 0.0009373125,
"loss": 2.6142,
"step": 30000
},
{
"epoch": 13.741381221594057,
"grad_norm": 0.1844184547662735,
"learning_rate": 0.00096853125,
"loss": 2.6297,
"step": 31000
},
{
"epoch": 14.0,
"eval_accuracy": 0.46196660074708856,
"eval_loss": 2.7939445972442627,
"eval_runtime": 72.6804,
"eval_samples_per_second": 834.888,
"eval_steps_per_second": 13.057,
"step": 31584
},
{
"epoch": 14.18445848575546,
"grad_norm": 0.1886565387248993,
"learning_rate": 0.00099978125,
"loss": 2.6147,
"step": 32000
},
{
"epoch": 14.627868307283006,
"grad_norm": 0.17688792943954468,
"learning_rate": 0.0009241984732824427,
"loss": 2.612,
"step": 33000
},
{
"epoch": 15.0,
"eval_accuracy": 0.4653180942779221,
"eval_loss": 2.764906167984009,
"eval_runtime": 72.6592,
"eval_samples_per_second": 835.131,
"eval_steps_per_second": 13.061,
"step": 33840
},
{
"epoch": 15.070945571444408,
"grad_norm": 0.19411760568618774,
"learning_rate": 0.00084793893129771,
"loss": 2.5952,
"step": 34000
},
{
"epoch": 15.514355392971954,
"grad_norm": 0.17588546872138977,
"learning_rate": 0.0007716793893129771,
"loss": 2.5635,
"step": 35000
},
{
"epoch": 15.957765214499501,
"grad_norm": 0.17366230487823486,
"learning_rate": 0.0006953435114503817,
"loss": 2.5667,
"step": 36000
},
{
"epoch": 16.0,
"eval_accuracy": 0.4685543578011297,
"eval_loss": 2.7425177097320557,
"eval_runtime": 72.8257,
"eval_samples_per_second": 833.222,
"eval_steps_per_second": 13.031,
"step": 36096
},
{
"epoch": 16.400842478660902,
"grad_norm": 0.18736566603183746,
"learning_rate": 0.0006190076335877863,
"loss": 2.5093,
"step": 37000
},
{
"epoch": 16.84425230018845,
"grad_norm": 0.18060249090194702,
"learning_rate": 0.0005426717557251909,
"loss": 2.5177,
"step": 38000
},
{
"epoch": 17.0,
"eval_accuracy": 0.4714397611384699,
"eval_loss": 2.7205777168273926,
"eval_runtime": 72.8204,
"eval_samples_per_second": 833.283,
"eval_steps_per_second": 13.032,
"step": 38352
},
{
"epoch": 17.28732956434985,
"grad_norm": 0.1875220090150833,
"learning_rate": 0.000466412213740458,
"loss": 2.4733,
"step": 39000
},
{
"epoch": 17.730739385877396,
"grad_norm": 0.18848279118537903,
"learning_rate": 0.00039007633587786263,
"loss": 2.4607,
"step": 40000
},
{
"epoch": 18.0,
"eval_accuracy": 0.47464791190042266,
"eval_loss": 2.699930429458618,
"eval_runtime": 72.4963,
"eval_samples_per_second": 837.008,
"eval_steps_per_second": 13.09,
"step": 40608
},
{
"epoch": 18.1738166500388,
"grad_norm": 0.19309544563293457,
"learning_rate": 0.0003138167938931298,
"loss": 2.43,
"step": 41000
},
{
"epoch": 18.617226471566344,
"grad_norm": 0.1929185390472412,
"learning_rate": 0.00023748091603053434,
"loss": 2.397,
"step": 42000
},
{
"epoch": 19.0,
"eval_accuracy": 0.4773406620393708,
"eval_loss": 2.6864736080169678,
"eval_runtime": 72.792,
"eval_samples_per_second": 833.608,
"eval_steps_per_second": 13.037,
"step": 42864
},
{
"epoch": 19.060303735727746,
"grad_norm": 0.20593929290771484,
"learning_rate": 0.00016114503816793893,
"loss": 2.3837,
"step": 43000
},
{
"epoch": 19.503713557255292,
"grad_norm": 0.2001897543668747,
"learning_rate": 8.480916030534351e-05,
"loss": 2.3276,
"step": 44000
},
{
"epoch": 19.94712337878284,
"grad_norm": 0.19933941960334778,
"learning_rate": 8.549618320610688e-06,
"loss": 2.3241,
"step": 45000
},
{
"epoch": 19.991464360935595,
"eval_accuracy": 0.47868057440510814,
"eval_loss": 2.6840312480926514,
"eval_runtime": 72.9711,
"eval_samples_per_second": 831.562,
"eval_steps_per_second": 13.005,
"step": 45100
},
{
"epoch": 19.991464360935595,
"step": 45100,
"total_flos": 1.507910045663232e+18,
"train_loss": 2.8050402250099604,
"train_runtime": 30336.2728,
"train_samples_per_second": 380.629,
"train_steps_per_second": 1.487
}
],
"logging_steps": 1000,
"max_steps": 45100,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.507910045663232e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}