|
{ |
|
"best_metric": 2.6840312480926514, |
|
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-earlystop-bpe_seed-42_1e-3/checkpoint-45100", |
|
"epoch": 19.991464360935595, |
|
"eval_steps": 500, |
|
"global_step": 45100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4434098215275468, |
|
"grad_norm": 0.4883245825767517, |
|
"learning_rate": 3.125e-05, |
|
"loss": 5.5896, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8868196430550936, |
|
"grad_norm": 0.6184232831001282, |
|
"learning_rate": 6.25e-05, |
|
"loss": 4.1044, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.3604034665942844, |
|
"eval_loss": 3.820427417755127, |
|
"eval_runtime": 74.0095, |
|
"eval_samples_per_second": 819.894, |
|
"eval_steps_per_second": 12.823, |
|
"step": 2256 |
|
}, |
|
{ |
|
"epoch": 1.3298969072164948, |
|
"grad_norm": 0.5272237658500671, |
|
"learning_rate": 9.375e-05, |
|
"loss": 3.6981, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7733067287440418, |
|
"grad_norm": 0.580745279788971, |
|
"learning_rate": 0.000125, |
|
"loss": 3.4457, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.4093216099887549, |
|
"eval_loss": 3.304572105407715, |
|
"eval_runtime": 74.3697, |
|
"eval_samples_per_second": 815.923, |
|
"eval_steps_per_second": 12.761, |
|
"step": 4512 |
|
}, |
|
{ |
|
"epoch": 2.2163839929054427, |
|
"grad_norm": 0.5752166509628296, |
|
"learning_rate": 0.00015625, |
|
"loss": 3.2482, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.6597938144329896, |
|
"grad_norm": 0.45855629444122314, |
|
"learning_rate": 0.0001875, |
|
"loss": 3.13, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.42987378339602156, |
|
"eval_loss": 3.0944786071777344, |
|
"eval_runtime": 73.3184, |
|
"eval_samples_per_second": 827.624, |
|
"eval_steps_per_second": 12.944, |
|
"step": 6768 |
|
}, |
|
{ |
|
"epoch": 3.102871078594391, |
|
"grad_norm": 0.4158306419849396, |
|
"learning_rate": 0.00021875, |
|
"loss": 3.0338, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.5462809001219378, |
|
"grad_norm": 0.3917515277862549, |
|
"learning_rate": 0.00025, |
|
"loss": 2.9667, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.9896907216494846, |
|
"grad_norm": 0.3958011865615845, |
|
"learning_rate": 0.00028125000000000003, |
|
"loss": 2.9219, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.4403801362337948, |
|
"eval_loss": 2.988952398300171, |
|
"eval_runtime": 72.5536, |
|
"eval_samples_per_second": 836.347, |
|
"eval_steps_per_second": 13.08, |
|
"step": 9024 |
|
}, |
|
{ |
|
"epoch": 4.4327679858108855, |
|
"grad_norm": 0.3376877009868622, |
|
"learning_rate": 0.0003125, |
|
"loss": 2.8585, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.876177807338433, |
|
"grad_norm": 0.32727962732315063, |
|
"learning_rate": 0.00034375, |
|
"loss": 2.8444, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.44664624452285856, |
|
"eval_loss": 2.928157091140747, |
|
"eval_runtime": 72.5793, |
|
"eval_samples_per_second": 836.051, |
|
"eval_steps_per_second": 13.075, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 5.319255071499834, |
|
"grad_norm": 0.32956644892692566, |
|
"learning_rate": 0.000375, |
|
"loss": 2.7978, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.762664893027381, |
|
"grad_norm": 0.3080673813819885, |
|
"learning_rate": 0.00040625000000000004, |
|
"loss": 2.7883, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.4507549730505254, |
|
"eval_loss": 2.8910350799560547, |
|
"eval_runtime": 72.5685, |
|
"eval_samples_per_second": 836.175, |
|
"eval_steps_per_second": 13.077, |
|
"step": 13536 |
|
}, |
|
{ |
|
"epoch": 6.205742157188782, |
|
"grad_norm": 0.2959093153476715, |
|
"learning_rate": 0.0004375, |
|
"loss": 2.7566, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.649151978716328, |
|
"grad_norm": 0.29388415813446045, |
|
"learning_rate": 0.00046871875, |
|
"loss": 2.7434, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.4544950043300115, |
|
"eval_loss": 2.8579459190368652, |
|
"eval_runtime": 72.4898, |
|
"eval_samples_per_second": 837.083, |
|
"eval_steps_per_second": 13.091, |
|
"step": 15792 |
|
}, |
|
{ |
|
"epoch": 7.09222924287773, |
|
"grad_norm": 0.27015742659568787, |
|
"learning_rate": 0.00049996875, |
|
"loss": 2.7294, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.535639064405276, |
|
"grad_norm": 0.2585032880306244, |
|
"learning_rate": 0.00053121875, |
|
"loss": 2.7057, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 7.979048885932824, |
|
"grad_norm": 0.26894038915634155, |
|
"learning_rate": 0.0005624687499999999, |
|
"loss": 2.7158, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4559880181472721, |
|
"eval_loss": 2.842834949493408, |
|
"eval_runtime": 72.6498, |
|
"eval_samples_per_second": 835.24, |
|
"eval_steps_per_second": 13.063, |
|
"step": 18048 |
|
}, |
|
{ |
|
"epoch": 8.422126150094225, |
|
"grad_norm": 0.24038437008857727, |
|
"learning_rate": 0.0005936875, |
|
"loss": 2.6733, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.865535971621771, |
|
"grad_norm": 0.22421102225780487, |
|
"learning_rate": 0.0006249375000000001, |
|
"loss": 2.6905, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4572794602349839, |
|
"eval_loss": 2.8298442363739014, |
|
"eval_runtime": 72.6455, |
|
"eval_samples_per_second": 835.29, |
|
"eval_steps_per_second": 13.063, |
|
"step": 20304 |
|
}, |
|
{ |
|
"epoch": 9.308613235783172, |
|
"grad_norm": 0.22955693304538727, |
|
"learning_rate": 0.0006561562500000001, |
|
"loss": 2.6582, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 9.75202305731072, |
|
"grad_norm": 0.20538607239723206, |
|
"learning_rate": 0.00068740625, |
|
"loss": 2.6697, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.45919856010960747, |
|
"eval_loss": 2.816859006881714, |
|
"eval_runtime": 72.615, |
|
"eval_samples_per_second": 835.64, |
|
"eval_steps_per_second": 13.069, |
|
"step": 22560 |
|
}, |
|
{ |
|
"epoch": 10.19510032147212, |
|
"grad_norm": 0.2177572250366211, |
|
"learning_rate": 0.00071865625, |
|
"loss": 2.6506, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 10.638510142999667, |
|
"grad_norm": 0.2029583603143692, |
|
"learning_rate": 0.000749875, |
|
"loss": 2.6509, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.46011432522910284, |
|
"eval_loss": 2.807971477508545, |
|
"eval_runtime": 72.7484, |
|
"eval_samples_per_second": 834.107, |
|
"eval_steps_per_second": 13.045, |
|
"step": 24816 |
|
}, |
|
{ |
|
"epoch": 11.081587407161068, |
|
"grad_norm": 0.2110850214958191, |
|
"learning_rate": 0.000781125, |
|
"loss": 2.6497, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 11.524997228688616, |
|
"grad_norm": 0.19248805940151215, |
|
"learning_rate": 0.000812375, |
|
"loss": 2.6322, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 11.968407050216163, |
|
"grad_norm": 0.18789444863796234, |
|
"learning_rate": 0.00084359375, |
|
"loss": 2.6494, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4606821383794124, |
|
"eval_loss": 2.8019886016845703, |
|
"eval_runtime": 72.5563, |
|
"eval_samples_per_second": 836.316, |
|
"eval_steps_per_second": 13.079, |
|
"step": 27072 |
|
}, |
|
{ |
|
"epoch": 12.411484314377564, |
|
"grad_norm": 0.1861707717180252, |
|
"learning_rate": 0.0008748437500000001, |
|
"loss": 2.6148, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 12.854894135905111, |
|
"grad_norm": 0.18803346157073975, |
|
"learning_rate": 0.0009060625, |
|
"loss": 2.6384, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.46163558106169295, |
|
"eval_loss": 2.7958271503448486, |
|
"eval_runtime": 72.8339, |
|
"eval_samples_per_second": 833.128, |
|
"eval_steps_per_second": 13.03, |
|
"step": 29328 |
|
}, |
|
{ |
|
"epoch": 13.297971400066512, |
|
"grad_norm": 0.1761549860239029, |
|
"learning_rate": 0.0009373125, |
|
"loss": 2.6142, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 13.741381221594057, |
|
"grad_norm": 0.1844184547662735, |
|
"learning_rate": 0.00096853125, |
|
"loss": 2.6297, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.46196660074708856, |
|
"eval_loss": 2.7939445972442627, |
|
"eval_runtime": 72.6804, |
|
"eval_samples_per_second": 834.888, |
|
"eval_steps_per_second": 13.057, |
|
"step": 31584 |
|
}, |
|
{ |
|
"epoch": 14.18445848575546, |
|
"grad_norm": 0.1886565387248993, |
|
"learning_rate": 0.00099978125, |
|
"loss": 2.6147, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 14.627868307283006, |
|
"grad_norm": 0.17688792943954468, |
|
"learning_rate": 0.0009241984732824427, |
|
"loss": 2.612, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.4653180942779221, |
|
"eval_loss": 2.764906167984009, |
|
"eval_runtime": 72.6592, |
|
"eval_samples_per_second": 835.131, |
|
"eval_steps_per_second": 13.061, |
|
"step": 33840 |
|
}, |
|
{ |
|
"epoch": 15.070945571444408, |
|
"grad_norm": 0.19411760568618774, |
|
"learning_rate": 0.00084793893129771, |
|
"loss": 2.5952, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 15.514355392971954, |
|
"grad_norm": 0.17588546872138977, |
|
"learning_rate": 0.0007716793893129771, |
|
"loss": 2.5635, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 15.957765214499501, |
|
"grad_norm": 0.17366230487823486, |
|
"learning_rate": 0.0006953435114503817, |
|
"loss": 2.5667, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4685543578011297, |
|
"eval_loss": 2.7425177097320557, |
|
"eval_runtime": 72.8257, |
|
"eval_samples_per_second": 833.222, |
|
"eval_steps_per_second": 13.031, |
|
"step": 36096 |
|
}, |
|
{ |
|
"epoch": 16.400842478660902, |
|
"grad_norm": 0.18736566603183746, |
|
"learning_rate": 0.0006190076335877863, |
|
"loss": 2.5093, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 16.84425230018845, |
|
"grad_norm": 0.18060249090194702, |
|
"learning_rate": 0.0005426717557251909, |
|
"loss": 2.5177, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.4714397611384699, |
|
"eval_loss": 2.7205777168273926, |
|
"eval_runtime": 72.8204, |
|
"eval_samples_per_second": 833.283, |
|
"eval_steps_per_second": 13.032, |
|
"step": 38352 |
|
}, |
|
{ |
|
"epoch": 17.28732956434985, |
|
"grad_norm": 0.1875220090150833, |
|
"learning_rate": 0.000466412213740458, |
|
"loss": 2.4733, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 17.730739385877396, |
|
"grad_norm": 0.18848279118537903, |
|
"learning_rate": 0.00039007633587786263, |
|
"loss": 2.4607, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.47464791190042266, |
|
"eval_loss": 2.699930429458618, |
|
"eval_runtime": 72.4963, |
|
"eval_samples_per_second": 837.008, |
|
"eval_steps_per_second": 13.09, |
|
"step": 40608 |
|
}, |
|
{ |
|
"epoch": 18.1738166500388, |
|
"grad_norm": 0.19309544563293457, |
|
"learning_rate": 0.0003138167938931298, |
|
"loss": 2.43, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 18.617226471566344, |
|
"grad_norm": 0.1929185390472412, |
|
"learning_rate": 0.00023748091603053434, |
|
"loss": 2.397, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.4773406620393708, |
|
"eval_loss": 2.6864736080169678, |
|
"eval_runtime": 72.792, |
|
"eval_samples_per_second": 833.608, |
|
"eval_steps_per_second": 13.037, |
|
"step": 42864 |
|
}, |
|
{ |
|
"epoch": 19.060303735727746, |
|
"grad_norm": 0.20593929290771484, |
|
"learning_rate": 0.00016114503816793893, |
|
"loss": 2.3837, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 19.503713557255292, |
|
"grad_norm": 0.2001897543668747, |
|
"learning_rate": 8.480916030534351e-05, |
|
"loss": 2.3276, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 19.94712337878284, |
|
"grad_norm": 0.19933941960334778, |
|
"learning_rate": 8.549618320610688e-06, |
|
"loss": 2.3241, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 19.991464360935595, |
|
"eval_accuracy": 0.47868057440510814, |
|
"eval_loss": 2.6840312480926514, |
|
"eval_runtime": 72.9711, |
|
"eval_samples_per_second": 831.562, |
|
"eval_steps_per_second": 13.005, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 19.991464360935595, |
|
"step": 45100, |
|
"total_flos": 1.507910045663232e+18, |
|
"train_loss": 2.8050402250099604, |
|
"train_runtime": 30336.2728, |
|
"train_samples_per_second": 380.629, |
|
"train_steps_per_second": 1.487 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 45100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.507910045663232e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|