|
{ |
|
"best_metric": 1.0097001791000366, |
|
"best_model_checkpoint": "/kaggle/output/checkpoint-42000", |
|
"epoch": 1.711212516297262, |
|
"eval_steps": 1000, |
|
"global_step": 42000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 2.7777777777777777e-11, |
|
"loss": 1.1383, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 2.7750000000000004e-08, |
|
"loss": 1.1424, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_accuracy": 0.32375249500998005, |
|
"eval_loss": 1.1077626943588257, |
|
"eval_runtime": 54.8633, |
|
"eval_samples_per_second": 91.318, |
|
"eval_steps_per_second": 11.428, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 5.5527777777777784e-08, |
|
"loss": 1.1244, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_accuracy": 0.33652694610778444, |
|
"eval_loss": 1.1080161333084106, |
|
"eval_runtime": 54.7384, |
|
"eval_samples_per_second": 91.526, |
|
"eval_steps_per_second": 11.454, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 8.327777777777778e-08, |
|
"loss": 1.1228, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_accuracy": 0.34331337325349304, |
|
"eval_loss": 1.1084064245224, |
|
"eval_runtime": 54.7948, |
|
"eval_samples_per_second": 91.432, |
|
"eval_steps_per_second": 11.443, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.1105555555555557e-07, |
|
"loss": 1.1216, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_accuracy": 0.3385229540918164, |
|
"eval_loss": 1.1014840602874756, |
|
"eval_runtime": 54.8508, |
|
"eval_samples_per_second": 91.339, |
|
"eval_steps_per_second": 11.431, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.3880555555555558e-07, |
|
"loss": 1.1181, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_accuracy": 0.33073852295409184, |
|
"eval_loss": 1.1008135080337524, |
|
"eval_runtime": 54.8304, |
|
"eval_samples_per_second": 91.373, |
|
"eval_steps_per_second": 11.435, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.6658333333333335e-07, |
|
"loss": 1.1132, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_accuracy": 0.3520958083832335, |
|
"eval_loss": 1.0993762016296387, |
|
"eval_runtime": 54.8804, |
|
"eval_samples_per_second": 91.289, |
|
"eval_steps_per_second": 11.425, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.9433333333333334e-07, |
|
"loss": 1.1113, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_accuracy": 0.3530938123752495, |
|
"eval_loss": 1.0965770483016968, |
|
"eval_runtime": 54.8881, |
|
"eval_samples_per_second": 91.277, |
|
"eval_steps_per_second": 11.423, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 2.2211111111111114e-07, |
|
"loss": 1.1111, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_accuracy": 0.35708582834331337, |
|
"eval_loss": 1.094658613204956, |
|
"eval_runtime": 54.8233, |
|
"eval_samples_per_second": 91.384, |
|
"eval_steps_per_second": 11.437, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 2.4986111111111113e-07, |
|
"loss": 1.109, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_accuracy": 0.34191616766467064, |
|
"eval_loss": 1.106990933418274, |
|
"eval_runtime": 54.9095, |
|
"eval_samples_per_second": 91.241, |
|
"eval_steps_per_second": 11.419, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 2.776388888888889e-07, |
|
"loss": 1.1036, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_accuracy": 0.37584830339321357, |
|
"eval_loss": 1.0930211544036865, |
|
"eval_runtime": 54.9067, |
|
"eval_samples_per_second": 91.246, |
|
"eval_steps_per_second": 11.419, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 3.0541666666666667e-07, |
|
"loss": 1.1045, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_accuracy": 0.3652694610778443, |
|
"eval_loss": 1.092846393585205, |
|
"eval_runtime": 54.8964, |
|
"eval_samples_per_second": 91.263, |
|
"eval_steps_per_second": 11.422, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 3.3319444444444444e-07, |
|
"loss": 1.1024, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_accuracy": 0.39261477045908183, |
|
"eval_loss": 1.089038372039795, |
|
"eval_runtime": 54.9763, |
|
"eval_samples_per_second": 91.13, |
|
"eval_steps_per_second": 11.405, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 3.6094444444444446e-07, |
|
"loss": 1.1007, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.34311377245508984, |
|
"eval_loss": 1.0933948755264282, |
|
"eval_runtime": 54.9285, |
|
"eval_samples_per_second": 91.209, |
|
"eval_steps_per_second": 11.415, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 3.8872222222222223e-07, |
|
"loss": 1.0985, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_accuracy": 0.36367265469061877, |
|
"eval_loss": 1.09434974193573, |
|
"eval_runtime": 54.8032, |
|
"eval_samples_per_second": 91.418, |
|
"eval_steps_per_second": 11.441, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 4.1650000000000006e-07, |
|
"loss": 1.0988, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_accuracy": 0.39481037924151696, |
|
"eval_loss": 1.0886671543121338, |
|
"eval_runtime": 54.9221, |
|
"eval_samples_per_second": 91.22, |
|
"eval_steps_per_second": 11.416, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 4.4425e-07, |
|
"loss": 1.0965, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_accuracy": 0.3916167664670659, |
|
"eval_loss": 1.0834949016571045, |
|
"eval_runtime": 54.5628, |
|
"eval_samples_per_second": 91.821, |
|
"eval_steps_per_second": 11.491, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.7202777777777785e-07, |
|
"loss": 1.0926, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_accuracy": 0.4239520958083832, |
|
"eval_loss": 1.079688310623169, |
|
"eval_runtime": 54.6989, |
|
"eval_samples_per_second": 91.592, |
|
"eval_steps_per_second": 11.463, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 4.998055555555556e-07, |
|
"loss": 1.0956, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_accuracy": 0.4219560878243513, |
|
"eval_loss": 1.080493688583374, |
|
"eval_runtime": 54.6863, |
|
"eval_samples_per_second": 91.613, |
|
"eval_steps_per_second": 11.465, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 5.275555555555556e-07, |
|
"loss": 1.0878, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_accuracy": 0.4343313373253493, |
|
"eval_loss": 1.0664235353469849, |
|
"eval_runtime": 54.7843, |
|
"eval_samples_per_second": 91.45, |
|
"eval_steps_per_second": 11.445, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 5.553333333333334e-07, |
|
"loss": 1.0793, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_accuracy": 0.4365269461077844, |
|
"eval_loss": 1.06390380859375, |
|
"eval_runtime": 54.7978, |
|
"eval_samples_per_second": 91.427, |
|
"eval_steps_per_second": 11.442, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 5.830833333333334e-07, |
|
"loss": 1.0746, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_accuracy": 0.4311377245508982, |
|
"eval_loss": 1.0611063241958618, |
|
"eval_runtime": 54.6084, |
|
"eval_samples_per_second": 91.744, |
|
"eval_steps_per_second": 11.482, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 6.108611111111111e-07, |
|
"loss": 1.0757, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_accuracy": 0.43253493013972055, |
|
"eval_loss": 1.0579031705856323, |
|
"eval_runtime": 54.7147, |
|
"eval_samples_per_second": 91.566, |
|
"eval_steps_per_second": 11.459, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 6.386111111111112e-07, |
|
"loss": 1.0712, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_accuracy": 0.43353293413173655, |
|
"eval_loss": 1.0545520782470703, |
|
"eval_runtime": 54.8205, |
|
"eval_samples_per_second": 91.389, |
|
"eval_steps_per_second": 11.437, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 6.663888888888889e-07, |
|
"loss": 1.0703, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_accuracy": 0.43872255489021955, |
|
"eval_loss": 1.0489881038665771, |
|
"eval_runtime": 54.6899, |
|
"eval_samples_per_second": 91.607, |
|
"eval_steps_per_second": 11.465, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 6.941666666666667e-07, |
|
"loss": 1.0673, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_accuracy": 0.4467065868263473, |
|
"eval_loss": 1.0496938228607178, |
|
"eval_runtime": 54.8292, |
|
"eval_samples_per_second": 91.375, |
|
"eval_steps_per_second": 11.436, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 7.219166666666666e-07, |
|
"loss": 1.0672, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_accuracy": 0.4297405189620758, |
|
"eval_loss": 1.0511012077331543, |
|
"eval_runtime": 54.8046, |
|
"eval_samples_per_second": 91.416, |
|
"eval_steps_per_second": 11.441, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"learning_rate": 7.496944444444444e-07, |
|
"loss": 1.0658, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_accuracy": 0.43912175648702595, |
|
"eval_loss": 1.0465787649154663, |
|
"eval_runtime": 54.7507, |
|
"eval_samples_per_second": 91.506, |
|
"eval_steps_per_second": 11.452, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 7.774444444444445e-07, |
|
"loss": 1.0638, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"eval_accuracy": 0.43333333333333335, |
|
"eval_loss": 1.0430774688720703, |
|
"eval_runtime": 54.6439, |
|
"eval_samples_per_second": 91.685, |
|
"eval_steps_per_second": 11.474, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 8.052222222222223e-07, |
|
"loss": 1.0602, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_accuracy": 0.4479041916167665, |
|
"eval_loss": 1.0387687683105469, |
|
"eval_runtime": 54.8009, |
|
"eval_samples_per_second": 91.422, |
|
"eval_steps_per_second": 11.441, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 8.329722222222223e-07, |
|
"loss": 1.0567, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"eval_accuracy": 0.4485029940119761, |
|
"eval_loss": 1.0339411497116089, |
|
"eval_runtime": 54.6621, |
|
"eval_samples_per_second": 91.654, |
|
"eval_steps_per_second": 11.47, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 8.607500000000001e-07, |
|
"loss": 1.0611, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_accuracy": 0.4465069860279441, |
|
"eval_loss": 1.0385600328445435, |
|
"eval_runtime": 54.7106, |
|
"eval_samples_per_second": 91.573, |
|
"eval_steps_per_second": 11.46, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 8.885e-07, |
|
"loss": 1.0555, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_accuracy": 0.4377245508982036, |
|
"eval_loss": 1.0331332683563232, |
|
"eval_runtime": 54.7554, |
|
"eval_samples_per_second": 91.498, |
|
"eval_steps_per_second": 11.451, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 9.162777777777779e-07, |
|
"loss": 1.0512, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_accuracy": 0.44610778443113774, |
|
"eval_loss": 1.028577446937561, |
|
"eval_runtime": 54.7331, |
|
"eval_samples_per_second": 91.535, |
|
"eval_steps_per_second": 11.456, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 9.440277777777779e-07, |
|
"loss": 1.048, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"eval_accuracy": 0.4532934131736527, |
|
"eval_loss": 1.0286486148834229, |
|
"eval_runtime": 54.7585, |
|
"eval_samples_per_second": 91.493, |
|
"eval_steps_per_second": 11.45, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 9.718055555555557e-07, |
|
"loss": 1.0524, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"eval_accuracy": 0.449500998003992, |
|
"eval_loss": 1.0262507200241089, |
|
"eval_runtime": 54.1381, |
|
"eval_samples_per_second": 92.541, |
|
"eval_steps_per_second": 11.582, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 9.995555555555557e-07, |
|
"loss": 1.0472, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_accuracy": 0.4552894211576846, |
|
"eval_loss": 1.022876501083374, |
|
"eval_runtime": 54.2077, |
|
"eval_samples_per_second": 92.422, |
|
"eval_steps_per_second": 11.567, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 1.0273333333333335e-06, |
|
"loss": 1.0454, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"eval_accuracy": 0.4477045908183633, |
|
"eval_loss": 1.0219467878341675, |
|
"eval_runtime": 54.4001, |
|
"eval_samples_per_second": 92.095, |
|
"eval_steps_per_second": 11.526, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"learning_rate": 1.0550833333333334e-06, |
|
"loss": 1.0473, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"eval_accuracy": 0.46107784431137727, |
|
"eval_loss": 1.0189749002456665, |
|
"eval_runtime": 54.291, |
|
"eval_samples_per_second": 92.28, |
|
"eval_steps_per_second": 11.549, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 1.0828611111111111e-06, |
|
"loss": 1.0465, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_accuracy": 0.4500998003992016, |
|
"eval_loss": 1.0226292610168457, |
|
"eval_runtime": 54.6634, |
|
"eval_samples_per_second": 91.652, |
|
"eval_steps_per_second": 11.47, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"learning_rate": 1.1106111111111112e-06, |
|
"loss": 1.0408, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"eval_accuracy": 0.45229540918163674, |
|
"eval_loss": 1.0191301107406616, |
|
"eval_runtime": 54.6371, |
|
"eval_samples_per_second": 91.696, |
|
"eval_steps_per_second": 11.476, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 1.138388888888889e-06, |
|
"loss": 1.0433, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_accuracy": 0.4620758483033932, |
|
"eval_loss": 1.0231131315231323, |
|
"eval_runtime": 54.2747, |
|
"eval_samples_per_second": 92.308, |
|
"eval_steps_per_second": 11.552, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"learning_rate": 1.166138888888889e-06, |
|
"loss": 1.0392, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_accuracy": 0.46726546906187627, |
|
"eval_loss": 1.0097001791000366, |
|
"eval_runtime": 54.4967, |
|
"eval_samples_per_second": 91.932, |
|
"eval_steps_per_second": 11.505, |
|
"step": 42000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 10000000, |
|
"num_train_epochs": 408, |
|
"save_steps": 1000, |
|
"total_flos": 8.77946541810647e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|