|
{ |
|
"best_metric": 1.0064666271209717, |
|
"best_model_checkpoint": "/kaggle/output/checkpoint-44000", |
|
"epoch": 1.7926988265971318, |
|
"eval_steps": 1000, |
|
"global_step": 44000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 2.7777777777777777e-11, |
|
"loss": 1.1506, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 2.7750000000000004e-08, |
|
"loss": 1.1326, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_accuracy": 0.3331337325349301, |
|
"eval_loss": 1.1057127714157104, |
|
"eval_runtime": 61.3538, |
|
"eval_samples_per_second": 81.657, |
|
"eval_steps_per_second": 10.219, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 5.5527777777777784e-08, |
|
"loss": 1.1238, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_accuracy": 0.33812375249501, |
|
"eval_loss": 1.1044437885284424, |
|
"eval_runtime": 61.256, |
|
"eval_samples_per_second": 81.788, |
|
"eval_steps_per_second": 10.236, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 8.327777777777778e-08, |
|
"loss": 1.1251, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_accuracy": 0.3407185628742515, |
|
"eval_loss": 1.1043975353240967, |
|
"eval_runtime": 61.2433, |
|
"eval_samples_per_second": 81.805, |
|
"eval_steps_per_second": 10.238, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 1.1105555555555557e-07, |
|
"loss": 1.1205, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_accuracy": 0.3387225548902196, |
|
"eval_loss": 1.100860357284546, |
|
"eval_runtime": 61.4457, |
|
"eval_samples_per_second": 81.535, |
|
"eval_steps_per_second": 10.204, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.3880555555555558e-07, |
|
"loss": 1.1192, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_accuracy": 0.34151696606786425, |
|
"eval_loss": 1.1005476713180542, |
|
"eval_runtime": 61.2568, |
|
"eval_samples_per_second": 81.787, |
|
"eval_steps_per_second": 10.236, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 1.6658333333333335e-07, |
|
"loss": 1.1145, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_accuracy": 0.3508982035928144, |
|
"eval_loss": 1.0979759693145752, |
|
"eval_runtime": 61.2221, |
|
"eval_samples_per_second": 81.833, |
|
"eval_steps_per_second": 10.241, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 1.9433333333333334e-07, |
|
"loss": 1.1164, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_accuracy": 0.35508982035928144, |
|
"eval_loss": 1.0955135822296143, |
|
"eval_runtime": 61.255, |
|
"eval_samples_per_second": 81.789, |
|
"eval_steps_per_second": 10.236, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 2.2211111111111114e-07, |
|
"loss": 1.1091, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_accuracy": 0.36147704590818364, |
|
"eval_loss": 1.0946425199508667, |
|
"eval_runtime": 61.3011, |
|
"eval_samples_per_second": 81.728, |
|
"eval_steps_per_second": 10.228, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 2.4986111111111113e-07, |
|
"loss": 1.107, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_accuracy": 0.3471057884231537, |
|
"eval_loss": 1.10226309299469, |
|
"eval_runtime": 61.3193, |
|
"eval_samples_per_second": 81.703, |
|
"eval_steps_per_second": 10.225, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 2.776388888888889e-07, |
|
"loss": 1.1055, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_accuracy": 0.3774451097804391, |
|
"eval_loss": 1.0922847986221313, |
|
"eval_runtime": 61.1866, |
|
"eval_samples_per_second": 81.881, |
|
"eval_steps_per_second": 10.247, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 3.0541666666666667e-07, |
|
"loss": 1.1028, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_accuracy": 0.37425149700598803, |
|
"eval_loss": 1.0914534330368042, |
|
"eval_runtime": 61.341, |
|
"eval_samples_per_second": 81.675, |
|
"eval_steps_per_second": 10.222, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 3.3319444444444444e-07, |
|
"loss": 1.1024, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_accuracy": 0.3872255489021956, |
|
"eval_loss": 1.0888582468032837, |
|
"eval_runtime": 61.4245, |
|
"eval_samples_per_second": 81.564, |
|
"eval_steps_per_second": 10.208, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"learning_rate": 3.6094444444444446e-07, |
|
"loss": 1.1015, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.3397205588822355, |
|
"eval_loss": 1.096097707748413, |
|
"eval_runtime": 61.3257, |
|
"eval_samples_per_second": 81.695, |
|
"eval_steps_per_second": 10.224, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 3.8872222222222223e-07, |
|
"loss": 1.1, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_accuracy": 0.36387225548902197, |
|
"eval_loss": 1.0940359830856323, |
|
"eval_runtime": 61.2945, |
|
"eval_samples_per_second": 81.737, |
|
"eval_steps_per_second": 10.229, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 4.1650000000000006e-07, |
|
"loss": 1.0997, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_accuracy": 0.38602794411177643, |
|
"eval_loss": 1.0880299806594849, |
|
"eval_runtime": 61.2546, |
|
"eval_samples_per_second": 81.79, |
|
"eval_steps_per_second": 10.236, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 4.4425e-07, |
|
"loss": 1.0987, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_accuracy": 0.38962075848303396, |
|
"eval_loss": 1.0837221145629883, |
|
"eval_runtime": 61.1947, |
|
"eval_samples_per_second": 81.87, |
|
"eval_steps_per_second": 10.246, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 4.7202777777777785e-07, |
|
"loss": 1.0943, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_accuracy": 0.4033932135728543, |
|
"eval_loss": 1.081850290298462, |
|
"eval_runtime": 61.2293, |
|
"eval_samples_per_second": 81.824, |
|
"eval_steps_per_second": 10.24, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 4.997777777777779e-07, |
|
"loss": 1.0966, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_accuracy": 0.4031936127744511, |
|
"eval_loss": 1.0839308500289917, |
|
"eval_runtime": 61.2192, |
|
"eval_samples_per_second": 81.837, |
|
"eval_steps_per_second": 10.242, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"learning_rate": 5.275555555555556e-07, |
|
"loss": 1.0935, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_accuracy": 0.40099800399201596, |
|
"eval_loss": 1.0814112424850464, |
|
"eval_runtime": 61.1921, |
|
"eval_samples_per_second": 81.873, |
|
"eval_steps_per_second": 10.246, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 5.553055555555556e-07, |
|
"loss": 1.0939, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_accuracy": 0.39241516966067863, |
|
"eval_loss": 1.0840386152267456, |
|
"eval_runtime": 61.2683, |
|
"eval_samples_per_second": 81.772, |
|
"eval_steps_per_second": 10.234, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 5.830833333333334e-07, |
|
"loss": 1.0917, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_accuracy": 0.38303393213572856, |
|
"eval_loss": 1.083555817604065, |
|
"eval_runtime": 61.2974, |
|
"eval_samples_per_second": 81.733, |
|
"eval_steps_per_second": 10.229, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 6.108333333333333e-07, |
|
"loss": 1.0926, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_accuracy": 0.39481037924151696, |
|
"eval_loss": 1.080513834953308, |
|
"eval_runtime": 61.4078, |
|
"eval_samples_per_second": 81.586, |
|
"eval_steps_per_second": 10.21, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 6.386111111111112e-07, |
|
"loss": 1.09, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_accuracy": 0.4115768463073852, |
|
"eval_loss": 1.076084017753601, |
|
"eval_runtime": 61.6647, |
|
"eval_samples_per_second": 81.246, |
|
"eval_steps_per_second": 10.168, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 6.663888888888889e-07, |
|
"loss": 1.0879, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_accuracy": 0.42734530938123755, |
|
"eval_loss": 1.0709619522094727, |
|
"eval_runtime": 61.3291, |
|
"eval_samples_per_second": 81.69, |
|
"eval_steps_per_second": 10.224, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 6.94138888888889e-07, |
|
"loss": 1.0846, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_accuracy": 0.417564870259481, |
|
"eval_loss": 1.0698238611221313, |
|
"eval_runtime": 61.3091, |
|
"eval_samples_per_second": 81.717, |
|
"eval_steps_per_second": 10.227, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 7.219166666666666e-07, |
|
"loss": 1.0821, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_accuracy": 0.43313373253493015, |
|
"eval_loss": 1.065280795097351, |
|
"eval_runtime": 61.4486, |
|
"eval_samples_per_second": 81.532, |
|
"eval_steps_per_second": 10.204, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"learning_rate": 7.496666666666667e-07, |
|
"loss": 1.078, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_accuracy": 0.4367265469061876, |
|
"eval_loss": 1.059377908706665, |
|
"eval_runtime": 61.6257, |
|
"eval_samples_per_second": 81.297, |
|
"eval_steps_per_second": 10.174, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"learning_rate": 7.774444444444445e-07, |
|
"loss": 1.0707, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"eval_accuracy": 0.437125748502994, |
|
"eval_loss": 1.0536556243896484, |
|
"eval_runtime": 61.8255, |
|
"eval_samples_per_second": 81.035, |
|
"eval_steps_per_second": 10.141, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 8.051944444444445e-07, |
|
"loss": 1.0658, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_accuracy": 0.4425149700598802, |
|
"eval_loss": 1.0465552806854248, |
|
"eval_runtime": 61.6813, |
|
"eval_samples_per_second": 81.224, |
|
"eval_steps_per_second": 10.165, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 8.329722222222223e-07, |
|
"loss": 1.0596, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"eval_accuracy": 0.44011976047904194, |
|
"eval_loss": 1.042420506477356, |
|
"eval_runtime": 61.7796, |
|
"eval_samples_per_second": 81.095, |
|
"eval_steps_per_second": 10.149, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 8.607222222222223e-07, |
|
"loss": 1.0665, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_accuracy": 0.44011976047904194, |
|
"eval_loss": 1.0469056367874146, |
|
"eval_runtime": 62.7365, |
|
"eval_samples_per_second": 79.858, |
|
"eval_steps_per_second": 9.994, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 8.885e-07, |
|
"loss": 1.0586, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_accuracy": 0.4363273453093812, |
|
"eval_loss": 1.041648507118225, |
|
"eval_runtime": 61.7699, |
|
"eval_samples_per_second": 81.107, |
|
"eval_steps_per_second": 10.151, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 9.162500000000001e-07, |
|
"loss": 1.0579, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_accuracy": 0.44550898203592815, |
|
"eval_loss": 1.0353455543518066, |
|
"eval_runtime": 61.7392, |
|
"eval_samples_per_second": 81.148, |
|
"eval_steps_per_second": 10.156, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 9.440277777777779e-07, |
|
"loss": 1.0499, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"eval_accuracy": 0.45109780439121755, |
|
"eval_loss": 1.0313138961791992, |
|
"eval_runtime": 61.661, |
|
"eval_samples_per_second": 81.251, |
|
"eval_steps_per_second": 10.169, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 9.718055555555557e-07, |
|
"loss": 1.0551, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"eval_accuracy": 0.4481037924151697, |
|
"eval_loss": 1.0282059907913208, |
|
"eval_runtime": 61.7111, |
|
"eval_samples_per_second": 81.185, |
|
"eval_steps_per_second": 10.16, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 9.995555555555557e-07, |
|
"loss": 1.0509, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"eval_accuracy": 0.4568862275449102, |
|
"eval_loss": 1.0246349573135376, |
|
"eval_runtime": 61.4536, |
|
"eval_samples_per_second": 81.525, |
|
"eval_steps_per_second": 10.203, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 1.0273333333333335e-06, |
|
"loss": 1.0486, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"eval_accuracy": 0.45868263473053894, |
|
"eval_loss": 1.0200064182281494, |
|
"eval_runtime": 61.3644, |
|
"eval_samples_per_second": 81.643, |
|
"eval_steps_per_second": 10.218, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"learning_rate": 1.0550833333333334e-06, |
|
"loss": 1.0468, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"eval_accuracy": 0.4564870259481038, |
|
"eval_loss": 1.0205750465393066, |
|
"eval_runtime": 61.6145, |
|
"eval_samples_per_second": 81.312, |
|
"eval_steps_per_second": 10.176, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 1.0828611111111111e-06, |
|
"loss": 1.0464, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_accuracy": 0.4554890219560878, |
|
"eval_loss": 1.021471619606018, |
|
"eval_runtime": 61.6216, |
|
"eval_samples_per_second": 81.303, |
|
"eval_steps_per_second": 10.175, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"learning_rate": 1.1106111111111112e-06, |
|
"loss": 1.0374, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"eval_accuracy": 0.45728542914171655, |
|
"eval_loss": 1.0143259763717651, |
|
"eval_runtime": 61.6469, |
|
"eval_samples_per_second": 81.269, |
|
"eval_steps_per_second": 10.171, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 1.138388888888889e-06, |
|
"loss": 1.0427, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_accuracy": 0.45848303393213574, |
|
"eval_loss": 1.0229008197784424, |
|
"eval_runtime": 61.4018, |
|
"eval_samples_per_second": 81.594, |
|
"eval_steps_per_second": 10.211, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"learning_rate": 1.166138888888889e-06, |
|
"loss": 1.0359, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_accuracy": 0.4674650698602794, |
|
"eval_loss": 1.0088660717010498, |
|
"eval_runtime": 61.4607, |
|
"eval_samples_per_second": 81.515, |
|
"eval_steps_per_second": 10.202, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"learning_rate": 1.1939166666666668e-06, |
|
"loss": 1.0382, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_accuracy": 0.4489021956087824, |
|
"eval_loss": 1.0299837589263916, |
|
"eval_runtime": 61.6434, |
|
"eval_samples_per_second": 81.274, |
|
"eval_steps_per_second": 10.171, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"learning_rate": 1.2216944444444446e-06, |
|
"loss": 1.0381, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"eval_accuracy": 0.4786427145708583, |
|
"eval_loss": 1.0064666271209717, |
|
"eval_runtime": 61.6779, |
|
"eval_samples_per_second": 81.228, |
|
"eval_steps_per_second": 10.166, |
|
"step": 44000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 10000000, |
|
"num_train_epochs": 408, |
|
"save_steps": 1000, |
|
"total_flos": 9.19753644418007e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|