|
{ |
|
"best_metric": 1.9192386865615845, |
|
"best_model_checkpoint": "experiments/checkpoint-500", |
|
"epoch": 100.0, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 1.8e-06, |
|
"loss": 1.7688, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 4.8e-06, |
|
"loss": 1.7684, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 7.799999999999998e-06, |
|
"loss": 1.7564, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 1.0799999999999998e-05, |
|
"loss": 1.7424, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 1.3799999999999998e-05, |
|
"loss": 1.727, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"learning_rate": 1.68e-05, |
|
"loss": 1.7135, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"learning_rate": 1.98e-05, |
|
"loss": 1.701, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 2.28e-05, |
|
"loss": 1.6797, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"learning_rate": 2.5799999999999997e-05, |
|
"loss": 1.6547, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 2.88e-05, |
|
"loss": 1.6245, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.5684312582015991, |
|
"eval_runtime": 1.4166, |
|
"eval_samples_per_second": 7.059, |
|
"eval_steps_per_second": 1.412, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"learning_rate": 3.1799999999999994e-05, |
|
"loss": 1.5841, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 3.48e-05, |
|
"loss": 1.5316, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"learning_rate": 3.78e-05, |
|
"loss": 1.4644, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"learning_rate": 4.08e-05, |
|
"loss": 1.3728, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"learning_rate": 4.3799999999999994e-05, |
|
"loss": 1.2692, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 4.56e-05, |
|
"loss": 1.1998, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"learning_rate": 4.8599999999999995e-05, |
|
"loss": 1.1159, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"learning_rate": 5.1599999999999994e-05, |
|
"loss": 1.0442, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"learning_rate": 5.459999999999999e-05, |
|
"loss": 0.9944, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 5.76e-05, |
|
"loss": 0.9518, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 0.9818406105041504, |
|
"eval_runtime": 1.3835, |
|
"eval_samples_per_second": 7.228, |
|
"eval_steps_per_second": 1.446, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"learning_rate": 6.0599999999999996e-05, |
|
"loss": 0.908, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"learning_rate": 6.359999999999999e-05, |
|
"loss": 0.8678, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"learning_rate": 6.659999999999999e-05, |
|
"loss": 0.8303, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"learning_rate": 6.96e-05, |
|
"loss": 0.7928, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"learning_rate": 7.259999999999999e-05, |
|
"loss": 0.7594, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"learning_rate": 7.56e-05, |
|
"loss": 0.73, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"learning_rate": 7.86e-05, |
|
"loss": 0.7034, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"learning_rate": 8.16e-05, |
|
"loss": 0.6777, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"learning_rate": 8.459999999999998e-05, |
|
"loss": 0.6493, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"learning_rate": 8.759999999999999e-05, |
|
"loss": 0.6249, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.8599645495414734, |
|
"eval_runtime": 1.4099, |
|
"eval_samples_per_second": 7.093, |
|
"eval_steps_per_second": 1.419, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"learning_rate": 9.059999999999999e-05, |
|
"loss": 0.6007, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"learning_rate": 9.36e-05, |
|
"loss": 0.5716, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"learning_rate": 9.659999999999999e-05, |
|
"loss": 0.5465, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"learning_rate": 9.96e-05, |
|
"loss": 0.5191, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"learning_rate": 0.0001026, |
|
"loss": 0.4947, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"learning_rate": 0.00010559999999999998, |
|
"loss": 0.4681, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"learning_rate": 0.00010859999999999998, |
|
"loss": 0.4417, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"learning_rate": 0.00011159999999999999, |
|
"loss": 0.4116, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"learning_rate": 0.0001146, |
|
"loss": 0.3804, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"learning_rate": 0.0001176, |
|
"loss": 0.3544, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 1.1081293821334839, |
|
"eval_runtime": 1.3764, |
|
"eval_samples_per_second": 7.265, |
|
"eval_steps_per_second": 1.453, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"learning_rate": 0.00012059999999999999, |
|
"loss": 0.3248, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"learning_rate": 0.0001236, |
|
"loss": 0.2931, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"learning_rate": 0.0001266, |
|
"loss": 0.2677, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"learning_rate": 0.00012959999999999998, |
|
"loss": 0.2386, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"learning_rate": 0.0001326, |
|
"loss": 0.2142, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"learning_rate": 0.0001356, |
|
"loss": 0.1932, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"learning_rate": 0.0001386, |
|
"loss": 0.1709, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"learning_rate": 0.00014159999999999997, |
|
"loss": 0.1571, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"learning_rate": 0.0001446, |
|
"loss": 0.1417, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"learning_rate": 0.00014759999999999998, |
|
"loss": 0.1184, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_loss": 1.5212451219558716, |
|
"eval_runtime": 1.3827, |
|
"eval_samples_per_second": 7.232, |
|
"eval_steps_per_second": 1.446, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"learning_rate": 0.00015059999999999997, |
|
"loss": 0.1096, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"learning_rate": 0.0001536, |
|
"loss": 0.1037, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"learning_rate": 0.00015659999999999998, |
|
"loss": 0.095, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"learning_rate": 0.0001596, |
|
"loss": 0.0865, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"learning_rate": 0.0001626, |
|
"loss": 0.0808, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"learning_rate": 0.0001656, |
|
"loss": 0.0794, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"learning_rate": 0.0001686, |
|
"loss": 0.075, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"learning_rate": 0.00017159999999999997, |
|
"loss": 0.0726, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"learning_rate": 0.00017459999999999996, |
|
"loss": 0.0696, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"learning_rate": 0.00017759999999999998, |
|
"loss": 0.0665, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_loss": 1.7048699855804443, |
|
"eval_runtime": 1.3753, |
|
"eval_samples_per_second": 7.271, |
|
"eval_steps_per_second": 1.454, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"learning_rate": 0.00018059999999999997, |
|
"loss": 0.065, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"learning_rate": 0.0001836, |
|
"loss": 0.0623, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"learning_rate": 0.00018659999999999998, |
|
"loss": 0.0574, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"learning_rate": 0.00018959999999999997, |
|
"loss": 0.0577, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"learning_rate": 0.0001926, |
|
"loss": 0.0597, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"learning_rate": 0.00019559999999999998, |
|
"loss": 0.0546, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"learning_rate": 0.0001986, |
|
"loss": 0.0603, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"learning_rate": 0.0002016, |
|
"loss": 0.0555, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"learning_rate": 0.00020459999999999999, |
|
"loss": 0.0551, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"learning_rate": 0.00020759999999999998, |
|
"loss": 0.0529, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_loss": 1.901442527770996, |
|
"eval_runtime": 1.3785, |
|
"eval_samples_per_second": 7.254, |
|
"eval_steps_per_second": 1.451, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"learning_rate": 0.00021059999999999997, |
|
"loss": 0.051, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"learning_rate": 0.00021359999999999996, |
|
"loss": 0.05, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"learning_rate": 0.00021659999999999998, |
|
"loss": 0.0493, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"learning_rate": 0.00021959999999999997, |
|
"loss": 0.0465, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"learning_rate": 0.0002226, |
|
"loss": 0.0504, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"learning_rate": 0.00022559999999999998, |
|
"loss": 0.0491, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"learning_rate": 0.00022859999999999997, |
|
"loss": 0.0485, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"learning_rate": 0.0002316, |
|
"loss": 0.0451, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"learning_rate": 0.00023459999999999998, |
|
"loss": 0.0478, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"learning_rate": 0.0002376, |
|
"loss": 0.0435, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_loss": 1.9840269088745117, |
|
"eval_runtime": 1.3787, |
|
"eval_samples_per_second": 7.253, |
|
"eval_steps_per_second": 1.451, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"learning_rate": 0.0002406, |
|
"loss": 0.0429, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"learning_rate": 0.00024359999999999999, |
|
"loss": 0.0464, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"learning_rate": 0.0002466, |
|
"loss": 0.0458, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"learning_rate": 0.00024959999999999994, |
|
"loss": 0.0441, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"learning_rate": 0.00025259999999999996, |
|
"loss": 0.0421, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"learning_rate": 0.0002556, |
|
"loss": 0.0433, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"learning_rate": 0.0002586, |
|
"loss": 0.0444, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"learning_rate": 0.00026159999999999996, |
|
"loss": 0.0472, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"learning_rate": 0.0002646, |
|
"loss": 0.0442, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"learning_rate": 0.0002676, |
|
"loss": 0.0431, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_loss": 1.839582085609436, |
|
"eval_runtime": 1.3848, |
|
"eval_samples_per_second": 7.221, |
|
"eval_steps_per_second": 1.444, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"learning_rate": 0.00027059999999999996, |
|
"loss": 0.0431, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"learning_rate": 0.0002736, |
|
"loss": 0.044, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"learning_rate": 0.0002766, |
|
"loss": 0.0429, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"learning_rate": 0.00027959999999999997, |
|
"loss": 0.042, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"learning_rate": 0.0002826, |
|
"loss": 0.0415, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"learning_rate": 0.00028559999999999995, |
|
"loss": 0.0433, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"learning_rate": 0.00028859999999999997, |
|
"loss": 0.0405, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"learning_rate": 0.0002916, |
|
"loss": 0.0416, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"learning_rate": 0.00029459999999999995, |
|
"loss": 0.0378, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"learning_rate": 0.00029759999999999997, |
|
"loss": 0.0379, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_loss": 1.9192386865615845, |
|
"eval_runtime": 1.3756, |
|
"eval_samples_per_second": 7.269, |
|
"eval_steps_per_second": 1.454, |
|
"step": 500 |
|
} |
|
], |
|
"max_steps": 10000, |
|
"num_train_epochs": 2000, |
|
"total_flos": 8.0956959522816e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|