{ "best_metric": 1.9192386865615845, "best_model_checkpoint": "experiments/checkpoint-500", "epoch": 100.0, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "learning_rate": 1.8e-06, "loss": 1.7688, "step": 5 }, { "epoch": 2.0, "learning_rate": 4.8e-06, "loss": 1.7684, "step": 10 }, { "epoch": 3.0, "learning_rate": 7.799999999999998e-06, "loss": 1.7564, "step": 15 }, { "epoch": 4.0, "learning_rate": 1.0799999999999998e-05, "loss": 1.7424, "step": 20 }, { "epoch": 5.0, "learning_rate": 1.3799999999999998e-05, "loss": 1.727, "step": 25 }, { "epoch": 6.0, "learning_rate": 1.68e-05, "loss": 1.7135, "step": 30 }, { "epoch": 7.0, "learning_rate": 1.98e-05, "loss": 1.701, "step": 35 }, { "epoch": 8.0, "learning_rate": 2.28e-05, "loss": 1.6797, "step": 40 }, { "epoch": 9.0, "learning_rate": 2.5799999999999997e-05, "loss": 1.6547, "step": 45 }, { "epoch": 10.0, "learning_rate": 2.88e-05, "loss": 1.6245, "step": 50 }, { "epoch": 10.0, "eval_loss": 1.5684312582015991, "eval_runtime": 1.4166, "eval_samples_per_second": 7.059, "eval_steps_per_second": 1.412, "step": 50 }, { "epoch": 11.0, "learning_rate": 3.1799999999999994e-05, "loss": 1.5841, "step": 55 }, { "epoch": 12.0, "learning_rate": 3.48e-05, "loss": 1.5316, "step": 60 }, { "epoch": 13.0, "learning_rate": 3.78e-05, "loss": 1.4644, "step": 65 }, { "epoch": 14.0, "learning_rate": 4.08e-05, "loss": 1.3728, "step": 70 }, { "epoch": 15.0, "learning_rate": 4.3799999999999994e-05, "loss": 1.2692, "step": 75 }, { "epoch": 16.0, "learning_rate": 4.56e-05, "loss": 1.1998, "step": 80 }, { "epoch": 17.0, "learning_rate": 4.8599999999999995e-05, "loss": 1.1159, "step": 85 }, { "epoch": 18.0, "learning_rate": 5.1599999999999994e-05, "loss": 1.0442, "step": 90 }, { "epoch": 19.0, "learning_rate": 5.459999999999999e-05, "loss": 0.9944, "step": 95 }, { "epoch": 20.0, "learning_rate": 5.76e-05, "loss": 0.9518, "step": 100 }, { "epoch": 20.0, "eval_loss": 0.9818406105041504, "eval_runtime": 1.3835, "eval_samples_per_second": 7.228, "eval_steps_per_second": 1.446, "step": 100 }, { "epoch": 21.0, "learning_rate": 6.0599999999999996e-05, "loss": 0.908, "step": 105 }, { "epoch": 22.0, "learning_rate": 6.359999999999999e-05, "loss": 0.8678, "step": 110 }, { "epoch": 23.0, "learning_rate": 6.659999999999999e-05, "loss": 0.8303, "step": 115 }, { "epoch": 24.0, "learning_rate": 6.96e-05, "loss": 0.7928, "step": 120 }, { "epoch": 25.0, "learning_rate": 7.259999999999999e-05, "loss": 0.7594, "step": 125 }, { "epoch": 26.0, "learning_rate": 7.56e-05, "loss": 0.73, "step": 130 }, { "epoch": 27.0, "learning_rate": 7.86e-05, "loss": 0.7034, "step": 135 }, { "epoch": 28.0, "learning_rate": 8.16e-05, "loss": 0.6777, "step": 140 }, { "epoch": 29.0, "learning_rate": 8.459999999999998e-05, "loss": 0.6493, "step": 145 }, { "epoch": 30.0, "learning_rate": 8.759999999999999e-05, "loss": 0.6249, "step": 150 }, { "epoch": 30.0, "eval_loss": 0.8599645495414734, "eval_runtime": 1.4099, "eval_samples_per_second": 7.093, "eval_steps_per_second": 1.419, "step": 150 }, { "epoch": 31.0, "learning_rate": 9.059999999999999e-05, "loss": 0.6007, "step": 155 }, { "epoch": 32.0, "learning_rate": 9.36e-05, "loss": 0.5716, "step": 160 }, { "epoch": 33.0, "learning_rate": 9.659999999999999e-05, "loss": 0.5465, "step": 165 }, { "epoch": 34.0, "learning_rate": 9.96e-05, "loss": 0.5191, "step": 170 }, { "epoch": 35.0, "learning_rate": 0.0001026, "loss": 0.4947, "step": 175 }, { "epoch": 36.0, "learning_rate": 0.00010559999999999998, "loss": 0.4681, "step": 180 }, { "epoch": 37.0, "learning_rate": 0.00010859999999999998, "loss": 0.4417, "step": 185 }, { "epoch": 38.0, "learning_rate": 0.00011159999999999999, "loss": 0.4116, "step": 190 }, { "epoch": 39.0, "learning_rate": 0.0001146, "loss": 0.3804, "step": 195 }, { "epoch": 40.0, "learning_rate": 0.0001176, "loss": 0.3544, "step": 200 }, { "epoch": 40.0, "eval_loss": 1.1081293821334839, "eval_runtime": 1.3764, "eval_samples_per_second": 7.265, "eval_steps_per_second": 1.453, "step": 200 }, { "epoch": 41.0, "learning_rate": 0.00012059999999999999, "loss": 0.3248, "step": 205 }, { "epoch": 42.0, "learning_rate": 0.0001236, "loss": 0.2931, "step": 210 }, { "epoch": 43.0, "learning_rate": 0.0001266, "loss": 0.2677, "step": 215 }, { "epoch": 44.0, "learning_rate": 0.00012959999999999998, "loss": 0.2386, "step": 220 }, { "epoch": 45.0, "learning_rate": 0.0001326, "loss": 0.2142, "step": 225 }, { "epoch": 46.0, "learning_rate": 0.0001356, "loss": 0.1932, "step": 230 }, { "epoch": 47.0, "learning_rate": 0.0001386, "loss": 0.1709, "step": 235 }, { "epoch": 48.0, "learning_rate": 0.00014159999999999997, "loss": 0.1571, "step": 240 }, { "epoch": 49.0, "learning_rate": 0.0001446, "loss": 0.1417, "step": 245 }, { "epoch": 50.0, "learning_rate": 0.00014759999999999998, "loss": 0.1184, "step": 250 }, { "epoch": 50.0, "eval_loss": 1.5212451219558716, "eval_runtime": 1.3827, "eval_samples_per_second": 7.232, "eval_steps_per_second": 1.446, "step": 250 }, { "epoch": 51.0, "learning_rate": 0.00015059999999999997, "loss": 0.1096, "step": 255 }, { "epoch": 52.0, "learning_rate": 0.0001536, "loss": 0.1037, "step": 260 }, { "epoch": 53.0, "learning_rate": 0.00015659999999999998, "loss": 0.095, "step": 265 }, { "epoch": 54.0, "learning_rate": 0.0001596, "loss": 0.0865, "step": 270 }, { "epoch": 55.0, "learning_rate": 0.0001626, "loss": 0.0808, "step": 275 }, { "epoch": 56.0, "learning_rate": 0.0001656, "loss": 0.0794, "step": 280 }, { "epoch": 57.0, "learning_rate": 0.0001686, "loss": 0.075, "step": 285 }, { "epoch": 58.0, "learning_rate": 0.00017159999999999997, "loss": 0.0726, "step": 290 }, { "epoch": 59.0, "learning_rate": 0.00017459999999999996, "loss": 0.0696, "step": 295 }, { "epoch": 60.0, "learning_rate": 0.00017759999999999998, "loss": 0.0665, "step": 300 }, { "epoch": 60.0, "eval_loss": 1.7048699855804443, "eval_runtime": 1.3753, "eval_samples_per_second": 7.271, "eval_steps_per_second": 1.454, "step": 300 }, { "epoch": 61.0, "learning_rate": 0.00018059999999999997, "loss": 0.065, "step": 305 }, { "epoch": 62.0, "learning_rate": 0.0001836, "loss": 0.0623, "step": 310 }, { "epoch": 63.0, "learning_rate": 0.00018659999999999998, "loss": 0.0574, "step": 315 }, { "epoch": 64.0, "learning_rate": 0.00018959999999999997, "loss": 0.0577, "step": 320 }, { "epoch": 65.0, "learning_rate": 0.0001926, "loss": 0.0597, "step": 325 }, { "epoch": 66.0, "learning_rate": 0.00019559999999999998, "loss": 0.0546, "step": 330 }, { "epoch": 67.0, "learning_rate": 0.0001986, "loss": 0.0603, "step": 335 }, { "epoch": 68.0, "learning_rate": 0.0002016, "loss": 0.0555, "step": 340 }, { "epoch": 69.0, "learning_rate": 0.00020459999999999999, "loss": 0.0551, "step": 345 }, { "epoch": 70.0, "learning_rate": 0.00020759999999999998, "loss": 0.0529, "step": 350 }, { "epoch": 70.0, "eval_loss": 1.901442527770996, "eval_runtime": 1.3785, "eval_samples_per_second": 7.254, "eval_steps_per_second": 1.451, "step": 350 }, { "epoch": 71.0, "learning_rate": 0.00021059999999999997, "loss": 0.051, "step": 355 }, { "epoch": 72.0, "learning_rate": 0.00021359999999999996, "loss": 0.05, "step": 360 }, { "epoch": 73.0, "learning_rate": 0.00021659999999999998, "loss": 0.0493, "step": 365 }, { "epoch": 74.0, "learning_rate": 0.00021959999999999997, "loss": 0.0465, "step": 370 }, { "epoch": 75.0, "learning_rate": 0.0002226, "loss": 0.0504, "step": 375 }, { "epoch": 76.0, "learning_rate": 0.00022559999999999998, "loss": 0.0491, "step": 380 }, { "epoch": 77.0, "learning_rate": 0.00022859999999999997, "loss": 0.0485, "step": 385 }, { "epoch": 78.0, "learning_rate": 0.0002316, "loss": 0.0451, "step": 390 }, { "epoch": 79.0, "learning_rate": 0.00023459999999999998, "loss": 0.0478, "step": 395 }, { "epoch": 80.0, "learning_rate": 0.0002376, "loss": 0.0435, "step": 400 }, { "epoch": 80.0, "eval_loss": 1.9840269088745117, "eval_runtime": 1.3787, "eval_samples_per_second": 7.253, "eval_steps_per_second": 1.451, "step": 400 }, { "epoch": 81.0, "learning_rate": 0.0002406, "loss": 0.0429, "step": 405 }, { "epoch": 82.0, "learning_rate": 0.00024359999999999999, "loss": 0.0464, "step": 410 }, { "epoch": 83.0, "learning_rate": 0.0002466, "loss": 0.0458, "step": 415 }, { "epoch": 84.0, "learning_rate": 0.00024959999999999994, "loss": 0.0441, "step": 420 }, { "epoch": 85.0, "learning_rate": 0.00025259999999999996, "loss": 0.0421, "step": 425 }, { "epoch": 86.0, "learning_rate": 0.0002556, "loss": 0.0433, "step": 430 }, { "epoch": 87.0, "learning_rate": 0.0002586, "loss": 0.0444, "step": 435 }, { "epoch": 88.0, "learning_rate": 0.00026159999999999996, "loss": 0.0472, "step": 440 }, { "epoch": 89.0, "learning_rate": 0.0002646, "loss": 0.0442, "step": 445 }, { "epoch": 90.0, "learning_rate": 0.0002676, "loss": 0.0431, "step": 450 }, { "epoch": 90.0, "eval_loss": 1.839582085609436, "eval_runtime": 1.3848, "eval_samples_per_second": 7.221, "eval_steps_per_second": 1.444, "step": 450 }, { "epoch": 91.0, "learning_rate": 0.00027059999999999996, "loss": 0.0431, "step": 455 }, { "epoch": 92.0, "learning_rate": 0.0002736, "loss": 0.044, "step": 460 }, { "epoch": 93.0, "learning_rate": 0.0002766, "loss": 0.0429, "step": 465 }, { "epoch": 94.0, "learning_rate": 0.00027959999999999997, "loss": 0.042, "step": 470 }, { "epoch": 95.0, "learning_rate": 0.0002826, "loss": 0.0415, "step": 475 }, { "epoch": 96.0, "learning_rate": 0.00028559999999999995, "loss": 0.0433, "step": 480 }, { "epoch": 97.0, "learning_rate": 0.00028859999999999997, "loss": 0.0405, "step": 485 }, { "epoch": 98.0, "learning_rate": 0.0002916, "loss": 0.0416, "step": 490 }, { "epoch": 99.0, "learning_rate": 0.00029459999999999995, "loss": 0.0378, "step": 495 }, { "epoch": 100.0, "learning_rate": 0.00029759999999999997, "loss": 0.0379, "step": 500 }, { "epoch": 100.0, "eval_loss": 1.9192386865615845, "eval_runtime": 1.3756, "eval_samples_per_second": 7.269, "eval_steps_per_second": 1.454, "step": 500 } ], "max_steps": 10000, "num_train_epochs": 2000, "total_flos": 8.0956959522816e+16, "trial_name": null, "trial_params": null }