{ "best_metric": null, "best_model_checkpoint": null, "epoch": 300.0, "global_step": 16500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.09, "learning_rate": 1.9393939393939395e-05, "loss": 0.0325, "step": 500 }, { "epoch": 18.18, "learning_rate": 1.8787878787878792e-05, "loss": 0.0205, "step": 1000 }, { "epoch": 18.18, "eval_loss": 0.024596277624368668, "eval_runtime": 0.6803, "eval_samples_per_second": 385.138, "eval_steps_per_second": 24.99, "step": 1000 }, { "epoch": 27.27, "learning_rate": 1.8181818181818182e-05, "loss": 0.0176, "step": 1500 }, { "epoch": 36.36, "learning_rate": 1.7575757575757576e-05, "loss": 0.0148, "step": 2000 }, { "epoch": 36.36, "eval_loss": 0.009050876833498478, "eval_runtime": 0.6792, "eval_samples_per_second": 385.757, "eval_steps_per_second": 25.03, "step": 2000 }, { "epoch": 45.45, "learning_rate": 1.6969696969696972e-05, "loss": 0.0139, "step": 2500 }, { "epoch": 54.55, "learning_rate": 1.6363636363636366e-05, "loss": 0.0131, "step": 3000 }, { "epoch": 54.55, "eval_loss": 0.08356618881225586, "eval_runtime": 0.6799, "eval_samples_per_second": 385.336, "eval_steps_per_second": 25.003, "step": 3000 }, { "epoch": 63.64, "learning_rate": 1.575757575757576e-05, "loss": 0.0123, "step": 3500 }, { "epoch": 72.73, "learning_rate": 1.5151515151515153e-05, "loss": 0.012, "step": 4000 }, { "epoch": 72.73, "eval_loss": 0.0886448472738266, "eval_runtime": 0.6927, "eval_samples_per_second": 378.228, "eval_steps_per_second": 24.542, "step": 4000 }, { "epoch": 81.82, "learning_rate": 1.4545454545454546e-05, "loss": 0.0114, "step": 4500 }, { "epoch": 90.91, "learning_rate": 1.3939393939393942e-05, "loss": 0.011, "step": 5000 }, { "epoch": 90.91, "eval_loss": 0.0763353630900383, "eval_runtime": 0.678, "eval_samples_per_second": 386.405, "eval_steps_per_second": 25.072, "step": 5000 }, { "epoch": 100.0, "learning_rate": 1.3333333333333333e-05, "loss": 0.0105, "step": 5500 }, { "epoch": 109.09, "learning_rate": 1.2727272727272728e-05, "loss": 0.0111, "step": 6000 }, { "epoch": 109.09, "eval_loss": 0.10684655606746674, "eval_runtime": 0.6792, "eval_samples_per_second": 385.762, "eval_steps_per_second": 25.03, "step": 6000 }, { "epoch": 118.18, "learning_rate": 1.2121212121212122e-05, "loss": 0.0104, "step": 6500 }, { "epoch": 127.27, "learning_rate": 1.1515151515151517e-05, "loss": 0.0104, "step": 7000 }, { "epoch": 127.27, "eval_loss": 0.0989227443933487, "eval_runtime": 0.6773, "eval_samples_per_second": 386.802, "eval_steps_per_second": 25.098, "step": 7000 }, { "epoch": 136.36, "learning_rate": 1.0909090909090909e-05, "loss": 0.0097, "step": 7500 }, { "epoch": 145.45, "learning_rate": 1.0303030303030304e-05, "loss": 0.0106, "step": 8000 }, { "epoch": 145.45, "eval_loss": 0.09557081758975983, "eval_runtime": 0.7034, "eval_samples_per_second": 372.496, "eval_steps_per_second": 24.17, "step": 8000 }, { "epoch": 154.55, "learning_rate": 9.696969696969698e-06, "loss": 0.0099, "step": 8500 }, { "epoch": 163.64, "learning_rate": 9.090909090909091e-06, "loss": 0.0097, "step": 9000 }, { "epoch": 163.64, "eval_loss": 0.07463082671165466, "eval_runtime": 0.6685, "eval_samples_per_second": 391.9, "eval_steps_per_second": 25.429, "step": 9000 }, { "epoch": 172.73, "learning_rate": 8.484848484848486e-06, "loss": 0.0111, "step": 9500 }, { "epoch": 181.82, "learning_rate": 7.87878787878788e-06, "loss": 0.0097, "step": 10000 }, { "epoch": 181.82, "eval_loss": 0.0682496652007103, "eval_runtime": 0.6795, "eval_samples_per_second": 385.591, "eval_steps_per_second": 25.019, "step": 10000 }, { "epoch": 190.91, "learning_rate": 7.272727272727273e-06, "loss": 0.0098, "step": 10500 }, { "epoch": 200.0, "learning_rate": 6.666666666666667e-06, "loss": 0.0095, "step": 11000 }, { "epoch": 200.0, "eval_loss": 0.04372559115290642, "eval_runtime": 0.7607, "eval_samples_per_second": 344.421, "eval_steps_per_second": 22.348, "step": 11000 }, { "epoch": 209.09, "learning_rate": 6.060606060606061e-06, "loss": 0.0096, "step": 11500 }, { "epoch": 218.18, "learning_rate": 5.4545454545454545e-06, "loss": 0.0095, "step": 12000 }, { "epoch": 218.18, "eval_loss": 0.0709039568901062, "eval_runtime": 0.7061, "eval_samples_per_second": 371.075, "eval_steps_per_second": 24.077, "step": 12000 }, { "epoch": 227.27, "learning_rate": 4.848484848484849e-06, "loss": 0.0092, "step": 12500 }, { "epoch": 236.36, "learning_rate": 4.242424242424243e-06, "loss": 0.0093, "step": 13000 }, { "epoch": 236.36, "eval_loss": 0.06207394599914551, "eval_runtime": 0.7614, "eval_samples_per_second": 344.108, "eval_steps_per_second": 22.328, "step": 13000 }, { "epoch": 245.45, "learning_rate": 3.6363636363636366e-06, "loss": 0.0092, "step": 13500 }, { "epoch": 254.55, "learning_rate": 3.0303030303030305e-06, "loss": 0.0095, "step": 14000 }, { "epoch": 254.55, "eval_loss": 0.0763927549123764, "eval_runtime": 0.7074, "eval_samples_per_second": 370.363, "eval_steps_per_second": 24.031, "step": 14000 }, { "epoch": 263.64, "learning_rate": 2.4242424242424244e-06, "loss": 0.0094, "step": 14500 }, { "epoch": 272.73, "learning_rate": 1.8181818181818183e-06, "loss": 0.0095, "step": 15000 }, { "epoch": 272.73, "eval_loss": 0.0721823051571846, "eval_runtime": 0.6582, "eval_samples_per_second": 398.054, "eval_steps_per_second": 25.828, "step": 15000 }, { "epoch": 281.82, "learning_rate": 1.2121212121212122e-06, "loss": 0.0091, "step": 15500 }, { "epoch": 290.91, "learning_rate": 6.060606060606061e-07, "loss": 0.0092, "step": 16000 }, { "epoch": 290.91, "eval_loss": 0.05487630143761635, "eval_runtime": 0.715, "eval_samples_per_second": 366.453, "eval_steps_per_second": 23.777, "step": 16000 }, { "epoch": 300.0, "learning_rate": 0.0, "loss": 0.0094, "step": 16500 } ], "max_steps": 16500, "num_train_epochs": 300, "total_flos": 9.41486238830592e+16, "trial_name": null, "trial_params": null }