{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.171700033932813, "eval_steps": 100, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09048750141386722, "eval_accuracy": 0.16714730898859703, "eval_loss": 4.686699867248535, "eval_runtime": 125.2427, "eval_samples_per_second": 7.218, "eval_steps_per_second": 7.218, "step": 100 }, { "epoch": 0.18097500282773443, "eval_accuracy": 0.25826880194607743, "eval_loss": 3.9017727375030518, "eval_runtime": 125.5606, "eval_samples_per_second": 7.2, "eval_steps_per_second": 7.2, "step": 200 }, { "epoch": 0.2714625042416016, "eval_accuracy": 0.29766981254599767, "eval_loss": 3.5929646492004395, "eval_runtime": 125.5573, "eval_samples_per_second": 7.2, "eval_steps_per_second": 7.2, "step": 300 }, { "epoch": 0.36195000565546886, "eval_accuracy": 0.3236774609630093, "eval_loss": 3.4225211143493652, "eval_runtime": 125.7379, "eval_samples_per_second": 7.19, "eval_steps_per_second": 7.19, "step": 400 }, { "epoch": 0.45243750706933605, "grad_norm": 4.40625, "learning_rate": 4.7737556561085976e-05, "loss": 4.0632, "step": 500 }, { "epoch": 0.45243750706933605, "eval_accuracy": 0.3405748527595257, "eval_loss": 3.307744026184082, "eval_runtime": 125.5713, "eval_samples_per_second": 7.199, "eval_steps_per_second": 7.199, "step": 500 }, { "epoch": 0.5429250084832032, "eval_accuracy": 0.354706238577747, "eval_loss": 3.21340012550354, "eval_runtime": 125.6117, "eval_samples_per_second": 7.197, "eval_steps_per_second": 7.197, "step": 600 }, { "epoch": 0.6334125098970704, "eval_accuracy": 0.3676221206998826, "eval_loss": 3.127941608428955, "eval_runtime": 125.646, "eval_samples_per_second": 7.195, "eval_steps_per_second": 7.195, "step": 700 }, { "epoch": 0.7239000113109377, "eval_accuracy": 0.377770033996102, "eval_loss": 3.0699830055236816, "eval_runtime": 125.238, "eval_samples_per_second": 7.218, "eval_steps_per_second": 7.218, "step": 800 }, { "epoch": 0.8143875127248049, "eval_accuracy": 0.3878323087639568, "eval_loss": 2.992367744445801, "eval_runtime": 126.0865, "eval_samples_per_second": 7.17, "eval_steps_per_second": 7.17, "step": 900 }, { "epoch": 0.9048750141386721, "grad_norm": 5.15625, "learning_rate": 4.547511312217195e-05, "loss": 3.0582, "step": 1000 }, { "epoch": 0.9048750141386721, "eval_accuracy": 0.3950470582191688, "eval_loss": 2.9669389724731445, "eval_runtime": 125.9784, "eval_samples_per_second": 7.176, "eval_steps_per_second": 7.176, "step": 1000 }, { "epoch": 0.9953625155525393, "eval_accuracy": 0.4000391717207191, "eval_loss": 2.936887264251709, "eval_runtime": 125.9788, "eval_samples_per_second": 7.176, "eval_steps_per_second": 7.176, "step": 1100 }, { "epoch": 1.0858500169664065, "eval_accuracy": 0.40566369384035433, "eval_loss": 2.889920234680176, "eval_runtime": 126.1226, "eval_samples_per_second": 7.168, "eval_steps_per_second": 7.168, "step": 1200 }, { "epoch": 1.1763375183802738, "eval_accuracy": 0.41152411235050507, "eval_loss": 2.855320930480957, "eval_runtime": 125.9479, "eval_samples_per_second": 7.178, "eval_steps_per_second": 7.178, "step": 1300 }, { "epoch": 1.2668250197941409, "eval_accuracy": 0.4195339922877197, "eval_loss": 2.8255977630615234, "eval_runtime": 125.4794, "eval_samples_per_second": 7.204, "eval_steps_per_second": 7.204, "step": 1400 }, { "epoch": 1.3573125212080082, "grad_norm": 4.5625, "learning_rate": 4.321266968325792e-05, "loss": 2.7942, "step": 1500 }, { "epoch": 1.3573125212080082, "eval_accuracy": 0.4255361411350747, "eval_loss": 2.780564308166504, "eval_runtime": 125.3637, "eval_samples_per_second": 7.211, "eval_steps_per_second": 7.211, "step": 1500 }, { "epoch": 1.4478000226218755, "eval_accuracy": 0.4327748821383073, "eval_loss": 2.7515323162078857, "eval_runtime": 125.451, "eval_samples_per_second": 7.206, "eval_steps_per_second": 7.206, "step": 1600 }, { "epoch": 1.5382875240357425, "eval_accuracy": 0.4364924338825894, "eval_loss": 2.731539726257324, "eval_runtime": 125.5016, "eval_samples_per_second": 7.203, "eval_steps_per_second": 7.203, "step": 1700 }, { "epoch": 1.6287750254496096, "eval_accuracy": 0.44307129828565556, "eval_loss": 2.693981170654297, "eval_runtime": 125.3005, "eval_samples_per_second": 7.215, "eval_steps_per_second": 7.215, "step": 1800 }, { "epoch": 1.719262526863477, "eval_accuracy": 0.4504961080016023, "eval_loss": 2.660767078399658, "eval_runtime": 125.3064, "eval_samples_per_second": 7.214, "eval_steps_per_second": 7.214, "step": 1900 }, { "epoch": 1.8097500282773442, "grad_norm": 4.375, "learning_rate": 4.095022624434389e-05, "loss": 2.6245, "step": 2000 }, { "epoch": 1.8097500282773442, "eval_accuracy": 0.4550176169447281, "eval_loss": 2.637939929962158, "eval_runtime": 125.4251, "eval_samples_per_second": 7.207, "eval_steps_per_second": 7.207, "step": 2000 }, { "epoch": 1.9002375296912115, "eval_accuracy": 0.4604336970231151, "eval_loss": 2.624295949935913, "eval_runtime": 125.5212, "eval_samples_per_second": 7.202, "eval_steps_per_second": 7.202, "step": 2100 }, { "epoch": 1.9907250311050786, "eval_accuracy": 0.4636457964607748, "eval_loss": 2.602674722671509, "eval_runtime": 125.5023, "eval_samples_per_second": 7.203, "eval_steps_per_second": 7.203, "step": 2200 }, { "epoch": 2.0812125325189457, "eval_accuracy": 0.4712341921958477, "eval_loss": 2.5688230991363525, "eval_runtime": 125.5191, "eval_samples_per_second": 7.202, "eval_steps_per_second": 7.202, "step": 2300 }, { "epoch": 2.171700033932813, "eval_accuracy": 0.47027530058798783, "eval_loss": 2.567828416824341, "eval_runtime": 125.4716, "eval_samples_per_second": 7.205, "eval_steps_per_second": 7.205, "step": 2400 } ], "logging_steps": 500, "max_steps": 11050, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "total_flos": 1.7805314132803584e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }