{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.925925925925926, "eval_steps": 1, "global_step": 10, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 282.9060974121094, "learning_rate": 2.5e-05, "loss": 8.6695, "step": 1 }, { "epoch": 1.0, "eval_accuracy": 0.6212121212121212, "eval_loss": 7.576000690460205, "eval_runtime": 8.5741, "eval_samples_per_second": 15.395, "eval_steps_per_second": 0.816, "step": 1 }, { "epoch": 1.1851851851851851, "grad_norm": 62.49760437011719, "learning_rate": 5e-05, "loss": 1.941, "step": 2 }, { "epoch": 1.1851851851851851, "eval_accuracy": 0.5454545454545454, "eval_loss": 1.9975844621658325, "eval_runtime": 8.5606, "eval_samples_per_second": 15.419, "eval_steps_per_second": 0.818, "step": 2 }, { "epoch": 2.0, "grad_norm": 122.50921630859375, "learning_rate": 4.375e-05, "loss": 1.3554, "step": 3 }, { "epoch": 2.0, "eval_accuracy": 0.3787878787878788, "eval_loss": 15.970643997192383, "eval_runtime": 8.5528, "eval_samples_per_second": 15.434, "eval_steps_per_second": 0.818, "step": 3 }, { "epoch": 2.3703703703703702, "grad_norm": 142.13168334960938, "learning_rate": 3.7500000000000003e-05, "loss": 4.8003, "step": 4 }, { "epoch": 2.3703703703703702, "eval_accuracy": 0.3787878787878788, "eval_loss": 8.323533058166504, "eval_runtime": 8.5446, "eval_samples_per_second": 15.448, "eval_steps_per_second": 0.819, "step": 4 }, { "epoch": 3.0, "grad_norm": 234.6129608154297, "learning_rate": 3.125e-05, "loss": 4.1677, "step": 5 }, { "epoch": 3.0, "eval_accuracy": 0.6818181818181818, "eval_loss": 1.3907140493392944, "eval_runtime": 8.5437, "eval_samples_per_second": 15.45, "eval_steps_per_second": 0.819, "step": 5 }, { "epoch": 3.5555555555555554, "grad_norm": 105.30667877197266, "learning_rate": 2.5e-05, "loss": 0.8659, "step": 6 }, { "epoch": 3.5555555555555554, "eval_accuracy": 0.6515151515151515, "eval_loss": 1.0335112810134888, "eval_runtime": 8.5605, "eval_samples_per_second": 15.42, "eval_steps_per_second": 0.818, "step": 6 }, { "epoch": 4.0, "grad_norm": 61.28445053100586, "learning_rate": 1.8750000000000002e-05, "loss": 0.441, "step": 7 }, { "epoch": 4.0, "eval_accuracy": 0.4090909090909091, "eval_loss": 2.9479410648345947, "eval_runtime": 8.5489, "eval_samples_per_second": 15.441, "eval_steps_per_second": 0.819, "step": 7 }, { "epoch": 4.7407407407407405, "grad_norm": 232.58999633789062, "learning_rate": 1.25e-05, "loss": 1.5472, "step": 8 }, { "epoch": 4.7407407407407405, "eval_accuracy": 0.4090909090909091, "eval_loss": 2.773439884185791, "eval_runtime": 8.5456, "eval_samples_per_second": 15.447, "eval_steps_per_second": 0.819, "step": 8 }, { "epoch": 5.0, "grad_norm": 73.13507843017578, "learning_rate": 6.25e-06, "loss": 0.4933, "step": 9 }, { "epoch": 5.0, "eval_accuracy": 0.5075757575757576, "eval_loss": 1.6608752012252808, "eval_runtime": 8.544, "eval_samples_per_second": 15.45, "eval_steps_per_second": 0.819, "step": 9 }, { "epoch": 5.925925925925926, "grad_norm": 179.47828674316406, "learning_rate": 0.0, "loss": 1.0334, "step": 10 }, { "epoch": 5.925925925925926, "eval_accuracy": 0.5681818181818182, "eval_loss": 1.06743323802948, "eval_runtime": 8.5485, "eval_samples_per_second": 15.441, "eval_steps_per_second": 0.819, "step": 10 }, { "epoch": 5.925925925925926, "step": 10, "total_flos": 1.4575787280384e+16, "train_loss": 2.531478080153465, "train_runtime": 897.3522, "train_samples_per_second": 5.862, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 10, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.4575787280384e+16, "train_batch_size": 20, "trial_name": null, "trial_params": null }