{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 132, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 9989.583355258865, "learning_rate": 1.4285714285714286e-06, "loss": 29.4449, "step": 1 }, { "epoch": 0.11, "grad_norm": 707.0577616119402, "learning_rate": 7.1428571428571436e-06, "loss": 22.2193, "step": 5 }, { "epoch": 0.23, "grad_norm": 375.6314604650611, "learning_rate": 1.4285714285714287e-05, "loss": 15.1108, "step": 10 }, { "epoch": 0.34, "grad_norm": 289.1773570401075, "learning_rate": 1.999645611123453e-05, "loss": 12.8322, "step": 15 }, { "epoch": 0.45, "grad_norm": 100.15622758708129, "learning_rate": 1.9872683547213446e-05, "loss": 11.4674, "step": 20 }, { "epoch": 0.57, "grad_norm": 304.81520026489596, "learning_rate": 1.9574220383620054e-05, "loss": 10.0101, "step": 25 }, { "epoch": 0.68, "grad_norm": 371.0345686284941, "learning_rate": 1.9106347728549134e-05, "loss": 6.2458, "step": 30 }, { "epoch": 0.8, "grad_norm": 81.2827670282697, "learning_rate": 1.8477344278896708e-05, "loss": 3.0864, "step": 35 }, { "epoch": 0.91, "grad_norm": 47.894252021447926, "learning_rate": 1.7698339834299064e-05, "loss": 2.8755, "step": 40 }, { "epoch": 1.02, "grad_norm": 53.028055369579555, "learning_rate": 1.6783118362696162e-05, "loss": 2.6268, "step": 45 }, { "epoch": 1.14, "grad_norm": 50.2509028556341, "learning_rate": 1.5747874102144073e-05, "loss": 2.3685, "step": 50 }, { "epoch": 1.25, "grad_norm": 20.877545879705412, "learning_rate": 1.461092501449326e-05, "loss": 2.2533, "step": 55 }, { "epoch": 1.36, "grad_norm": 36.31764261618316, "learning_rate": 1.3392388661180303e-05, "loss": 2.1889, "step": 60 }, { "epoch": 1.48, "grad_norm": 29.627487525974026, "learning_rate": 1.2113826236296245e-05, "loss": 2.0759, "step": 65 }, { "epoch": 1.59, "grad_norm": 19.76492979476836, "learning_rate": 1.0797861055530832e-05, "loss": 2.0685, "step": 70 }, { "epoch": 1.7, "grad_norm": 8.65646635592006, "learning_rate": 9.467778251578217e-06, "loss": 2.0652, "step": 75 }, { "epoch": 1.82, "grad_norm": 21.898667968620007, "learning_rate": 8.147112759128859e-06, "loss": 2.0181, "step": 80 }, { "epoch": 1.93, "grad_norm": 7.552721457027234, "learning_rate": 6.859232879780515e-06, "loss": 2.0121, "step": 85 }, { "epoch": 2.05, "grad_norm": 26.791885422498936, "learning_rate": 5.626926795411447e-06, "loss": 1.9533, "step": 90 }, { "epoch": 2.16, "grad_norm": 13.308383024928718, "learning_rate": 4.4719993463880695e-06, "loss": 1.8816, "step": 95 }, { "epoch": 2.27, "grad_norm": 10.35688603485077, "learning_rate": 3.414886209349615e-06, "loss": 1.8907, "step": 100 }, { "epoch": 2.39, "grad_norm": 14.11012256128135, "learning_rate": 2.4742923014386154e-06, "loss": 1.8785, "step": 105 }, { "epoch": 2.5, "grad_norm": 8.840068155810476, "learning_rate": 1.6668608091748495e-06, "loss": 1.896, "step": 110 }, { "epoch": 2.61, "grad_norm": 10.535980554072063, "learning_rate": 1.0068786982878087e-06, "loss": 1.8747, "step": 115 }, { "epoch": 2.73, "grad_norm": 7.2944918703673345, "learning_rate": 5.060239153161872e-07, "loss": 1.8646, "step": 120 }, { "epoch": 2.84, "grad_norm": 6.536602991990556, "learning_rate": 1.731587540747903e-07, "loss": 1.8607, "step": 125 }, { "epoch": 2.95, "grad_norm": 8.045080822985964, "learning_rate": 1.4173043232380557e-08, "loss": 1.8633, "step": 130 }, { "epoch": 3.0, "step": 132, "total_flos": 9088687669248.0, "train_loss": 4.646637526425448, "train_runtime": 252.9729, "train_samples_per_second": 16.614, "train_steps_per_second": 0.522 } ], "logging_steps": 5, "max_steps": 132, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 9088687669248.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }