{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0625, "eval_steps": 2500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.094185829162598, "learning_rate": 4.000000000000001e-06, "loss": 1.8542, "step": 100 }, { "epoch": 0.01, "grad_norm": 8.345755577087402, "learning_rate": 8.000000000000001e-06, "loss": 1.4774, "step": 200 }, { "epoch": 0.01, "grad_norm": 3.6847422122955322, "learning_rate": 1.2e-05, "loss": 1.3027, "step": 300 }, { "epoch": 0.01, "grad_norm": 15.149823188781738, "learning_rate": 1.6000000000000003e-05, "loss": 1.2168, "step": 400 }, { "epoch": 0.01, "grad_norm": 9.95534896850586, "learning_rate": 2e-05, "loss": 1.1544, "step": 500 }, { "epoch": 0.01, "grad_norm": 3.96409273147583, "learning_rate": 1.98974358974359e-05, "loss": 1.134, "step": 600 }, { "epoch": 0.02, "grad_norm": 2.587010383605957, "learning_rate": 1.9794871794871798e-05, "loss": 1.1294, "step": 700 }, { "epoch": 0.02, "grad_norm": 5.926353454589844, "learning_rate": 1.9692307692307696e-05, "loss": 1.0886, "step": 800 }, { "epoch": 0.02, "grad_norm": 4.175276756286621, "learning_rate": 1.958974358974359e-05, "loss": 1.1227, "step": 900 }, { "epoch": 0.03, "grad_norm": 2.2265052795410156, "learning_rate": 1.9487179487179488e-05, "loss": 1.0694, "step": 1000 }, { "epoch": 0.03, "grad_norm": 6.808347702026367, "learning_rate": 1.9384615384615386e-05, "loss": 1.1084, "step": 1100 }, { "epoch": 0.03, "grad_norm": 2.2117719650268555, "learning_rate": 1.9282051282051284e-05, "loss": 1.0758, "step": 1200 }, { "epoch": 0.03, "grad_norm": 2.893665075302124, "learning_rate": 1.9179487179487182e-05, "loss": 1.0732, "step": 1300 }, { "epoch": 0.04, "grad_norm": 4.583731174468994, "learning_rate": 1.907692307692308e-05, "loss": 1.0345, "step": 1400 }, { "epoch": 0.04, "grad_norm": 2.2239737510681152, "learning_rate": 1.8974358974358975e-05, "loss": 1.0151, "step": 1500 }, { "epoch": 0.04, "grad_norm": 6.440332412719727, "learning_rate": 1.8871794871794873e-05, "loss": 1.0249, "step": 1600 }, { "epoch": 0.04, "grad_norm": 3.9038124084472656, "learning_rate": 1.876923076923077e-05, "loss": 1.0481, "step": 1700 }, { "epoch": 0.04, "grad_norm": 4.901433944702148, "learning_rate": 1.866666666666667e-05, "loss": 1.0383, "step": 1800 }, { "epoch": 0.05, "grad_norm": 2.6100122928619385, "learning_rate": 1.8564102564102567e-05, "loss": 0.9715, "step": 1900 }, { "epoch": 0.05, "grad_norm": 4.283998012542725, "learning_rate": 1.8461538461538465e-05, "loss": 0.9946, "step": 2000 }, { "epoch": 0.05, "grad_norm": 5.045602798461914, "learning_rate": 1.835897435897436e-05, "loss": 1.0233, "step": 2100 }, { "epoch": 0.06, "grad_norm": 3.054832935333252, "learning_rate": 1.8256410256410257e-05, "loss": 1.0177, "step": 2200 }, { "epoch": 0.06, "grad_norm": 4.251312732696533, "learning_rate": 1.8153846153846155e-05, "loss": 0.9562, "step": 2300 }, { "epoch": 0.06, "grad_norm": 2.6943576335906982, "learning_rate": 1.8051282051282053e-05, "loss": 1.0076, "step": 2400 }, { "epoch": 0.06, "grad_norm": 3.307131290435791, "learning_rate": 1.794871794871795e-05, "loss": 0.9687, "step": 2500 }, { "epoch": 0.06, "eval_loss": 0.9706119894981384, "eval_runtime": 104.0832, "eval_samples_per_second": 9.608, "eval_steps_per_second": 9.608, "step": 2500 } ], "logging_steps": 100, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2500, "total_flos": 4.025531498496e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }