{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990205680705191, "eval_steps": 500, "global_step": 510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.039177277179236046, "grad_norm": 0.5212510228157043, "learning_rate": 0.00098, "loss": 1.6695, "step": 20 }, { "epoch": 0.07835455435847209, "grad_norm": 0.46687546372413635, "learning_rate": 0.00094, "loss": 1.4897, "step": 40 }, { "epoch": 0.11753183153770813, "grad_norm": 0.3956817388534546, "learning_rate": 0.0009000000000000001, "loss": 1.1361, "step": 60 }, { "epoch": 0.15670910871694418, "grad_norm": 0.6187575459480286, "learning_rate": 0.00086, "loss": 1.1006, "step": 80 }, { "epoch": 0.1958863858961802, "grad_norm": 0.5952175259590149, "learning_rate": 0.00082, "loss": 1.1829, "step": 100 }, { "epoch": 0.23506366307541626, "grad_norm": 0.5533424019813538, "learning_rate": 0.0007800000000000001, "loss": 1.1187, "step": 120 }, { "epoch": 0.2742409402546523, "grad_norm": 0.49875837564468384, "learning_rate": 0.00074, "loss": 1.1533, "step": 140 }, { "epoch": 0.31341821743388837, "grad_norm": 0.3221249580383301, "learning_rate": 0.0007, "loss": 1.0684, "step": 160 }, { "epoch": 0.3525954946131244, "grad_norm": 0.4444270730018616, "learning_rate": 0.00066, "loss": 0.9958, "step": 180 }, { "epoch": 0.3917727717923604, "grad_norm": 0.3522554337978363, "learning_rate": 0.00062, "loss": 1.0248, "step": 200 }, { "epoch": 0.4309500489715965, "grad_norm": 0.5420783162117004, "learning_rate": 0.00058, "loss": 0.8882, "step": 220 }, { "epoch": 0.4701273261508325, "grad_norm": 0.390200674533844, "learning_rate": 0.00054, "loss": 1.057, "step": 240 }, { "epoch": 0.5093046033300686, "grad_norm": 0.29798322916030884, "learning_rate": 0.0005, "loss": 0.9993, "step": 260 }, { "epoch": 0.5484818805093046, "grad_norm": 0.32027778029441833, "learning_rate": 0.00046, "loss": 31543.7594, "step": 280 }, { "epoch": 0.5876591576885406, "grad_norm": 0.3356141746044159, "learning_rate": 0.00042, "loss": 1.0387, "step": 300 }, { "epoch": 0.6268364348677767, "grad_norm": 0.4466319680213928, "learning_rate": 0.00038, "loss": 1.0035, "step": 320 }, { "epoch": 0.6660137120470128, "grad_norm": 0.34509071707725525, "learning_rate": 0.00034, "loss": 1.079, "step": 340 }, { "epoch": 0.7051909892262488, "grad_norm": 0.3388811945915222, "learning_rate": 0.0003, "loss": 0.8813, "step": 360 }, { "epoch": 0.7443682664054848, "grad_norm": 0.42653968930244446, "learning_rate": 0.00026000000000000003, "loss": 0.9738, "step": 380 }, { "epoch": 0.7835455435847208, "grad_norm": 0.41174277663230896, "learning_rate": 0.00022, "loss": 1.0102, "step": 400 }, { "epoch": 0.8227228207639569, "grad_norm": 0.2753573954105377, "learning_rate": 0.00017999999999999998, "loss": 0.9923, "step": 420 }, { "epoch": 0.861900097943193, "grad_norm": 0.4338860809803009, "learning_rate": 0.00014000000000000001, "loss": 0.9225, "step": 440 }, { "epoch": 0.901077375122429, "grad_norm": 0.3668128252029419, "learning_rate": 0.0001, "loss": 1.0012, "step": 460 }, { "epoch": 0.940254652301665, "grad_norm": 0.20731355249881744, "learning_rate": 6e-05, "loss": 0.9247, "step": 480 }, { "epoch": 0.9794319294809011, "grad_norm": 0.2774331867694855, "learning_rate": 2e-05, "loss": 1.0422, "step": 500 }, { "epoch": 0.9990205680705191, "step": 510, "total_flos": 468651156894720.0, "train_loss": 1238.0412948421404, "train_runtime": 10118.6641, "train_samples_per_second": 0.101, "train_steps_per_second": 0.05 } ], "logging_steps": 20, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 468651156894720.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }