{ "best_metric": 0.3561370372772217, "best_model_checkpoint": "results/checkpoint-8500", "epoch": 0.8119209093514185, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04776005349125991, "grad_norm": 6.31744909286499, "learning_rate": 4.8805998662718505e-05, "loss": 0.4525, "step": 500 }, { "epoch": 0.04776005349125991, "eval_loss": 0.38868293166160583, "eval_runtime": 105.2805, "eval_samples_per_second": 248.393, "eval_steps_per_second": 3.885, "step": 500 }, { "epoch": 0.09552010698251982, "grad_norm": 8.537174224853516, "learning_rate": 4.761199732543701e-05, "loss": 0.4516, "step": 1000 }, { "epoch": 0.09552010698251982, "eval_loss": 0.3857233226299286, "eval_runtime": 110.5179, "eval_samples_per_second": 236.622, "eval_steps_per_second": 3.701, "step": 1000 }, { "epoch": 0.14328016047377973, "grad_norm": 3.55123233795166, "learning_rate": 4.641799598815551e-05, "loss": 0.4412, "step": 1500 }, { "epoch": 0.14328016047377973, "eval_loss": 0.3843745291233063, "eval_runtime": 108.5384, "eval_samples_per_second": 240.938, "eval_steps_per_second": 3.768, "step": 1500 }, { "epoch": 0.19104021396503965, "grad_norm": 4.456860065460205, "learning_rate": 4.522399465087401e-05, "loss": 0.4411, "step": 2000 }, { "epoch": 0.19104021396503965, "eval_loss": 0.3724469542503357, "eval_runtime": 104.9479, "eval_samples_per_second": 249.181, "eval_steps_per_second": 3.897, "step": 2000 }, { "epoch": 0.23880026745629956, "grad_norm": 8.174205780029297, "learning_rate": 4.402999331359251e-05, "loss": 0.4269, "step": 2500 }, { "epoch": 0.23880026745629956, "eval_loss": 0.3822018504142761, "eval_runtime": 109.3926, "eval_samples_per_second": 239.056, "eval_steps_per_second": 3.739, "step": 2500 }, { "epoch": 0.28656032094755945, "grad_norm": 8.617950439453125, "learning_rate": 4.2835991976311015e-05, "loss": 0.4417, "step": 3000 }, { "epoch": 0.28656032094755945, "eval_loss": 0.37791815400123596, "eval_runtime": 110.9874, "eval_samples_per_second": 235.621, "eval_steps_per_second": 3.685, "step": 3000 }, { "epoch": 0.33432037443881935, "grad_norm": 4.857789993286133, "learning_rate": 4.164199063902952e-05, "loss": 0.4324, "step": 3500 }, { "epoch": 0.33432037443881935, "eval_loss": 0.37730905413627625, "eval_runtime": 110.4008, "eval_samples_per_second": 236.873, "eval_steps_per_second": 3.705, "step": 3500 }, { "epoch": 0.3820804279300793, "grad_norm": 4.581517219543457, "learning_rate": 4.044798930174802e-05, "loss": 0.4184, "step": 4000 }, { "epoch": 0.3820804279300793, "eval_loss": 0.3715578615665436, "eval_runtime": 111.0466, "eval_samples_per_second": 235.496, "eval_steps_per_second": 3.683, "step": 4000 }, { "epoch": 0.4298404814213392, "grad_norm": 7.005139350891113, "learning_rate": 3.925398796446652e-05, "loss": 0.4135, "step": 4500 }, { "epoch": 0.4298404814213392, "eval_loss": 0.3852500319480896, "eval_runtime": 110.8005, "eval_samples_per_second": 236.019, "eval_steps_per_second": 3.691, "step": 4500 }, { "epoch": 0.47760053491259913, "grad_norm": 7.208944797515869, "learning_rate": 3.8059986627185024e-05, "loss": 0.4172, "step": 5000 }, { "epoch": 0.47760053491259913, "eval_loss": 0.3664040267467499, "eval_runtime": 110.6088, "eval_samples_per_second": 236.428, "eval_steps_per_second": 3.698, "step": 5000 }, { "epoch": 0.525360588403859, "grad_norm": 4.011592864990234, "learning_rate": 3.6865985289903526e-05, "loss": 0.4164, "step": 5500 }, { "epoch": 0.525360588403859, "eval_loss": 0.36664679646492004, "eval_runtime": 108.0591, "eval_samples_per_second": 242.006, "eval_steps_per_second": 3.785, "step": 5500 }, { "epoch": 0.5731206418951189, "grad_norm": 3.7559821605682373, "learning_rate": 3.567198395262203e-05, "loss": 0.4124, "step": 6000 }, { "epoch": 0.5731206418951189, "eval_loss": 0.36337631940841675, "eval_runtime": 105.397, "eval_samples_per_second": 248.119, "eval_steps_per_second": 3.881, "step": 6000 }, { "epoch": 0.6208806953863788, "grad_norm": 4.904799938201904, "learning_rate": 3.447798261534053e-05, "loss": 0.4133, "step": 6500 }, { "epoch": 0.6208806953863788, "eval_loss": 0.3614007234573364, "eval_runtime": 110.9753, "eval_samples_per_second": 235.647, "eval_steps_per_second": 3.686, "step": 6500 }, { "epoch": 0.6686407488776387, "grad_norm": 8.097270011901855, "learning_rate": 3.328398127805903e-05, "loss": 0.4132, "step": 7000 }, { "epoch": 0.6686407488776387, "eval_loss": 0.360387921333313, "eval_runtime": 110.5513, "eval_samples_per_second": 236.551, "eval_steps_per_second": 3.7, "step": 7000 }, { "epoch": 0.7164008023688987, "grad_norm": 5.6349968910217285, "learning_rate": 3.2089979940777535e-05, "loss": 0.3994, "step": 7500 }, { "epoch": 0.7164008023688987, "eval_loss": 0.3582770824432373, "eval_runtime": 109.3558, "eval_samples_per_second": 239.137, "eval_steps_per_second": 3.74, "step": 7500 }, { "epoch": 0.7641608558601586, "grad_norm": 5.345180988311768, "learning_rate": 3.089597860349604e-05, "loss": 0.4032, "step": 8000 }, { "epoch": 0.7641608558601586, "eval_loss": 0.3689400851726532, "eval_runtime": 110.4919, "eval_samples_per_second": 236.678, "eval_steps_per_second": 3.702, "step": 8000 }, { "epoch": 0.8119209093514185, "grad_norm": 3.314952850341797, "learning_rate": 2.9701977266214542e-05, "loss": 0.4101, "step": 8500 }, { "epoch": 0.8119209093514185, "eval_loss": 0.3561370372772217, "eval_runtime": 112.2203, "eval_samples_per_second": 233.033, "eval_steps_per_second": 3.645, "step": 8500 } ], "logging_steps": 500, "max_steps": 20938, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.43163257192448e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }