{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9918458117123796, "eval_steps": 500, "global_step": 504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05930318754633061, "grad_norm": 1.3802219854574984, "learning_rate": 5e-06, "loss": 0.7739, "step": 10 }, { "epoch": 0.11860637509266123, "grad_norm": 2.733409062302713, "learning_rate": 5e-06, "loss": 0.7072, "step": 20 }, { "epoch": 0.17790956263899185, "grad_norm": 1.3622548058844581, "learning_rate": 5e-06, "loss": 0.6883, "step": 30 }, { "epoch": 0.23721275018532245, "grad_norm": 2.1397210299903042, "learning_rate": 5e-06, "loss": 0.6653, "step": 40 }, { "epoch": 0.2965159377316531, "grad_norm": 1.1868267718380074, "learning_rate": 5e-06, "loss": 0.6578, "step": 50 }, { "epoch": 0.3558191252779837, "grad_norm": 0.768661599042398, "learning_rate": 5e-06, "loss": 0.6434, "step": 60 }, { "epoch": 0.41512231282431433, "grad_norm": 0.929517541921154, "learning_rate": 5e-06, "loss": 0.6412, "step": 70 }, { "epoch": 0.4744255003706449, "grad_norm": 1.0083217759359295, "learning_rate": 5e-06, "loss": 0.6305, "step": 80 }, { "epoch": 0.5337286879169756, "grad_norm": 0.7019920027719391, "learning_rate": 5e-06, "loss": 0.625, "step": 90 }, { "epoch": 0.5930318754633062, "grad_norm": 0.8179786383641104, "learning_rate": 5e-06, "loss": 0.6186, "step": 100 }, { "epoch": 0.6523350630096367, "grad_norm": 1.505043114613733, "learning_rate": 5e-06, "loss": 0.6109, "step": 110 }, { "epoch": 0.7116382505559674, "grad_norm": 0.652990528799698, "learning_rate": 5e-06, "loss": 0.6174, "step": 120 }, { "epoch": 0.770941438102298, "grad_norm": 0.8738270604177748, "learning_rate": 5e-06, "loss": 0.6099, "step": 130 }, { "epoch": 0.8302446256486287, "grad_norm": 0.7482311809054135, "learning_rate": 5e-06, "loss": 0.6152, "step": 140 }, { "epoch": 0.8895478131949592, "grad_norm": 0.5898841945150782, "learning_rate": 5e-06, "loss": 0.604, "step": 150 }, { "epoch": 0.9488510007412898, "grad_norm": 0.5509255300448996, "learning_rate": 5e-06, "loss": 0.6088, "step": 160 }, { "epoch": 0.9962935507783544, "eval_loss": 0.6123025417327881, "eval_runtime": 177.7519, "eval_samples_per_second": 25.558, "eval_steps_per_second": 0.399, "step": 168 }, { "epoch": 1.0096367679762788, "grad_norm": 0.5962998315100678, "learning_rate": 5e-06, "loss": 0.6323, "step": 170 }, { "epoch": 1.0689399555226093, "grad_norm": 0.6401517658215825, "learning_rate": 5e-06, "loss": 0.5587, "step": 180 }, { "epoch": 1.12824314306894, "grad_norm": 0.5789193360471795, "learning_rate": 5e-06, "loss": 0.5532, "step": 190 }, { "epoch": 1.1875463306152705, "grad_norm": 0.5007418392739746, "learning_rate": 5e-06, "loss": 0.5449, "step": 200 }, { "epoch": 1.2468495181616013, "grad_norm": 0.742099379670997, "learning_rate": 5e-06, "loss": 0.5506, "step": 210 }, { "epoch": 1.3061527057079318, "grad_norm": 0.6357145205835705, "learning_rate": 5e-06, "loss": 0.5518, "step": 220 }, { "epoch": 1.3654558932542624, "grad_norm": 0.4706954425299361, "learning_rate": 5e-06, "loss": 0.543, "step": 230 }, { "epoch": 1.424759080800593, "grad_norm": 0.6430769844169726, "learning_rate": 5e-06, "loss": 0.5563, "step": 240 }, { "epoch": 1.4840622683469236, "grad_norm": 0.714837022060758, "learning_rate": 5e-06, "loss": 0.5517, "step": 250 }, { "epoch": 1.5433654558932544, "grad_norm": 0.5623788095234329, "learning_rate": 5e-06, "loss": 0.5512, "step": 260 }, { "epoch": 1.602668643439585, "grad_norm": 0.5481074950687667, "learning_rate": 5e-06, "loss": 0.55, "step": 270 }, { "epoch": 1.6619718309859155, "grad_norm": 0.5539378521915824, "learning_rate": 5e-06, "loss": 0.5472, "step": 280 }, { "epoch": 1.721275018532246, "grad_norm": 0.8039356878348214, "learning_rate": 5e-06, "loss": 0.5451, "step": 290 }, { "epoch": 1.7805782060785766, "grad_norm": 0.7791331104006716, "learning_rate": 5e-06, "loss": 0.5477, "step": 300 }, { "epoch": 1.8398813936249074, "grad_norm": 0.6989192286392514, "learning_rate": 5e-06, "loss": 0.5558, "step": 310 }, { "epoch": 1.899184581171238, "grad_norm": 0.5069580060902782, "learning_rate": 5e-06, "loss": 0.5533, "step": 320 }, { "epoch": 1.9584877687175686, "grad_norm": 0.5741926432541444, "learning_rate": 5e-06, "loss": 0.5487, "step": 330 }, { "epoch": 1.9940696812453669, "eval_loss": 0.6003267765045166, "eval_runtime": 177.8259, "eval_samples_per_second": 25.547, "eval_steps_per_second": 0.399, "step": 336 }, { "epoch": 2.0192735359525575, "grad_norm": 0.9927432888051317, "learning_rate": 5e-06, "loss": 0.564, "step": 340 }, { "epoch": 2.078576723498888, "grad_norm": 0.7990001146652178, "learning_rate": 5e-06, "loss": 0.4911, "step": 350 }, { "epoch": 2.1378799110452187, "grad_norm": 0.556438381270437, "learning_rate": 5e-06, "loss": 0.4883, "step": 360 }, { "epoch": 2.1971830985915495, "grad_norm": 0.6135178834215755, "learning_rate": 5e-06, "loss": 0.488, "step": 370 }, { "epoch": 2.25648628613788, "grad_norm": 0.5081260960389352, "learning_rate": 5e-06, "loss": 0.493, "step": 380 }, { "epoch": 2.3157894736842106, "grad_norm": 0.68213842322186, "learning_rate": 5e-06, "loss": 0.4814, "step": 390 }, { "epoch": 2.375092661230541, "grad_norm": 0.5560842165539166, "learning_rate": 5e-06, "loss": 0.4947, "step": 400 }, { "epoch": 2.4343958487768718, "grad_norm": 0.561149355084152, "learning_rate": 5e-06, "loss": 0.4916, "step": 410 }, { "epoch": 2.4936990363232026, "grad_norm": 0.534652055102195, "learning_rate": 5e-06, "loss": 0.4956, "step": 420 }, { "epoch": 2.553002223869533, "grad_norm": 0.5766622241099675, "learning_rate": 5e-06, "loss": 0.494, "step": 430 }, { "epoch": 2.6123054114158637, "grad_norm": 0.5664640432180077, "learning_rate": 5e-06, "loss": 0.4973, "step": 440 }, { "epoch": 2.6716085989621945, "grad_norm": 0.5141442287518144, "learning_rate": 5e-06, "loss": 0.4998, "step": 450 }, { "epoch": 2.730911786508525, "grad_norm": 0.622575399798233, "learning_rate": 5e-06, "loss": 0.4891, "step": 460 }, { "epoch": 2.790214974054855, "grad_norm": 0.6328620684014972, "learning_rate": 5e-06, "loss": 0.5005, "step": 470 }, { "epoch": 2.849518161601186, "grad_norm": 0.5410072349219146, "learning_rate": 5e-06, "loss": 0.4894, "step": 480 }, { "epoch": 2.9088213491475168, "grad_norm": 0.6725625016553842, "learning_rate": 5e-06, "loss": 0.4939, "step": 490 }, { "epoch": 2.968124536693847, "grad_norm": 0.6468049088847707, "learning_rate": 5e-06, "loss": 0.497, "step": 500 }, { "epoch": 2.9918458117123796, "eval_loss": 0.6024672985076904, "eval_runtime": 178.6347, "eval_samples_per_second": 25.432, "eval_steps_per_second": 0.397, "step": 504 }, { "epoch": 2.9918458117123796, "step": 504, "total_flos": 844009392046080.0, "train_loss": 0.5635922295706612, "train_runtime": 29809.8263, "train_samples_per_second": 8.685, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 504, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 844009392046080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }