{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993815708101422, "eval_steps": 500, "global_step": 404, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024737167594310452, "grad_norm": 3.8928613137780697, "learning_rate": 5e-06, "loss": 0.8929, "step": 10 }, { "epoch": 0.049474335188620905, "grad_norm": 4.590030022211483, "learning_rate": 5e-06, "loss": 0.7739, "step": 20 }, { "epoch": 0.07421150278293136, "grad_norm": 2.0445731869289467, "learning_rate": 5e-06, "loss": 0.7286, "step": 30 }, { "epoch": 0.09894867037724181, "grad_norm": 1.4578350201244652, "learning_rate": 5e-06, "loss": 0.6964, "step": 40 }, { "epoch": 0.12368583797155226, "grad_norm": 1.2041164043429626, "learning_rate": 5e-06, "loss": 0.6808, "step": 50 }, { "epoch": 0.14842300556586271, "grad_norm": 0.9753852138770908, "learning_rate": 5e-06, "loss": 0.6577, "step": 60 }, { "epoch": 0.17316017316017315, "grad_norm": 0.8981690468785772, "learning_rate": 5e-06, "loss": 0.6476, "step": 70 }, { "epoch": 0.19789734075448362, "grad_norm": 0.7458796589430092, "learning_rate": 5e-06, "loss": 0.6395, "step": 80 }, { "epoch": 0.22263450834879406, "grad_norm": 0.7140456505991135, "learning_rate": 5e-06, "loss": 0.6376, "step": 90 }, { "epoch": 0.24737167594310452, "grad_norm": 0.9200299620458595, "learning_rate": 5e-06, "loss": 0.6307, "step": 100 }, { "epoch": 0.272108843537415, "grad_norm": 0.5761268046084219, "learning_rate": 5e-06, "loss": 0.6175, "step": 110 }, { "epoch": 0.29684601113172543, "grad_norm": 0.5836896660196662, "learning_rate": 5e-06, "loss": 0.6177, "step": 120 }, { "epoch": 0.32158317872603587, "grad_norm": 1.038827197557366, "learning_rate": 5e-06, "loss": 0.6132, "step": 130 }, { "epoch": 0.3463203463203463, "grad_norm": 0.6673252980412175, "learning_rate": 5e-06, "loss": 0.6091, "step": 140 }, { "epoch": 0.37105751391465674, "grad_norm": 0.6307785927320235, "learning_rate": 5e-06, "loss": 0.6046, "step": 150 }, { "epoch": 0.39579468150896724, "grad_norm": 0.5244651264271686, "learning_rate": 5e-06, "loss": 0.601, "step": 160 }, { "epoch": 0.4205318491032777, "grad_norm": 0.5705407579445089, "learning_rate": 5e-06, "loss": 0.6082, "step": 170 }, { "epoch": 0.4452690166975881, "grad_norm": 0.5554154594013059, "learning_rate": 5e-06, "loss": 0.6003, "step": 180 }, { "epoch": 0.47000618429189855, "grad_norm": 0.6019873008818303, "learning_rate": 5e-06, "loss": 0.5994, "step": 190 }, { "epoch": 0.49474335188620905, "grad_norm": 0.7034894074017951, "learning_rate": 5e-06, "loss": 0.5986, "step": 200 }, { "epoch": 0.5194805194805194, "grad_norm": 0.5638344674323469, "learning_rate": 5e-06, "loss": 0.6023, "step": 210 }, { "epoch": 0.54421768707483, "grad_norm": 0.5443793743216905, "learning_rate": 5e-06, "loss": 0.5927, "step": 220 }, { "epoch": 0.5689548546691404, "grad_norm": 0.5802674598015297, "learning_rate": 5e-06, "loss": 0.5925, "step": 230 }, { "epoch": 0.5936920222634509, "grad_norm": 0.592738891502665, "learning_rate": 5e-06, "loss": 0.5928, "step": 240 }, { "epoch": 0.6184291898577613, "grad_norm": 0.5388550762260421, "learning_rate": 5e-06, "loss": 0.5858, "step": 250 }, { "epoch": 0.6431663574520717, "grad_norm": 0.5593031272628818, "learning_rate": 5e-06, "loss": 0.5879, "step": 260 }, { "epoch": 0.6679035250463822, "grad_norm": 0.6608335560611281, "learning_rate": 5e-06, "loss": 0.5844, "step": 270 }, { "epoch": 0.6926406926406926, "grad_norm": 0.6327217733233739, "learning_rate": 5e-06, "loss": 0.5755, "step": 280 }, { "epoch": 0.717377860235003, "grad_norm": 0.5769636309953428, "learning_rate": 5e-06, "loss": 0.5843, "step": 290 }, { "epoch": 0.7421150278293135, "grad_norm": 0.5532053151787545, "learning_rate": 5e-06, "loss": 0.5864, "step": 300 }, { "epoch": 0.766852195423624, "grad_norm": 0.6400898941077486, "learning_rate": 5e-06, "loss": 0.5822, "step": 310 }, { "epoch": 0.7915893630179345, "grad_norm": 0.602629447160874, "learning_rate": 5e-06, "loss": 0.5727, "step": 320 }, { "epoch": 0.8163265306122449, "grad_norm": 0.5999318227987905, "learning_rate": 5e-06, "loss": 0.5794, "step": 330 }, { "epoch": 0.8410636982065554, "grad_norm": 0.5332757259893975, "learning_rate": 5e-06, "loss": 0.5793, "step": 340 }, { "epoch": 0.8658008658008658, "grad_norm": 0.5492421058512896, "learning_rate": 5e-06, "loss": 0.5744, "step": 350 }, { "epoch": 0.8905380333951762, "grad_norm": 0.6007108771595042, "learning_rate": 5e-06, "loss": 0.5707, "step": 360 }, { "epoch": 0.9152752009894867, "grad_norm": 0.5866994201925174, "learning_rate": 5e-06, "loss": 0.5765, "step": 370 }, { "epoch": 0.9400123685837971, "grad_norm": 0.4956333122054928, "learning_rate": 5e-06, "loss": 0.5714, "step": 380 }, { "epoch": 0.9647495361781077, "grad_norm": 0.5508918734344029, "learning_rate": 5e-06, "loss": 0.5682, "step": 390 }, { "epoch": 0.9894867037724181, "grad_norm": 0.599971023810852, "learning_rate": 5e-06, "loss": 0.5719, "step": 400 }, { "epoch": 0.9993815708101422, "eval_loss": 0.5685587525367737, "eval_runtime": 289.4782, "eval_samples_per_second": 37.626, "eval_steps_per_second": 0.591, "step": 404 } ], "logging_steps": 10, "max_steps": 1212, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 676924426813440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }