{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 2.4489102363586426, "learning_rate": 2e-05, "loss": 1.8047, "step": 10 }, { "epoch": 0.2, "grad_norm": 1.8422876596450806, "learning_rate": 1.9396926207859085e-05, "loss": 1.3832, "step": 20 }, { "epoch": 0.3, "grad_norm": 3.1532328128814697, "learning_rate": 1.766044443118978e-05, "loss": 1.1404, "step": 30 }, { "epoch": 0.4, "grad_norm": 1.9095581769943237, "learning_rate": 1.5000000000000002e-05, "loss": 1.068, "step": 40 }, { "epoch": 0.5, "grad_norm": 2.4410550594329834, "learning_rate": 1.1736481776669307e-05, "loss": 1.014, "step": 50 }, { "epoch": 0.6, "grad_norm": 2.7899439334869385, "learning_rate": 8.263518223330698e-06, "loss": 1.1168, "step": 60 }, { "epoch": 0.7, "grad_norm": 2.6681783199310303, "learning_rate": 5.000000000000003e-06, "loss": 0.9987, "step": 70 }, { "epoch": 0.8, "grad_norm": 2.2320525646209717, "learning_rate": 2.339555568810221e-06, "loss": 1.198, "step": 80 }, { "epoch": 0.9, "grad_norm": 1.1953657865524292, "learning_rate": 6.030737921409169e-07, "loss": 1.0336, "step": 90 }, { "epoch": 1.0, "grad_norm": 2.67486572265625, "learning_rate": 0.0, "loss": 1.0806, "step": 100 }, { "epoch": 2.2, "grad_norm": 1.0614951848983765, "learning_rate": 1.913545457642601e-05, "loss": 0.9586, "step": 110 }, { "epoch": 2.4, "grad_norm": 1.1396006345748901, "learning_rate": 1.8829475928589272e-05, "loss": 0.9822, "step": 120 }, { "epoch": 2.6, "grad_norm": 1.8257782459259033, "learning_rate": 1.848048096156426e-05, "loss": 0.9798, "step": 130 }, { "epoch": 2.8, "grad_norm": 3.6675124168395996, "learning_rate": 1.8090169943749477e-05, "loss": 0.9642, "step": 140 }, { "epoch": 3.0, "grad_norm": 2.7332773208618164, "learning_rate": 1.766044443118978e-05, "loss": 1.0019, "step": 150 }, { "epoch": 3.2, "grad_norm": 1.2114962339401245, "learning_rate": 1.7193398003386514e-05, "loss": 0.8493, "step": 160 }, { "epoch": 3.4, "grad_norm": 5.460783004760742, "learning_rate": 1.6691306063588583e-05, "loss": 0.8319, "step": 170 }, { "epoch": 3.6, "grad_norm": 3.551159620285034, "learning_rate": 1.6156614753256583e-05, "loss": 0.7986, "step": 180 }, { "epoch": 3.8, "grad_norm": 4.315650463104248, "learning_rate": 1.5591929034707468e-05, "loss": 0.7908, "step": 190 }, { "epoch": 4.0, "grad_norm": 5.731149673461914, "learning_rate": 1.5000000000000002e-05, "loss": 0.8207, "step": 200 }, { "epoch": 4.2, "grad_norm": 2.3099749088287354, "learning_rate": 1.4383711467890776e-05, "loss": 0.8045, "step": 210 }, { "epoch": 4.4, "grad_norm": 5.100424766540527, "learning_rate": 1.3746065934159123e-05, "loss": 0.6337, "step": 220 }, { "epoch": 4.6, "grad_norm": 4.819253444671631, "learning_rate": 1.3090169943749475e-05, "loss": 0.5244, "step": 230 }, { "epoch": 4.8, "grad_norm": 5.7660675048828125, "learning_rate": 1.2419218955996677e-05, "loss": 0.5807, "step": 240 }, { "epoch": 5.0, "grad_norm": 3.9891021251678467, "learning_rate": 1.1736481776669307e-05, "loss": 0.6353, "step": 250 }, { "epoch": 5.2, "grad_norm": 3.6858391761779785, "learning_rate": 1.1045284632676535e-05, "loss": 0.4776, "step": 260 }, { "epoch": 5.4, "grad_norm": 9.160813331604004, "learning_rate": 1.0348994967025012e-05, "loss": 0.323, "step": 270 }, { "epoch": 5.6, "grad_norm": 7.0403313636779785, "learning_rate": 9.651005032974994e-06, "loss": 0.4799, "step": 280 }, { "epoch": 5.8, "grad_norm": 7.419447898864746, "learning_rate": 8.954715367323468e-06, "loss": 0.5816, "step": 290 }, { "epoch": 6.0, "grad_norm": 3.421180009841919, "learning_rate": 8.263518223330698e-06, "loss": 0.4278, "step": 300 }, { "epoch": 6.2, "grad_norm": 4.215754985809326, "learning_rate": 7.580781044003324e-06, "loss": 0.3721, "step": 310 }, { "epoch": 6.4, "grad_norm": 6.3047308921813965, "learning_rate": 6.909830056250527e-06, "loss": 0.2973, "step": 320 }, { "epoch": 6.6, "grad_norm": 14.362757682800293, "learning_rate": 6.25393406584088e-06, "loss": 0.2589, "step": 330 }, { "epoch": 6.8, "grad_norm": 7.351701736450195, "learning_rate": 5.616288532109225e-06, "loss": 0.2718, "step": 340 }, { "epoch": 7.0, "grad_norm": 8.06631851196289, "learning_rate": 5.000000000000003e-06, "loss": 0.3159, "step": 350 }, { "epoch": 7.2, "grad_norm": 5.070101261138916, "learning_rate": 4.408070965292534e-06, "loss": 0.2075, "step": 360 }, { "epoch": 7.4, "grad_norm": 7.613597869873047, "learning_rate": 3.8433852467434175e-06, "loss": 0.1164, "step": 370 }, { "epoch": 7.6, "grad_norm": 6.611764907836914, "learning_rate": 3.308693936411421e-06, "loss": 0.2217, "step": 380 }, { "epoch": 7.8, "grad_norm": 12.279210090637207, "learning_rate": 2.8066019966134907e-06, "loss": 0.1475, "step": 390 }, { "epoch": 8.0, "grad_norm": 14.093499183654785, "learning_rate": 2.339555568810221e-06, "loss": 0.2752, "step": 400 }, { "epoch": 8.2, "grad_norm": 2.3949663639068604, "learning_rate": 1.9098300562505266e-06, "loss": 0.1019, "step": 410 }, { "epoch": 8.4, "grad_norm": 3.969956159591675, "learning_rate": 1.5195190384357405e-06, "loss": 0.1754, "step": 420 }, { "epoch": 8.6, "grad_norm": 8.477439880371094, "learning_rate": 1.1705240714107301e-06, "loss": 0.1174, "step": 430 }, { "epoch": 8.8, "grad_norm": 2.671515464782715, "learning_rate": 8.645454235739903e-07, "loss": 0.1461, "step": 440 }, { "epoch": 9.0, "grad_norm": 0.29233822226524353, "learning_rate": 6.030737921409169e-07, "loss": 0.1105, "step": 450 }, { "epoch": 9.2, "grad_norm": 3.0615086555480957, "learning_rate": 3.8738304061681107e-07, "loss": 0.1009, "step": 460 }, { "epoch": 9.4, "grad_norm": 1.1887454986572266, "learning_rate": 2.1852399266194312e-07, "loss": 0.1134, "step": 470 }, { "epoch": 9.6, "grad_norm": 13.041033744812012, "learning_rate": 9.731931258429638e-08, "loss": 0.1092, "step": 480 }, { "epoch": 9.8, "grad_norm": 5.9538798332214355, "learning_rate": 2.4359497401758026e-08, "loss": 0.0799, "step": 490 }, { "epoch": 10.0, "grad_norm": 3.9380784034729004, "learning_rate": 0.0, "loss": 0.1335, "step": 500 }, { "epoch": 10.0, "step": 500, "total_flos": 1.1283387467931648e+16, "train_loss": 0.3623817192316055, "train_runtime": 5832.4394, "train_samples_per_second": 0.171, "train_steps_per_second": 0.086 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1283387467931648e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }