{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "grad_norm": 0.625, "learning_rate": 7.4074074074074075e-06, "loss": 0.4546, "step": 20 }, { "epoch": 0.13, "grad_norm": 0.41796875, "learning_rate": 9.994529604032901e-06, "loss": 0.2781, "step": 40 }, { "epoch": 0.2, "grad_norm": 0.38671875, "learning_rate": 9.964784918620284e-06, "loss": 0.2492, "step": 60 }, { "epoch": 0.27, "grad_norm": 0.3984375, "learning_rate": 9.909333658705933e-06, "loss": 0.2379, "step": 80 }, { "epoch": 0.33, "grad_norm": 0.400390625, "learning_rate": 9.82846293883108e-06, "loss": 0.2301, "step": 100 }, { "epoch": 0.4, "grad_norm": 0.37109375, "learning_rate": 9.722591489961829e-06, "loss": 0.2255, "step": 120 }, { "epoch": 0.47, "grad_norm": 0.388671875, "learning_rate": 9.59226749139145e-06, "loss": 0.2223, "step": 140 }, { "epoch": 0.53, "grad_norm": 0.76171875, "learning_rate": 9.438165732387471e-06, "loss": 0.2171, "step": 160 }, { "epoch": 0.6, "grad_norm": 0.39453125, "learning_rate": 9.261084118279846e-06, "loss": 0.2154, "step": 180 }, { "epoch": 0.67, "grad_norm": 0.41015625, "learning_rate": 9.06193953908105e-06, "loss": 0.2126, "step": 200 }, { "epoch": 0.73, "grad_norm": 0.37109375, "learning_rate": 8.84176312202936e-06, "loss": 0.2143, "step": 220 }, { "epoch": 0.8, "grad_norm": 0.37890625, "learning_rate": 8.601694892636701e-06, "loss": 0.2082, "step": 240 }, { "epoch": 0.87, "grad_norm": 0.369140625, "learning_rate": 8.34297787188496e-06, "loss": 0.2072, "step": 260 }, { "epoch": 0.93, "grad_norm": 0.404296875, "learning_rate": 8.066951640134183e-06, "loss": 0.2069, "step": 280 }, { "epoch": 1.0, "grad_norm": 0.404296875, "learning_rate": 7.77504540106735e-06, "loss": 0.2067, "step": 300 }, { "epoch": 1.07, "grad_norm": 0.431640625, "learning_rate": 7.468770581585147e-06, "loss": 0.1909, "step": 320 }, { "epoch": 1.13, "grad_norm": 0.39453125, "learning_rate": 7.149713005966784e-06, "loss": 0.1909, "step": 340 }, { "epoch": 1.2, "grad_norm": 0.400390625, "learning_rate": 6.819524684817439e-06, "loss": 0.1877, "step": 360 }, { "epoch": 1.27, "grad_norm": 0.396484375, "learning_rate": 6.479915261317299e-06, "loss": 0.1901, "step": 380 }, { "epoch": 1.33, "grad_norm": 0.404296875, "learning_rate": 6.132643159061707e-06, "loss": 0.1886, "step": 400 }, { "epoch": 1.4, "grad_norm": 0.376953125, "learning_rate": 5.779506477326933e-06, "loss": 0.1887, "step": 420 }, { "epoch": 1.47, "grad_norm": 0.404296875, "learning_rate": 5.4223336809039205e-06, "loss": 0.1866, "step": 440 }, { "epoch": 1.53, "grad_norm": 0.419921875, "learning_rate": 5.062974132706017e-06, "loss": 0.1887, "step": 460 }, { "epoch": 1.6, "grad_norm": 0.390625, "learning_rate": 4.703288518170774e-06, "loss": 0.1849, "step": 480 }, { "epoch": 1.67, "grad_norm": 0.390625, "learning_rate": 4.345139211036192e-06, "loss": 0.187, "step": 500 }, { "epoch": 1.73, "grad_norm": 0.421875, "learning_rate": 3.9903806303753036e-06, "loss": 0.1839, "step": 520 }, { "epoch": 1.8, "grad_norm": 0.40234375, "learning_rate": 3.6408496388182857e-06, "loss": 0.187, "step": 540 }, { "epoch": 1.87, "grad_norm": 0.390625, "learning_rate": 3.2983560316780104e-06, "loss": 0.1849, "step": 560 }, { "epoch": 1.93, "grad_norm": 0.412109375, "learning_rate": 2.9646731662242554e-06, "loss": 0.184, "step": 580 }, { "epoch": 2.0, "grad_norm": 0.4140625, "learning_rate": 2.6415287796261707e-06, "loss": 0.1844, "step": 600 }, { "epoch": 2.07, "grad_norm": 0.412109375, "learning_rate": 2.330596043105683e-06, "loss": 0.1795, "step": 620 }, { "epoch": 2.13, "grad_norm": 0.4453125, "learning_rate": 2.03348489862149e-06, "loss": 0.1787, "step": 640 }, { "epoch": 2.2, "grad_norm": 0.39453125, "learning_rate": 1.7517337229403946e-06, "loss": 0.1781, "step": 660 }, { "epoch": 2.27, "grad_norm": 0.39453125, "learning_rate": 1.4868013622576138e-06, "loss": 0.1759, "step": 680 }, { "epoch": 2.33, "grad_norm": 0.404296875, "learning_rate": 1.240059578609054e-06, "loss": 0.178, "step": 700 }, { "epoch": 2.4, "grad_norm": 0.416015625, "learning_rate": 1.012785947186397e-06, "loss": 0.1785, "step": 720 }, { "epoch": 2.47, "grad_norm": 0.40234375, "learning_rate": 8.061572413311253e-07, "loss": 0.1778, "step": 740 }, { "epoch": 2.53, "grad_norm": 0.404296875, "learning_rate": 6.212433394585865e-07, "loss": 0.1771, "step": 760 }, { "epoch": 2.6, "grad_norm": 0.41796875, "learning_rate": 4.590016854606727e-07, "loss": 0.1756, "step": 780 }, { "epoch": 2.67, "grad_norm": 0.392578125, "learning_rate": 3.2027233126997405e-07, "loss": 0.1776, "step": 800 }, { "epoch": 2.73, "grad_norm": 0.400390625, "learning_rate": 2.057735872539157e-07, "loss": 0.1764, "step": 820 }, { "epoch": 2.8, "grad_norm": 0.412109375, "learning_rate": 1.1609830296019142e-07, "loss": 0.1815, "step": 840 }, { "epoch": 2.87, "grad_norm": 0.490234375, "learning_rate": 5.1710797470987393e-08, "loss": 0.1763, "step": 860 }, { "epoch": 2.93, "grad_norm": 0.400390625, "learning_rate": 1.2944455259944476e-08, "loss": 0.1788, "step": 880 }, { "epoch": 3.0, "grad_norm": 0.41015625, "learning_rate": 0.0, "loss": 0.1766, "step": 900 }, { "epoch": 3.0, "step": 900, "total_flos": 4.549145968838181e+18, "train_loss": 0.20135066880120173, "train_runtime": 71504.7155, "train_samples_per_second": 2.411, "train_steps_per_second": 0.013 } ], "logging_steps": 20, "max_steps": 900, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "total_flos": 4.549145968838181e+18, "train_batch_size": 24, "trial_name": null, "trial_params": null }