{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 1505, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013289036544850499, "grad_norm": 0.10525072365999222, "learning_rate": 4.9978216198586135e-05, "loss": 0.6155, "step": 20 }, { "epoch": 0.026578073089700997, "grad_norm": 0.08554615080356598, "learning_rate": 4.991290275706486e-05, "loss": 0.5694, "step": 40 }, { "epoch": 0.03986710963455149, "grad_norm": 0.08361516892910004, "learning_rate": 4.980417349743936e-05, "loss": 0.557, "step": 60 }, { "epoch": 0.053156146179401995, "grad_norm": 0.08680060505867004, "learning_rate": 4.9652217902637596e-05, "loss": 0.548, "step": 80 }, { "epoch": 0.0664451827242525, "grad_norm": 0.08960291743278503, "learning_rate": 4.945730078629964e-05, "loss": 0.5427, "step": 100 }, { "epoch": 0.07973421926910298, "grad_norm": 0.09262242168188095, "learning_rate": 4.921976183128585e-05, "loss": 0.5384, "step": 120 }, { "epoch": 0.09302325581395349, "grad_norm": 0.08780515193939209, "learning_rate": 4.894001499771015e-05, "loss": 0.5362, "step": 140 }, { "epoch": 0.10631229235880399, "grad_norm": 0.09249912202358246, "learning_rate": 4.861854780153004e-05, "loss": 0.5324, "step": 160 }, { "epoch": 0.11960132890365449, "grad_norm": 0.09562400728464127, "learning_rate": 4.825592046495054e-05, "loss": 0.5311, "step": 180 }, { "epoch": 0.132890365448505, "grad_norm": 0.09372778236865997, "learning_rate": 4.785276494012263e-05, "loss": 0.5278, "step": 200 }, { "epoch": 0.132890365448505, "eval_accuracy": 0.19452303794312395, "eval_loss": 0.5592088103294373, "eval_runtime": 19.5284, "eval_samples_per_second": 93.914, "eval_steps_per_second": 0.41, "step": 200 }, { "epoch": 0.1461794019933555, "grad_norm": 0.08762918412685394, "learning_rate": 4.740978380783765e-05, "loss": 0.5253, "step": 220 }, { "epoch": 0.15946843853820597, "grad_norm": 0.08518578112125397, "learning_rate": 4.6927749053136866e-05, "loss": 0.5192, "step": 240 }, { "epoch": 0.17275747508305647, "grad_norm": 0.09664598107337952, "learning_rate": 4.640750071996995e-05, "loss": 0.5217, "step": 260 }, { "epoch": 0.18604651162790697, "grad_norm": 0.08245342969894409, "learning_rate": 4.584994544724695e-05, "loss": 0.5172, "step": 280 }, { "epoch": 0.19933554817275748, "grad_norm": 0.08551981300115585, "learning_rate": 4.5256054888834934e-05, "loss": 0.5152, "step": 300 }, { "epoch": 0.21262458471760798, "grad_norm": 0.09647104889154434, "learning_rate": 4.4626864020252774e-05, "loss": 0.5139, "step": 320 }, { "epoch": 0.22591362126245848, "grad_norm": 0.09810427576303482, "learning_rate": 4.3963469335015085e-05, "loss": 0.5129, "step": 340 }, { "epoch": 0.23920265780730898, "grad_norm": 0.08342389762401581, "learning_rate": 4.326702693376844e-05, "loss": 0.5119, "step": 360 }, { "epoch": 0.25249169435215946, "grad_norm": 0.08738644421100616, "learning_rate": 4.2538750509550054e-05, "loss": 0.511, "step": 380 }, { "epoch": 0.26578073089701, "grad_norm": 0.08475251495838165, "learning_rate": 4.177990923267986e-05, "loss": 0.5117, "step": 400 }, { "epoch": 0.26578073089701, "eval_accuracy": 0.1953402564276045, "eval_loss": 0.5438870787620544, "eval_runtime": 15.5302, "eval_samples_per_second": 118.093, "eval_steps_per_second": 0.515, "step": 400 }, { "epoch": 0.27906976744186046, "grad_norm": 0.07873477786779404, "learning_rate": 4.099182553897229e-05, "loss": 0.5084, "step": 420 }, { "epoch": 0.292358803986711, "grad_norm": 0.09158772230148315, "learning_rate": 4.017587282512181e-05, "loss": 0.5065, "step": 440 }, { "epoch": 0.30564784053156147, "grad_norm": 0.07729614526033401, "learning_rate": 3.933347305527898e-05, "loss": 0.5047, "step": 460 }, { "epoch": 0.31893687707641194, "grad_norm": 0.08530613034963608, "learning_rate": 3.846609428298757e-05, "loss": 0.5049, "step": 480 }, { "epoch": 0.33222591362126247, "grad_norm": 0.07760792225599289, "learning_rate": 3.7575248092801686e-05, "loss": 0.5035, "step": 500 }, { "epoch": 0.34551495016611294, "grad_norm": 0.08521712571382523, "learning_rate": 3.66624869660411e-05, "loss": 0.5042, "step": 520 }, { "epoch": 0.3588039867109635, "grad_norm": 0.08439727872610092, "learning_rate": 3.572940157527572e-05, "loss": 0.5021, "step": 540 }, { "epoch": 0.37209302325581395, "grad_norm": 0.09042590111494064, "learning_rate": 3.47776180122539e-05, "loss": 0.5019, "step": 560 }, { "epoch": 0.3853820598006645, "grad_norm": 0.08219762146472931, "learning_rate": 3.3808794954105716e-05, "loss": 0.501, "step": 580 }, { "epoch": 0.39867109634551495, "grad_norm": 0.08426713198423386, "learning_rate": 3.282462077275947e-05, "loss": 0.5013, "step": 600 }, { "epoch": 0.39867109634551495, "eval_accuracy": 0.19588631180347973, "eval_loss": 0.5341373682022095, "eval_runtime": 16.1072, "eval_samples_per_second": 113.862, "eval_steps_per_second": 0.497, "step": 600 }, { "epoch": 0.4119601328903654, "grad_norm": 0.08020314574241638, "learning_rate": 3.1826810592609036e-05, "loss": 0.4968, "step": 620 }, { "epoch": 0.42524916943521596, "grad_norm": 0.07975760847330093, "learning_rate": 3.081710330155942e-05, "loss": 0.4997, "step": 640 }, { "epoch": 0.43853820598006643, "grad_norm": 0.08056964725255966, "learning_rate": 2.979725852065981e-05, "loss": 0.4968, "step": 660 }, { "epoch": 0.45182724252491696, "grad_norm": 0.08022565394639969, "learning_rate": 2.876905353760459e-05, "loss": 0.4976, "step": 680 }, { "epoch": 0.46511627906976744, "grad_norm": 0.08131925761699677, "learning_rate": 2.7734280209446865e-05, "loss": 0.4973, "step": 700 }, { "epoch": 0.47840531561461797, "grad_norm": 0.07562076300382614, "learning_rate": 2.6694741839921732e-05, "loss": 0.4956, "step": 720 }, { "epoch": 0.49169435215946844, "grad_norm": 0.07877329736948013, "learning_rate": 2.5652250036821523e-05, "loss": 0.4966, "step": 740 }, { "epoch": 0.5049833887043189, "grad_norm": 0.08014395087957382, "learning_rate": 2.4608621554899362e-05, "loss": 0.4934, "step": 760 }, { "epoch": 0.5182724252491694, "grad_norm": 0.07770328223705292, "learning_rate": 2.356567512980326e-05, "loss": 0.4934, "step": 780 }, { "epoch": 0.53156146179402, "grad_norm": 0.07732851803302765, "learning_rate": 2.252522830855798e-05, "loss": 0.4951, "step": 800 }, { "epoch": 0.53156146179402, "eval_accuracy": 0.19623978277118043, "eval_loss": 0.5274041295051575, "eval_runtime": 16.4552, "eval_samples_per_second": 111.454, "eval_steps_per_second": 0.486, "step": 800 }, { "epoch": 0.5448504983388704, "grad_norm": 0.07608461380004883, "learning_rate": 2.1489094282118395e-05, "loss": 0.4896, "step": 820 }, { "epoch": 0.5581395348837209, "grad_norm": 0.07657533138990402, "learning_rate": 2.0459078725514092e-05, "loss": 0.4918, "step": 840 }, { "epoch": 0.5714285714285714, "grad_norm": 0.07983728498220444, "learning_rate": 1.9436976651092144e-05, "loss": 0.4927, "step": 860 }, { "epoch": 0.584717607973422, "grad_norm": 0.07355430722236633, "learning_rate": 1.8424569280341653e-05, "loss": 0.493, "step": 880 }, { "epoch": 0.5980066445182725, "grad_norm": 0.08014149218797684, "learning_rate": 1.7423620939751788e-05, "loss": 0.4922, "step": 900 }, { "epoch": 0.6112956810631229, "grad_norm": 0.07500924915075302, "learning_rate": 1.6435875986112685e-05, "loss": 0.491, "step": 920 }, { "epoch": 0.6245847176079734, "grad_norm": 0.07356715947389603, "learning_rate": 1.546305576661776e-05, "loss": 0.4909, "step": 940 }, { "epoch": 0.6378737541528239, "grad_norm": 0.07140863686800003, "learning_rate": 1.4506855619064846e-05, "loss": 0.489, "step": 960 }, { "epoch": 0.6511627906976745, "grad_norm": 0.07692987471818924, "learning_rate": 1.3568941917384036e-05, "loss": 0.4902, "step": 980 }, { "epoch": 0.6644518272425249, "grad_norm": 0.07356040179729462, "learning_rate": 1.2650949167640997e-05, "loss": 0.4894, "step": 1000 }, { "epoch": 0.6644518272425249, "eval_accuracy": 0.19652373156663552, "eval_loss": 0.5229406952857971, "eval_runtime": 15.6791, "eval_samples_per_second": 116.971, "eval_steps_per_second": 0.51, "step": 1000 }, { "epoch": 0.6777408637873754, "grad_norm": 0.0691773071885109, "learning_rate": 1.1754477159576499e-05, "loss": 0.4869, "step": 1020 }, { "epoch": 0.6910299003322259, "grad_norm": 0.07505939155817032, "learning_rate": 1.088108817864629e-05, "loss": 0.4865, "step": 1040 }, { "epoch": 0.7043189368770764, "grad_norm": 0.06973451375961304, "learning_rate": 1.003230428341979e-05, "loss": 0.4888, "step": 1060 }, { "epoch": 0.717607973421927, "grad_norm": 0.07225219160318375, "learning_rate": 9.209604653082326e-06, "loss": 0.4858, "step": 1080 }, { "epoch": 0.7308970099667774, "grad_norm": 0.07558443397283554, "learning_rate": 8.414423009663563e-06, "loss": 0.4891, "step": 1100 }, { "epoch": 0.7441860465116279, "grad_norm": 0.0698658898472786, "learning_rate": 7.648145119484152e-06, "loss": 0.4871, "step": 1120 }, { "epoch": 0.7574750830564784, "grad_norm": 0.06963298469781876, "learning_rate": 6.912106378175098e-06, "loss": 0.4884, "step": 1140 }, { "epoch": 0.770764119601329, "grad_norm": 0.0692787617444992, "learning_rate": 6.207589483478266e-06, "loss": 0.4877, "step": 1160 }, { "epoch": 0.7840531561461794, "grad_norm": 0.07016126066446304, "learning_rate": 5.53582219988382e-06, "loss": 0.4856, "step": 1180 }, { "epoch": 0.7973421926910299, "grad_norm": 0.06945677101612091, "learning_rate": 4.897975218999926e-06, "loss": 0.4868, "step": 1200 }, { "epoch": 0.7973421926910299, "eval_accuracy": 0.19665158843513314, "eval_loss": 0.5205041170120239, "eval_runtime": 14.8321, "eval_samples_per_second": 123.651, "eval_steps_per_second": 0.539, "step": 1200 }, { "epoch": 0.8106312292358804, "grad_norm": 0.07045505195856094, "learning_rate": 4.295160119383712e-06, "loss": 0.4859, "step": 1220 }, { "epoch": 0.8239202657807309, "grad_norm": 0.06839559227228165, "learning_rate": 3.728427429388709e-06, "loss": 0.4863, "step": 1240 }, { "epoch": 0.8372093023255814, "grad_norm": 0.06684821844100952, "learning_rate": 3.198764796404807e-06, "loss": 0.4856, "step": 1260 }, { "epoch": 0.8504983388704319, "grad_norm": 0.06731660664081573, "learning_rate": 2.707095265681081e-06, "loss": 0.4854, "step": 1280 }, { "epoch": 0.8637873754152824, "grad_norm": 0.06780705600976944, "learning_rate": 2.254275671731007e-06, "loss": 0.4868, "step": 1300 }, { "epoch": 0.8770764119601329, "grad_norm": 0.06815515458583832, "learning_rate": 1.8410951451234533e-06, "loss": 0.4854, "step": 1320 }, { "epoch": 0.8903654485049833, "grad_norm": 0.0670180469751358, "learning_rate": 1.4682737372615967e-06, "loss": 0.485, "step": 1340 }, { "epoch": 0.9036544850498339, "grad_norm": 0.06649608910083771, "learning_rate": 1.1364611655463736e-06, "loss": 0.4867, "step": 1360 }, { "epoch": 0.9169435215946844, "grad_norm": 0.0674930214881897, "learning_rate": 8.462356811112987e-07, "loss": 0.4865, "step": 1380 }, { "epoch": 0.9302325581395349, "grad_norm": 0.06808231770992279, "learning_rate": 5.981030611018234e-07, "loss": 0.4864, "step": 1400 }, { "epoch": 0.9302325581395349, "eval_accuracy": 0.19667556159797644, "eval_loss": 0.519675612449646, "eval_runtime": 14.9507, "eval_samples_per_second": 122.67, "eval_steps_per_second": 0.535, "step": 1400 }, { "epoch": 0.9435215946843853, "grad_norm": 0.06696037203073502, "learning_rate": 3.9249572725543196e-07, "loss": 0.4852, "step": 1420 }, { "epoch": 0.9568106312292359, "grad_norm": 0.06675516068935394, "learning_rate": 2.297719923185032e-07, "loss": 0.4875, "step": 1440 }, { "epoch": 0.9700996677740864, "grad_norm": 0.06678403913974762, "learning_rate": 1.1021543561322012e-07, "loss": 0.4852, "step": 1460 }, { "epoch": 0.9833887043189369, "grad_norm": 0.0660882443189621, "learning_rate": 3.403440884269526e-08, "loss": 0.4848, "step": 1480 }, { "epoch": 0.9966777408637874, "grad_norm": 0.06698651611804962, "learning_rate": 1.3616729956228425e-09, "loss": 0.4847, "step": 1500 }, { "epoch": 1.0, "step": 1505, "total_flos": 2.786803439690685e+19, "train_loss": 0.0, "train_runtime": 4.5361, "train_samples_per_second": 339673.082, "train_steps_per_second": 331.781 } ], "logging_steps": 20, "max_steps": 1505, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.786803439690685e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }