{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2130013831258646, "eval_steps": 50, "global_step": 2700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04098150709492342, "grad_norm": 0.5333279371261597, "learning_rate": 3.663003663003663e-06, "loss": 2.2117, "step": 50 }, { "epoch": 0.04098150709492342, "eval_loss": 2.7143771648406982, "eval_runtime": 282.6413, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 50 }, { "epoch": 0.08196301418984683, "grad_norm": 0.42295441031455994, "learning_rate": 7.326007326007326e-06, "loss": 2.1609, "step": 100 }, { "epoch": 0.08196301418984683, "eval_loss": 2.6825826168060303, "eval_runtime": 283.0578, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 100 }, { "epoch": 0.12294452128477025, "grad_norm": 0.277256041765213, "learning_rate": 1.098901098901099e-05, "loss": 2.0212, "step": 150 }, { "epoch": 0.12294452128477025, "eval_loss": 2.602968215942383, "eval_runtime": 282.5654, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 150 }, { "epoch": 0.16392602837969367, "grad_norm": 0.2273157685995102, "learning_rate": 1.4652014652014653e-05, "loss": 1.8938, "step": 200 }, { "epoch": 0.16392602837969367, "eval_loss": 2.506340980529785, "eval_runtime": 283.8897, "eval_samples_per_second": 0.764, "eval_steps_per_second": 0.099, "step": 200 }, { "epoch": 0.20490753547461707, "grad_norm": 0.21983110904693604, "learning_rate": 1.8315018315018315e-05, "loss": 1.7918, "step": 250 }, { "epoch": 0.20490753547461707, "eval_loss": 2.4188530445098877, "eval_runtime": 283.2047, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.099, "step": 250 }, { "epoch": 0.2458890425695405, "grad_norm": 0.20148395001888275, "learning_rate": 1.999406558079547e-05, "loss": 1.7115, "step": 300 }, { "epoch": 0.2458890425695405, "eval_loss": 2.3436834812164307, "eval_runtime": 282.2728, "eval_samples_per_second": 0.769, "eval_steps_per_second": 0.099, "step": 300 }, { "epoch": 0.2868705496644639, "grad_norm": 0.20601870119571686, "learning_rate": 1.9951769064396967e-05, "loss": 1.6595, "step": 350 }, { "epoch": 0.2868705496644639, "eval_loss": 2.3192338943481445, "eval_runtime": 282.1213, "eval_samples_per_second": 0.769, "eval_steps_per_second": 0.099, "step": 350 }, { "epoch": 0.32785205675938733, "grad_norm": 0.19198212027549744, "learning_rate": 1.986897612915546e-05, "loss": 1.6081, "step": 400 }, { "epoch": 0.32785205675938733, "eval_loss": 2.3058478832244873, "eval_runtime": 282.7227, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 400 }, { "epoch": 0.36883356385431076, "grad_norm": 0.19531460106372833, "learning_rate": 1.9746023681741606e-05, "loss": 1.6127, "step": 450 }, { "epoch": 0.36883356385431076, "eval_loss": 2.2954719066619873, "eval_runtime": 283.3445, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.099, "step": 450 }, { "epoch": 0.40981507094923414, "grad_norm": 0.2188873440027237, "learning_rate": 1.9583412048657773e-05, "loss": 1.5999, "step": 500 }, { "epoch": 0.40981507094923414, "eval_loss": 2.2856028079986572, "eval_runtime": 282.6955, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 500 }, { "epoch": 0.45079657804415757, "grad_norm": 0.2119743824005127, "learning_rate": 1.9381802940275198e-05, "loss": 1.6074, "step": 550 }, { "epoch": 0.45079657804415757, "eval_loss": 2.2773895263671875, "eval_runtime": 283.317, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.099, "step": 550 }, { "epoch": 0.491778085139081, "grad_norm": 0.21725280582904816, "learning_rate": 1.914201675815694e-05, "loss": 1.5989, "step": 600 }, { "epoch": 0.491778085139081, "eval_loss": 2.2710676193237305, "eval_runtime": 283.5744, "eval_samples_per_second": 0.765, "eval_steps_per_second": 0.099, "step": 600 }, { "epoch": 0.5327595922340044, "grad_norm": 0.22087362408638, "learning_rate": 1.8865029256623765e-05, "loss": 1.5708, "step": 650 }, { "epoch": 0.5327595922340044, "eval_loss": 2.2653110027313232, "eval_runtime": 282.6403, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 650 }, { "epoch": 0.5737410993289278, "grad_norm": 0.24597273766994476, "learning_rate": 1.855196757214796e-05, "loss": 1.5981, "step": 700 }, { "epoch": 0.5737410993289278, "eval_loss": 2.2589895725250244, "eval_runtime": 282.6053, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 700 }, { "epoch": 0.6147226064238512, "grad_norm": 0.20500172674655914, "learning_rate": 1.8204105636732604e-05, "loss": 1.5859, "step": 750 }, { "epoch": 0.6147226064238512, "eval_loss": 2.2537271976470947, "eval_runtime": 282.8109, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 750 }, { "epoch": 0.6557041135187747, "grad_norm": 0.24009792506694794, "learning_rate": 1.782285899394034e-05, "loss": 1.5765, "step": 800 }, { "epoch": 0.6557041135187747, "eval_loss": 2.250108242034912, "eval_runtime": 282.295, "eval_samples_per_second": 0.769, "eval_steps_per_second": 0.099, "step": 800 }, { "epoch": 0.6966856206136981, "grad_norm": 0.23266170918941498, "learning_rate": 1.74097790386668e-05, "loss": 1.5676, "step": 850 }, { "epoch": 0.6966856206136981, "eval_loss": 2.2453207969665527, "eval_runtime": 281.9055, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.099, "step": 850 }, { "epoch": 0.7376671277086215, "grad_norm": 0.2351984679698944, "learning_rate": 1.6966546704098455e-05, "loss": 1.5688, "step": 900 }, { "epoch": 0.7376671277086215, "eval_loss": 2.2418150901794434, "eval_runtime": 281.9821, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.099, "step": 900 }, { "epoch": 0.7786486348035448, "grad_norm": 0.237622931599617, "learning_rate": 1.6494965621544403e-05, "loss": 1.5643, "step": 950 }, { "epoch": 0.7786486348035448, "eval_loss": 2.2379164695739746, "eval_runtime": 282.2084, "eval_samples_per_second": 0.769, "eval_steps_per_second": 0.099, "step": 950 }, { "epoch": 0.8196301418984683, "grad_norm": 0.24924997985363007, "learning_rate": 1.5996954780976568e-05, "loss": 1.5346, "step": 1000 }, { "epoch": 0.8196301418984683, "eval_loss": 2.2346854209899902, "eval_runtime": 283.4225, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.099, "step": 1000 }, { "epoch": 0.8606116489933917, "grad_norm": 0.24776747822761536, "learning_rate": 1.547454072214457e-05, "loss": 1.5507, "step": 1050 }, { "epoch": 0.8606116489933917, "eval_loss": 2.2320806980133057, "eval_runtime": 282.9227, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 1050 }, { "epoch": 0.9015931560883151, "grad_norm": 0.2700960040092468, "learning_rate": 1.4929849288041656e-05, "loss": 1.5582, "step": 1100 }, { "epoch": 0.9015931560883151, "eval_loss": 2.2285408973693848, "eval_runtime": 283.0683, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 1100 }, { "epoch": 0.9425746631832386, "grad_norm": 0.26280835270881653, "learning_rate": 1.4365096974279093e-05, "loss": 1.5275, "step": 1150 }, { "epoch": 0.9425746631832386, "eval_loss": 2.2259719371795654, "eval_runtime": 282.6889, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 1150 }, { "epoch": 0.983556170278162, "grad_norm": 0.2555929124355316, "learning_rate": 1.3782581909570757e-05, "loss": 1.523, "step": 1200 }, { "epoch": 0.983556170278162, "eval_loss": 2.2228543758392334, "eval_runtime": 284.1011, "eval_samples_per_second": 0.764, "eval_steps_per_second": 0.099, "step": 1200 }, { "epoch": 1.0245376773730854, "grad_norm": 0.27351436018943787, "learning_rate": 1.3184674504030679e-05, "loss": 1.5354, "step": 1250 }, { "epoch": 1.0245376773730854, "eval_loss": 2.2217090129852295, "eval_runtime": 284.3948, "eval_samples_per_second": 0.763, "eval_steps_per_second": 0.098, "step": 1250 }, { "epoch": 1.0655191844680088, "grad_norm": 0.2628322243690491, "learning_rate": 1.2573807803338216e-05, "loss": 1.5386, "step": 1300 }, { "epoch": 1.0655191844680088, "eval_loss": 2.2189579010009766, "eval_runtime": 282.9448, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 1300 }, { "epoch": 1.1065006915629323, "grad_norm": 0.27614346146583557, "learning_rate": 1.1952467588022282e-05, "loss": 1.5338, "step": 1350 }, { "epoch": 1.1065006915629323, "eval_loss": 2.21720814704895, "eval_runtime": 282.859, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 1350 }, { "epoch": 1.1474821986578556, "grad_norm": 0.27579498291015625, "learning_rate": 1.1323182258153314e-05, "loss": 1.5292, "step": 1400 }, { "epoch": 1.1474821986578556, "eval_loss": 2.2148287296295166, "eval_runtime": 283.7073, "eval_samples_per_second": 0.765, "eval_steps_per_second": 0.099, "step": 1400 }, { "epoch": 1.1884637057527792, "grad_norm": 0.2812260687351227, "learning_rate": 1.0688512544604915e-05, "loss": 1.5376, "step": 1450 }, { "epoch": 1.1884637057527792, "eval_loss": 2.2129642963409424, "eval_runtime": 281.4833, "eval_samples_per_second": 0.771, "eval_steps_per_second": 0.099, "step": 1450 }, { "epoch": 1.2294452128477025, "grad_norm": 0.28713124990463257, "learning_rate": 1.005104108875275e-05, "loss": 1.5273, "step": 1500 }, { "epoch": 1.2294452128477025, "eval_loss": 2.2116787433624268, "eval_runtime": 283.0969, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 1500 }, { "epoch": 1.270426719942626, "grad_norm": 0.27820634841918945, "learning_rate": 9.41336193301377e-06, "loss": 1.526, "step": 1550 }, { "epoch": 1.270426719942626, "eval_loss": 2.2110674381256104, "eval_runtime": 281.8783, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.099, "step": 1550 }, { "epoch": 1.3114082270375493, "grad_norm": 0.26735347509384155, "learning_rate": 8.778069964991484e-06, "loss": 1.537, "step": 1600 }, { "epoch": 1.3114082270375493, "eval_loss": 2.2088816165924072, "eval_runtime": 281.9837, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.099, "step": 1600 }, { "epoch": 1.3523897341324727, "grad_norm": 0.2734984755516052, "learning_rate": 8.147750358182e-06, "loss": 1.5431, "step": 1650 }, { "epoch": 1.3523897341324727, "eval_loss": 2.207979917526245, "eval_runtime": 282.6878, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 1650 }, { "epoch": 1.3933712412273962, "grad_norm": 0.3110925257205963, "learning_rate": 7.524968052209331e-06, "loss": 1.5401, "step": 1700 }, { "epoch": 1.3933712412273962, "eval_loss": 2.2074739933013916, "eval_runtime": 282.1594, "eval_samples_per_second": 0.769, "eval_steps_per_second": 0.099, "step": 1700 }, { "epoch": 1.4343527483223195, "grad_norm": 0.3202177882194519, "learning_rate": 6.912257315397784e-06, "loss": 1.5331, "step": 1750 }, { "epoch": 1.4343527483223195, "eval_loss": 2.205904245376587, "eval_runtime": 280.7829, "eval_samples_per_second": 0.773, "eval_steps_per_second": 0.1, "step": 1750 }, { "epoch": 1.475334255417243, "grad_norm": 0.28285861015319824, "learning_rate": 6.312111432154074e-06, "loss": 1.5395, "step": 1800 }, { "epoch": 1.475334255417243, "eval_loss": 2.2046620845794678, "eval_runtime": 282.5413, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 1800 }, { "epoch": 1.5163157625121664, "grad_norm": 0.2836764454841614, "learning_rate": 5.726972557124022e-06, "loss": 1.542, "step": 1850 }, { "epoch": 1.5163157625121664, "eval_loss": 2.203850507736206, "eval_runtime": 283.3938, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.099, "step": 1850 }, { "epoch": 1.5572972696070897, "grad_norm": 0.2900823652744293, "learning_rate": 5.159221777409953e-06, "loss": 1.502, "step": 1900 }, { "epoch": 1.5572972696070897, "eval_loss": 2.2032361030578613, "eval_runtime": 281.099, "eval_samples_per_second": 0.772, "eval_steps_per_second": 0.1, "step": 1900 }, { "epoch": 1.5982787767020132, "grad_norm": 0.282176673412323, "learning_rate": 4.611169423288323e-06, "loss": 1.5267, "step": 1950 }, { "epoch": 1.5982787767020132, "eval_loss": 2.202617883682251, "eval_runtime": 283.2145, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.099, "step": 1950 }, { "epoch": 1.6392602837969368, "grad_norm": 0.29563507437705994, "learning_rate": 4.085045666855846e-06, "loss": 1.51, "step": 2000 }, { "epoch": 1.6392602837969368, "eval_loss": 2.2023086547851562, "eval_runtime": 282.4408, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 2000 }, { "epoch": 1.68024179089186, "grad_norm": 0.2923285663127899, "learning_rate": 3.5829914468607874e-06, "loss": 1.5319, "step": 2050 }, { "epoch": 1.68024179089186, "eval_loss": 2.2018585205078125, "eval_runtime": 281.9707, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.099, "step": 2050 }, { "epoch": 1.7212232979867834, "grad_norm": 0.2978394031524658, "learning_rate": 3.1070497566486825e-06, "loss": 1.5267, "step": 2100 }, { "epoch": 1.7212232979867834, "eval_loss": 2.2010927200317383, "eval_runtime": 282.5046, "eval_samples_per_second": 0.768, "eval_steps_per_second": 0.099, "step": 2100 }, { "epoch": 1.7622048050817067, "grad_norm": 0.2987017035484314, "learning_rate": 2.6591573306741704e-06, "loss": 1.5201, "step": 2150 }, { "epoch": 1.7622048050817067, "eval_loss": 2.200831174850464, "eval_runtime": 283.0038, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 2150 }, { "epoch": 1.8031863121766303, "grad_norm": 0.28967171907424927, "learning_rate": 2.241136763408801e-06, "loss": 1.5204, "step": 2200 }, { "epoch": 1.8031863121766303, "eval_loss": 2.2002053260803223, "eval_runtime": 281.6629, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.099, "step": 2200 }, { "epoch": 1.8441678192715538, "grad_norm": 0.321614146232605, "learning_rate": 1.8546890927150273e-06, "loss": 1.5094, "step": 2250 }, { "epoch": 1.8441678192715538, "eval_loss": 2.2001166343688965, "eval_runtime": 282.843, "eval_samples_per_second": 0.767, "eval_steps_per_second": 0.099, "step": 2250 }, { "epoch": 1.8851493263664771, "grad_norm": 0.2887614965438843, "learning_rate": 1.501386877866694e-06, "loss": 1.5268, "step": 2300 }, { "epoch": 1.8851493263664771, "eval_loss": 2.199751615524292, "eval_runtime": 282.0209, "eval_samples_per_second": 0.769, "eval_steps_per_second": 0.099, "step": 2300 }, { "epoch": 1.9261308334614005, "grad_norm": 0.30499428510665894, "learning_rate": 1.1826678003833402e-06, "loss": 1.513, "step": 2350 }, { "epoch": 1.9261308334614005, "eval_loss": 2.1995420455932617, "eval_runtime": 283.2079, "eval_samples_per_second": 0.766, "eval_steps_per_second": 0.099, "step": 2350 }, { "epoch": 1.967112340556324, "grad_norm": 0.29722264409065247, "learning_rate": 8.998288137183209e-07, "loss": 1.5263, "step": 2400 }, { "epoch": 1.967112340556324, "eval_loss": 2.199302911758423, "eval_runtime": 281.4307, "eval_samples_per_second": 0.771, "eval_steps_per_second": 0.099, "step": 2400 }, { "epoch": 2.0080938476512475, "grad_norm": 0.30117130279541016, "learning_rate": 6.540208656071601e-07, "loss": 1.5291, "step": 2450 }, { "epoch": 2.0080938476512475, "eval_loss": 2.199272394180298, "eval_runtime": 281.4783, "eval_samples_per_second": 0.771, "eval_steps_per_second": 0.099, "step": 2450 }, { "epoch": 2.049075354746171, "grad_norm": 0.29823198914527893, "learning_rate": 4.4624421455236156e-07, "loss": 1.5187, "step": 2500 }, { "epoch": 2.049075354746171, "eval_loss": 2.1992440223693848, "eval_runtime": 280.9197, "eval_samples_per_second": 0.772, "eval_steps_per_second": 0.1, "step": 2500 }, { "epoch": 2.090056861841094, "grad_norm": 0.3053443729877472, "learning_rate": 2.7734435950315663e-07, "loss": 1.5324, "step": 2550 }, { "epoch": 2.090056861841094, "eval_loss": 2.1991806030273438, "eval_runtime": 281.8919, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.099, "step": 2550 }, { "epoch": 2.1310383689360175, "grad_norm": 0.31491023302078247, "learning_rate": 1.4800859929338218e-07, "loss": 1.5314, "step": 2600 }, { "epoch": 2.1310383689360175, "eval_loss": 2.199115037918091, "eval_runtime": 282.019, "eval_samples_per_second": 0.769, "eval_steps_per_second": 0.099, "step": 2600 }, { "epoch": 2.172019876030941, "grad_norm": 0.300465852022171, "learning_rate": 5.876323583810184e-08, "loss": 1.5357, "step": 2650 }, { "epoch": 2.172019876030941, "eval_loss": 2.19909930229187, "eval_runtime": 281.0522, "eval_samples_per_second": 0.772, "eval_steps_per_second": 0.1, "step": 2650 }, { "epoch": 2.2130013831258646, "grad_norm": 0.3176944851875305, "learning_rate": 9.971432469871866e-09, "loss": 1.507, "step": 2700 }, { "epoch": 2.2130013831258646, "eval_loss": 2.1991000175476074, "eval_runtime": 281.6411, "eval_samples_per_second": 0.77, "eval_steps_per_second": 0.099, "step": 2700 } ], "logging_steps": 50, "max_steps": 2735, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 1.100581152473088e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }