{ "best_metric": null, "best_model_checkpoint": null, "epoch": 22.0, "global_step": 79002, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-09, "loss": 10.5192, "step": 1 }, { "epoch": 0.14, "learning_rate": 2.5e-06, "loss": 9.4826, "step": 500 }, { "epoch": 0.28, "learning_rate": 5e-06, "loss": 7.9029, "step": 1000 }, { "epoch": 0.42, "learning_rate": 7.5e-06, "loss": 6.8971, "step": 1500 }, { "epoch": 0.56, "learning_rate": 1e-05, "loss": 6.6133, "step": 2000 }, { "epoch": 0.7, "learning_rate": 1.25e-05, "loss": 6.4578, "step": 2500 }, { "epoch": 0.84, "learning_rate": 1.5e-05, "loss": 6.3502, "step": 3000 }, { "epoch": 0.97, "learning_rate": 1.75e-05, "loss": 6.2558, "step": 3500 }, { "epoch": 1.11, "learning_rate": 2e-05, "loss": 6.1784, "step": 4000 }, { "epoch": 1.25, "learning_rate": 2.25e-05, "loss": 6.1171, "step": 4500 }, { "epoch": 1.39, "learning_rate": 2.5e-05, "loss": 6.0613, "step": 5000 }, { "epoch": 1.53, "learning_rate": 2.7500000000000004e-05, "loss": 6.0146, "step": 5500 }, { "epoch": 1.67, "learning_rate": 3e-05, "loss": 5.9732, "step": 6000 }, { "epoch": 1.81, "learning_rate": 3.2500000000000004e-05, "loss": 5.9385, "step": 6500 }, { "epoch": 1.95, "learning_rate": 3.5e-05, "loss": 5.9006, "step": 7000 }, { "epoch": 2.09, "learning_rate": 3.7500000000000003e-05, "loss": 5.8725, "step": 7500 }, { "epoch": 2.23, "learning_rate": 4e-05, "loss": 5.8389, "step": 8000 }, { "epoch": 2.37, "learning_rate": 4.2495e-05, "loss": 5.8149, "step": 8500 }, { "epoch": 2.51, "learning_rate": 4.4995000000000005e-05, "loss": 5.7907, "step": 9000 }, { "epoch": 2.65, "learning_rate": 4.7495e-05, "loss": 5.771, "step": 9500 }, { "epoch": 2.78, "learning_rate": 4.9995000000000005e-05, "loss": 5.7541, "step": 10000 }, { "epoch": 2.92, "learning_rate": 4.998254346606843e-05, "loss": 5.7367, "step": 10500 }, { "epoch": 3.06, "learning_rate": 4.9965016825574876e-05, "loss": 5.7192, "step": 11000 }, { "epoch": 3.2, "learning_rate": 4.994749018508132e-05, "loss": 5.7026, "step": 11500 }, { "epoch": 3.34, "learning_rate": 4.9929963544587775e-05, "loss": 5.6922, "step": 12000 }, { "epoch": 3.48, "learning_rate": 4.991247195737521e-05, "loss": 5.6796, "step": 12500 }, { "epoch": 3.62, "learning_rate": 4.9894945316881665e-05, "loss": 5.67, "step": 13000 }, { "epoch": 3.76, "learning_rate": 4.987741867638811e-05, "loss": 5.6599, "step": 13500 }, { "epoch": 3.9, "learning_rate": 4.9859892035894564e-05, "loss": 5.6487, "step": 14000 }, { "epoch": 4.04, "learning_rate": 4.9842400448681994e-05, "loss": 5.6437, "step": 14500 }, { "epoch": 4.18, "learning_rate": 4.982487380818845e-05, "loss": 5.6326, "step": 15000 }, { "epoch": 4.32, "learning_rate": 4.98073471676949e-05, "loss": 5.6252, "step": 15500 }, { "epoch": 4.46, "learning_rate": 4.978982052720135e-05, "loss": 5.6177, "step": 16000 }, { "epoch": 4.59, "learning_rate": 4.9772328939988783e-05, "loss": 5.6131, "step": 16500 }, { "epoch": 4.73, "learning_rate": 4.975483735277622e-05, "loss": 5.6043, "step": 17000 }, { "epoch": 4.87, "learning_rate": 4.9737310712282666e-05, "loss": 5.5996, "step": 17500 }, { "epoch": 5.01, "learning_rate": 4.971978407178912e-05, "loss": 5.5909, "step": 18000 }, { "epoch": 5.15, "learning_rate": 4.970225743129557e-05, "loss": 5.5874, "step": 18500 }, { "epoch": 5.29, "learning_rate": 4.968473079080202e-05, "loss": 5.5822, "step": 19000 }, { "epoch": 5.43, "learning_rate": 4.966723920358946e-05, "loss": 5.5773, "step": 19500 }, { "epoch": 5.57, "learning_rate": 4.964971256309591e-05, "loss": 5.5722, "step": 20000 }, { "epoch": 5.71, "learning_rate": 4.963218592260236e-05, "loss": 5.5707, "step": 20500 }, { "epoch": 5.85, "learning_rate": 4.961465928210881e-05, "loss": 5.5626, "step": 21000 }, { "epoch": 5.99, "learning_rate": 4.9597132641615254e-05, "loss": 5.5599, "step": 21500 }, { "epoch": 6.13, "learning_rate": 4.957964105440269e-05, "loss": 5.5536, "step": 22000 }, { "epoch": 6.27, "learning_rate": 4.9562114413909144e-05, "loss": 5.5496, "step": 22500 }, { "epoch": 6.4, "learning_rate": 4.95445877734156e-05, "loss": 5.5485, "step": 23000 }, { "epoch": 6.54, "learning_rate": 4.952706113292204e-05, "loss": 5.5436, "step": 23500 }, { "epoch": 6.68, "learning_rate": 4.950956954570948e-05, "loss": 5.5428, "step": 24000 }, { "epoch": 6.82, "learning_rate": 4.9492042905215926e-05, "loss": 5.5355, "step": 24500 }, { "epoch": 6.96, "learning_rate": 4.947451626472238e-05, "loss": 5.5348, "step": 25000 }, { "epoch": 7.1, "learning_rate": 4.9456989624228826e-05, "loss": 5.5275, "step": 25500 }, { "epoch": 7.24, "learning_rate": 4.943949803701627e-05, "loss": 5.5286, "step": 26000 }, { "epoch": 7.38, "learning_rate": 4.9421971396522716e-05, "loss": 5.5239, "step": 26500 }, { "epoch": 7.52, "learning_rate": 4.940444475602917e-05, "loss": 5.5223, "step": 27000 }, { "epoch": 7.66, "learning_rate": 4.9386953168816605e-05, "loss": 5.52, "step": 27500 }, { "epoch": 7.8, "learning_rate": 4.936942652832305e-05, "loss": 5.5177, "step": 28000 }, { "epoch": 7.94, "learning_rate": 4.9351899887829505e-05, "loss": 5.5147, "step": 28500 }, { "epoch": 8.08, "learning_rate": 4.933437324733595e-05, "loss": 5.5059, "step": 29000 }, { "epoch": 8.21, "learning_rate": 4.9316846606842404e-05, "loss": 5.5046, "step": 29500 }, { "epoch": 8.35, "learning_rate": 4.929931996634885e-05, "loss": 5.5033, "step": 30000 }, { "epoch": 8.49, "learning_rate": 4.92817933258553e-05, "loss": 5.5019, "step": 30500 }, { "epoch": 8.63, "learning_rate": 4.9264266685361756e-05, "loss": 5.4969, "step": 31000 }, { "epoch": 8.77, "learning_rate": 4.92467400448682e-05, "loss": 5.4955, "step": 31500 }, { "epoch": 8.91, "learning_rate": 4.922924845765564e-05, "loss": 5.4978, "step": 32000 }, { "epoch": 9.05, "learning_rate": 4.9211721817162086e-05, "loss": 5.4951, "step": 32500 }, { "epoch": 9.19, "learning_rate": 4.919419517666854e-05, "loss": 5.4846, "step": 33000 }, { "epoch": 9.33, "learning_rate": 4.9176668536174985e-05, "loss": 5.4871, "step": 33500 }, { "epoch": 9.47, "learning_rate": 4.915917694896243e-05, "loss": 5.4838, "step": 34000 }, { "epoch": 9.61, "learning_rate": 4.9141650308468875e-05, "loss": 5.4853, "step": 34500 }, { "epoch": 9.75, "learning_rate": 4.912412366797533e-05, "loss": 5.4807, "step": 35000 }, { "epoch": 9.89, "learning_rate": 4.9106597027481774e-05, "loss": 5.4789, "step": 35500 }, { "epoch": 10.03, "learning_rate": 4.908910544026921e-05, "loss": 5.4754, "step": 36000 }, { "epoch": 10.16, "learning_rate": 4.9071578799775664e-05, "loss": 5.4706, "step": 36500 }, { "epoch": 10.3, "learning_rate": 4.905405215928211e-05, "loss": 5.332, "step": 37000 }, { "epoch": 10.44, "learning_rate": 4.903652551878856e-05, "loss": 5.1202, "step": 37500 }, { "epoch": 10.58, "learning_rate": 4.901899887829501e-05, "loss": 4.9543, "step": 38000 }, { "epoch": 10.72, "learning_rate": 4.900147223780146e-05, "loss": 4.7848, "step": 38500 }, { "epoch": 10.86, "learning_rate": 4.898394559730791e-05, "loss": 4.6248, "step": 39000 }, { "epoch": 11.0, "learning_rate": 4.8966418956814355e-05, "loss": 4.4711, "step": 39500 }, { "epoch": 11.14, "learning_rate": 4.894889231632081e-05, "loss": 4.3324, "step": 40000 }, { "epoch": 11.28, "learning_rate": 4.8931400729108245e-05, "loss": 4.1831, "step": 40500 }, { "epoch": 11.42, "learning_rate": 4.89138740886147e-05, "loss": 3.845, "step": 41000 }, { "epoch": 11.56, "learning_rate": 4.889634744812115e-05, "loss": 3.3852, "step": 41500 }, { "epoch": 11.7, "learning_rate": 4.88788208076276e-05, "loss": 2.749, "step": 42000 }, { "epoch": 11.84, "learning_rate": 4.886136427369602e-05, "loss": 2.4339, "step": 42500 }, { "epoch": 11.97, "learning_rate": 4.884383763320247e-05, "loss": 2.2505, "step": 43000 }, { "epoch": 12.11, "learning_rate": 4.882631099270892e-05, "loss": 2.1255, "step": 43500 }, { "epoch": 12.25, "learning_rate": 4.880878435221537e-05, "loss": 2.037, "step": 44000 }, { "epoch": 12.39, "learning_rate": 4.879125771172182e-05, "loss": 1.9662, "step": 44500 }, { "epoch": 12.53, "learning_rate": 4.877376612450926e-05, "loss": 1.909, "step": 45000 }, { "epoch": 12.67, "learning_rate": 4.8756239484015706e-05, "loss": 1.8613, "step": 45500 }, { "epoch": 12.81, "learning_rate": 4.873871284352215e-05, "loss": 1.8191, "step": 46000 }, { "epoch": 12.95, "learning_rate": 4.8721186203028606e-05, "loss": 1.7808, "step": 46500 }, { "epoch": 13.09, "learning_rate": 4.870369461581604e-05, "loss": 1.745, "step": 47000 }, { "epoch": 13.23, "learning_rate": 4.8686167975322495e-05, "loss": 1.7123, "step": 47500 }, { "epoch": 13.37, "learning_rate": 4.866864133482894e-05, "loss": 1.6834, "step": 48000 }, { "epoch": 13.51, "learning_rate": 4.8651114694335395e-05, "loss": 1.6541, "step": 48500 }, { "epoch": 13.65, "learning_rate": 4.8633623107122825e-05, "loss": 1.6308, "step": 49000 }, { "epoch": 13.78, "learning_rate": 4.861609646662928e-05, "loss": 1.6093, "step": 49500 }, { "epoch": 13.92, "learning_rate": 4.859856982613573e-05, "loss": 1.5854, "step": 50000 }, { "epoch": 14.06, "learning_rate": 4.858104318564218e-05, "loss": 1.5645, "step": 50500 }, { "epoch": 14.2, "learning_rate": 4.856351654514863e-05, "loss": 1.5449, "step": 51000 }, { "epoch": 14.34, "learning_rate": 4.854602495793607e-05, "loss": 1.5306, "step": 51500 }, { "epoch": 14.48, "learning_rate": 4.852849831744252e-05, "loss": 1.5134, "step": 52000 }, { "epoch": 14.62, "learning_rate": 4.8510971676948966e-05, "loss": 1.4979, "step": 52500 }, { "epoch": 14.76, "learning_rate": 4.849344503645541e-05, "loss": 1.4835, "step": 53000 }, { "epoch": 14.9, "learning_rate": 4.847595344924285e-05, "loss": 1.468, "step": 53500 }, { "epoch": 15.04, "learning_rate": 4.84584268087493e-05, "loss": 1.4542, "step": 54000 }, { "epoch": 15.18, "learning_rate": 4.844090016825575e-05, "loss": 1.4416, "step": 54500 }, { "epoch": 15.32, "learning_rate": 4.842340858104319e-05, "loss": 1.4277, "step": 55000 }, { "epoch": 15.46, "learning_rate": 4.840588194054964e-05, "loss": 1.4185, "step": 55500 }, { "epoch": 15.59, "learning_rate": 4.8388355300056085e-05, "loss": 1.405, "step": 56000 }, { "epoch": 15.73, "learning_rate": 4.837082865956254e-05, "loss": 1.3956, "step": 56500 }, { "epoch": 15.87, "learning_rate": 4.8353302019068984e-05, "loss": 1.384, "step": 57000 }, { "epoch": 16.01, "learning_rate": 4.833581043185643e-05, "loss": 1.3745, "step": 57500 }, { "epoch": 16.15, "learning_rate": 4.8318283791362874e-05, "loss": 1.3639, "step": 58000 }, { "epoch": 16.29, "learning_rate": 4.830075715086933e-05, "loss": 1.3543, "step": 58500 }, { "epoch": 16.43, "learning_rate": 4.828323051037577e-05, "loss": 1.347, "step": 59000 }, { "epoch": 16.57, "learning_rate": 4.8265703869882226e-05, "loss": 1.3377, "step": 59500 }, { "epoch": 16.71, "learning_rate": 4.8248212282669656e-05, "loss": 1.3291, "step": 60000 }, { "epoch": 16.85, "learning_rate": 4.823068564217611e-05, "loss": 1.3214, "step": 60500 }, { "epoch": 16.99, "learning_rate": 4.821315900168256e-05, "loss": 1.3146, "step": 61000 }, { "epoch": 17.13, "learning_rate": 4.819563236118901e-05, "loss": 1.3041, "step": 61500 }, { "epoch": 17.27, "learning_rate": 4.817810572069546e-05, "loss": 1.2975, "step": 62000 }, { "epoch": 17.4, "learning_rate": 4.8160579080201915e-05, "loss": 1.2908, "step": 62500 }, { "epoch": 17.54, "learning_rate": 4.814305243970836e-05, "loss": 1.2835, "step": 63000 }, { "epoch": 17.68, "learning_rate": 4.812556085249579e-05, "loss": 1.2766, "step": 63500 }, { "epoch": 17.82, "learning_rate": 4.8108034212002244e-05, "loss": 1.2715, "step": 64000 }, { "epoch": 17.96, "learning_rate": 4.80905075715087e-05, "loss": 1.2655, "step": 64500 }, { "epoch": 18.1, "learning_rate": 4.807298093101514e-05, "loss": 1.2565, "step": 65000 }, { "epoch": 18.24, "learning_rate": 4.805548934380259e-05, "loss": 1.2506, "step": 65500 }, { "epoch": 18.38, "learning_rate": 4.803796270330903e-05, "loss": 1.2443, "step": 66000 }, { "epoch": 18.52, "learning_rate": 4.802043606281548e-05, "loss": 1.2386, "step": 66500 }, { "epoch": 18.66, "learning_rate": 4.800290942232193e-05, "loss": 1.2338, "step": 67000 }, { "epoch": 18.8, "learning_rate": 4.798541783510937e-05, "loss": 1.2287, "step": 67500 }, { "epoch": 18.94, "learning_rate": 4.7967926247896806e-05, "loss": 1.2233, "step": 68000 }, { "epoch": 19.08, "learning_rate": 4.795039960740326e-05, "loss": 1.2185, "step": 68500 }, { "epoch": 19.21, "learning_rate": 4.7932872966909705e-05, "loss": 1.2133, "step": 69000 }, { "epoch": 19.35, "learning_rate": 4.791534632641616e-05, "loss": 1.2088, "step": 69500 }, { "epoch": 19.49, "learning_rate": 4.7897819685922604e-05, "loss": 1.2036, "step": 70000 }, { "epoch": 19.63, "learning_rate": 4.788029304542905e-05, "loss": 1.1979, "step": 70500 }, { "epoch": 19.77, "learning_rate": 4.7862766404935504e-05, "loss": 1.1919, "step": 71000 }, { "epoch": 19.91, "learning_rate": 4.784527481772294e-05, "loss": 1.1893, "step": 71500 }, { "epoch": 20.05, "learning_rate": 4.7827748177229394e-05, "loss": 1.1863, "step": 72000 }, { "epoch": 20.19, "learning_rate": 4.781022153673584e-05, "loss": 1.1801, "step": 72500 }, { "epoch": 20.33, "learning_rate": 4.779269489624229e-05, "loss": 1.1777, "step": 73000 }, { "epoch": 20.47, "learning_rate": 4.777516825574874e-05, "loss": 1.1714, "step": 73500 }, { "epoch": 20.61, "learning_rate": 4.7757676668536176e-05, "loss": 1.1684, "step": 74000 }, { "epoch": 20.75, "learning_rate": 4.774015002804263e-05, "loss": 1.165, "step": 74500 }, { "epoch": 20.89, "learning_rate": 4.7722623387549075e-05, "loss": 1.1602, "step": 75000 }, { "epoch": 21.02, "learning_rate": 4.770509674705553e-05, "loss": 1.1567, "step": 75500 }, { "epoch": 21.16, "learning_rate": 4.7687570106561975e-05, "loss": 1.1524, "step": 76000 }, { "epoch": 21.3, "learning_rate": 4.767007851934942e-05, "loss": 1.1502, "step": 76500 }, { "epoch": 21.44, "learning_rate": 4.7652551878855864e-05, "loss": 1.1442, "step": 77000 }, { "epoch": 21.58, "learning_rate": 4.763502523836231e-05, "loss": 1.1422, "step": 77500 }, { "epoch": 21.72, "learning_rate": 4.7617498597868764e-05, "loss": 1.1384, "step": 78000 }, { "epoch": 21.86, "learning_rate": 4.759997195737521e-05, "loss": 1.1361, "step": 78500 }, { "epoch": 22.0, "learning_rate": 4.758244531688166e-05, "loss": 1.1318, "step": 79000 } ], "max_steps": 1436400, "num_train_epochs": 400, "total_flos": 2.129238171230562e+19, "trial_name": null, "trial_params": null }