{ "best_metric": 1.0308473110198975, "best_model_checkpoint": "./models/adapters_mlm_wiki/mt/checkpoint-46500", "epoch": 36.41346906812843, "eval_steps": 500, "global_step": 46500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.39, "learning_rate": 4.9500000000000004e-05, "loss": 2.8769, "step": 500 }, { "epoch": 0.39, "eval_accuracy": 0.5941419312917663, "eval_loss": 2.3415136337280273, "eval_runtime": 28.5011, "eval_samples_per_second": 82.909, "eval_steps_per_second": 5.193, "step": 500 }, { "epoch": 0.78, "learning_rate": 4.9e-05, "loss": 2.3422, "step": 1000 }, { "epoch": 0.78, "eval_accuracy": 0.6323611764442354, "eval_loss": 2.056666851043701, "eval_runtime": 28.4052, "eval_samples_per_second": 83.189, "eval_steps_per_second": 5.21, "step": 1000 }, { "epoch": 1.17, "learning_rate": 4.85e-05, "loss": 2.1192, "step": 1500 }, { "epoch": 1.17, "eval_accuracy": 0.6535452883255145, "eval_loss": 1.896019458770752, "eval_runtime": 28.5579, "eval_samples_per_second": 82.744, "eval_steps_per_second": 5.182, "step": 1500 }, { "epoch": 1.57, "learning_rate": 4.8e-05, "loss": 1.9894, "step": 2000 }, { "epoch": 1.57, "eval_accuracy": 0.6694745466891896, "eval_loss": 1.786872148513794, "eval_runtime": 28.1815, "eval_samples_per_second": 83.849, "eval_steps_per_second": 5.252, "step": 2000 }, { "epoch": 1.96, "learning_rate": 4.75e-05, "loss": 1.8961, "step": 2500 }, { "epoch": 1.96, "eval_accuracy": 0.6795843328924064, "eval_loss": 1.7180945873260498, "eval_runtime": 28.0277, "eval_samples_per_second": 84.309, "eval_steps_per_second": 5.28, "step": 2500 }, { "epoch": 2.35, "learning_rate": 4.7e-05, "loss": 1.8258, "step": 3000 }, { "epoch": 2.35, "eval_accuracy": 0.6893365553356471, "eval_loss": 1.6541422605514526, "eval_runtime": 28.1485, "eval_samples_per_second": 83.948, "eval_steps_per_second": 5.258, "step": 3000 }, { "epoch": 2.74, "learning_rate": 4.6500000000000005e-05, "loss": 1.7648, "step": 3500 }, { "epoch": 2.74, "eval_accuracy": 0.699619942626494, "eval_loss": 1.5881444215774536, "eval_runtime": 28.0252, "eval_samples_per_second": 84.317, "eval_steps_per_second": 5.281, "step": 3500 }, { "epoch": 3.13, "learning_rate": 4.600000000000001e-05, "loss": 1.7159, "step": 4000 }, { "epoch": 3.13, "eval_accuracy": 0.7064951255671863, "eval_loss": 1.5544116497039795, "eval_runtime": 28.0414, "eval_samples_per_second": 84.268, "eval_steps_per_second": 5.278, "step": 4000 }, { "epoch": 3.52, "learning_rate": 4.55e-05, "loss": 1.6763, "step": 4500 }, { "epoch": 3.52, "eval_accuracy": 0.7100668501769564, "eval_loss": 1.5229085683822632, "eval_runtime": 28.0579, "eval_samples_per_second": 84.219, "eval_steps_per_second": 5.275, "step": 4500 }, { "epoch": 3.92, "learning_rate": 4.5e-05, "loss": 1.6338, "step": 5000 }, { "epoch": 3.92, "eval_accuracy": 0.7165721663953293, "eval_loss": 1.4794138669967651, "eval_runtime": 28.0413, "eval_samples_per_second": 84.269, "eval_steps_per_second": 5.278, "step": 5000 }, { "epoch": 4.31, "learning_rate": 4.4500000000000004e-05, "loss": 1.6, "step": 5500 }, { "epoch": 4.31, "eval_accuracy": 0.7222084994105429, "eval_loss": 1.445155382156372, "eval_runtime": 28.0278, "eval_samples_per_second": 84.309, "eval_steps_per_second": 5.28, "step": 5500 }, { "epoch": 4.7, "learning_rate": 4.4000000000000006e-05, "loss": 1.5832, "step": 6000 }, { "epoch": 4.7, "eval_accuracy": 0.7249569552951998, "eval_loss": 1.430212140083313, "eval_runtime": 28.0661, "eval_samples_per_second": 84.194, "eval_steps_per_second": 5.273, "step": 6000 }, { "epoch": 5.09, "learning_rate": 4.35e-05, "loss": 1.5532, "step": 6500 }, { "epoch": 5.09, "eval_accuracy": 0.7307575842539112, "eval_loss": 1.4013447761535645, "eval_runtime": 28.0058, "eval_samples_per_second": 84.375, "eval_steps_per_second": 5.285, "step": 6500 }, { "epoch": 5.48, "learning_rate": 4.3e-05, "loss": 1.5247, "step": 7000 }, { "epoch": 5.48, "eval_accuracy": 0.7324599757836674, "eval_loss": 1.3955962657928467, "eval_runtime": 28.0763, "eval_samples_per_second": 84.163, "eval_steps_per_second": 5.271, "step": 7000 }, { "epoch": 5.87, "learning_rate": 4.25e-05, "loss": 1.5103, "step": 7500 }, { "epoch": 5.87, "eval_accuracy": 0.7366787666935556, "eval_loss": 1.3597683906555176, "eval_runtime": 28.6566, "eval_samples_per_second": 82.459, "eval_steps_per_second": 5.165, "step": 7500 }, { "epoch": 6.26, "learning_rate": 4.2e-05, "loss": 1.4866, "step": 8000 }, { "epoch": 6.26, "eval_accuracy": 0.7400985509526404, "eval_loss": 1.3330906629562378, "eval_runtime": 28.5641, "eval_samples_per_second": 82.726, "eval_steps_per_second": 5.181, "step": 8000 }, { "epoch": 6.66, "learning_rate": 4.15e-05, "loss": 1.468, "step": 8500 }, { "epoch": 6.66, "eval_accuracy": 0.7428036442904253, "eval_loss": 1.3266334533691406, "eval_runtime": 28.221, "eval_samples_per_second": 83.732, "eval_steps_per_second": 5.244, "step": 8500 }, { "epoch": 7.05, "learning_rate": 4.1e-05, "loss": 1.4402, "step": 9000 }, { "epoch": 7.05, "eval_accuracy": 0.7456972113785166, "eval_loss": 1.3118966817855835, "eval_runtime": 28.4656, "eval_samples_per_second": 83.013, "eval_steps_per_second": 5.199, "step": 9000 }, { "epoch": 7.44, "learning_rate": 4.05e-05, "loss": 1.4255, "step": 9500 }, { "epoch": 7.44, "eval_accuracy": 0.7481133713927157, "eval_loss": 1.2967016696929932, "eval_runtime": 28.4058, "eval_samples_per_second": 83.187, "eval_steps_per_second": 5.21, "step": 9500 }, { "epoch": 7.83, "learning_rate": 4e-05, "loss": 1.4236, "step": 10000 }, { "epoch": 7.83, "eval_accuracy": 0.7516161978944533, "eval_loss": 1.277876615524292, "eval_runtime": 28.6033, "eval_samples_per_second": 82.613, "eval_steps_per_second": 5.174, "step": 10000 }, { "epoch": 8.22, "learning_rate": 3.9500000000000005e-05, "loss": 1.41, "step": 10500 }, { "epoch": 8.22, "eval_accuracy": 0.7543682399627304, "eval_loss": 1.2598024606704712, "eval_runtime": 28.471, "eval_samples_per_second": 82.997, "eval_steps_per_second": 5.198, "step": 10500 }, { "epoch": 8.61, "learning_rate": 3.9000000000000006e-05, "loss": 1.3994, "step": 11000 }, { "epoch": 8.61, "eval_accuracy": 0.7538934495456234, "eval_loss": 1.267708420753479, "eval_runtime": 28.0315, "eval_samples_per_second": 84.298, "eval_steps_per_second": 5.28, "step": 11000 }, { "epoch": 9.01, "learning_rate": 3.85e-05, "loss": 1.3809, "step": 11500 }, { "epoch": 9.01, "eval_accuracy": 0.7578680402421712, "eval_loss": 1.233376383781433, "eval_runtime": 28.1278, "eval_samples_per_second": 84.009, "eval_steps_per_second": 5.262, "step": 11500 }, { "epoch": 9.4, "learning_rate": 3.8e-05, "loss": 1.3689, "step": 12000 }, { "epoch": 9.4, "eval_accuracy": 0.7580960599828525, "eval_loss": 1.246765375137329, "eval_runtime": 28.0513, "eval_samples_per_second": 84.239, "eval_steps_per_second": 5.276, "step": 12000 }, { "epoch": 9.79, "learning_rate": 3.7500000000000003e-05, "loss": 1.3637, "step": 12500 }, { "epoch": 9.79, "eval_accuracy": 0.7588055474386927, "eval_loss": 1.2349480390548706, "eval_runtime": 28.062, "eval_samples_per_second": 84.206, "eval_steps_per_second": 5.274, "step": 12500 }, { "epoch": 10.18, "learning_rate": 3.7e-05, "loss": 1.3587, "step": 13000 }, { "epoch": 10.18, "eval_accuracy": 0.7624618002305978, "eval_loss": 1.2156881093978882, "eval_runtime": 28.0585, "eval_samples_per_second": 84.217, "eval_steps_per_second": 5.275, "step": 13000 }, { "epoch": 10.57, "learning_rate": 3.65e-05, "loss": 1.3397, "step": 13500 }, { "epoch": 10.57, "eval_accuracy": 0.7629879247402415, "eval_loss": 1.2054917812347412, "eval_runtime": 28.0637, "eval_samples_per_second": 84.201, "eval_steps_per_second": 5.274, "step": 13500 }, { "epoch": 10.96, "learning_rate": 3.6e-05, "loss": 1.3347, "step": 14000 }, { "epoch": 10.96, "eval_accuracy": 0.7653931854572328, "eval_loss": 1.1967546939849854, "eval_runtime": 28.1433, "eval_samples_per_second": 83.963, "eval_steps_per_second": 5.259, "step": 14000 }, { "epoch": 11.35, "learning_rate": 3.55e-05, "loss": 1.315, "step": 14500 }, { "epoch": 11.35, "eval_accuracy": 0.7652196884959334, "eval_loss": 1.1955384016036987, "eval_runtime": 28.0408, "eval_samples_per_second": 84.27, "eval_steps_per_second": 5.278, "step": 14500 }, { "epoch": 11.75, "learning_rate": 3.5e-05, "loss": 1.3246, "step": 15000 }, { "epoch": 11.75, "eval_accuracy": 0.7673783292570736, "eval_loss": 1.1885535717010498, "eval_runtime": 27.9933, "eval_samples_per_second": 84.413, "eval_steps_per_second": 5.287, "step": 15000 }, { "epoch": 12.14, "learning_rate": 3.45e-05, "loss": 1.3078, "step": 15500 }, { "epoch": 12.14, "eval_accuracy": 0.7660036105535808, "eval_loss": 1.1941598653793335, "eval_runtime": 28.0336, "eval_samples_per_second": 84.292, "eval_steps_per_second": 5.279, "step": 15500 }, { "epoch": 12.53, "learning_rate": 3.4000000000000007e-05, "loss": 1.2925, "step": 16000 }, { "epoch": 12.53, "eval_accuracy": 0.7677877852070641, "eval_loss": 1.184984803199768, "eval_runtime": 28.0377, "eval_samples_per_second": 84.279, "eval_steps_per_second": 5.279, "step": 16000 }, { "epoch": 12.92, "learning_rate": 3.35e-05, "loss": 1.3004, "step": 16500 }, { "epoch": 12.92, "eval_accuracy": 0.7692169811849188, "eval_loss": 1.1747151613235474, "eval_runtime": 28.5429, "eval_samples_per_second": 82.788, "eval_steps_per_second": 5.185, "step": 16500 }, { "epoch": 13.31, "learning_rate": 3.3e-05, "loss": 1.2911, "step": 17000 }, { "epoch": 13.31, "eval_accuracy": 0.7718838693754838, "eval_loss": 1.1590917110443115, "eval_runtime": 28.4837, "eval_samples_per_second": 82.96, "eval_steps_per_second": 5.196, "step": 17000 }, { "epoch": 13.7, "learning_rate": 3.2500000000000004e-05, "loss": 1.2786, "step": 17500 }, { "epoch": 13.7, "eval_accuracy": 0.7733724392801664, "eval_loss": 1.1601552963256836, "eval_runtime": 28.5301, "eval_samples_per_second": 82.825, "eval_steps_per_second": 5.188, "step": 17500 }, { "epoch": 14.1, "learning_rate": 3.2000000000000005e-05, "loss": 1.2771, "step": 18000 }, { "epoch": 14.1, "eval_accuracy": 0.7717249012568334, "eval_loss": 1.1596946716308594, "eval_runtime": 28.4816, "eval_samples_per_second": 82.966, "eval_steps_per_second": 5.196, "step": 18000 }, { "epoch": 14.49, "learning_rate": 3.15e-05, "loss": 1.2774, "step": 18500 }, { "epoch": 14.49, "eval_accuracy": 0.7723565394698514, "eval_loss": 1.1547273397445679, "eval_runtime": 28.7302, "eval_samples_per_second": 82.248, "eval_steps_per_second": 5.151, "step": 18500 }, { "epoch": 14.88, "learning_rate": 3.1e-05, "loss": 1.2652, "step": 19000 }, { "epoch": 14.88, "eval_accuracy": 0.7751180890584815, "eval_loss": 1.1402720212936401, "eval_runtime": 28.5346, "eval_samples_per_second": 82.812, "eval_steps_per_second": 5.187, "step": 19000 }, { "epoch": 15.27, "learning_rate": 3.05e-05, "loss": 1.262, "step": 19500 }, { "epoch": 15.27, "eval_accuracy": 0.7753582885297109, "eval_loss": 1.1397219896316528, "eval_runtime": 28.0724, "eval_samples_per_second": 84.175, "eval_steps_per_second": 5.272, "step": 19500 }, { "epoch": 15.66, "learning_rate": 3e-05, "loss": 1.2595, "step": 20000 }, { "epoch": 15.66, "eval_accuracy": 0.7777633992370622, "eval_loss": 1.1324734687805176, "eval_runtime": 28.005, "eval_samples_per_second": 84.378, "eval_steps_per_second": 5.285, "step": 20000 }, { "epoch": 16.05, "learning_rate": 2.95e-05, "loss": 1.2544, "step": 20500 }, { "epoch": 16.05, "eval_accuracy": 0.7759057248886968, "eval_loss": 1.1385127305984497, "eval_runtime": 28.0652, "eval_samples_per_second": 84.197, "eval_steps_per_second": 5.273, "step": 20500 }, { "epoch": 16.44, "learning_rate": 2.9e-05, "loss": 1.2424, "step": 21000 }, { "epoch": 16.44, "eval_accuracy": 0.7774187044235099, "eval_loss": 1.1291025876998901, "eval_runtime": 28.0205, "eval_samples_per_second": 84.331, "eval_steps_per_second": 5.282, "step": 21000 }, { "epoch": 16.84, "learning_rate": 2.8499999999999998e-05, "loss": 1.2361, "step": 21500 }, { "epoch": 16.84, "eval_accuracy": 0.7781849717450904, "eval_loss": 1.133793830871582, "eval_runtime": 28.0578, "eval_samples_per_second": 84.219, "eval_steps_per_second": 5.275, "step": 21500 }, { "epoch": 17.23, "learning_rate": 2.8000000000000003e-05, "loss": 1.2325, "step": 22000 }, { "epoch": 17.23, "eval_accuracy": 0.7817587017636397, "eval_loss": 1.1080787181854248, "eval_runtime": 28.073, "eval_samples_per_second": 84.173, "eval_steps_per_second": 5.272, "step": 22000 }, { "epoch": 17.62, "learning_rate": 2.7500000000000004e-05, "loss": 1.236, "step": 22500 }, { "epoch": 17.62, "eval_accuracy": 0.7789308966943304, "eval_loss": 1.1161266565322876, "eval_runtime": 28.1033, "eval_samples_per_second": 84.083, "eval_steps_per_second": 5.266, "step": 22500 }, { "epoch": 18.01, "learning_rate": 2.7000000000000002e-05, "loss": 1.2284, "step": 23000 }, { "epoch": 18.01, "eval_accuracy": 0.7808533500481296, "eval_loss": 1.1149948835372925, "eval_runtime": 28.0618, "eval_samples_per_second": 84.207, "eval_steps_per_second": 5.274, "step": 23000 }, { "epoch": 18.4, "learning_rate": 2.6500000000000004e-05, "loss": 1.2267, "step": 23500 }, { "epoch": 18.4, "eval_accuracy": 0.7830957515333128, "eval_loss": 1.1000680923461914, "eval_runtime": 28.068, "eval_samples_per_second": 84.189, "eval_steps_per_second": 5.273, "step": 23500 }, { "epoch": 18.79, "learning_rate": 2.6000000000000002e-05, "loss": 1.2151, "step": 24000 }, { "epoch": 18.79, "eval_accuracy": 0.7828555991422734, "eval_loss": 1.1053537130355835, "eval_runtime": 28.1255, "eval_samples_per_second": 84.016, "eval_steps_per_second": 5.262, "step": 24000 }, { "epoch": 19.19, "learning_rate": 2.5500000000000003e-05, "loss": 1.2197, "step": 24500 }, { "epoch": 19.19, "eval_accuracy": 0.7813697277709529, "eval_loss": 1.1096093654632568, "eval_runtime": 28.0667, "eval_samples_per_second": 84.192, "eval_steps_per_second": 5.273, "step": 24500 }, { "epoch": 19.58, "learning_rate": 2.5e-05, "loss": 1.2226, "step": 25000 }, { "epoch": 19.58, "eval_accuracy": 0.7815349331543324, "eval_loss": 1.1098133325576782, "eval_runtime": 28.7846, "eval_samples_per_second": 82.093, "eval_steps_per_second": 5.142, "step": 25000 }, { "epoch": 19.97, "learning_rate": 2.45e-05, "loss": 1.2101, "step": 25500 }, { "epoch": 19.97, "eval_accuracy": 0.7840432352180319, "eval_loss": 1.0962213277816772, "eval_runtime": 28.4856, "eval_samples_per_second": 82.954, "eval_steps_per_second": 5.196, "step": 25500 }, { "epoch": 20.36, "learning_rate": 2.4e-05, "loss": 1.2102, "step": 26000 }, { "epoch": 20.36, "eval_accuracy": 0.7846640600019074, "eval_loss": 1.0919607877731323, "eval_runtime": 28.86, "eval_samples_per_second": 81.878, "eval_steps_per_second": 5.128, "step": 26000 }, { "epoch": 20.75, "learning_rate": 2.35e-05, "loss": 1.2003, "step": 26500 }, { "epoch": 20.75, "eval_accuracy": 0.7862929811034005, "eval_loss": 1.0827641487121582, "eval_runtime": 28.6668, "eval_samples_per_second": 82.43, "eval_steps_per_second": 5.163, "step": 26500 }, { "epoch": 21.14, "learning_rate": 2.3000000000000003e-05, "loss": 1.1912, "step": 27000 }, { "epoch": 21.14, "eval_accuracy": 0.7853952245447887, "eval_loss": 1.0885553359985352, "eval_runtime": 28.7209, "eval_samples_per_second": 82.275, "eval_steps_per_second": 5.153, "step": 27000 }, { "epoch": 21.53, "learning_rate": 2.25e-05, "loss": 1.1987, "step": 27500 }, { "epoch": 21.53, "eval_accuracy": 0.7859570475981822, "eval_loss": 1.0860474109649658, "eval_runtime": 28.6338, "eval_samples_per_second": 82.525, "eval_steps_per_second": 5.169, "step": 27500 }, { "epoch": 21.93, "learning_rate": 2.2000000000000003e-05, "loss": 1.2072, "step": 28000 }, { "epoch": 21.93, "eval_accuracy": 0.7859265315328015, "eval_loss": 1.081165075302124, "eval_runtime": 28.0483, "eval_samples_per_second": 84.248, "eval_steps_per_second": 5.277, "step": 28000 }, { "epoch": 22.32, "learning_rate": 2.15e-05, "loss": 1.1894, "step": 28500 }, { "epoch": 22.32, "eval_accuracy": 0.7858167357568326, "eval_loss": 1.081618309020996, "eval_runtime": 28.0759, "eval_samples_per_second": 84.165, "eval_steps_per_second": 5.271, "step": 28500 }, { "epoch": 22.71, "learning_rate": 2.1e-05, "loss": 1.2031, "step": 29000 }, { "epoch": 22.71, "eval_accuracy": 0.7873750982373414, "eval_loss": 1.0770971775054932, "eval_runtime": 28.0208, "eval_samples_per_second": 84.33, "eval_steps_per_second": 5.282, "step": 29000 }, { "epoch": 23.1, "learning_rate": 2.05e-05, "loss": 1.1819, "step": 29500 }, { "epoch": 23.1, "eval_accuracy": 0.7881185786710934, "eval_loss": 1.0673798322677612, "eval_runtime": 28.1231, "eval_samples_per_second": 84.023, "eval_steps_per_second": 5.263, "step": 29500 }, { "epoch": 23.49, "learning_rate": 2e-05, "loss": 1.185, "step": 30000 }, { "epoch": 23.49, "eval_accuracy": 0.787878617967322, "eval_loss": 1.0761115550994873, "eval_runtime": 28.1342, "eval_samples_per_second": 83.99, "eval_steps_per_second": 5.261, "step": 30000 }, { "epoch": 23.88, "learning_rate": 1.9500000000000003e-05, "loss": 1.1873, "step": 30500 }, { "epoch": 23.88, "eval_accuracy": 0.7891809994107909, "eval_loss": 1.0697174072265625, "eval_runtime": 28.162, "eval_samples_per_second": 83.907, "eval_steps_per_second": 5.255, "step": 30500 }, { "epoch": 24.28, "learning_rate": 1.9e-05, "loss": 1.1793, "step": 31000 }, { "epoch": 24.28, "eval_accuracy": 0.7883963056255248, "eval_loss": 1.0705946683883667, "eval_runtime": 28.052, "eval_samples_per_second": 84.236, "eval_steps_per_second": 5.276, "step": 31000 }, { "epoch": 24.67, "learning_rate": 1.85e-05, "loss": 1.1793, "step": 31500 }, { "epoch": 24.67, "eval_accuracy": 0.7898969660676408, "eval_loss": 1.0621615648269653, "eval_runtime": 28.1123, "eval_samples_per_second": 84.056, "eval_steps_per_second": 5.265, "step": 31500 }, { "epoch": 25.06, "learning_rate": 1.8e-05, "loss": 1.1748, "step": 32000 }, { "epoch": 25.06, "eval_accuracy": 0.7894267444469374, "eval_loss": 1.0629585981369019, "eval_runtime": 28.034, "eval_samples_per_second": 84.29, "eval_steps_per_second": 5.279, "step": 32000 }, { "epoch": 25.45, "learning_rate": 1.75e-05, "loss": 1.1701, "step": 32500 }, { "epoch": 25.45, "eval_accuracy": 0.7889187362824543, "eval_loss": 1.0643483400344849, "eval_runtime": 28.053, "eval_samples_per_second": 84.233, "eval_steps_per_second": 5.276, "step": 32500 }, { "epoch": 25.84, "learning_rate": 1.7000000000000003e-05, "loss": 1.1678, "step": 33000 }, { "epoch": 25.84, "eval_accuracy": 0.7906348441195924, "eval_loss": 1.0566548109054565, "eval_runtime": 28.02, "eval_samples_per_second": 84.333, "eval_steps_per_second": 5.282, "step": 33000 }, { "epoch": 26.23, "learning_rate": 1.65e-05, "loss": 1.177, "step": 33500 }, { "epoch": 26.23, "eval_accuracy": 0.7885541594478366, "eval_loss": 1.0659528970718384, "eval_runtime": 28.9284, "eval_samples_per_second": 81.684, "eval_steps_per_second": 5.116, "step": 33500 }, { "epoch": 26.62, "learning_rate": 1.6000000000000003e-05, "loss": 1.1749, "step": 34000 }, { "epoch": 26.62, "eval_accuracy": 0.7910553900598024, "eval_loss": 1.065231204032898, "eval_runtime": 28.6183, "eval_samples_per_second": 82.57, "eval_steps_per_second": 5.172, "step": 34000 }, { "epoch": 27.02, "learning_rate": 1.55e-05, "loss": 1.1623, "step": 34500 }, { "epoch": 27.02, "eval_accuracy": 0.7923780881255506, "eval_loss": 1.0435874462127686, "eval_runtime": 28.7279, "eval_samples_per_second": 82.254, "eval_steps_per_second": 5.152, "step": 34500 }, { "epoch": 27.41, "learning_rate": 1.5e-05, "loss": 1.1647, "step": 35000 }, { "epoch": 27.41, "eval_accuracy": 0.7872681693016056, "eval_loss": 1.076871395111084, "eval_runtime": 28.6193, "eval_samples_per_second": 82.567, "eval_steps_per_second": 5.171, "step": 35000 }, { "epoch": 27.8, "learning_rate": 1.45e-05, "loss": 1.1692, "step": 35500 }, { "epoch": 27.8, "eval_accuracy": 0.7918323947544599, "eval_loss": 1.0473923683166504, "eval_runtime": 28.1261, "eval_samples_per_second": 84.015, "eval_steps_per_second": 5.262, "step": 35500 }, { "epoch": 28.19, "learning_rate": 1.4000000000000001e-05, "loss": 1.1572, "step": 36000 }, { "epoch": 28.19, "eval_accuracy": 0.7922499002579246, "eval_loss": 1.04538094997406, "eval_runtime": 28.3784, "eval_samples_per_second": 83.268, "eval_steps_per_second": 5.215, "step": 36000 }, { "epoch": 28.58, "learning_rate": 1.3500000000000001e-05, "loss": 1.1612, "step": 36500 }, { "epoch": 28.58, "eval_accuracy": 0.7915871161923587, "eval_loss": 1.0553712844848633, "eval_runtime": 28.5792, "eval_samples_per_second": 82.682, "eval_steps_per_second": 5.179, "step": 36500 }, { "epoch": 28.97, "learning_rate": 1.3000000000000001e-05, "loss": 1.1626, "step": 37000 }, { "epoch": 28.97, "eval_accuracy": 0.7918068040808763, "eval_loss": 1.049229621887207, "eval_runtime": 28.1968, "eval_samples_per_second": 83.804, "eval_steps_per_second": 5.249, "step": 37000 }, { "epoch": 29.37, "learning_rate": 1.25e-05, "loss": 1.1613, "step": 37500 }, { "epoch": 29.37, "eval_accuracy": 0.7909290923508192, "eval_loss": 1.0585567951202393, "eval_runtime": 28.1829, "eval_samples_per_second": 83.845, "eval_steps_per_second": 5.251, "step": 37500 }, { "epoch": 29.76, "learning_rate": 1.2e-05, "loss": 1.146, "step": 38000 }, { "epoch": 29.76, "eval_accuracy": 0.7918408217683095, "eval_loss": 1.0470130443572998, "eval_runtime": 28.08, "eval_samples_per_second": 84.153, "eval_steps_per_second": 5.271, "step": 38000 }, { "epoch": 30.15, "learning_rate": 1.1500000000000002e-05, "loss": 1.1558, "step": 38500 }, { "epoch": 30.15, "eval_accuracy": 0.792069691940612, "eval_loss": 1.0529680252075195, "eval_runtime": 28.2018, "eval_samples_per_second": 83.789, "eval_steps_per_second": 5.248, "step": 38500 }, { "epoch": 30.54, "learning_rate": 1.1000000000000001e-05, "loss": 1.1553, "step": 39000 }, { "epoch": 30.54, "eval_accuracy": 0.7910355486862441, "eval_loss": 1.0474281311035156, "eval_runtime": 28.1832, "eval_samples_per_second": 83.844, "eval_steps_per_second": 5.251, "step": 39000 }, { "epoch": 30.93, "learning_rate": 1.05e-05, "loss": 1.1543, "step": 39500 }, { "epoch": 30.93, "eval_accuracy": 0.791970271873566, "eval_loss": 1.0445611476898193, "eval_runtime": 28.122, "eval_samples_per_second": 84.027, "eval_steps_per_second": 5.263, "step": 39500 }, { "epoch": 31.32, "learning_rate": 1e-05, "loss": 1.1523, "step": 40000 }, { "epoch": 31.32, "eval_accuracy": 0.791586566920167, "eval_loss": 1.0520665645599365, "eval_runtime": 28.1029, "eval_samples_per_second": 84.084, "eval_steps_per_second": 5.266, "step": 40000 }, { "epoch": 31.71, "learning_rate": 9.5e-06, "loss": 1.1529, "step": 40500 }, { "epoch": 31.71, "eval_accuracy": 0.7923261955625831, "eval_loss": 1.0489442348480225, "eval_runtime": 28.0946, "eval_samples_per_second": 84.109, "eval_steps_per_second": 5.268, "step": 40500 }, { "epoch": 32.11, "learning_rate": 9e-06, "loss": 1.1528, "step": 41000 }, { "epoch": 32.11, "eval_accuracy": 0.7929594205173631, "eval_loss": 1.0407068729400635, "eval_runtime": 28.0625, "eval_samples_per_second": 84.205, "eval_steps_per_second": 5.274, "step": 41000 }, { "epoch": 32.5, "learning_rate": 8.500000000000002e-06, "loss": 1.1532, "step": 41500 }, { "epoch": 32.5, "eval_accuracy": 0.7942555258001673, "eval_loss": 1.0386089086532593, "eval_runtime": 28.1339, "eval_samples_per_second": 83.991, "eval_steps_per_second": 5.261, "step": 41500 }, { "epoch": 32.89, "learning_rate": 8.000000000000001e-06, "loss": 1.1415, "step": 42000 }, { "epoch": 32.89, "eval_accuracy": 0.7913391756430352, "eval_loss": 1.0489261150360107, "eval_runtime": 28.1217, "eval_samples_per_second": 84.028, "eval_steps_per_second": 5.263, "step": 42000 }, { "epoch": 33.28, "learning_rate": 7.5e-06, "loss": 1.1509, "step": 42500 }, { "epoch": 33.28, "eval_accuracy": 0.7940224033160062, "eval_loss": 1.0355397462844849, "eval_runtime": 28.5372, "eval_samples_per_second": 82.804, "eval_steps_per_second": 5.186, "step": 42500 }, { "epoch": 33.67, "learning_rate": 7.000000000000001e-06, "loss": 1.1484, "step": 43000 }, { "epoch": 33.67, "eval_accuracy": 0.7930596246956346, "eval_loss": 1.0375429391860962, "eval_runtime": 28.6819, "eval_samples_per_second": 82.386, "eval_steps_per_second": 5.16, "step": 43000 }, { "epoch": 34.06, "learning_rate": 6.5000000000000004e-06, "loss": 1.1434, "step": 43500 }, { "epoch": 34.06, "eval_accuracy": 0.7927762712812054, "eval_loss": 1.0431498289108276, "eval_runtime": 28.0708, "eval_samples_per_second": 84.18, "eval_steps_per_second": 5.272, "step": 43500 }, { "epoch": 34.46, "learning_rate": 6e-06, "loss": 1.1464, "step": 44000 }, { "epoch": 34.46, "eval_accuracy": 0.794945162432753, "eval_loss": 1.034825086593628, "eval_runtime": 28.0218, "eval_samples_per_second": 84.327, "eval_steps_per_second": 5.282, "step": 44000 }, { "epoch": 34.85, "learning_rate": 5.500000000000001e-06, "loss": 1.1394, "step": 44500 }, { "epoch": 34.85, "eval_accuracy": 0.7927448337825697, "eval_loss": 1.051361322402954, "eval_runtime": 28.0137, "eval_samples_per_second": 84.352, "eval_steps_per_second": 5.283, "step": 44500 }, { "epoch": 35.24, "learning_rate": 5e-06, "loss": 1.1418, "step": 45000 }, { "epoch": 35.24, "eval_accuracy": 0.7933213503342134, "eval_loss": 1.0428730249404907, "eval_runtime": 28.0418, "eval_samples_per_second": 84.267, "eval_steps_per_second": 5.278, "step": 45000 }, { "epoch": 35.63, "learning_rate": 4.5e-06, "loss": 1.1453, "step": 45500 }, { "epoch": 35.63, "eval_accuracy": 0.7942262297649546, "eval_loss": 1.0422593355178833, "eval_runtime": 27.9895, "eval_samples_per_second": 84.424, "eval_steps_per_second": 5.288, "step": 45500 }, { "epoch": 36.02, "learning_rate": 4.000000000000001e-06, "loss": 1.1411, "step": 46000 }, { "epoch": 36.02, "eval_accuracy": 0.794925731466523, "eval_loss": 1.035831093788147, "eval_runtime": 28.0055, "eval_samples_per_second": 84.376, "eval_steps_per_second": 5.285, "step": 46000 }, { "epoch": 36.41, "learning_rate": 3.5000000000000004e-06, "loss": 1.1434, "step": 46500 }, { "epoch": 36.41, "eval_accuracy": 0.7954393439490088, "eval_loss": 1.0308473110198975, "eval_runtime": 28.0644, "eval_samples_per_second": 84.199, "eval_steps_per_second": 5.274, "step": 46500 } ], "logging_steps": 500, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "total_flos": 1.980436047640658e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }