{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998971722365039, "global_step": 4862, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0, "loss": 10.4794, "step": 1 }, { "epoch": 0.0, "learning_rate": 5.855516318488718e-05, "loss": 10.195, "step": 10 }, { "epoch": 0.0, "learning_rate": 9.852528029402888e-05, "loss": 9.0314, "step": 20 }, { "epoch": 0.01, "learning_rate": 0.00011711032636977435, "loss": 8.3018, "step": 30 }, { "epoch": 0.01, "learning_rate": 0.00012935198326770444, "loss": 8.0744, "step": 40 }, { "epoch": 0.01, "learning_rate": 0.00013849539740317057, "loss": 7.8706, "step": 50 }, { "epoch": 0.01, "eval_accuracy": 0.11661608233036805, "eval_loss": 7.881457328796387, "eval_runtime": 10.388, "eval_samples_per_second": 3.369, "eval_steps_per_second": 0.193, "step": 50 }, { "epoch": 0.01, "learning_rate": 0.00014579627258973804, "loss": 7.6597, "step": 60 }, { "epoch": 0.01, "learning_rate": 0.0001518740958549795, "loss": 7.4752, "step": 70 }, { "epoch": 0.02, "learning_rate": 0.00015708044347891603, "loss": 7.3497, "step": 80 }, { "epoch": 0.02, "learning_rate": 0.0001616341750666857, "loss": 7.256, "step": 90 }, { "epoch": 0.02, "learning_rate": 0.0001656808271083118, "loss": 7.1325, "step": 100 }, { "epoch": 0.02, "eval_accuracy": 0.1523669980812838, "eval_loss": 7.2515740394592285, "eval_runtime": 10.3168, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 100 }, { "epoch": 0.02, "learning_rate": 0.00016932210037684612, "loss": 7.052, "step": 110 }, { "epoch": 0.02, "learning_rate": 0.0001723141171704594, "loss": 6.9402, "step": 120 }, { "epoch": 0.03, "learning_rate": 0.00017537326012423199, "loss": 6.8498, "step": 130 }, { "epoch": 0.03, "learning_rate": 0.0001781950123852968, "loss": 6.7319, "step": 140 }, { "epoch": 0.03, "learning_rate": 0.00018056004207494319, "loss": 6.6609, "step": 150 }, { "epoch": 0.03, "eval_accuracy": 0.17777080062794348, "eval_loss": 6.819276809692383, "eval_runtime": 10.3083, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 150 }, { "epoch": 0.03, "learning_rate": 0.00018301924610008189, "loss": 6.5659, "step": 160 }, { "epoch": 0.03, "learning_rate": 0.00018532269677939782, "loss": 6.4576, "step": 170 }, { "epoch": 0.04, "learning_rate": 0.00018748895370481112, "loss": 6.4062, "step": 180 }, { "epoch": 0.04, "learning_rate": 0.00018953344483335556, "loss": 6.2343, "step": 190 }, { "epoch": 0.04, "learning_rate": 0.00019146913367833817, "loss": 6.0795, "step": 200 }, { "epoch": 0.04, "eval_accuracy": 0.2014233385661957, "eval_loss": 6.384451389312744, "eval_runtime": 10.3147, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 200 }, { "epoch": 0.04, "learning_rate": 0.00019330701776944063, "loss": 6.0014, "step": 210 }, { "epoch": 0.05, "learning_rate": 0.00019505650713185044, "loss": 5.9148, "step": 220 }, { "epoch": 0.05, "learning_rate": 0.00019672571585424665, "loss": 5.7346, "step": 230 }, { "epoch": 0.05, "learning_rate": 0.00019832168964685297, "loss": 5.6229, "step": 240 }, { "epoch": 0.05, "learning_rate": 0.0001998505855457085, "loss": 5.5911, "step": 250 }, { "epoch": 0.05, "eval_accuracy": 0.2269108669108669, "eval_loss": 5.930122375488281, "eval_runtime": 10.2989, "eval_samples_per_second": 3.398, "eval_steps_per_second": 0.194, "step": 250 }, { "epoch": 0.05, "learning_rate": 0.0001996535296665223, "loss": 5.4576, "step": 260 }, { "epoch": 0.06, "learning_rate": 0.0001992204417496752, "loss": 5.4233, "step": 270 }, { "epoch": 0.06, "learning_rate": 0.00019878735383282807, "loss": 5.3452, "step": 280 }, { "epoch": 0.06, "learning_rate": 0.00019835426591598097, "loss": 5.2178, "step": 290 }, { "epoch": 0.06, "learning_rate": 0.00019792117799913384, "loss": 5.0416, "step": 300 }, { "epoch": 0.06, "eval_accuracy": 0.25283097854526426, "eval_loss": 5.53858757019043, "eval_runtime": 10.2964, "eval_samples_per_second": 3.399, "eval_steps_per_second": 0.194, "step": 300 }, { "epoch": 0.06, "learning_rate": 0.0001974880900822867, "loss": 4.9763, "step": 310 }, { "epoch": 0.07, "learning_rate": 0.00019705500216543958, "loss": 4.9485, "step": 320 }, { "epoch": 0.07, "learning_rate": 0.00019662191424859245, "loss": 4.7805, "step": 330 }, { "epoch": 0.07, "learning_rate": 0.00019618882633174535, "loss": 4.7348, "step": 340 }, { "epoch": 0.07, "learning_rate": 0.00019575573841489822, "loss": 4.6488, "step": 350 }, { "epoch": 0.07, "eval_accuracy": 0.280118611547183, "eval_loss": 5.200571060180664, "eval_runtime": 10.3088, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 350 }, { "epoch": 0.07, "learning_rate": 0.00019532265049805112, "loss": 4.5881, "step": 360 }, { "epoch": 0.08, "learning_rate": 0.000194889562581204, "loss": 4.5292, "step": 370 }, { "epoch": 0.08, "learning_rate": 0.0001944564746643569, "loss": 4.4588, "step": 380 }, { "epoch": 0.08, "learning_rate": 0.00019402338674750976, "loss": 4.3398, "step": 390 }, { "epoch": 0.08, "learning_rate": 0.00019359029883066263, "loss": 4.2704, "step": 400 }, { "epoch": 0.08, "eval_accuracy": 0.2968986568986569, "eval_loss": 4.920831680297852, "eval_runtime": 10.3195, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 400 }, { "epoch": 0.08, "learning_rate": 0.0001931572109138155, "loss": 4.3331, "step": 410 }, { "epoch": 0.09, "learning_rate": 0.0001927241229969684, "loss": 4.2611, "step": 420 }, { "epoch": 0.09, "learning_rate": 0.00019229103508012127, "loss": 4.1317, "step": 430 }, { "epoch": 0.09, "learning_rate": 0.00019185794716327414, "loss": 4.1948, "step": 440 }, { "epoch": 0.09, "learning_rate": 0.00019142485924642704, "loss": 4.1254, "step": 450 }, { "epoch": 0.09, "eval_accuracy": 0.306959706959707, "eval_loss": 4.744773864746094, "eval_runtime": 10.2994, "eval_samples_per_second": 3.398, "eval_steps_per_second": 0.194, "step": 450 }, { "epoch": 0.09, "learning_rate": 0.0001909917713295799, "loss": 4.1134, "step": 460 }, { "epoch": 0.1, "learning_rate": 0.0001905586834127328, "loss": 4.0577, "step": 470 }, { "epoch": 0.1, "learning_rate": 0.00019012559549588568, "loss": 4.0536, "step": 480 }, { "epoch": 0.1, "learning_rate": 0.00018969250757903855, "loss": 4.0161, "step": 490 }, { "epoch": 0.1, "learning_rate": 0.00018925941966219142, "loss": 3.9946, "step": 500 }, { "epoch": 0.1, "eval_accuracy": 0.3148438862724577, "eval_loss": 4.626054286956787, "eval_runtime": 10.2947, "eval_samples_per_second": 3.4, "eval_steps_per_second": 0.194, "step": 500 }, { "epoch": 0.1, "learning_rate": 0.00018882633174534431, "loss": 4.0105, "step": 510 }, { "epoch": 0.11, "learning_rate": 0.00018839324382849718, "loss": 3.9675, "step": 520 }, { "epoch": 0.11, "learning_rate": 0.00018796015591165008, "loss": 3.9762, "step": 530 }, { "epoch": 0.11, "learning_rate": 0.00018752706799480295, "loss": 3.8776, "step": 540 }, { "epoch": 0.11, "learning_rate": 0.00018709398007795585, "loss": 3.9233, "step": 550 }, { "epoch": 0.11, "eval_accuracy": 0.3190301761730333, "eval_loss": 4.543696880340576, "eval_runtime": 10.3044, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 550 }, { "epoch": 0.12, "learning_rate": 0.00018666089216110872, "loss": 3.9052, "step": 560 }, { "epoch": 0.12, "learning_rate": 0.0001862278042442616, "loss": 3.8801, "step": 570 }, { "epoch": 0.12, "learning_rate": 0.00018579471632741446, "loss": 3.8455, "step": 580 }, { "epoch": 0.12, "learning_rate": 0.00018536162841056733, "loss": 3.8847, "step": 590 }, { "epoch": 0.12, "learning_rate": 0.00018492854049372023, "loss": 3.8564, "step": 600 }, { "epoch": 0.12, "eval_accuracy": 0.3223233908948195, "eval_loss": 4.484829425811768, "eval_runtime": 10.3163, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 600 }, { "epoch": 0.13, "learning_rate": 0.0001844954525768731, "loss": 3.8747, "step": 610 }, { "epoch": 0.13, "learning_rate": 0.000184062364660026, "loss": 3.8457, "step": 620 }, { "epoch": 0.13, "learning_rate": 0.00018362927674317887, "loss": 3.8098, "step": 630 }, { "epoch": 0.13, "learning_rate": 0.00018319618882633177, "loss": 3.8504, "step": 640 }, { "epoch": 0.13, "learning_rate": 0.00018276310090948464, "loss": 3.8044, "step": 650 }, { "epoch": 0.13, "eval_accuracy": 0.3253096110238967, "eval_loss": 4.4368672370910645, "eval_runtime": 10.3128, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 650 }, { "epoch": 0.14, "learning_rate": 0.00018233001299263754, "loss": 3.8319, "step": 660 }, { "epoch": 0.14, "learning_rate": 0.00018189692507579038, "loss": 3.7511, "step": 670 }, { "epoch": 0.14, "learning_rate": 0.00018146383715894328, "loss": 3.7444, "step": 680 }, { "epoch": 0.14, "learning_rate": 0.00018103074924209615, "loss": 3.7481, "step": 690 }, { "epoch": 0.14, "learning_rate": 0.00018059766132524902, "loss": 3.7818, "step": 700 }, { "epoch": 0.14, "eval_accuracy": 0.32684458398744115, "eval_loss": 4.409779071807861, "eval_runtime": 10.3206, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 700 }, { "epoch": 0.15, "learning_rate": 0.00018016457340840192, "loss": 3.7294, "step": 710 }, { "epoch": 0.15, "learning_rate": 0.00017973148549155479, "loss": 3.6994, "step": 720 }, { "epoch": 0.15, "learning_rate": 0.00017929839757470768, "loss": 3.8228, "step": 730 }, { "epoch": 0.15, "learning_rate": 0.00017886530965786055, "loss": 3.7021, "step": 740 }, { "epoch": 0.15, "learning_rate": 0.00017843222174101345, "loss": 3.7394, "step": 750 }, { "epoch": 0.15, "eval_accuracy": 0.3294819466248038, "eval_loss": 4.378809928894043, "eval_runtime": 10.3201, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 750 }, { "epoch": 0.16, "learning_rate": 0.00017799913382416632, "loss": 3.723, "step": 760 }, { "epoch": 0.16, "learning_rate": 0.0001775660459073192, "loss": 3.7536, "step": 770 }, { "epoch": 0.16, "learning_rate": 0.00017713295799047206, "loss": 3.6932, "step": 780 }, { "epoch": 0.16, "learning_rate": 0.00017669987007362496, "loss": 3.675, "step": 790 }, { "epoch": 0.16, "learning_rate": 0.00017626678215677783, "loss": 3.6735, "step": 800 }, { "epoch": 0.16, "eval_accuracy": 0.3317425431711146, "eval_loss": 4.348150253295898, "eval_runtime": 10.3035, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 800 }, { "epoch": 0.17, "learning_rate": 0.00017583369423993073, "loss": 3.6496, "step": 810 }, { "epoch": 0.17, "learning_rate": 0.0001754006063230836, "loss": 3.6922, "step": 820 }, { "epoch": 0.17, "learning_rate": 0.00017496751840623647, "loss": 3.6704, "step": 830 }, { "epoch": 0.17, "learning_rate": 0.00017453443048938937, "loss": 3.5991, "step": 840 }, { "epoch": 0.17, "learning_rate": 0.00017410134257254224, "loss": 3.6736, "step": 850 }, { "epoch": 0.17, "eval_accuracy": 0.3330821559392988, "eval_loss": 4.325968265533447, "eval_runtime": 10.3042, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 850 }, { "epoch": 0.18, "learning_rate": 0.0001736682546556951, "loss": 3.6847, "step": 860 }, { "epoch": 0.18, "learning_rate": 0.00017323516673884798, "loss": 3.6526, "step": 870 }, { "epoch": 0.18, "learning_rate": 0.00017280207882200088, "loss": 3.6376, "step": 880 }, { "epoch": 0.18, "learning_rate": 0.00017236899090515375, "loss": 3.6258, "step": 890 }, { "epoch": 0.19, "learning_rate": 0.00017193590298830665, "loss": 3.5848, "step": 900 }, { "epoch": 0.19, "eval_accuracy": 0.3351753008895866, "eval_loss": 4.298619747161865, "eval_runtime": 10.2928, "eval_samples_per_second": 3.4, "eval_steps_per_second": 0.194, "step": 900 }, { "epoch": 0.19, "learning_rate": 0.00017150281507145952, "loss": 3.6342, "step": 910 }, { "epoch": 0.19, "learning_rate": 0.00017106972715461241, "loss": 3.5997, "step": 920 }, { "epoch": 0.19, "learning_rate": 0.00017063663923776528, "loss": 3.6412, "step": 930 }, { "epoch": 0.19, "learning_rate": 0.00017020355132091815, "loss": 3.6093, "step": 940 }, { "epoch": 0.2, "learning_rate": 0.00016977046340407103, "loss": 3.6733, "step": 950 }, { "epoch": 0.2, "eval_accuracy": 0.33535670678527824, "eval_loss": 4.287267208099365, "eval_runtime": 10.2981, "eval_samples_per_second": 3.399, "eval_steps_per_second": 0.194, "step": 950 }, { "epoch": 0.2, "learning_rate": 0.0001693373754872239, "loss": 3.6504, "step": 960 }, { "epoch": 0.2, "learning_rate": 0.0001689042875703768, "loss": 3.6471, "step": 970 }, { "epoch": 0.2, "learning_rate": 0.00016847119965352966, "loss": 3.652, "step": 980 }, { "epoch": 0.2, "learning_rate": 0.00016803811173668256, "loss": 3.585, "step": 990 }, { "epoch": 0.21, "learning_rate": 0.00016760502381983543, "loss": 3.5897, "step": 1000 }, { "epoch": 0.21, "eval_accuracy": 0.33716378859236, "eval_loss": 4.26666784286499, "eval_runtime": 10.2978, "eval_samples_per_second": 3.399, "eval_steps_per_second": 0.194, "step": 1000 }, { "epoch": 0.21, "learning_rate": 0.00016717193590298833, "loss": 3.6133, "step": 1010 }, { "epoch": 0.21, "learning_rate": 0.0001667388479861412, "loss": 3.6513, "step": 1020 }, { "epoch": 0.21, "learning_rate": 0.00016630576006929407, "loss": 3.5812, "step": 1030 }, { "epoch": 0.21, "learning_rate": 0.00016587267215244694, "loss": 3.5337, "step": 1040 }, { "epoch": 0.22, "learning_rate": 0.00016543958423559984, "loss": 3.5743, "step": 1050 }, { "epoch": 0.22, "eval_accuracy": 0.3382243153671725, "eval_loss": 4.2522406578063965, "eval_runtime": 10.3129, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 1050 }, { "epoch": 0.22, "learning_rate": 0.0001650064963187527, "loss": 3.5491, "step": 1060 }, { "epoch": 0.22, "learning_rate": 0.0001645734084019056, "loss": 3.6083, "step": 1070 }, { "epoch": 0.22, "learning_rate": 0.00016414032048505848, "loss": 3.5326, "step": 1080 }, { "epoch": 0.22, "learning_rate": 0.00016370723256821135, "loss": 3.4979, "step": 1090 }, { "epoch": 0.23, "learning_rate": 0.00016327414465136425, "loss": 3.5075, "step": 1100 }, { "epoch": 0.23, "eval_accuracy": 0.33963369963369966, "eval_loss": 4.232938289642334, "eval_runtime": 10.3143, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 1100 }, { "epoch": 0.23, "learning_rate": 0.00016284105673451712, "loss": 3.5368, "step": 1110 }, { "epoch": 0.23, "learning_rate": 0.00016240796881767, "loss": 3.5292, "step": 1120 }, { "epoch": 0.23, "learning_rate": 0.00016197488090082286, "loss": 3.5705, "step": 1130 }, { "epoch": 0.23, "learning_rate": 0.00016154179298397576, "loss": 3.568, "step": 1140 }, { "epoch": 0.24, "learning_rate": 0.00016110870506712863, "loss": 3.5513, "step": 1150 }, { "epoch": 0.24, "eval_accuracy": 0.3409802895517181, "eval_loss": 4.225428581237793, "eval_runtime": 10.3106, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 1150 }, { "epoch": 0.24, "learning_rate": 0.00016067561715028152, "loss": 3.5785, "step": 1160 }, { "epoch": 0.24, "learning_rate": 0.0001602425292334344, "loss": 3.4841, "step": 1170 }, { "epoch": 0.24, "learning_rate": 0.0001598094413165873, "loss": 3.4861, "step": 1180 }, { "epoch": 0.24, "learning_rate": 0.00015937635339974016, "loss": 3.5064, "step": 1190 }, { "epoch": 0.25, "learning_rate": 0.00015894326548289303, "loss": 3.5034, "step": 1200 }, { "epoch": 0.25, "eval_accuracy": 0.3416082330368045, "eval_loss": 4.212094783782959, "eval_runtime": 10.3174, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 1200 }, { "epoch": 0.25, "learning_rate": 0.0001585101775660459, "loss": 3.5208, "step": 1210 }, { "epoch": 0.25, "learning_rate": 0.00015807708964919877, "loss": 3.544, "step": 1220 }, { "epoch": 0.25, "learning_rate": 0.00015764400173235167, "loss": 3.5995, "step": 1230 }, { "epoch": 0.26, "learning_rate": 0.00015721091381550454, "loss": 3.5096, "step": 1240 }, { "epoch": 0.26, "learning_rate": 0.00015677782589865744, "loss": 3.4953, "step": 1250 }, { "epoch": 0.26, "eval_accuracy": 0.34162218733647304, "eval_loss": 4.200605869293213, "eval_runtime": 10.3079, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 1250 }, { "epoch": 0.26, "learning_rate": 0.0001563447379818103, "loss": 3.4903, "step": 1260 }, { "epoch": 0.26, "learning_rate": 0.0001559116500649632, "loss": 3.5847, "step": 1270 }, { "epoch": 0.26, "learning_rate": 0.00015547856214811608, "loss": 3.5003, "step": 1280 }, { "epoch": 0.27, "learning_rate": 0.00015504547423126895, "loss": 3.465, "step": 1290 }, { "epoch": 0.27, "learning_rate": 0.00015461238631442182, "loss": 3.5375, "step": 1300 }, { "epoch": 0.27, "eval_accuracy": 0.3420408163265306, "eval_loss": 4.193361282348633, "eval_runtime": 10.3166, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 1300 }, { "epoch": 0.27, "learning_rate": 0.00015417929839757472, "loss": 3.479, "step": 1310 }, { "epoch": 0.27, "learning_rate": 0.0001537462104807276, "loss": 3.5274, "step": 1320 }, { "epoch": 0.27, "learning_rate": 0.00015331312256388049, "loss": 3.4944, "step": 1330 }, { "epoch": 0.28, "learning_rate": 0.00015288003464703336, "loss": 3.52, "step": 1340 }, { "epoch": 0.28, "learning_rate": 0.00015244694673018623, "loss": 3.5105, "step": 1350 }, { "epoch": 0.28, "eval_accuracy": 0.3424524681667539, "eval_loss": 4.182863235473633, "eval_runtime": 10.2862, "eval_samples_per_second": 3.403, "eval_steps_per_second": 0.194, "step": 1350 }, { "epoch": 0.28, "learning_rate": 0.00015201385881333913, "loss": 3.5494, "step": 1360 }, { "epoch": 0.28, "learning_rate": 0.000151580770896492, "loss": 3.4345, "step": 1370 }, { "epoch": 0.28, "learning_rate": 0.00015114768297964487, "loss": 3.4243, "step": 1380 }, { "epoch": 0.29, "learning_rate": 0.00015071459506279774, "loss": 3.5167, "step": 1390 }, { "epoch": 0.29, "learning_rate": 0.00015028150714595063, "loss": 3.4764, "step": 1400 }, { "epoch": 0.29, "eval_accuracy": 0.3427803942089656, "eval_loss": 4.173154830932617, "eval_runtime": 10.3105, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 1400 }, { "epoch": 0.29, "learning_rate": 0.0001498484192291035, "loss": 3.4447, "step": 1410 }, { "epoch": 0.29, "learning_rate": 0.0001494153313122564, "loss": 3.5027, "step": 1420 }, { "epoch": 0.29, "learning_rate": 0.00014898224339540927, "loss": 3.4461, "step": 1430 }, { "epoch": 0.3, "learning_rate": 0.00014854915547856217, "loss": 3.4238, "step": 1440 }, { "epoch": 0.3, "learning_rate": 0.00014811606756171504, "loss": 3.4558, "step": 1450 }, { "epoch": 0.3, "eval_accuracy": 0.34314320600034887, "eval_loss": 4.165927886962891, "eval_runtime": 10.3072, "eval_samples_per_second": 3.396, "eval_steps_per_second": 0.194, "step": 1450 }, { "epoch": 0.3, "learning_rate": 0.0001476829796448679, "loss": 3.502, "step": 1460 }, { "epoch": 0.3, "learning_rate": 0.00014724989172802078, "loss": 3.5107, "step": 1470 }, { "epoch": 0.3, "learning_rate": 0.00014681680381117365, "loss": 3.5175, "step": 1480 }, { "epoch": 0.31, "learning_rate": 0.00014638371589432655, "loss": 3.4825, "step": 1490 }, { "epoch": 0.31, "learning_rate": 0.00014595062797747942, "loss": 3.4698, "step": 1500 }, { "epoch": 0.31, "eval_accuracy": 0.34391766963195536, "eval_loss": 4.1588826179504395, "eval_runtime": 10.3017, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 1500 }, { "epoch": 0.31, "learning_rate": 0.00014551754006063232, "loss": 3.4971, "step": 1510 }, { "epoch": 0.31, "learning_rate": 0.0001450844521437852, "loss": 3.4639, "step": 1520 }, { "epoch": 0.31, "learning_rate": 0.0001446513642269381, "loss": 3.485, "step": 1530 }, { "epoch": 0.32, "learning_rate": 0.00014421827631009096, "loss": 3.5071, "step": 1540 }, { "epoch": 0.32, "learning_rate": 0.00014378518839324383, "loss": 3.3979, "step": 1550 }, { "epoch": 0.32, "eval_accuracy": 0.3454038025466597, "eval_loss": 4.149387359619141, "eval_runtime": 10.2957, "eval_samples_per_second": 3.399, "eval_steps_per_second": 0.194, "step": 1550 }, { "epoch": 0.32, "learning_rate": 0.0001433521004763967, "loss": 3.4136, "step": 1560 }, { "epoch": 0.32, "learning_rate": 0.0001429190125595496, "loss": 3.4406, "step": 1570 }, { "epoch": 0.32, "learning_rate": 0.00014248592464270247, "loss": 3.4546, "step": 1580 }, { "epoch": 0.33, "learning_rate": 0.00014205283672585536, "loss": 3.4742, "step": 1590 }, { "epoch": 0.33, "learning_rate": 0.00014161974880900824, "loss": 3.4233, "step": 1600 }, { "epoch": 0.33, "eval_accuracy": 0.34525728240013953, "eval_loss": 4.142571449279785, "eval_runtime": 10.3075, "eval_samples_per_second": 3.396, "eval_steps_per_second": 0.194, "step": 1600 }, { "epoch": 0.33, "learning_rate": 0.0001411866608921611, "loss": 3.4348, "step": 1610 }, { "epoch": 0.33, "learning_rate": 0.000140753572975314, "loss": 3.4015, "step": 1620 }, { "epoch": 0.34, "learning_rate": 0.00014032048505846687, "loss": 3.4459, "step": 1630 }, { "epoch": 0.34, "learning_rate": 0.00013988739714161974, "loss": 3.4866, "step": 1640 }, { "epoch": 0.34, "learning_rate": 0.00013945430922477262, "loss": 3.4162, "step": 1650 }, { "epoch": 0.34, "eval_accuracy": 0.34582243153671727, "eval_loss": 4.141883373260498, "eval_runtime": 10.3186, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 1650 }, { "epoch": 0.34, "learning_rate": 0.0001390212213079255, "loss": 3.4257, "step": 1660 }, { "epoch": 0.34, "learning_rate": 0.00013858813339107838, "loss": 3.386, "step": 1670 }, { "epoch": 0.35, "learning_rate": 0.00013815504547423128, "loss": 3.4398, "step": 1680 }, { "epoch": 0.35, "learning_rate": 0.00013772195755738415, "loss": 3.4502, "step": 1690 }, { "epoch": 0.35, "learning_rate": 0.00013728886964053705, "loss": 3.4454, "step": 1700 }, { "epoch": 0.35, "eval_accuracy": 0.3463945578231293, "eval_loss": 4.1352057456970215, "eval_runtime": 10.3113, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 1700 }, { "epoch": 0.35, "learning_rate": 0.00013685578172368992, "loss": 3.4493, "step": 1710 }, { "epoch": 0.35, "learning_rate": 0.00013642269380684282, "loss": 3.4085, "step": 1720 }, { "epoch": 0.36, "learning_rate": 0.00013598960588999566, "loss": 3.4486, "step": 1730 }, { "epoch": 0.36, "learning_rate": 0.00013555651797314853, "loss": 3.4503, "step": 1740 }, { "epoch": 0.36, "learning_rate": 0.00013512343005630143, "loss": 3.4127, "step": 1750 }, { "epoch": 0.36, "eval_accuracy": 0.34649921507064363, "eval_loss": 4.134985446929932, "eval_runtime": 10.2957, "eval_samples_per_second": 3.399, "eval_steps_per_second": 0.194, "step": 1750 }, { "epoch": 0.36, "learning_rate": 0.0001346903421394543, "loss": 3.4014, "step": 1760 }, { "epoch": 0.36, "learning_rate": 0.0001342572542226072, "loss": 3.4282, "step": 1770 }, { "epoch": 0.37, "learning_rate": 0.00013382416630576007, "loss": 3.4109, "step": 1780 }, { "epoch": 0.37, "learning_rate": 0.00013339107838891297, "loss": 3.4397, "step": 1790 }, { "epoch": 0.37, "learning_rate": 0.00013295799047206584, "loss": 3.4347, "step": 1800 }, { "epoch": 0.37, "eval_accuracy": 0.3448456305599163, "eval_loss": 4.133869171142578, "eval_runtime": 10.3119, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 1800 }, { "epoch": 0.37, "learning_rate": 0.00013252490255521873, "loss": 3.413, "step": 1810 }, { "epoch": 0.37, "learning_rate": 0.0001320918146383716, "loss": 3.3894, "step": 1820 }, { "epoch": 0.38, "learning_rate": 0.00013165872672152448, "loss": 3.3624, "step": 1830 }, { "epoch": 0.38, "learning_rate": 0.00013122563880467735, "loss": 3.397, "step": 1840 }, { "epoch": 0.38, "learning_rate": 0.00013079255088783024, "loss": 3.4176, "step": 1850 }, { "epoch": 0.38, "eval_accuracy": 0.3472178615035758, "eval_loss": 4.122106552124023, "eval_runtime": 10.3021, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 1850 }, { "epoch": 0.38, "learning_rate": 0.00013035946297098311, "loss": 3.4136, "step": 1860 }, { "epoch": 0.38, "learning_rate": 0.00012992637505413598, "loss": 3.4338, "step": 1870 }, { "epoch": 0.39, "learning_rate": 0.00012949328713728888, "loss": 3.3931, "step": 1880 }, { "epoch": 0.39, "learning_rate": 0.00012906019922044175, "loss": 3.4306, "step": 1890 }, { "epoch": 0.39, "learning_rate": 0.00012862711130359465, "loss": 3.4145, "step": 1900 }, { "epoch": 0.39, "eval_accuracy": 0.3480830280830281, "eval_loss": 4.11691427230835, "eval_runtime": 10.3007, "eval_samples_per_second": 3.398, "eval_steps_per_second": 0.194, "step": 1900 }, { "epoch": 0.39, "learning_rate": 0.00012819402338674752, "loss": 3.4272, "step": 1910 }, { "epoch": 0.39, "learning_rate": 0.0001277609354699004, "loss": 3.4037, "step": 1920 }, { "epoch": 0.4, "learning_rate": 0.00012732784755305326, "loss": 3.4358, "step": 1930 }, { "epoch": 0.4, "learning_rate": 0.00012689475963620616, "loss": 3.3797, "step": 1940 }, { "epoch": 0.4, "learning_rate": 0.00012646167171935903, "loss": 3.407, "step": 1950 }, { "epoch": 0.4, "eval_accuracy": 0.34714111285539856, "eval_loss": 4.114504814147949, "eval_runtime": 10.2981, "eval_samples_per_second": 3.399, "eval_steps_per_second": 0.194, "step": 1950 }, { "epoch": 0.4, "learning_rate": 0.00012602858380251193, "loss": 3.3906, "step": 1960 }, { "epoch": 0.41, "learning_rate": 0.0001255954958856648, "loss": 3.4079, "step": 1970 }, { "epoch": 0.41, "learning_rate": 0.0001251624079688177, "loss": 3.3355, "step": 1980 }, { "epoch": 0.41, "learning_rate": 0.00012472932005197057, "loss": 3.3968, "step": 1990 }, { "epoch": 0.41, "learning_rate": 0.00012429623213512344, "loss": 3.3351, "step": 2000 }, { "epoch": 0.41, "eval_accuracy": 0.34764346764346765, "eval_loss": 4.10912561416626, "eval_runtime": 10.3006, "eval_samples_per_second": 3.398, "eval_steps_per_second": 0.194, "step": 2000 }, { "epoch": 0.41, "learning_rate": 0.0001238631442182763, "loss": 3.3782, "step": 2010 }, { "epoch": 0.42, "learning_rate": 0.00012347336509311392, "loss": 3.4039, "step": 2020 }, { "epoch": 0.42, "learning_rate": 0.0001230402771762668, "loss": 3.3794, "step": 2030 }, { "epoch": 0.42, "learning_rate": 0.0001226071892594197, "loss": 3.4188, "step": 2040 }, { "epoch": 0.42, "learning_rate": 0.00012217410134257256, "loss": 3.4255, "step": 2050 }, { "epoch": 0.42, "eval_accuracy": 0.34798534798534797, "eval_loss": 4.109765529632568, "eval_runtime": 10.3054, "eval_samples_per_second": 3.396, "eval_steps_per_second": 0.194, "step": 2050 }, { "epoch": 0.42, "learning_rate": 0.00012174101342572544, "loss": 3.3917, "step": 2060 }, { "epoch": 0.43, "learning_rate": 0.00012130792550887831, "loss": 3.4352, "step": 2070 }, { "epoch": 0.43, "learning_rate": 0.0001208748375920312, "loss": 3.384, "step": 2080 }, { "epoch": 0.43, "learning_rate": 0.00012044174967518407, "loss": 3.389, "step": 2090 }, { "epoch": 0.43, "learning_rate": 0.00012000866175833694, "loss": 3.396, "step": 2100 }, { "epoch": 0.43, "eval_accuracy": 0.34839699982557126, "eval_loss": 4.104307651519775, "eval_runtime": 10.3168, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 2100 }, { "epoch": 0.43, "learning_rate": 0.00011957557384148984, "loss": 3.3757, "step": 2110 }, { "epoch": 0.44, "learning_rate": 0.0001191424859246427, "loss": 3.397, "step": 2120 }, { "epoch": 0.44, "learning_rate": 0.00011870939800779559, "loss": 3.3469, "step": 2130 }, { "epoch": 0.44, "learning_rate": 0.00011827631009094846, "loss": 3.3529, "step": 2140 }, { "epoch": 0.44, "learning_rate": 0.00011784322217410136, "loss": 3.3477, "step": 2150 }, { "epoch": 0.44, "eval_accuracy": 0.34870399441828015, "eval_loss": 4.099356651306152, "eval_runtime": 10.3017, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 2150 }, { "epoch": 0.44, "learning_rate": 0.00011741013425725423, "loss": 3.3823, "step": 2160 }, { "epoch": 0.45, "learning_rate": 0.00011697704634040711, "loss": 3.362, "step": 2170 }, { "epoch": 0.45, "learning_rate": 0.00011654395842355998, "loss": 3.3482, "step": 2180 }, { "epoch": 0.45, "learning_rate": 0.00011611087050671288, "loss": 3.3491, "step": 2190 }, { "epoch": 0.45, "learning_rate": 0.00011567778258986575, "loss": 3.3604, "step": 2200 }, { "epoch": 0.45, "eval_accuracy": 0.3493040293040293, "eval_loss": 4.094115734100342, "eval_runtime": 10.3156, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 2200 }, { "epoch": 0.45, "learning_rate": 0.00011524469467301864, "loss": 3.3527, "step": 2210 }, { "epoch": 0.46, "learning_rate": 0.0001148116067561715, "loss": 3.3814, "step": 2220 }, { "epoch": 0.46, "learning_rate": 0.00011437851883932438, "loss": 3.36, "step": 2230 }, { "epoch": 0.46, "learning_rate": 0.00011394543092247727, "loss": 3.3149, "step": 2240 }, { "epoch": 0.46, "learning_rate": 0.00011351234300563015, "loss": 3.3521, "step": 2250 }, { "epoch": 0.46, "eval_accuracy": 0.34838304552590266, "eval_loss": 4.095041751861572, "eval_runtime": 10.3028, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 2250 }, { "epoch": 0.46, "learning_rate": 0.00011307925508878303, "loss": 3.3846, "step": 2260 }, { "epoch": 0.47, "learning_rate": 0.0001126461671719359, "loss": 3.3441, "step": 2270 }, { "epoch": 0.47, "learning_rate": 0.0001122130792550888, "loss": 3.3527, "step": 2280 }, { "epoch": 0.47, "learning_rate": 0.00011177999133824167, "loss": 3.361, "step": 2290 }, { "epoch": 0.47, "learning_rate": 0.00011134690342139455, "loss": 3.3027, "step": 2300 }, { "epoch": 0.47, "eval_accuracy": 0.34924821210535495, "eval_loss": 4.090921878814697, "eval_runtime": 10.2945, "eval_samples_per_second": 3.4, "eval_steps_per_second": 0.194, "step": 2300 }, { "epoch": 0.48, "learning_rate": 0.00011091381550454742, "loss": 3.3337, "step": 2310 }, { "epoch": 0.48, "learning_rate": 0.00011048072758770032, "loss": 3.3423, "step": 2320 }, { "epoch": 0.48, "learning_rate": 0.00011004763967085319, "loss": 3.3161, "step": 2330 }, { "epoch": 0.48, "learning_rate": 0.00010961455175400607, "loss": 3.3455, "step": 2340 }, { "epoch": 0.48, "learning_rate": 0.00010918146383715895, "loss": 3.3831, "step": 2350 }, { "epoch": 0.48, "eval_accuracy": 0.3492412349555207, "eval_loss": 4.084415435791016, "eval_runtime": 10.3139, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 2350 }, { "epoch": 0.49, "learning_rate": 0.00010874837592031182, "loss": 3.3531, "step": 2360 }, { "epoch": 0.49, "learning_rate": 0.00010831528800346471, "loss": 3.3944, "step": 2370 }, { "epoch": 0.49, "learning_rate": 0.00010788220008661758, "loss": 3.3476, "step": 2380 }, { "epoch": 0.49, "learning_rate": 0.00010744911216977047, "loss": 3.3656, "step": 2390 }, { "epoch": 0.49, "learning_rate": 0.00010701602425292334, "loss": 3.3175, "step": 2400 }, { "epoch": 0.49, "eval_accuracy": 0.34965986394557824, "eval_loss": 4.080374717712402, "eval_runtime": 10.3156, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 2400 }, { "epoch": 0.5, "learning_rate": 0.00010658293633607624, "loss": 3.3493, "step": 2410 }, { "epoch": 0.5, "learning_rate": 0.00010614984841922911, "loss": 3.37, "step": 2420 }, { "epoch": 0.5, "learning_rate": 0.00010571676050238199, "loss": 3.3488, "step": 2430 }, { "epoch": 0.5, "learning_rate": 0.00010528367258553486, "loss": 3.3135, "step": 2440 }, { "epoch": 0.5, "learning_rate": 0.00010485058466868776, "loss": 3.3065, "step": 2450 }, { "epoch": 0.5, "eval_accuracy": 0.35118785975928835, "eval_loss": 4.075130939483643, "eval_runtime": 10.3192, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 2450 }, { "epoch": 0.51, "learning_rate": 0.00010441749675184063, "loss": 3.3313, "step": 2460 }, { "epoch": 0.51, "learning_rate": 0.00010398440883499351, "loss": 3.3206, "step": 2470 }, { "epoch": 0.51, "learning_rate": 0.00010355132091814638, "loss": 3.3532, "step": 2480 }, { "epoch": 0.51, "learning_rate": 0.00010311823300129926, "loss": 3.3202, "step": 2490 }, { "epoch": 0.51, "learning_rate": 0.00010268514508445215, "loss": 3.3148, "step": 2500 }, { "epoch": 0.51, "eval_accuracy": 0.3493319379033665, "eval_loss": 4.077237606048584, "eval_runtime": 10.3211, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 2500 }, { "epoch": 0.52, "learning_rate": 0.00010225205716760502, "loss": 3.3322, "step": 2510 }, { "epoch": 0.52, "learning_rate": 0.00010181896925075791, "loss": 3.359, "step": 2520 }, { "epoch": 0.52, "learning_rate": 0.00010138588133391078, "loss": 3.3148, "step": 2530 }, { "epoch": 0.52, "learning_rate": 0.00010095279341706368, "loss": 3.3384, "step": 2540 }, { "epoch": 0.52, "learning_rate": 0.00010051970550021655, "loss": 3.3136, "step": 2550 }, { "epoch": 0.52, "eval_accuracy": 0.35003663003663005, "eval_loss": 4.075963973999023, "eval_runtime": 10.3206, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 2550 }, { "epoch": 0.53, "learning_rate": 0.00010008661758336943, "loss": 3.3407, "step": 2560 }, { "epoch": 0.53, "learning_rate": 9.96535296665223e-05, "loss": 3.346, "step": 2570 }, { "epoch": 0.53, "learning_rate": 9.922044174967519e-05, "loss": 3.3429, "step": 2580 }, { "epoch": 0.53, "learning_rate": 9.878735383282807e-05, "loss": 3.3453, "step": 2590 }, { "epoch": 0.53, "learning_rate": 9.835426591598095e-05, "loss": 3.3299, "step": 2600 }, { "epoch": 0.53, "eval_accuracy": 0.3503715332286761, "eval_loss": 4.069892406463623, "eval_runtime": 10.3115, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 2600 }, { "epoch": 0.54, "learning_rate": 9.792117799913382e-05, "loss": 3.2616, "step": 2610 }, { "epoch": 0.54, "learning_rate": 9.748809008228671e-05, "loss": 3.2649, "step": 2620 }, { "epoch": 0.54, "learning_rate": 9.705500216543959e-05, "loss": 3.2806, "step": 2630 }, { "epoch": 0.54, "learning_rate": 9.662191424859248e-05, "loss": 3.3751, "step": 2640 }, { "epoch": 0.54, "learning_rate": 9.618882633174535e-05, "loss": 3.3089, "step": 2650 }, { "epoch": 0.54, "eval_accuracy": 0.3509227280655852, "eval_loss": 4.068863868713379, "eval_runtime": 10.3195, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 2650 }, { "epoch": 0.55, "learning_rate": 9.575573841489823e-05, "loss": 3.3234, "step": 2660 }, { "epoch": 0.55, "learning_rate": 9.532265049805112e-05, "loss": 3.353, "step": 2670 }, { "epoch": 0.55, "learning_rate": 9.488956258120399e-05, "loss": 3.3036, "step": 2680 }, { "epoch": 0.55, "learning_rate": 9.445647466435687e-05, "loss": 3.3596, "step": 2690 }, { "epoch": 0.56, "learning_rate": 9.402338674750974e-05, "loss": 3.3414, "step": 2700 }, { "epoch": 0.56, "eval_accuracy": 0.3499529042386185, "eval_loss": 4.066127300262451, "eval_runtime": 10.3126, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 2700 }, { "epoch": 0.56, "learning_rate": 9.359029883066262e-05, "loss": 3.362, "step": 2710 }, { "epoch": 0.56, "learning_rate": 9.315721091381551e-05, "loss": 3.3545, "step": 2720 }, { "epoch": 0.56, "learning_rate": 9.272412299696839e-05, "loss": 3.3639, "step": 2730 }, { "epoch": 0.56, "learning_rate": 9.229103508012126e-05, "loss": 3.2914, "step": 2740 }, { "epoch": 0.57, "learning_rate": 9.185794716327415e-05, "loss": 3.3434, "step": 2750 }, { "epoch": 0.57, "eval_accuracy": 0.35164137449851735, "eval_loss": 4.059126377105713, "eval_runtime": 10.3025, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 2750 }, { "epoch": 0.57, "learning_rate": 9.142485924642703e-05, "loss": 3.2933, "step": 2760 }, { "epoch": 0.57, "learning_rate": 9.099177132957992e-05, "loss": 3.2918, "step": 2770 }, { "epoch": 0.57, "learning_rate": 9.055868341273279e-05, "loss": 3.3093, "step": 2780 }, { "epoch": 0.57, "learning_rate": 9.012559549588567e-05, "loss": 3.307, "step": 2790 }, { "epoch": 0.58, "learning_rate": 8.969250757903855e-05, "loss": 3.2961, "step": 2800 }, { "epoch": 0.58, "eval_accuracy": 0.35194139194139196, "eval_loss": 4.059094429016113, "eval_runtime": 10.3044, "eval_samples_per_second": 3.397, "eval_steps_per_second": 0.194, "step": 2800 }, { "epoch": 0.58, "learning_rate": 8.925941966219143e-05, "loss": 3.317, "step": 2810 }, { "epoch": 0.58, "learning_rate": 8.882633174534431e-05, "loss": 3.2903, "step": 2820 }, { "epoch": 0.58, "learning_rate": 8.839324382849718e-05, "loss": 3.3317, "step": 2830 }, { "epoch": 0.58, "learning_rate": 8.796015591165006e-05, "loss": 3.2929, "step": 2840 }, { "epoch": 0.59, "learning_rate": 8.752706799480295e-05, "loss": 3.3354, "step": 2850 }, { "epoch": 0.59, "eval_accuracy": 0.3513064713064713, "eval_loss": 4.057965278625488, "eval_runtime": 10.3079, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 2850 }, { "epoch": 0.59, "learning_rate": 8.709398007795583e-05, "loss": 3.3293, "step": 2860 }, { "epoch": 0.59, "learning_rate": 8.66608921611087e-05, "loss": 3.2948, "step": 2870 }, { "epoch": 0.59, "learning_rate": 8.622780424426159e-05, "loss": 3.2694, "step": 2880 }, { "epoch": 0.59, "learning_rate": 8.579471632741447e-05, "loss": 3.3123, "step": 2890 }, { "epoch": 0.6, "learning_rate": 8.536162841056736e-05, "loss": 3.3188, "step": 2900 }, { "epoch": 0.6, "eval_accuracy": 0.3508041165184022, "eval_loss": 4.052666664123535, "eval_runtime": 10.314, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 2900 }, { "epoch": 0.6, "learning_rate": 8.492854049372023e-05, "loss": 3.3388, "step": 2910 }, { "epoch": 0.6, "learning_rate": 8.449545257687311e-05, "loss": 3.2808, "step": 2920 }, { "epoch": 0.6, "learning_rate": 8.4062364660026e-05, "loss": 3.3221, "step": 2930 }, { "epoch": 0.6, "learning_rate": 8.362927674317888e-05, "loss": 3.3259, "step": 2940 }, { "epoch": 0.61, "learning_rate": 8.319618882633175e-05, "loss": 3.3485, "step": 2950 }, { "epoch": 0.61, "eval_accuracy": 0.3516623059480202, "eval_loss": 4.051156044006348, "eval_runtime": 10.3078, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 2950 }, { "epoch": 0.61, "learning_rate": 8.276310090948462e-05, "loss": 3.3199, "step": 2960 }, { "epoch": 0.61, "learning_rate": 8.23300129926375e-05, "loss": 3.3162, "step": 2970 }, { "epoch": 0.61, "learning_rate": 8.189692507579039e-05, "loss": 3.305, "step": 2980 }, { "epoch": 0.61, "learning_rate": 8.146383715894327e-05, "loss": 3.3509, "step": 2990 }, { "epoch": 0.62, "learning_rate": 8.103074924209616e-05, "loss": 3.2905, "step": 3000 }, { "epoch": 0.62, "eval_accuracy": 0.3514878772021629, "eval_loss": 4.0494561195373535, "eval_runtime": 10.3082, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 3000 }, { "epoch": 0.62, "learning_rate": 8.059766132524903e-05, "loss": 3.3209, "step": 3010 }, { "epoch": 0.62, "learning_rate": 8.016457340840191e-05, "loss": 3.2824, "step": 3020 }, { "epoch": 0.62, "learning_rate": 7.97314854915548e-05, "loss": 3.3032, "step": 3030 }, { "epoch": 0.63, "learning_rate": 7.929839757470768e-05, "loss": 3.3071, "step": 3040 }, { "epoch": 0.63, "learning_rate": 7.886530965786055e-05, "loss": 3.3421, "step": 3050 }, { "epoch": 0.63, "eval_accuracy": 0.3516832373975231, "eval_loss": 4.0482659339904785, "eval_runtime": 10.3201, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 3050 }, { "epoch": 0.63, "learning_rate": 7.843222174101343e-05, "loss": 3.3453, "step": 3060 }, { "epoch": 0.63, "learning_rate": 7.799913382416632e-05, "loss": 3.3165, "step": 3070 }, { "epoch": 0.63, "learning_rate": 7.756604590731919e-05, "loss": 3.3095, "step": 3080 }, { "epoch": 0.64, "learning_rate": 7.713295799047207e-05, "loss": 3.2814, "step": 3090 }, { "epoch": 0.64, "learning_rate": 7.669987007362494e-05, "loss": 3.3219, "step": 3100 }, { "epoch": 0.64, "eval_accuracy": 0.3521018663875807, "eval_loss": 4.049413681030273, "eval_runtime": 10.33, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.194, "step": 3100 }, { "epoch": 0.64, "learning_rate": 7.626678215677783e-05, "loss": 3.3019, "step": 3110 }, { "epoch": 0.64, "learning_rate": 7.583369423993071e-05, "loss": 3.3186, "step": 3120 }, { "epoch": 0.64, "learning_rate": 7.54006063230836e-05, "loss": 3.2792, "step": 3130 }, { "epoch": 0.65, "learning_rate": 7.496751840623647e-05, "loss": 3.2745, "step": 3140 }, { "epoch": 0.65, "learning_rate": 7.453443048938935e-05, "loss": 3.2681, "step": 3150 }, { "epoch": 0.65, "eval_accuracy": 0.35144601430315714, "eval_loss": 4.050104141235352, "eval_runtime": 10.3142, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 3150 }, { "epoch": 0.65, "learning_rate": 7.410134257254223e-05, "loss": 3.3041, "step": 3160 }, { "epoch": 0.65, "learning_rate": 7.366825465569512e-05, "loss": 3.3226, "step": 3170 }, { "epoch": 0.65, "learning_rate": 7.323516673884799e-05, "loss": 3.2969, "step": 3180 }, { "epoch": 0.66, "learning_rate": 7.280207882200087e-05, "loss": 3.2724, "step": 3190 }, { "epoch": 0.66, "learning_rate": 7.236899090515376e-05, "loss": 3.3439, "step": 3200 }, { "epoch": 0.66, "eval_accuracy": 0.35251351822780397, "eval_loss": 4.041363716125488, "eval_runtime": 10.3183, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 3200 }, { "epoch": 0.66, "learning_rate": 7.193590298830663e-05, "loss": 3.3006, "step": 3210 }, { "epoch": 0.66, "learning_rate": 7.150281507145951e-05, "loss": 3.3029, "step": 3220 }, { "epoch": 0.66, "learning_rate": 7.106972715461238e-05, "loss": 3.3077, "step": 3230 }, { "epoch": 0.67, "learning_rate": 7.063663923776527e-05, "loss": 3.2692, "step": 3240 }, { "epoch": 0.67, "learning_rate": 7.020355132091815e-05, "loss": 3.283, "step": 3250 }, { "epoch": 0.67, "eval_accuracy": 0.3523460666317809, "eval_loss": 4.042195796966553, "eval_runtime": 10.3212, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 3250 }, { "epoch": 0.67, "learning_rate": 6.977046340407103e-05, "loss": 3.2465, "step": 3260 }, { "epoch": 0.67, "learning_rate": 6.93373754872239e-05, "loss": 3.3109, "step": 3270 }, { "epoch": 0.67, "learning_rate": 6.890428757037679e-05, "loss": 3.27, "step": 3280 }, { "epoch": 0.68, "learning_rate": 6.847119965352967e-05, "loss": 3.269, "step": 3290 }, { "epoch": 0.68, "learning_rate": 6.803811173668256e-05, "loss": 3.2974, "step": 3300 }, { "epoch": 0.68, "eval_accuracy": 0.35278562707134137, "eval_loss": 4.040924072265625, "eval_runtime": 10.3185, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 3300 }, { "epoch": 0.68, "learning_rate": 6.760502381983543e-05, "loss": 3.2892, "step": 3310 }, { "epoch": 0.68, "learning_rate": 6.717193590298831e-05, "loss": 3.3283, "step": 3320 }, { "epoch": 0.68, "learning_rate": 6.67388479861412e-05, "loss": 3.337, "step": 3330 }, { "epoch": 0.69, "learning_rate": 6.630576006929407e-05, "loss": 3.2781, "step": 3340 }, { "epoch": 0.69, "learning_rate": 6.587267215244695e-05, "loss": 3.2924, "step": 3350 }, { "epoch": 0.69, "eval_accuracy": 0.35347636490493634, "eval_loss": 4.039037227630615, "eval_runtime": 10.3117, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 3350 }, { "epoch": 0.69, "learning_rate": 6.543958423559982e-05, "loss": 3.2447, "step": 3360 }, { "epoch": 0.69, "learning_rate": 6.50064963187527e-05, "loss": 3.2743, "step": 3370 }, { "epoch": 0.7, "learning_rate": 6.457340840190559e-05, "loss": 3.2559, "step": 3380 }, { "epoch": 0.7, "learning_rate": 6.414032048505847e-05, "loss": 3.3076, "step": 3390 }, { "epoch": 0.7, "learning_rate": 6.370723256821134e-05, "loss": 3.2148, "step": 3400 }, { "epoch": 0.7, "eval_accuracy": 0.3530507587650445, "eval_loss": 4.033644199371338, "eval_runtime": 10.313, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 3400 }, { "epoch": 0.7, "learning_rate": 6.327414465136423e-05, "loss": 3.2583, "step": 3410 }, { "epoch": 0.7, "learning_rate": 6.284105673451711e-05, "loss": 3.2855, "step": 3420 }, { "epoch": 0.71, "learning_rate": 6.240796881767e-05, "loss": 3.2763, "step": 3430 }, { "epoch": 0.71, "learning_rate": 6.201818969250758e-05, "loss": 3.2905, "step": 3440 }, { "epoch": 0.71, "learning_rate": 6.158510177566046e-05, "loss": 3.305, "step": 3450 }, { "epoch": 0.71, "eval_accuracy": 0.35271585557299845, "eval_loss": 4.036759853363037, "eval_runtime": 10.3081, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 3450 }, { "epoch": 0.71, "learning_rate": 6.115201385881333e-05, "loss": 3.3214, "step": 3460 }, { "epoch": 0.71, "learning_rate": 6.071892594196622e-05, "loss": 3.2726, "step": 3470 }, { "epoch": 0.72, "learning_rate": 6.02858380251191e-05, "loss": 3.3296, "step": 3480 }, { "epoch": 0.72, "learning_rate": 5.985275010827198e-05, "loss": 3.3195, "step": 3490 }, { "epoch": 0.72, "learning_rate": 5.9419662191424864e-05, "loss": 3.3217, "step": 3500 }, { "epoch": 0.72, "eval_accuracy": 0.3529112157683586, "eval_loss": 4.034041404724121, "eval_runtime": 10.3239, "eval_samples_per_second": 3.39, "eval_steps_per_second": 0.194, "step": 3500 }, { "epoch": 0.72, "learning_rate": 5.898657427457774e-05, "loss": 3.283, "step": 3510 }, { "epoch": 0.72, "learning_rate": 5.8553486357730626e-05, "loss": 3.305, "step": 3520 }, { "epoch": 0.73, "learning_rate": 5.81203984408835e-05, "loss": 3.2853, "step": 3530 }, { "epoch": 0.73, "learning_rate": 5.768731052403639e-05, "loss": 3.2876, "step": 3540 }, { "epoch": 0.73, "learning_rate": 5.7254222607189265e-05, "loss": 3.2529, "step": 3550 }, { "epoch": 0.73, "eval_accuracy": 0.35336473050758765, "eval_loss": 4.0340142250061035, "eval_runtime": 10.3209, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 3550 }, { "epoch": 0.73, "learning_rate": 5.6821134690342135e-05, "loss": 3.2383, "step": 3560 }, { "epoch": 0.73, "learning_rate": 5.638804677349502e-05, "loss": 3.2793, "step": 3570 }, { "epoch": 0.74, "learning_rate": 5.59549588566479e-05, "loss": 3.2438, "step": 3580 }, { "epoch": 0.74, "learning_rate": 5.552187093980078e-05, "loss": 3.3002, "step": 3590 }, { "epoch": 0.74, "learning_rate": 5.508878302295366e-05, "loss": 3.2654, "step": 3600 }, { "epoch": 0.74, "eval_accuracy": 0.3537205651491366, "eval_loss": 4.02842378616333, "eval_runtime": 10.3228, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 3600 }, { "epoch": 0.74, "learning_rate": 5.465569510610654e-05, "loss": 3.2249, "step": 3610 }, { "epoch": 0.74, "learning_rate": 5.422260718925942e-05, "loss": 3.3054, "step": 3620 }, { "epoch": 0.75, "learning_rate": 5.3789519272412304e-05, "loss": 3.3084, "step": 3630 }, { "epoch": 0.75, "learning_rate": 5.335643135556518e-05, "loss": 3.2655, "step": 3640 }, { "epoch": 0.75, "learning_rate": 5.2923343438718065e-05, "loss": 3.2954, "step": 3650 }, { "epoch": 0.75, "eval_accuracy": 0.35309959881388453, "eval_loss": 4.030717372894287, "eval_runtime": 10.3217, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 3650 }, { "epoch": 0.75, "learning_rate": 5.249025552187094e-05, "loss": 3.2667, "step": 3660 }, { "epoch": 0.75, "learning_rate": 5.2057167605023827e-05, "loss": 3.2486, "step": 3670 }, { "epoch": 0.76, "learning_rate": 5.162407968817671e-05, "loss": 3.2722, "step": 3680 }, { "epoch": 0.76, "learning_rate": 5.1190991771329574e-05, "loss": 3.2465, "step": 3690 }, { "epoch": 0.76, "learning_rate": 5.075790385448246e-05, "loss": 3.2247, "step": 3700 }, { "epoch": 0.76, "eval_accuracy": 0.3532461189604047, "eval_loss": 4.029253005981445, "eval_runtime": 10.324, "eval_samples_per_second": 3.39, "eval_steps_per_second": 0.194, "step": 3700 }, { "epoch": 0.76, "learning_rate": 5.0324815937635336e-05, "loss": 3.2807, "step": 3710 }, { "epoch": 0.77, "learning_rate": 4.989172802078822e-05, "loss": 3.3076, "step": 3720 }, { "epoch": 0.77, "learning_rate": 4.94586401039411e-05, "loss": 3.2756, "step": 3730 }, { "epoch": 0.77, "learning_rate": 4.902555218709398e-05, "loss": 3.2565, "step": 3740 }, { "epoch": 0.77, "learning_rate": 4.8592464270246866e-05, "loss": 3.2489, "step": 3750 }, { "epoch": 0.77, "eval_accuracy": 0.3537694051979766, "eval_loss": 4.025887489318848, "eval_runtime": 10.3294, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.194, "step": 3750 }, { "epoch": 0.77, "learning_rate": 4.815937635339974e-05, "loss": 3.2754, "step": 3760 }, { "epoch": 0.78, "learning_rate": 4.772628843655263e-05, "loss": 3.2691, "step": 3770 }, { "epoch": 0.78, "learning_rate": 4.7293200519705504e-05, "loss": 3.2651, "step": 3780 }, { "epoch": 0.78, "learning_rate": 4.686011260285838e-05, "loss": 3.2549, "step": 3790 }, { "epoch": 0.78, "learning_rate": 4.642702468601126e-05, "loss": 3.2887, "step": 3800 }, { "epoch": 0.78, "eval_accuracy": 0.35346938775510206, "eval_loss": 4.026332855224609, "eval_runtime": 10.328, "eval_samples_per_second": 3.389, "eval_steps_per_second": 0.194, "step": 3800 }, { "epoch": 0.78, "learning_rate": 4.599393676916414e-05, "loss": 3.2463, "step": 3810 }, { "epoch": 0.79, "learning_rate": 4.556084885231702e-05, "loss": 3.2388, "step": 3820 }, { "epoch": 0.79, "learning_rate": 4.5127760935469905e-05, "loss": 3.2527, "step": 3830 }, { "epoch": 0.79, "learning_rate": 4.469467301862278e-05, "loss": 3.2754, "step": 3840 }, { "epoch": 0.79, "learning_rate": 4.426158510177566e-05, "loss": 3.2275, "step": 3850 }, { "epoch": 0.79, "eval_accuracy": 0.3544531658817373, "eval_loss": 4.022998809814453, "eval_runtime": 10.3111, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 3850 }, { "epoch": 0.79, "learning_rate": 4.3828497184928543e-05, "loss": 3.2682, "step": 3860 }, { "epoch": 0.8, "learning_rate": 4.339540926808142e-05, "loss": 3.3002, "step": 3870 }, { "epoch": 0.8, "learning_rate": 4.2962321351234305e-05, "loss": 3.3368, "step": 3880 }, { "epoch": 0.8, "learning_rate": 4.252923343438718e-05, "loss": 3.2938, "step": 3890 }, { "epoch": 0.8, "learning_rate": 4.2096145517540066e-05, "loss": 3.2666, "step": 3900 }, { "epoch": 0.8, "eval_accuracy": 0.35355311355311353, "eval_loss": 4.022049903869629, "eval_runtime": 10.3226, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 3900 }, { "epoch": 0.8, "learning_rate": 4.1663057600692944e-05, "loss": 3.2478, "step": 3910 }, { "epoch": 0.81, "learning_rate": 4.122996968384582e-05, "loss": 3.2521, "step": 3920 }, { "epoch": 0.81, "learning_rate": 4.07968817669987e-05, "loss": 3.2966, "step": 3930 }, { "epoch": 0.81, "learning_rate": 4.036379385015158e-05, "loss": 3.312, "step": 3940 }, { "epoch": 0.81, "learning_rate": 3.993070593330446e-05, "loss": 3.2608, "step": 3950 }, { "epoch": 0.81, "eval_accuracy": 0.35442525728240015, "eval_loss": 4.020759582519531, "eval_runtime": 10.3237, "eval_samples_per_second": 3.39, "eval_steps_per_second": 0.194, "step": 3950 }, { "epoch": 0.81, "learning_rate": 3.9497618016457344e-05, "loss": 3.2372, "step": 3960 }, { "epoch": 0.82, "learning_rate": 3.906453009961023e-05, "loss": 3.2765, "step": 3970 }, { "epoch": 0.82, "learning_rate": 3.86314421827631e-05, "loss": 3.2758, "step": 3980 }, { "epoch": 0.82, "learning_rate": 3.819835426591598e-05, "loss": 3.2194, "step": 3990 }, { "epoch": 0.82, "learning_rate": 3.776526634906886e-05, "loss": 3.2701, "step": 4000 }, { "epoch": 0.82, "eval_accuracy": 0.35390894819466245, "eval_loss": 4.0217413902282715, "eval_runtime": 10.3215, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 4000 }, { "epoch": 0.82, "learning_rate": 3.7332178432221744e-05, "loss": 3.284, "step": 4010 }, { "epoch": 0.83, "learning_rate": 3.689909051537462e-05, "loss": 3.2948, "step": 4020 }, { "epoch": 0.83, "learning_rate": 3.6466002598527506e-05, "loss": 3.25, "step": 4030 }, { "epoch": 0.83, "learning_rate": 3.603291468168038e-05, "loss": 3.285, "step": 4040 }, { "epoch": 0.83, "learning_rate": 3.559982676483326e-05, "loss": 3.276, "step": 4050 }, { "epoch": 0.83, "eval_accuracy": 0.35459270887842315, "eval_loss": 4.0179877281188965, "eval_runtime": 10.3103, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 4050 }, { "epoch": 0.83, "learning_rate": 3.521004763967086e-05, "loss": 3.2278, "step": 4060 }, { "epoch": 0.84, "learning_rate": 3.4776959722823735e-05, "loss": 3.2552, "step": 4070 }, { "epoch": 0.84, "learning_rate": 3.434387180597661e-05, "loss": 3.2346, "step": 4080 }, { "epoch": 0.84, "learning_rate": 3.3910783889129496e-05, "loss": 3.2365, "step": 4090 }, { "epoch": 0.84, "learning_rate": 3.3477695972282374e-05, "loss": 3.3152, "step": 4100 }, { "epoch": 0.84, "eval_accuracy": 0.3540763997906855, "eval_loss": 4.020029067993164, "eval_runtime": 10.3176, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 4100 }, { "epoch": 0.85, "learning_rate": 3.304460805543526e-05, "loss": 3.3036, "step": 4110 }, { "epoch": 0.85, "learning_rate": 3.2611520138588135e-05, "loss": 3.2688, "step": 4120 }, { "epoch": 0.85, "learning_rate": 3.217843222174102e-05, "loss": 3.2507, "step": 4130 }, { "epoch": 0.85, "learning_rate": 3.174534430489389e-05, "loss": 3.2347, "step": 4140 }, { "epoch": 0.85, "learning_rate": 3.1312256388046774e-05, "loss": 3.2647, "step": 4150 }, { "epoch": 0.85, "eval_accuracy": 0.35356706785278214, "eval_loss": 4.020448684692383, "eval_runtime": 10.3118, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 4150 }, { "epoch": 0.86, "learning_rate": 3.087916847119965e-05, "loss": 3.2171, "step": 4160 }, { "epoch": 0.86, "learning_rate": 3.0446080554352535e-05, "loss": 3.2578, "step": 4170 }, { "epoch": 0.86, "learning_rate": 3.0012992637505416e-05, "loss": 3.234, "step": 4180 }, { "epoch": 0.86, "learning_rate": 2.9579904720658297e-05, "loss": 3.2092, "step": 4190 }, { "epoch": 0.86, "learning_rate": 2.9146816803811177e-05, "loss": 3.23, "step": 4200 }, { "epoch": 0.86, "eval_accuracy": 0.3542926914355486, "eval_loss": 4.017209053039551, "eval_runtime": 10.3231, "eval_samples_per_second": 3.39, "eval_steps_per_second": 0.194, "step": 4200 }, { "epoch": 0.87, "learning_rate": 2.871372888696405e-05, "loss": 3.312, "step": 4210 }, { "epoch": 0.87, "learning_rate": 2.8280640970116936e-05, "loss": 3.2474, "step": 4220 }, { "epoch": 0.87, "learning_rate": 2.7847553053269816e-05, "loss": 3.1625, "step": 4230 }, { "epoch": 0.87, "learning_rate": 2.7414465136422697e-05, "loss": 3.2238, "step": 4240 }, { "epoch": 0.87, "learning_rate": 2.6981377219575578e-05, "loss": 3.2393, "step": 4250 }, { "epoch": 0.87, "eval_accuracy": 0.3541740798883656, "eval_loss": 4.018671989440918, "eval_runtime": 10.3185, "eval_samples_per_second": 3.392, "eval_steps_per_second": 0.194, "step": 4250 }, { "epoch": 0.88, "learning_rate": 2.654828930272846e-05, "loss": 3.3045, "step": 4260 }, { "epoch": 0.88, "learning_rate": 2.6115201385881332e-05, "loss": 3.235, "step": 4270 }, { "epoch": 0.88, "learning_rate": 2.5682113469034213e-05, "loss": 3.2841, "step": 4280 }, { "epoch": 0.88, "learning_rate": 2.5249025552187094e-05, "loss": 3.2213, "step": 4290 }, { "epoch": 0.88, "learning_rate": 2.4815937635339975e-05, "loss": 3.2716, "step": 4300 }, { "epoch": 0.88, "eval_accuracy": 0.35403453689167974, "eval_loss": 4.0169243812561035, "eval_runtime": 10.3295, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.194, "step": 4300 }, { "epoch": 0.89, "learning_rate": 2.4382849718492855e-05, "loss": 3.2722, "step": 4310 }, { "epoch": 0.89, "learning_rate": 2.3949761801645733e-05, "loss": 3.2522, "step": 4320 }, { "epoch": 0.89, "learning_rate": 2.3516673884798617e-05, "loss": 3.2488, "step": 4330 }, { "epoch": 0.89, "learning_rate": 2.3083585967951497e-05, "loss": 3.242, "step": 4340 }, { "epoch": 0.89, "learning_rate": 2.2650498051104375e-05, "loss": 3.2383, "step": 4350 }, { "epoch": 0.89, "eval_accuracy": 0.35424385138670855, "eval_loss": 4.014565467834473, "eval_runtime": 10.3078, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 4350 }, { "epoch": 0.9, "learning_rate": 2.2217410134257256e-05, "loss": 3.2143, "step": 4360 }, { "epoch": 0.9, "learning_rate": 2.1784322217410136e-05, "loss": 3.2828, "step": 4370 }, { "epoch": 0.9, "learning_rate": 2.1351234300563017e-05, "loss": 3.2291, "step": 4380 }, { "epoch": 0.9, "learning_rate": 2.0918146383715894e-05, "loss": 3.243, "step": 4390 }, { "epoch": 0.9, "learning_rate": 2.0485058466868775e-05, "loss": 3.3085, "step": 4400 }, { "epoch": 0.9, "eval_accuracy": 0.35441130298273155, "eval_loss": 4.013179779052734, "eval_runtime": 10.2995, "eval_samples_per_second": 3.398, "eval_steps_per_second": 0.194, "step": 4400 }, { "epoch": 0.91, "learning_rate": 2.0051970550021656e-05, "loss": 3.2461, "step": 4410 }, { "epoch": 0.91, "learning_rate": 1.9618882633174533e-05, "loss": 3.2922, "step": 4420 }, { "epoch": 0.91, "learning_rate": 1.9185794716327414e-05, "loss": 3.228, "step": 4430 }, { "epoch": 0.91, "learning_rate": 1.8752706799480295e-05, "loss": 3.2603, "step": 4440 }, { "epoch": 0.92, "learning_rate": 1.8319618882633175e-05, "loss": 3.249, "step": 4450 }, { "epoch": 0.92, "eval_accuracy": 0.354850863422292, "eval_loss": 4.012799263000488, "eval_runtime": 10.3085, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.194, "step": 4450 }, { "epoch": 0.92, "learning_rate": 1.7886530965786056e-05, "loss": 3.2295, "step": 4460 }, { "epoch": 0.92, "learning_rate": 1.7453443048938937e-05, "loss": 3.2927, "step": 4470 }, { "epoch": 0.92, "learning_rate": 1.7020355132091814e-05, "loss": 3.2776, "step": 4480 }, { "epoch": 0.92, "learning_rate": 1.6587267215244695e-05, "loss": 3.2344, "step": 4490 }, { "epoch": 0.93, "learning_rate": 1.6154179298397576e-05, "loss": 3.2313, "step": 4500 }, { "epoch": 0.93, "eval_accuracy": 0.35491365777080064, "eval_loss": 4.013089656829834, "eval_runtime": 10.3145, "eval_samples_per_second": 3.393, "eval_steps_per_second": 0.194, "step": 4500 }, { "epoch": 0.93, "learning_rate": 1.5721091381550456e-05, "loss": 3.2486, "step": 4510 }, { "epoch": 0.93, "learning_rate": 1.5288003464703334e-05, "loss": 3.2489, "step": 4520 }, { "epoch": 0.93, "learning_rate": 1.4854915547856216e-05, "loss": 3.246, "step": 4530 }, { "epoch": 0.93, "learning_rate": 1.4421827631009097e-05, "loss": 3.1958, "step": 4540 }, { "epoch": 0.94, "learning_rate": 1.3988739714161974e-05, "loss": 3.2908, "step": 4550 }, { "epoch": 0.94, "eval_accuracy": 0.3545229373800802, "eval_loss": 4.011632442474365, "eval_runtime": 10.3115, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.194, "step": 4550 }, { "epoch": 0.94, "learning_rate": 1.3555651797314855e-05, "loss": 3.2766, "step": 4560 }, { "epoch": 0.94, "learning_rate": 1.3122563880467736e-05, "loss": 3.2029, "step": 4570 }, { "epoch": 0.94, "learning_rate": 1.2689475963620615e-05, "loss": 3.2499, "step": 4580 }, { "epoch": 0.94, "learning_rate": 1.2299696838458208e-05, "loss": 3.2521, "step": 4590 }, { "epoch": 0.95, "learning_rate": 1.1866608921611087e-05, "loss": 3.2715, "step": 4600 }, { "epoch": 0.95, "eval_accuracy": 0.3545089830804117, "eval_loss": 4.0114569664001465, "eval_runtime": 10.3298, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.194, "step": 4600 }, { "epoch": 0.95, "learning_rate": 1.1433521004763968e-05, "loss": 3.2073, "step": 4610 }, { "epoch": 0.95, "learning_rate": 1.1000433087916847e-05, "loss": 3.2471, "step": 4620 }, { "epoch": 0.95, "learning_rate": 1.0567345171069728e-05, "loss": 3.247, "step": 4630 }, { "epoch": 0.95, "learning_rate": 1.0134257254222609e-05, "loss": 3.27, "step": 4640 }, { "epoch": 0.96, "learning_rate": 9.701169337375488e-06, "loss": 3.21, "step": 4650 }, { "epoch": 0.96, "eval_accuracy": 0.354613640327926, "eval_loss": 4.011072158813477, "eval_runtime": 10.3358, "eval_samples_per_second": 3.386, "eval_steps_per_second": 0.194, "step": 4650 }, { "epoch": 0.96, "learning_rate": 9.268081420528368e-06, "loss": 3.2566, "step": 4660 }, { "epoch": 0.96, "learning_rate": 8.834993503681247e-06, "loss": 3.2382, "step": 4670 }, { "epoch": 0.96, "learning_rate": 8.401905586834128e-06, "loss": 3.2606, "step": 4680 }, { "epoch": 0.96, "learning_rate": 7.968817669987009e-06, "loss": 3.2536, "step": 4690 }, { "epoch": 0.97, "learning_rate": 7.535729753139888e-06, "loss": 3.2846, "step": 4700 }, { "epoch": 0.97, "eval_accuracy": 0.3547252747252747, "eval_loss": 4.011029243469238, "eval_runtime": 10.3241, "eval_samples_per_second": 3.39, "eval_steps_per_second": 0.194, "step": 4700 }, { "epoch": 0.97, "learning_rate": 7.102641836292767e-06, "loss": 3.2191, "step": 4710 }, { "epoch": 0.97, "learning_rate": 6.669553919445648e-06, "loss": 3.2743, "step": 4720 }, { "epoch": 0.97, "learning_rate": 6.2364660025985275e-06, "loss": 3.2955, "step": 4730 }, { "epoch": 0.97, "learning_rate": 5.803378085751407e-06, "loss": 3.2061, "step": 4740 }, { "epoch": 0.98, "learning_rate": 5.370290168904288e-06, "loss": 3.2913, "step": 4750 }, { "epoch": 0.98, "eval_accuracy": 0.3547392290249433, "eval_loss": 4.011110782623291, "eval_runtime": 10.3213, "eval_samples_per_second": 3.391, "eval_steps_per_second": 0.194, "step": 4750 }, { "epoch": 0.98, "learning_rate": 4.937202252057168e-06, "loss": 3.1993, "step": 4760 }, { "epoch": 0.98, "learning_rate": 4.504114335210048e-06, "loss": 3.1921, "step": 4770 }, { "epoch": 0.98, "learning_rate": 4.071026418362928e-06, "loss": 3.231, "step": 4780 }, { "epoch": 0.99, "learning_rate": 3.6379385015158076e-06, "loss": 3.268, "step": 4790 }, { "epoch": 0.99, "learning_rate": 3.204850584668688e-06, "loss": 3.2902, "step": 4800 }, { "epoch": 0.99, "eval_accuracy": 0.3546694575266004, "eval_loss": 4.011045455932617, "eval_runtime": 10.3382, "eval_samples_per_second": 3.385, "eval_steps_per_second": 0.193, "step": 4800 }, { "epoch": 0.99, "learning_rate": 2.771762667821568e-06, "loss": 3.2694, "step": 4810 }, { "epoch": 0.99, "learning_rate": 2.338674750974448e-06, "loss": 3.226, "step": 4820 }, { "epoch": 0.99, "learning_rate": 1.9055868341273278e-06, "loss": 3.2843, "step": 4830 }, { "epoch": 1.0, "learning_rate": 1.472498917280208e-06, "loss": 3.1971, "step": 4840 }, { "epoch": 1.0, "learning_rate": 1.039411000433088e-06, "loss": 3.2593, "step": 4850 }, { "epoch": 1.0, "eval_accuracy": 0.3547252747252747, "eval_loss": 4.01063871383667, "eval_runtime": 10.3267, "eval_samples_per_second": 3.389, "eval_steps_per_second": 0.194, "step": 4850 }, { "epoch": 1.0, "learning_rate": 6.06323083585968e-07, "loss": 3.2748, "step": 4860 }, { "epoch": 1.0, "step": 4862, "total_flos": 9.97087842226311e+19, "train_loss": 3.63210295110708, "train_runtime": 159180.7121, "train_samples_per_second": 1.955, "train_steps_per_second": 0.031 } ], "max_steps": 4862, "num_train_epochs": 1, "total_flos": 9.97087842226311e+19, "trial_name": null, "trial_params": null }