{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10000, "global_step": 268932, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.990703969776747e-05, "loss": 8.1971, "step": 500 }, { "epoch": 0.0, "learning_rate": 4.9814079395534934e-05, "loss": 7.3697, "step": 1000 }, { "epoch": 0.01, "learning_rate": 4.97211190933024e-05, "loss": 6.9478, "step": 1500 }, { "epoch": 0.01, "learning_rate": 4.962815879106986e-05, "loss": 6.5908, "step": 2000 }, { "epoch": 0.01, "learning_rate": 4.953519848883733e-05, "loss": 6.3102, "step": 2500 }, { "epoch": 0.01, "learning_rate": 4.94422381866048e-05, "loss": 6.0716, "step": 3000 }, { "epoch": 0.01, "learning_rate": 4.9349277884372256e-05, "loss": 5.859, "step": 3500 }, { "epoch": 0.01, "learning_rate": 4.925631758213973e-05, "loss": 5.6554, "step": 4000 }, { "epoch": 0.02, "learning_rate": 4.916335727990719e-05, "loss": 5.4972, "step": 4500 }, { "epoch": 0.02, "learning_rate": 4.907039697767465e-05, "loss": 5.3606, "step": 5000 }, { "epoch": 0.02, "learning_rate": 4.8977436675442126e-05, "loss": 5.2518, "step": 5500 }, { "epoch": 0.02, "learning_rate": 4.8884476373209585e-05, "loss": 5.1562, "step": 6000 }, { "epoch": 0.02, "learning_rate": 4.879151607097705e-05, "loss": 5.0784, "step": 6500 }, { "epoch": 0.03, "learning_rate": 4.8698555768744516e-05, "loss": 5.0087, "step": 7000 }, { "epoch": 0.03, "learning_rate": 4.860559546651198e-05, "loss": 4.9465, "step": 7500 }, { "epoch": 0.03, "learning_rate": 4.851263516427945e-05, "loss": 4.8951, "step": 8000 }, { "epoch": 0.03, "learning_rate": 4.841967486204691e-05, "loss": 4.8451, "step": 8500 }, { "epoch": 0.03, "learning_rate": 4.832671455981438e-05, "loss": 4.8033, "step": 9000 }, { "epoch": 0.04, "learning_rate": 4.8233754257581845e-05, "loss": 4.772, "step": 9500 }, { "epoch": 0.04, "learning_rate": 4.814079395534931e-05, "loss": 4.729, "step": 10000 }, { "epoch": 0.04, "eval_accuracy": 0.2836232206800241, "eval_loss": 4.607733249664307, "eval_runtime": 5046.5458, "eval_samples_per_second": 89.595, "eval_steps_per_second": 1.4, "step": 10000 }, { "epoch": 0.04, "learning_rate": 4.8047833653116776e-05, "loss": 4.698, "step": 10500 }, { "epoch": 0.04, "learning_rate": 4.7954873350884235e-05, "loss": 4.6666, "step": 11000 }, { "epoch": 0.04, "learning_rate": 4.786191304865171e-05, "loss": 4.6425, "step": 11500 }, { "epoch": 0.04, "learning_rate": 4.7768952746419174e-05, "loss": 4.6117, "step": 12000 }, { "epoch": 0.05, "learning_rate": 4.767599244418663e-05, "loss": 4.5848, "step": 12500 }, { "epoch": 0.05, "learning_rate": 4.7583032141954105e-05, "loss": 4.5665, "step": 13000 }, { "epoch": 0.05, "learning_rate": 4.7490071839721564e-05, "loss": 4.5407, "step": 13500 }, { "epoch": 0.05, "learning_rate": 4.739711153748903e-05, "loss": 4.5278, "step": 14000 }, { "epoch": 0.05, "learning_rate": 4.73041512352565e-05, "loss": 4.501, "step": 14500 }, { "epoch": 0.06, "learning_rate": 4.721119093302396e-05, "loss": 4.4807, "step": 15000 }, { "epoch": 0.06, "learning_rate": 4.711823063079143e-05, "loss": 4.468, "step": 15500 }, { "epoch": 0.06, "learning_rate": 4.702527032855889e-05, "loss": 4.4464, "step": 16000 }, { "epoch": 0.06, "learning_rate": 4.693231002632636e-05, "loss": 4.4371, "step": 16500 }, { "epoch": 0.06, "learning_rate": 4.6839349724093824e-05, "loss": 4.4197, "step": 17000 }, { "epoch": 0.07, "learning_rate": 4.674638942186129e-05, "loss": 4.4046, "step": 17500 }, { "epoch": 0.07, "learning_rate": 4.6653429119628756e-05, "loss": 4.3892, "step": 18000 }, { "epoch": 0.07, "learning_rate": 4.656046881739622e-05, "loss": 4.3779, "step": 18500 }, { "epoch": 0.07, "learning_rate": 4.646750851516369e-05, "loss": 4.3627, "step": 19000 }, { "epoch": 0.07, "learning_rate": 4.637454821293115e-05, "loss": 4.3501, "step": 19500 }, { "epoch": 0.07, "learning_rate": 4.628158791069861e-05, "loss": 4.3383, "step": 20000 }, { "epoch": 0.07, "eval_accuracy": 0.31616356516397504, "eval_loss": 4.231759548187256, "eval_runtime": 5043.1658, "eval_samples_per_second": 89.655, "eval_steps_per_second": 1.401, "step": 20000 }, { "epoch": 0.08, "learning_rate": 4.6188627608466085e-05, "loss": 4.3304, "step": 20500 }, { "epoch": 0.08, "learning_rate": 4.609566730623355e-05, "loss": 4.3221, "step": 21000 }, { "epoch": 0.08, "learning_rate": 4.600270700400101e-05, "loss": 4.3029, "step": 21500 }, { "epoch": 0.08, "learning_rate": 4.590974670176848e-05, "loss": 4.2954, "step": 22000 }, { "epoch": 0.08, "learning_rate": 4.581678639953594e-05, "loss": 4.2818, "step": 22500 }, { "epoch": 0.09, "learning_rate": 4.5723826097303407e-05, "loss": 4.2812, "step": 23000 }, { "epoch": 0.09, "learning_rate": 4.563086579507088e-05, "loss": 4.2749, "step": 23500 }, { "epoch": 0.09, "learning_rate": 4.553790549283834e-05, "loss": 4.2555, "step": 24000 }, { "epoch": 0.09, "learning_rate": 4.5444945190605804e-05, "loss": 4.2467, "step": 24500 }, { "epoch": 0.09, "learning_rate": 4.535198488837327e-05, "loss": 4.242, "step": 25000 }, { "epoch": 0.09, "learning_rate": 4.5259024586140735e-05, "loss": 4.2395, "step": 25500 }, { "epoch": 0.1, "learning_rate": 4.51660642839082e-05, "loss": 4.2208, "step": 26000 }, { "epoch": 0.1, "learning_rate": 4.507310398167567e-05, "loss": 4.2181, "step": 26500 }, { "epoch": 0.1, "learning_rate": 4.498014367944313e-05, "loss": 4.2108, "step": 27000 }, { "epoch": 0.1, "learning_rate": 4.48871833772106e-05, "loss": 4.2053, "step": 27500 }, { "epoch": 0.1, "learning_rate": 4.4794223074978064e-05, "loss": 4.1924, "step": 28000 }, { "epoch": 0.11, "learning_rate": 4.470126277274553e-05, "loss": 4.187, "step": 28500 }, { "epoch": 0.11, "learning_rate": 4.460830247051299e-05, "loss": 4.1839, "step": 29000 }, { "epoch": 0.11, "learning_rate": 4.451534216828046e-05, "loss": 4.1739, "step": 29500 }, { "epoch": 0.11, "learning_rate": 4.442238186604793e-05, "loss": 4.1706, "step": 30000 }, { "epoch": 0.11, "eval_accuracy": 0.33159489024981786, "eval_loss": 4.065094947814941, "eval_runtime": 5049.5785, "eval_samples_per_second": 89.541, "eval_steps_per_second": 1.399, "step": 30000 }, { "epoch": 0.11, "learning_rate": 4.432942156381539e-05, "loss": 4.1606, "step": 30500 }, { "epoch": 0.12, "learning_rate": 4.423646126158286e-05, "loss": 4.1613, "step": 31000 }, { "epoch": 0.12, "learning_rate": 4.414350095935032e-05, "loss": 4.1533, "step": 31500 }, { "epoch": 0.12, "learning_rate": 4.405054065711779e-05, "loss": 4.1449, "step": 32000 }, { "epoch": 0.12, "learning_rate": 4.3957580354885256e-05, "loss": 4.1376, "step": 32500 }, { "epoch": 0.12, "learning_rate": 4.3864620052652715e-05, "loss": 4.1307, "step": 33000 }, { "epoch": 0.12, "learning_rate": 4.377165975042019e-05, "loss": 4.1225, "step": 33500 }, { "epoch": 0.13, "learning_rate": 4.3678699448187646e-05, "loss": 4.1147, "step": 34000 }, { "epoch": 0.13, "learning_rate": 4.358573914595511e-05, "loss": 4.1113, "step": 34500 }, { "epoch": 0.13, "learning_rate": 4.3492778843722584e-05, "loss": 4.1075, "step": 35000 }, { "epoch": 0.13, "learning_rate": 4.3399818541490043e-05, "loss": 4.1023, "step": 35500 }, { "epoch": 0.13, "learning_rate": 4.330685823925751e-05, "loss": 4.0986, "step": 36000 }, { "epoch": 0.14, "learning_rate": 4.3213897937024975e-05, "loss": 4.0933, "step": 36500 }, { "epoch": 0.14, "learning_rate": 4.312093763479244e-05, "loss": 4.0899, "step": 37000 }, { "epoch": 0.14, "learning_rate": 4.3027977332559906e-05, "loss": 4.0842, "step": 37500 }, { "epoch": 0.14, "learning_rate": 4.293501703032737e-05, "loss": 4.0783, "step": 38000 }, { "epoch": 0.14, "learning_rate": 4.284205672809484e-05, "loss": 4.0721, "step": 38500 }, { "epoch": 0.15, "learning_rate": 4.2749096425862304e-05, "loss": 4.0692, "step": 39000 }, { "epoch": 0.15, "learning_rate": 4.265613612362977e-05, "loss": 4.0613, "step": 39500 }, { "epoch": 0.15, "learning_rate": 4.2563175821397235e-05, "loss": 4.0594, "step": 40000 }, { "epoch": 0.15, "eval_accuracy": 0.34162108157696924, "eval_loss": 3.9599251747131348, "eval_runtime": 5055.4019, "eval_samples_per_second": 89.438, "eval_steps_per_second": 1.398, "step": 40000 }, { "epoch": 0.15, "learning_rate": 4.2470215519164694e-05, "loss": 4.0552, "step": 40500 }, { "epoch": 0.15, "learning_rate": 4.237725521693217e-05, "loss": 4.0538, "step": 41000 }, { "epoch": 0.15, "learning_rate": 4.228429491469963e-05, "loss": 4.049, "step": 41500 }, { "epoch": 0.16, "learning_rate": 4.219133461246709e-05, "loss": 4.04, "step": 42000 }, { "epoch": 0.16, "learning_rate": 4.2098374310234564e-05, "loss": 4.0417, "step": 42500 }, { "epoch": 0.16, "learning_rate": 4.200541400800202e-05, "loss": 4.0398, "step": 43000 }, { "epoch": 0.16, "learning_rate": 4.191245370576949e-05, "loss": 4.0348, "step": 43500 }, { "epoch": 0.16, "learning_rate": 4.181949340353696e-05, "loss": 4.0271, "step": 44000 }, { "epoch": 0.17, "learning_rate": 4.172653310130442e-05, "loss": 4.0187, "step": 44500 }, { "epoch": 0.17, "learning_rate": 4.1633572799071886e-05, "loss": 4.0195, "step": 45000 }, { "epoch": 0.17, "learning_rate": 4.154061249683935e-05, "loss": 4.0134, "step": 45500 }, { "epoch": 0.17, "learning_rate": 4.144765219460682e-05, "loss": 4.012, "step": 46000 }, { "epoch": 0.17, "learning_rate": 4.135469189237428e-05, "loss": 4.0103, "step": 46500 }, { "epoch": 0.17, "learning_rate": 4.126173159014175e-05, "loss": 4.0061, "step": 47000 }, { "epoch": 0.18, "learning_rate": 4.1168771287909215e-05, "loss": 3.9976, "step": 47500 }, { "epoch": 0.18, "learning_rate": 4.107581098567668e-05, "loss": 4.0012, "step": 48000 }, { "epoch": 0.18, "learning_rate": 4.0982850683444146e-05, "loss": 4.0002, "step": 48500 }, { "epoch": 0.18, "learning_rate": 4.088989038121161e-05, "loss": 3.9899, "step": 49000 }, { "epoch": 0.18, "learning_rate": 4.079693007897907e-05, "loss": 3.9881, "step": 49500 }, { "epoch": 0.19, "learning_rate": 4.070396977674654e-05, "loss": 3.9842, "step": 50000 }, { "epoch": 0.19, "eval_accuracy": 0.3486961893069126, "eval_loss": 3.8825323581695557, "eval_runtime": 5041.9196, "eval_samples_per_second": 89.677, "eval_steps_per_second": 1.401, "step": 50000 }, { "epoch": 0.19, "learning_rate": 4.0611009474514e-05, "loss": 3.9765, "step": 50500 }, { "epoch": 0.19, "learning_rate": 4.051804917228147e-05, "loss": 3.974, "step": 51000 }, { "epoch": 0.19, "learning_rate": 4.042508887004894e-05, "loss": 3.9809, "step": 51500 }, { "epoch": 0.19, "learning_rate": 4.03321285678164e-05, "loss": 3.9751, "step": 52000 }, { "epoch": 0.2, "learning_rate": 4.0239168265583865e-05, "loss": 3.9714, "step": 52500 }, { "epoch": 0.2, "learning_rate": 4.014620796335133e-05, "loss": 3.967, "step": 53000 }, { "epoch": 0.2, "learning_rate": 4.00532476611188e-05, "loss": 3.9642, "step": 53500 }, { "epoch": 0.2, "learning_rate": 3.996028735888626e-05, "loss": 3.9586, "step": 54000 }, { "epoch": 0.2, "learning_rate": 3.986732705665373e-05, "loss": 3.9591, "step": 54500 }, { "epoch": 0.2, "learning_rate": 3.9774366754421194e-05, "loss": 3.9532, "step": 55000 }, { "epoch": 0.21, "learning_rate": 3.968140645218866e-05, "loss": 3.9432, "step": 55500 }, { "epoch": 0.21, "learning_rate": 3.9588446149956126e-05, "loss": 3.9482, "step": 56000 }, { "epoch": 0.21, "learning_rate": 3.949548584772359e-05, "loss": 3.9414, "step": 56500 }, { "epoch": 0.21, "learning_rate": 3.940252554549105e-05, "loss": 3.9397, "step": 57000 }, { "epoch": 0.21, "learning_rate": 3.930956524325852e-05, "loss": 3.9329, "step": 57500 }, { "epoch": 0.22, "learning_rate": 3.921660494102599e-05, "loss": 3.9361, "step": 58000 }, { "epoch": 0.22, "learning_rate": 3.912364463879345e-05, "loss": 3.9322, "step": 58500 }, { "epoch": 0.22, "learning_rate": 3.903068433656092e-05, "loss": 3.929, "step": 59000 }, { "epoch": 0.22, "learning_rate": 3.893772403432838e-05, "loss": 3.9308, "step": 59500 }, { "epoch": 0.22, "learning_rate": 3.8844763732095845e-05, "loss": 3.9298, "step": 60000 }, { "epoch": 0.22, "eval_accuracy": 0.3545327865706106, "eval_loss": 3.824397087097168, "eval_runtime": 5028.3225, "eval_samples_per_second": 89.92, "eval_steps_per_second": 1.405, "step": 60000 }, { "epoch": 0.22, "learning_rate": 3.875180342986332e-05, "loss": 3.9259, "step": 60500 }, { "epoch": 0.23, "learning_rate": 3.8658843127630776e-05, "loss": 3.9188, "step": 61000 }, { "epoch": 0.23, "learning_rate": 3.856588282539824e-05, "loss": 3.9109, "step": 61500 }, { "epoch": 0.23, "learning_rate": 3.847292252316571e-05, "loss": 3.9193, "step": 62000 }, { "epoch": 0.23, "learning_rate": 3.8379962220933174e-05, "loss": 3.9078, "step": 62500 }, { "epoch": 0.23, "learning_rate": 3.828700191870064e-05, "loss": 3.913, "step": 63000 }, { "epoch": 0.24, "learning_rate": 3.8194041616468105e-05, "loss": 3.9015, "step": 63500 }, { "epoch": 0.24, "learning_rate": 3.810108131423557e-05, "loss": 3.9075, "step": 64000 }, { "epoch": 0.24, "learning_rate": 3.8008121012003037e-05, "loss": 3.9038, "step": 64500 }, { "epoch": 0.24, "learning_rate": 3.79151607097705e-05, "loss": 3.8996, "step": 65000 }, { "epoch": 0.24, "learning_rate": 3.782220040753797e-05, "loss": 3.8987, "step": 65500 }, { "epoch": 0.25, "learning_rate": 3.772924010530543e-05, "loss": 3.8999, "step": 66000 }, { "epoch": 0.25, "learning_rate": 3.76362798030729e-05, "loss": 3.8889, "step": 66500 }, { "epoch": 0.25, "learning_rate": 3.7543319500840365e-05, "loss": 3.8927, "step": 67000 }, { "epoch": 0.25, "learning_rate": 3.7450359198607824e-05, "loss": 3.8974, "step": 67500 }, { "epoch": 0.25, "learning_rate": 3.73573988963753e-05, "loss": 3.8895, "step": 68000 }, { "epoch": 0.25, "learning_rate": 3.7264438594142756e-05, "loss": 3.8908, "step": 68500 }, { "epoch": 0.26, "learning_rate": 3.717147829191022e-05, "loss": 3.8816, "step": 69000 }, { "epoch": 0.26, "learning_rate": 3.7078517989677694e-05, "loss": 3.8718, "step": 69500 }, { "epoch": 0.26, "learning_rate": 3.698555768744515e-05, "loss": 3.8777, "step": 70000 }, { "epoch": 0.26, "eval_accuracy": 0.35923910731713965, "eval_loss": 3.7790777683258057, "eval_runtime": 5041.8181, "eval_samples_per_second": 89.679, "eval_steps_per_second": 1.401, "step": 70000 }, { "epoch": 0.26, "learning_rate": 3.689259738521262e-05, "loss": 3.8821, "step": 70500 }, { "epoch": 0.26, "learning_rate": 3.6799637082980085e-05, "loss": 3.8743, "step": 71000 }, { "epoch": 0.27, "learning_rate": 3.670667678074755e-05, "loss": 3.8735, "step": 71500 }, { "epoch": 0.27, "learning_rate": 3.6613716478515016e-05, "loss": 3.8722, "step": 72000 }, { "epoch": 0.27, "learning_rate": 3.652075617628248e-05, "loss": 3.8644, "step": 72500 }, { "epoch": 0.27, "learning_rate": 3.642779587404995e-05, "loss": 3.867, "step": 73000 }, { "epoch": 0.27, "learning_rate": 3.633483557181741e-05, "loss": 3.8666, "step": 73500 }, { "epoch": 0.28, "learning_rate": 3.624187526958488e-05, "loss": 3.8615, "step": 74000 }, { "epoch": 0.28, "learning_rate": 3.6148914967352345e-05, "loss": 3.8662, "step": 74500 }, { "epoch": 0.28, "learning_rate": 3.6055954665119804e-05, "loss": 3.8645, "step": 75000 }, { "epoch": 0.28, "learning_rate": 3.5962994362887276e-05, "loss": 3.8559, "step": 75500 }, { "epoch": 0.28, "learning_rate": 3.587003406065474e-05, "loss": 3.8536, "step": 76000 }, { "epoch": 0.28, "learning_rate": 3.57770737584222e-05, "loss": 3.8527, "step": 76500 }, { "epoch": 0.29, "learning_rate": 3.5684113456189673e-05, "loss": 3.848, "step": 77000 }, { "epoch": 0.29, "learning_rate": 3.559115315395713e-05, "loss": 3.8495, "step": 77500 }, { "epoch": 0.29, "learning_rate": 3.54981928517246e-05, "loss": 3.8523, "step": 78000 }, { "epoch": 0.29, "learning_rate": 3.540523254949207e-05, "loss": 3.8505, "step": 78500 }, { "epoch": 0.29, "learning_rate": 3.531227224725953e-05, "loss": 3.8439, "step": 79000 }, { "epoch": 0.3, "learning_rate": 3.5219311945026995e-05, "loss": 3.8387, "step": 79500 }, { "epoch": 0.3, "learning_rate": 3.512635164279446e-05, "loss": 3.8455, "step": 80000 }, { "epoch": 0.3, "eval_accuracy": 0.36290677735789373, "eval_loss": 3.7436022758483887, "eval_runtime": 5030.6589, "eval_samples_per_second": 89.878, "eval_steps_per_second": 1.404, "step": 80000 }, { "epoch": 0.3, "learning_rate": 3.503339134056193e-05, "loss": 3.8383, "step": 80500 }, { "epoch": 0.3, "learning_rate": 3.494043103832939e-05, "loss": 3.8445, "step": 81000 }, { "epoch": 0.3, "learning_rate": 3.484747073609686e-05, "loss": 3.8439, "step": 81500 }, { "epoch": 0.3, "learning_rate": 3.4754510433864324e-05, "loss": 3.8357, "step": 82000 }, { "epoch": 0.31, "learning_rate": 3.466155013163179e-05, "loss": 3.8362, "step": 82500 }, { "epoch": 0.31, "learning_rate": 3.4568589829399256e-05, "loss": 3.8311, "step": 83000 }, { "epoch": 0.31, "learning_rate": 3.447562952716672e-05, "loss": 3.8263, "step": 83500 }, { "epoch": 0.31, "learning_rate": 3.438266922493418e-05, "loss": 3.8328, "step": 84000 }, { "epoch": 0.31, "learning_rate": 3.428970892270165e-05, "loss": 3.8303, "step": 84500 }, { "epoch": 0.32, "learning_rate": 3.419674862046912e-05, "loss": 3.8202, "step": 85000 }, { "epoch": 0.32, "learning_rate": 3.410378831823658e-05, "loss": 3.8243, "step": 85500 }, { "epoch": 0.32, "learning_rate": 3.401082801600405e-05, "loss": 3.8284, "step": 86000 }, { "epoch": 0.32, "learning_rate": 3.391786771377151e-05, "loss": 3.8215, "step": 86500 }, { "epoch": 0.32, "learning_rate": 3.3824907411538975e-05, "loss": 3.8179, "step": 87000 }, { "epoch": 0.33, "learning_rate": 3.373194710930645e-05, "loss": 3.821, "step": 87500 }, { "epoch": 0.33, "learning_rate": 3.3638986807073906e-05, "loss": 3.816, "step": 88000 }, { "epoch": 0.33, "learning_rate": 3.354602650484138e-05, "loss": 3.8129, "step": 88500 }, { "epoch": 0.33, "learning_rate": 3.345306620260884e-05, "loss": 3.8203, "step": 89000 }, { "epoch": 0.33, "learning_rate": 3.3360105900376304e-05, "loss": 3.818, "step": 89500 }, { "epoch": 0.33, "learning_rate": 3.3267145598143776e-05, "loss": 3.8104, "step": 90000 }, { "epoch": 0.33, "eval_accuracy": 0.36599874258539994, "eval_loss": 3.7120308876037598, "eval_runtime": 5023.4047, "eval_samples_per_second": 90.008, "eval_steps_per_second": 1.406, "step": 90000 }, { "epoch": 0.34, "learning_rate": 3.3174185295911235e-05, "loss": 3.8046, "step": 90500 }, { "epoch": 0.34, "learning_rate": 3.30812249936787e-05, "loss": 3.8116, "step": 91000 }, { "epoch": 0.34, "learning_rate": 3.298826469144617e-05, "loss": 3.8063, "step": 91500 }, { "epoch": 0.34, "learning_rate": 3.289530438921363e-05, "loss": 3.8029, "step": 92000 }, { "epoch": 0.34, "learning_rate": 3.28023440869811e-05, "loss": 3.8023, "step": 92500 }, { "epoch": 0.35, "learning_rate": 3.2709383784748564e-05, "loss": 3.799, "step": 93000 }, { "epoch": 0.35, "learning_rate": 3.261642348251603e-05, "loss": 3.7974, "step": 93500 }, { "epoch": 0.35, "learning_rate": 3.2523463180283495e-05, "loss": 3.7981, "step": 94000 }, { "epoch": 0.35, "learning_rate": 3.243050287805096e-05, "loss": 3.7979, "step": 94500 }, { "epoch": 0.35, "learning_rate": 3.233754257581843e-05, "loss": 3.7971, "step": 95000 }, { "epoch": 0.36, "learning_rate": 3.2244582273585886e-05, "loss": 3.7956, "step": 95500 }, { "epoch": 0.36, "learning_rate": 3.215162197135336e-05, "loss": 3.7966, "step": 96000 }, { "epoch": 0.36, "learning_rate": 3.2058661669120824e-05, "loss": 3.7903, "step": 96500 }, { "epoch": 0.36, "learning_rate": 3.196570136688828e-05, "loss": 3.7944, "step": 97000 }, { "epoch": 0.36, "learning_rate": 3.1872741064655756e-05, "loss": 3.7942, "step": 97500 }, { "epoch": 0.36, "learning_rate": 3.1779780762423215e-05, "loss": 3.7871, "step": 98000 }, { "epoch": 0.37, "learning_rate": 3.168682046019068e-05, "loss": 3.7912, "step": 98500 }, { "epoch": 0.37, "learning_rate": 3.1593860157958146e-05, "loss": 3.7909, "step": 99000 }, { "epoch": 0.37, "learning_rate": 3.150089985572561e-05, "loss": 3.7798, "step": 99500 }, { "epoch": 0.37, "learning_rate": 3.140793955349308e-05, "loss": 3.7908, "step": 100000 }, { "epoch": 0.37, "eval_accuracy": 0.36866443948556843, "eval_loss": 3.686194896697998, "eval_runtime": 5036.4153, "eval_samples_per_second": 89.775, "eval_steps_per_second": 1.403, "step": 100000 }, { "epoch": 0.37, "learning_rate": 3.131497925126054e-05, "loss": 3.7867, "step": 100500 }, { "epoch": 0.38, "learning_rate": 3.122201894902801e-05, "loss": 3.7815, "step": 101000 }, { "epoch": 0.38, "learning_rate": 3.1129058646795475e-05, "loss": 3.781, "step": 101500 }, { "epoch": 0.38, "learning_rate": 3.103609834456294e-05, "loss": 3.7821, "step": 102000 }, { "epoch": 0.38, "learning_rate": 3.0943138042330406e-05, "loss": 3.7731, "step": 102500 }, { "epoch": 0.38, "learning_rate": 3.0850177740097865e-05, "loss": 3.7803, "step": 103000 }, { "epoch": 0.38, "learning_rate": 3.075721743786534e-05, "loss": 3.7747, "step": 103500 }, { "epoch": 0.39, "learning_rate": 3.0664257135632804e-05, "loss": 3.776, "step": 104000 }, { "epoch": 0.39, "learning_rate": 3.057129683340026e-05, "loss": 3.7718, "step": 104500 }, { "epoch": 0.39, "learning_rate": 3.0478336531167735e-05, "loss": 3.7762, "step": 105000 }, { "epoch": 0.39, "learning_rate": 3.0385376228935197e-05, "loss": 3.7731, "step": 105500 }, { "epoch": 0.39, "learning_rate": 3.029241592670266e-05, "loss": 3.7678, "step": 106000 }, { "epoch": 0.4, "learning_rate": 3.019945562447013e-05, "loss": 3.7706, "step": 106500 }, { "epoch": 0.4, "learning_rate": 3.0106495322237595e-05, "loss": 3.7676, "step": 107000 }, { "epoch": 0.4, "learning_rate": 3.0013535020005057e-05, "loss": 3.7703, "step": 107500 }, { "epoch": 0.4, "learning_rate": 2.9920574717772526e-05, "loss": 3.7657, "step": 108000 }, { "epoch": 0.4, "learning_rate": 2.982761441553999e-05, "loss": 3.7717, "step": 108500 }, { "epoch": 0.41, "learning_rate": 2.9734654113307454e-05, "loss": 3.7669, "step": 109000 }, { "epoch": 0.41, "learning_rate": 2.9641693811074923e-05, "loss": 3.7598, "step": 109500 }, { "epoch": 0.41, "learning_rate": 2.9548733508842386e-05, "loss": 3.7613, "step": 110000 }, { "epoch": 0.41, "eval_accuracy": 0.37119206097292273, "eval_loss": 3.6628386974334717, "eval_runtime": 5026.6773, "eval_samples_per_second": 89.949, "eval_steps_per_second": 1.406, "step": 110000 }, { "epoch": 0.41, "learning_rate": 2.9455773206609848e-05, "loss": 3.7575, "step": 110500 }, { "epoch": 0.41, "learning_rate": 2.9362812904377317e-05, "loss": 3.7598, "step": 111000 }, { "epoch": 0.41, "learning_rate": 2.9269852602144783e-05, "loss": 3.7473, "step": 111500 }, { "epoch": 0.42, "learning_rate": 2.9176892299912245e-05, "loss": 3.7578, "step": 112000 }, { "epoch": 0.42, "learning_rate": 2.9083931997679715e-05, "loss": 3.7512, "step": 112500 }, { "epoch": 0.42, "learning_rate": 2.8990971695447177e-05, "loss": 3.7517, "step": 113000 }, { "epoch": 0.42, "learning_rate": 2.8898011393214643e-05, "loss": 3.7585, "step": 113500 }, { "epoch": 0.42, "learning_rate": 2.8805051090982112e-05, "loss": 3.7595, "step": 114000 }, { "epoch": 0.43, "learning_rate": 2.8712090788749574e-05, "loss": 3.7493, "step": 114500 }, { "epoch": 0.43, "learning_rate": 2.8619130486517037e-05, "loss": 3.7554, "step": 115000 }, { "epoch": 0.43, "learning_rate": 2.8526170184284506e-05, "loss": 3.7515, "step": 115500 }, { "epoch": 0.43, "learning_rate": 2.843320988205197e-05, "loss": 3.7483, "step": 116000 }, { "epoch": 0.43, "learning_rate": 2.8340249579819434e-05, "loss": 3.7577, "step": 116500 }, { "epoch": 0.44, "learning_rate": 2.8247289277586903e-05, "loss": 3.753, "step": 117000 }, { "epoch": 0.44, "learning_rate": 2.8154328975354365e-05, "loss": 3.7472, "step": 117500 }, { "epoch": 0.44, "learning_rate": 2.806136867312183e-05, "loss": 3.742, "step": 118000 }, { "epoch": 0.44, "learning_rate": 2.79684083708893e-05, "loss": 3.7445, "step": 118500 }, { "epoch": 0.44, "learning_rate": 2.7875448068656763e-05, "loss": 3.7493, "step": 119000 }, { "epoch": 0.44, "learning_rate": 2.7782487766424225e-05, "loss": 3.7427, "step": 119500 }, { "epoch": 0.45, "learning_rate": 2.7689527464191694e-05, "loss": 3.7492, "step": 120000 }, { "epoch": 0.45, "eval_accuracy": 0.3731269306324982, "eval_loss": 3.643425464630127, "eval_runtime": 5034.359, "eval_samples_per_second": 89.812, "eval_steps_per_second": 1.403, "step": 120000 }, { "epoch": 0.45, "learning_rate": 2.759656716195916e-05, "loss": 3.7424, "step": 120500 }, { "epoch": 0.45, "learning_rate": 2.7503606859726622e-05, "loss": 3.7326, "step": 121000 }, { "epoch": 0.45, "learning_rate": 2.741064655749409e-05, "loss": 3.7368, "step": 121500 }, { "epoch": 0.45, "learning_rate": 2.7317686255261554e-05, "loss": 3.7387, "step": 122000 }, { "epoch": 0.46, "learning_rate": 2.722472595302902e-05, "loss": 3.7392, "step": 122500 }, { "epoch": 0.46, "learning_rate": 2.713176565079649e-05, "loss": 3.7371, "step": 123000 }, { "epoch": 0.46, "learning_rate": 2.703880534856395e-05, "loss": 3.7302, "step": 123500 }, { "epoch": 0.46, "learning_rate": 2.6945845046331413e-05, "loss": 3.7336, "step": 124000 }, { "epoch": 0.46, "learning_rate": 2.6852884744098882e-05, "loss": 3.7401, "step": 124500 }, { "epoch": 0.46, "learning_rate": 2.6759924441866345e-05, "loss": 3.7264, "step": 125000 }, { "epoch": 0.47, "learning_rate": 2.666696413963381e-05, "loss": 3.7328, "step": 125500 }, { "epoch": 0.47, "learning_rate": 2.657400383740128e-05, "loss": 3.7239, "step": 126000 }, { "epoch": 0.47, "learning_rate": 2.6481043535168742e-05, "loss": 3.727, "step": 126500 }, { "epoch": 0.47, "learning_rate": 2.6388083232936204e-05, "loss": 3.7448, "step": 127000 }, { "epoch": 0.47, "learning_rate": 2.6295122930703673e-05, "loss": 3.7296, "step": 127500 }, { "epoch": 0.48, "learning_rate": 2.620216262847114e-05, "loss": 3.7226, "step": 128000 }, { "epoch": 0.48, "learning_rate": 2.61092023262386e-05, "loss": 3.7289, "step": 128500 }, { "epoch": 0.48, "learning_rate": 2.601624202400607e-05, "loss": 3.7117, "step": 129000 }, { "epoch": 0.48, "learning_rate": 2.5923281721773533e-05, "loss": 3.7261, "step": 129500 }, { "epoch": 0.48, "learning_rate": 2.5830321419541e-05, "loss": 3.7228, "step": 130000 }, { "epoch": 0.48, "eval_accuracy": 0.3750882777539584, "eval_loss": 3.6245925426483154, "eval_runtime": 5034.4807, "eval_samples_per_second": 89.81, "eval_steps_per_second": 1.403, "step": 130000 }, { "epoch": 0.49, "learning_rate": 2.5737361117308468e-05, "loss": 3.7316, "step": 130500 }, { "epoch": 0.49, "learning_rate": 2.564440081507593e-05, "loss": 3.7187, "step": 131000 }, { "epoch": 0.49, "learning_rate": 2.5551440512843393e-05, "loss": 3.7147, "step": 131500 }, { "epoch": 0.49, "learning_rate": 2.5458480210610862e-05, "loss": 3.7185, "step": 132000 }, { "epoch": 0.49, "learning_rate": 2.5365519908378328e-05, "loss": 3.7186, "step": 132500 }, { "epoch": 0.49, "learning_rate": 2.527255960614579e-05, "loss": 3.7204, "step": 133000 }, { "epoch": 0.5, "learning_rate": 2.517959930391326e-05, "loss": 3.7154, "step": 133500 }, { "epoch": 0.5, "learning_rate": 2.508663900168072e-05, "loss": 3.7149, "step": 134000 }, { "epoch": 0.5, "learning_rate": 2.499367869944819e-05, "loss": 3.7138, "step": 134500 }, { "epoch": 0.5, "learning_rate": 2.4900718397215656e-05, "loss": 3.7194, "step": 135000 }, { "epoch": 0.5, "learning_rate": 2.480775809498312e-05, "loss": 3.7161, "step": 135500 }, { "epoch": 0.51, "learning_rate": 2.4714797792750584e-05, "loss": 3.7113, "step": 136000 }, { "epoch": 0.51, "learning_rate": 2.462183749051805e-05, "loss": 3.7124, "step": 136500 }, { "epoch": 0.51, "learning_rate": 2.4528877188285516e-05, "loss": 3.7113, "step": 137000 }, { "epoch": 0.51, "learning_rate": 2.443591688605298e-05, "loss": 3.7079, "step": 137500 }, { "epoch": 0.51, "learning_rate": 2.4342956583820447e-05, "loss": 3.7111, "step": 138000 }, { "epoch": 0.52, "learning_rate": 2.424999628158791e-05, "loss": 3.7081, "step": 138500 }, { "epoch": 0.52, "learning_rate": 2.415703597935538e-05, "loss": 3.7061, "step": 139000 }, { "epoch": 0.52, "learning_rate": 2.4064075677122845e-05, "loss": 3.7066, "step": 139500 }, { "epoch": 0.52, "learning_rate": 2.3971115374890307e-05, "loss": 3.7127, "step": 140000 }, { "epoch": 0.52, "eval_accuracy": 0.3766884162574383, "eval_loss": 3.609013319015503, "eval_runtime": 5024.1177, "eval_samples_per_second": 89.995, "eval_steps_per_second": 1.406, "step": 140000 }, { "epoch": 0.52, "learning_rate": 2.3878155072657773e-05, "loss": 3.7094, "step": 140500 }, { "epoch": 0.52, "learning_rate": 2.378519477042524e-05, "loss": 3.6994, "step": 141000 }, { "epoch": 0.53, "learning_rate": 2.3692234468192704e-05, "loss": 3.7066, "step": 141500 }, { "epoch": 0.53, "learning_rate": 2.359927416596017e-05, "loss": 3.705, "step": 142000 }, { "epoch": 0.53, "learning_rate": 2.3506313863727636e-05, "loss": 3.7026, "step": 142500 }, { "epoch": 0.53, "learning_rate": 2.3413353561495098e-05, "loss": 3.7035, "step": 143000 }, { "epoch": 0.53, "learning_rate": 2.3320393259262567e-05, "loss": 3.7052, "step": 143500 }, { "epoch": 0.54, "learning_rate": 2.3227432957030033e-05, "loss": 3.7018, "step": 144000 }, { "epoch": 0.54, "learning_rate": 2.3134472654797495e-05, "loss": 3.7042, "step": 144500 }, { "epoch": 0.54, "learning_rate": 2.304151235256496e-05, "loss": 3.7033, "step": 145000 }, { "epoch": 0.54, "learning_rate": 2.2948552050332427e-05, "loss": 3.7057, "step": 145500 }, { "epoch": 0.54, "learning_rate": 2.2855591748099893e-05, "loss": 3.7018, "step": 146000 }, { "epoch": 0.54, "learning_rate": 2.276263144586736e-05, "loss": 3.7007, "step": 146500 }, { "epoch": 0.55, "learning_rate": 2.2669671143634824e-05, "loss": 3.6892, "step": 147000 }, { "epoch": 0.55, "learning_rate": 2.2576710841402286e-05, "loss": 3.7029, "step": 147500 }, { "epoch": 0.55, "learning_rate": 2.2483750539169756e-05, "loss": 3.6992, "step": 148000 }, { "epoch": 0.55, "learning_rate": 2.239079023693722e-05, "loss": 3.6961, "step": 148500 }, { "epoch": 0.55, "learning_rate": 2.2297829934704684e-05, "loss": 3.6916, "step": 149000 }, { "epoch": 0.56, "learning_rate": 2.220486963247215e-05, "loss": 3.6983, "step": 149500 }, { "epoch": 0.56, "learning_rate": 2.2111909330239615e-05, "loss": 3.694, "step": 150000 }, { "epoch": 0.56, "eval_accuracy": 0.3782522490519747, "eval_loss": 3.5962026119232178, "eval_runtime": 5027.3439, "eval_samples_per_second": 89.937, "eval_steps_per_second": 1.405, "step": 150000 }, { "epoch": 0.56, "learning_rate": 2.201894902800708e-05, "loss": 3.6916, "step": 150500 }, { "epoch": 0.56, "learning_rate": 2.1925988725774547e-05, "loss": 3.69, "step": 151000 }, { "epoch": 0.56, "learning_rate": 2.1833028423542012e-05, "loss": 3.6858, "step": 151500 }, { "epoch": 0.57, "learning_rate": 2.1740068121309475e-05, "loss": 3.7009, "step": 152000 }, { "epoch": 0.57, "learning_rate": 2.164710781907694e-05, "loss": 3.6886, "step": 152500 }, { "epoch": 0.57, "learning_rate": 2.155414751684441e-05, "loss": 3.6869, "step": 153000 }, { "epoch": 0.57, "learning_rate": 2.1461187214611872e-05, "loss": 3.6898, "step": 153500 }, { "epoch": 0.57, "learning_rate": 2.1368226912379338e-05, "loss": 3.695, "step": 154000 }, { "epoch": 0.57, "learning_rate": 2.1275266610146804e-05, "loss": 3.6882, "step": 154500 }, { "epoch": 0.58, "learning_rate": 2.118230630791427e-05, "loss": 3.6909, "step": 155000 }, { "epoch": 0.58, "learning_rate": 2.1089346005681735e-05, "loss": 3.6862, "step": 155500 }, { "epoch": 0.58, "learning_rate": 2.09963857034492e-05, "loss": 3.6878, "step": 156000 }, { "epoch": 0.58, "learning_rate": 2.0903425401216663e-05, "loss": 3.6882, "step": 156500 }, { "epoch": 0.58, "learning_rate": 2.081046509898413e-05, "loss": 3.6841, "step": 157000 }, { "epoch": 0.59, "learning_rate": 2.0717504796751598e-05, "loss": 3.6828, "step": 157500 }, { "epoch": 0.59, "learning_rate": 2.062454449451906e-05, "loss": 3.6819, "step": 158000 }, { "epoch": 0.59, "learning_rate": 2.0531584192286526e-05, "loss": 3.6854, "step": 158500 }, { "epoch": 0.59, "learning_rate": 2.0438623890053992e-05, "loss": 3.6759, "step": 159000 }, { "epoch": 0.59, "learning_rate": 2.0345663587821458e-05, "loss": 3.6811, "step": 159500 }, { "epoch": 0.59, "learning_rate": 2.0252703285588923e-05, "loss": 3.6871, "step": 160000 }, { "epoch": 0.59, "eval_accuracy": 0.37971800692359575, "eval_loss": 3.583056688308716, "eval_runtime": 5027.862, "eval_samples_per_second": 89.928, "eval_steps_per_second": 1.405, "step": 160000 }, { "epoch": 0.6, "learning_rate": 2.015974298335639e-05, "loss": 3.6786, "step": 160500 }, { "epoch": 0.6, "learning_rate": 2.0066782681123855e-05, "loss": 3.6793, "step": 161000 }, { "epoch": 0.6, "learning_rate": 1.9973822378891317e-05, "loss": 3.6791, "step": 161500 }, { "epoch": 0.6, "learning_rate": 1.9880862076658786e-05, "loss": 3.6806, "step": 162000 }, { "epoch": 0.6, "learning_rate": 1.9787901774426252e-05, "loss": 3.6757, "step": 162500 }, { "epoch": 0.61, "learning_rate": 1.9694941472193715e-05, "loss": 3.6772, "step": 163000 }, { "epoch": 0.61, "learning_rate": 1.960198116996118e-05, "loss": 3.6803, "step": 163500 }, { "epoch": 0.61, "learning_rate": 1.9509020867728646e-05, "loss": 3.674, "step": 164000 }, { "epoch": 0.61, "learning_rate": 1.9416060565496112e-05, "loss": 3.6761, "step": 164500 }, { "epoch": 0.61, "learning_rate": 1.9323100263263578e-05, "loss": 3.6718, "step": 165000 }, { "epoch": 0.62, "learning_rate": 1.9230139961031043e-05, "loss": 3.675, "step": 165500 }, { "epoch": 0.62, "learning_rate": 1.9137179658798506e-05, "loss": 3.6734, "step": 166000 }, { "epoch": 0.62, "learning_rate": 1.9044219356565975e-05, "loss": 3.6716, "step": 166500 }, { "epoch": 0.62, "learning_rate": 1.895125905433344e-05, "loss": 3.6724, "step": 167000 }, { "epoch": 0.62, "learning_rate": 1.8858298752100903e-05, "loss": 3.6742, "step": 167500 }, { "epoch": 0.62, "learning_rate": 1.876533844986837e-05, "loss": 3.6758, "step": 168000 }, { "epoch": 0.63, "learning_rate": 1.8672378147635834e-05, "loss": 3.6704, "step": 168500 }, { "epoch": 0.63, "learning_rate": 1.85794178454033e-05, "loss": 3.6657, "step": 169000 }, { "epoch": 0.63, "learning_rate": 1.8486457543170766e-05, "loss": 3.6662, "step": 169500 }, { "epoch": 0.63, "learning_rate": 1.839349724093823e-05, "loss": 3.6784, "step": 170000 }, { "epoch": 0.63, "eval_accuracy": 0.3810095955450411, "eval_loss": 3.5707802772521973, "eval_runtime": 5019.3993, "eval_samples_per_second": 90.08, "eval_steps_per_second": 1.408, "step": 170000 }, { "epoch": 0.63, "learning_rate": 1.8300536938705694e-05, "loss": 3.672, "step": 170500 }, { "epoch": 0.64, "learning_rate": 1.8207576636473163e-05, "loss": 3.6694, "step": 171000 }, { "epoch": 0.64, "learning_rate": 1.811461633424063e-05, "loss": 3.6686, "step": 171500 }, { "epoch": 0.64, "learning_rate": 1.802165603200809e-05, "loss": 3.6712, "step": 172000 }, { "epoch": 0.64, "learning_rate": 1.7928695729775557e-05, "loss": 3.6743, "step": 172500 }, { "epoch": 0.64, "learning_rate": 1.7835735427543023e-05, "loss": 3.6632, "step": 173000 }, { "epoch": 0.65, "learning_rate": 1.774277512531049e-05, "loss": 3.6609, "step": 173500 }, { "epoch": 0.65, "learning_rate": 1.7649814823077954e-05, "loss": 3.668, "step": 174000 }, { "epoch": 0.65, "learning_rate": 1.755685452084542e-05, "loss": 3.6703, "step": 174500 }, { "epoch": 0.65, "learning_rate": 1.7463894218612882e-05, "loss": 3.6671, "step": 175000 }, { "epoch": 0.65, "learning_rate": 1.7370933916380348e-05, "loss": 3.66, "step": 175500 }, { "epoch": 0.65, "learning_rate": 1.7277973614147817e-05, "loss": 3.6648, "step": 176000 }, { "epoch": 0.66, "learning_rate": 1.718501331191528e-05, "loss": 3.6681, "step": 176500 }, { "epoch": 0.66, "learning_rate": 1.7092053009682745e-05, "loss": 3.6685, "step": 177000 }, { "epoch": 0.66, "learning_rate": 1.699909270745021e-05, "loss": 3.6669, "step": 177500 }, { "epoch": 0.66, "learning_rate": 1.6906132405217677e-05, "loss": 3.6691, "step": 178000 }, { "epoch": 0.66, "learning_rate": 1.6813172102985143e-05, "loss": 3.6575, "step": 178500 }, { "epoch": 0.67, "learning_rate": 1.6720211800752608e-05, "loss": 3.6688, "step": 179000 }, { "epoch": 0.67, "learning_rate": 1.662725149852007e-05, "loss": 3.6558, "step": 179500 }, { "epoch": 0.67, "learning_rate": 1.6534291196287536e-05, "loss": 3.6606, "step": 180000 }, { "epoch": 0.67, "eval_accuracy": 0.38233432261147465, "eval_loss": 3.559264898300171, "eval_runtime": 5018.0904, "eval_samples_per_second": 90.103, "eval_steps_per_second": 1.408, "step": 180000 }, { "epoch": 0.67, "learning_rate": 1.6441330894055006e-05, "loss": 3.6577, "step": 180500 }, { "epoch": 0.67, "learning_rate": 1.6348370591822468e-05, "loss": 3.6628, "step": 181000 }, { "epoch": 0.67, "learning_rate": 1.6255410289589934e-05, "loss": 3.6661, "step": 181500 }, { "epoch": 0.68, "learning_rate": 1.61624499873574e-05, "loss": 3.6582, "step": 182000 }, { "epoch": 0.68, "learning_rate": 1.6069489685124865e-05, "loss": 3.6554, "step": 182500 }, { "epoch": 0.68, "learning_rate": 1.597652938289233e-05, "loss": 3.6577, "step": 183000 }, { "epoch": 0.68, "learning_rate": 1.5883569080659797e-05, "loss": 3.6586, "step": 183500 }, { "epoch": 0.68, "learning_rate": 1.579060877842726e-05, "loss": 3.6565, "step": 184000 }, { "epoch": 0.69, "learning_rate": 1.5697648476194725e-05, "loss": 3.6623, "step": 184500 }, { "epoch": 0.69, "learning_rate": 1.5604688173962194e-05, "loss": 3.6617, "step": 185000 }, { "epoch": 0.69, "learning_rate": 1.5511727871729656e-05, "loss": 3.6555, "step": 185500 }, { "epoch": 0.69, "learning_rate": 1.5418767569497122e-05, "loss": 3.6524, "step": 186000 }, { "epoch": 0.69, "learning_rate": 1.5325807267264588e-05, "loss": 3.6602, "step": 186500 }, { "epoch": 0.7, "learning_rate": 1.5232846965032052e-05, "loss": 3.6563, "step": 187000 }, { "epoch": 0.7, "learning_rate": 1.5139886662799518e-05, "loss": 3.6551, "step": 187500 }, { "epoch": 0.7, "learning_rate": 1.5046926360566985e-05, "loss": 3.6567, "step": 188000 }, { "epoch": 0.7, "learning_rate": 1.4953966058334447e-05, "loss": 3.6475, "step": 188500 }, { "epoch": 0.7, "learning_rate": 1.4861005756101915e-05, "loss": 3.654, "step": 189000 }, { "epoch": 0.7, "learning_rate": 1.476804545386938e-05, "loss": 3.6559, "step": 189500 }, { "epoch": 0.71, "learning_rate": 1.4675085151636846e-05, "loss": 3.646, "step": 190000 }, { "epoch": 0.71, "eval_accuracy": 0.3834675999220702, "eval_loss": 3.549065351486206, "eval_runtime": 5019.1074, "eval_samples_per_second": 90.085, "eval_steps_per_second": 1.408, "step": 190000 }, { "epoch": 0.71, "learning_rate": 1.458212484940431e-05, "loss": 3.6544, "step": 190500 }, { "epoch": 0.71, "learning_rate": 1.4489164547171776e-05, "loss": 3.6484, "step": 191000 }, { "epoch": 0.71, "learning_rate": 1.4396204244939244e-05, "loss": 3.6467, "step": 191500 }, { "epoch": 0.71, "learning_rate": 1.4303243942706706e-05, "loss": 3.6558, "step": 192000 }, { "epoch": 0.72, "learning_rate": 1.4210283640474173e-05, "loss": 3.6461, "step": 192500 }, { "epoch": 0.72, "learning_rate": 1.4117323338241639e-05, "loss": 3.6532, "step": 193000 }, { "epoch": 0.72, "learning_rate": 1.4024363036009103e-05, "loss": 3.6551, "step": 193500 }, { "epoch": 0.72, "learning_rate": 1.3931402733776569e-05, "loss": 3.6476, "step": 194000 }, { "epoch": 0.72, "learning_rate": 1.3838442431544035e-05, "loss": 3.6466, "step": 194500 }, { "epoch": 0.73, "learning_rate": 1.3745482129311499e-05, "loss": 3.6518, "step": 195000 }, { "epoch": 0.73, "learning_rate": 1.3652521827078964e-05, "loss": 3.6413, "step": 195500 }, { "epoch": 0.73, "learning_rate": 1.3559561524846432e-05, "loss": 3.6432, "step": 196000 }, { "epoch": 0.73, "learning_rate": 1.3466601222613894e-05, "loss": 3.6464, "step": 196500 }, { "epoch": 0.73, "learning_rate": 1.3373640920381362e-05, "loss": 3.644, "step": 197000 }, { "epoch": 0.73, "learning_rate": 1.3280680618148827e-05, "loss": 3.6496, "step": 197500 }, { "epoch": 0.74, "learning_rate": 1.3187720315916292e-05, "loss": 3.6407, "step": 198000 }, { "epoch": 0.74, "learning_rate": 1.3094760013683757e-05, "loss": 3.6412, "step": 198500 }, { "epoch": 0.74, "learning_rate": 1.3001799711451223e-05, "loss": 3.646, "step": 199000 }, { "epoch": 0.74, "learning_rate": 1.2908839409218687e-05, "loss": 3.6432, "step": 199500 }, { "epoch": 0.74, "learning_rate": 1.2815879106986153e-05, "loss": 3.6453, "step": 200000 }, { "epoch": 0.74, "eval_accuracy": 0.38430268095273507, "eval_loss": 3.5410099029541016, "eval_runtime": 5035.174, "eval_samples_per_second": 89.797, "eval_steps_per_second": 1.403, "step": 200000 }, { "epoch": 0.75, "learning_rate": 1.272291880475362e-05, "loss": 3.6374, "step": 200500 }, { "epoch": 0.75, "learning_rate": 1.2629958502521083e-05, "loss": 3.6464, "step": 201000 }, { "epoch": 0.75, "learning_rate": 1.253699820028855e-05, "loss": 3.6488, "step": 201500 }, { "epoch": 0.75, "learning_rate": 1.2444037898056014e-05, "loss": 3.6405, "step": 202000 }, { "epoch": 0.75, "learning_rate": 1.235107759582348e-05, "loss": 3.6354, "step": 202500 }, { "epoch": 0.75, "learning_rate": 1.2258117293590946e-05, "loss": 3.6386, "step": 203000 }, { "epoch": 0.76, "learning_rate": 1.216515699135841e-05, "loss": 3.6349, "step": 203500 }, { "epoch": 0.76, "learning_rate": 1.2072196689125877e-05, "loss": 3.6377, "step": 204000 }, { "epoch": 0.76, "learning_rate": 1.1979236386893341e-05, "loss": 3.6374, "step": 204500 }, { "epoch": 0.76, "learning_rate": 1.1886276084660807e-05, "loss": 3.6403, "step": 205000 }, { "epoch": 0.76, "learning_rate": 1.1793315782428273e-05, "loss": 3.6397, "step": 205500 }, { "epoch": 0.77, "learning_rate": 1.1700355480195738e-05, "loss": 3.6388, "step": 206000 }, { "epoch": 0.77, "learning_rate": 1.1607395177963202e-05, "loss": 3.6413, "step": 206500 }, { "epoch": 0.77, "learning_rate": 1.1514434875730668e-05, "loss": 3.6353, "step": 207000 }, { "epoch": 0.77, "learning_rate": 1.1421474573498134e-05, "loss": 3.6395, "step": 207500 }, { "epoch": 0.77, "learning_rate": 1.1328514271265598e-05, "loss": 3.6378, "step": 208000 }, { "epoch": 0.78, "learning_rate": 1.1235553969033065e-05, "loss": 3.6394, "step": 208500 }, { "epoch": 0.78, "learning_rate": 1.114259366680053e-05, "loss": 3.6402, "step": 209000 }, { "epoch": 0.78, "learning_rate": 1.1049633364567995e-05, "loss": 3.6421, "step": 209500 }, { "epoch": 0.78, "learning_rate": 1.0956673062335461e-05, "loss": 3.6393, "step": 210000 }, { "epoch": 0.78, "eval_accuracy": 0.38505967217174975, "eval_loss": 3.5341877937316895, "eval_runtime": 5017.9224, "eval_samples_per_second": 90.106, "eval_steps_per_second": 1.408, "step": 210000 }, { "epoch": 0.78, "learning_rate": 1.0863712760102927e-05, "loss": 3.6379, "step": 210500 }, { "epoch": 0.78, "learning_rate": 1.0770752457870393e-05, "loss": 3.6319, "step": 211000 }, { "epoch": 0.79, "learning_rate": 1.0677792155637857e-05, "loss": 3.6318, "step": 211500 }, { "epoch": 0.79, "learning_rate": 1.0584831853405322e-05, "loss": 3.6345, "step": 212000 }, { "epoch": 0.79, "learning_rate": 1.0491871551172788e-05, "loss": 3.6346, "step": 212500 }, { "epoch": 0.79, "learning_rate": 1.0398911248940254e-05, "loss": 3.6304, "step": 213000 }, { "epoch": 0.79, "learning_rate": 1.0305950946707718e-05, "loss": 3.6273, "step": 213500 }, { "epoch": 0.8, "learning_rate": 1.0212990644475184e-05, "loss": 3.6346, "step": 214000 }, { "epoch": 0.8, "learning_rate": 1.012003034224265e-05, "loss": 3.6329, "step": 214500 }, { "epoch": 0.8, "learning_rate": 1.0027070040010113e-05, "loss": 3.6349, "step": 215000 }, { "epoch": 0.8, "learning_rate": 9.934109737777581e-06, "loss": 3.6369, "step": 215500 }, { "epoch": 0.8, "learning_rate": 9.841149435545045e-06, "loss": 3.6272, "step": 216000 }, { "epoch": 0.81, "learning_rate": 9.74818913331251e-06, "loss": 3.6283, "step": 216500 }, { "epoch": 0.81, "learning_rate": 9.655228831079976e-06, "loss": 3.6353, "step": 217000 }, { "epoch": 0.81, "learning_rate": 9.562268528847442e-06, "loss": 3.6251, "step": 217500 }, { "epoch": 0.81, "learning_rate": 9.469308226614906e-06, "loss": 3.6345, "step": 218000 }, { "epoch": 0.81, "learning_rate": 9.376347924382372e-06, "loss": 3.6256, "step": 218500 }, { "epoch": 0.81, "learning_rate": 9.283387622149838e-06, "loss": 3.6259, "step": 219000 }, { "epoch": 0.82, "learning_rate": 9.190427319917302e-06, "loss": 3.6288, "step": 219500 }, { "epoch": 0.82, "learning_rate": 9.09746701768477e-06, "loss": 3.6207, "step": 220000 }, { "epoch": 0.82, "eval_accuracy": 0.3857340442835604, "eval_loss": 3.5280263423919678, "eval_runtime": 5028.331, "eval_samples_per_second": 89.919, "eval_steps_per_second": 1.405, "step": 220000 }, { "epoch": 0.82, "learning_rate": 9.004506715452233e-06, "loss": 3.6311, "step": 220500 }, { "epoch": 0.82, "learning_rate": 8.911546413219699e-06, "loss": 3.6254, "step": 221000 }, { "epoch": 0.82, "learning_rate": 8.818586110987165e-06, "loss": 3.6307, "step": 221500 }, { "epoch": 0.83, "learning_rate": 8.72562580875463e-06, "loss": 3.629, "step": 222000 }, { "epoch": 0.83, "learning_rate": 8.632665506522095e-06, "loss": 3.6264, "step": 222500 }, { "epoch": 0.83, "learning_rate": 8.53970520428956e-06, "loss": 3.6215, "step": 223000 }, { "epoch": 0.83, "learning_rate": 8.446744902057026e-06, "loss": 3.6181, "step": 223500 }, { "epoch": 0.83, "learning_rate": 8.35378459982449e-06, "loss": 3.6253, "step": 224000 }, { "epoch": 0.83, "learning_rate": 8.260824297591958e-06, "loss": 3.6204, "step": 224500 }, { "epoch": 0.84, "learning_rate": 8.167863995359422e-06, "loss": 3.6264, "step": 225000 }, { "epoch": 0.84, "learning_rate": 8.074903693126887e-06, "loss": 3.6258, "step": 225500 }, { "epoch": 0.84, "learning_rate": 7.981943390894353e-06, "loss": 3.6279, "step": 226000 }, { "epoch": 0.84, "learning_rate": 7.888983088661817e-06, "loss": 3.6294, "step": 226500 }, { "epoch": 0.84, "learning_rate": 7.796022786429285e-06, "loss": 3.621, "step": 227000 }, { "epoch": 0.85, "learning_rate": 7.703062484196749e-06, "loss": 3.6315, "step": 227500 }, { "epoch": 0.85, "learning_rate": 7.610102181964214e-06, "loss": 3.6199, "step": 228000 }, { "epoch": 0.85, "learning_rate": 7.51714187973168e-06, "loss": 3.6182, "step": 228500 }, { "epoch": 0.85, "learning_rate": 7.424181577499145e-06, "loss": 3.6196, "step": 229000 }, { "epoch": 0.85, "learning_rate": 7.33122127526661e-06, "loss": 3.6216, "step": 229500 }, { "epoch": 0.86, "learning_rate": 7.2382609730340766e-06, "loss": 3.6288, "step": 230000 }, { "epoch": 0.86, "eval_accuracy": 0.386489052989915, "eval_loss": 3.5217700004577637, "eval_runtime": 5019.3728, "eval_samples_per_second": 90.08, "eval_steps_per_second": 1.408, "step": 230000 }, { "epoch": 0.86, "learning_rate": 7.1453006708015415e-06, "loss": 3.6201, "step": 230500 }, { "epoch": 0.86, "learning_rate": 7.052340368569006e-06, "loss": 3.6216, "step": 231000 }, { "epoch": 0.86, "learning_rate": 6.959380066336472e-06, "loss": 3.6214, "step": 231500 }, { "epoch": 0.86, "learning_rate": 6.866419764103937e-06, "loss": 3.6208, "step": 232000 }, { "epoch": 0.86, "learning_rate": 6.773459461871402e-06, "loss": 3.6116, "step": 232500 }, { "epoch": 0.87, "learning_rate": 6.6804991596388685e-06, "loss": 3.6187, "step": 233000 }, { "epoch": 0.87, "learning_rate": 6.587538857406333e-06, "loss": 3.6179, "step": 233500 }, { "epoch": 0.87, "learning_rate": 6.494578555173798e-06, "loss": 3.6112, "step": 234000 }, { "epoch": 0.87, "learning_rate": 6.401618252941265e-06, "loss": 3.6226, "step": 234500 }, { "epoch": 0.87, "learning_rate": 6.30865795070873e-06, "loss": 3.6148, "step": 235000 }, { "epoch": 0.88, "learning_rate": 6.2156976484761956e-06, "loss": 3.6107, "step": 235500 }, { "epoch": 0.88, "learning_rate": 6.1227373462436605e-06, "loss": 3.6129, "step": 236000 }, { "epoch": 0.88, "learning_rate": 6.029777044011125e-06, "loss": 3.6178, "step": 236500 }, { "epoch": 0.88, "learning_rate": 5.936816741778591e-06, "loss": 3.624, "step": 237000 }, { "epoch": 0.88, "learning_rate": 5.843856439546057e-06, "loss": 3.6232, "step": 237500 }, { "epoch": 0.88, "learning_rate": 5.750896137313522e-06, "loss": 3.6185, "step": 238000 }, { "epoch": 0.89, "learning_rate": 5.6579358350809875e-06, "loss": 3.6195, "step": 238500 }, { "epoch": 0.89, "learning_rate": 5.564975532848453e-06, "loss": 3.6236, "step": 239000 }, { "epoch": 0.89, "learning_rate": 5.472015230615918e-06, "loss": 3.6091, "step": 239500 }, { "epoch": 0.89, "learning_rate": 5.379054928383383e-06, "loss": 3.6176, "step": 240000 }, { "epoch": 0.89, "eval_accuracy": 0.3872213352261681, "eval_loss": 3.5150856971740723, "eval_runtime": 5028.7528, "eval_samples_per_second": 89.912, "eval_steps_per_second": 1.405, "step": 240000 }, { "epoch": 0.89, "learning_rate": 5.286094626150849e-06, "loss": 3.6167, "step": 240500 }, { "epoch": 0.9, "learning_rate": 5.193134323918314e-06, "loss": 3.62, "step": 241000 }, { "epoch": 0.9, "learning_rate": 5.1001740216857795e-06, "loss": 3.612, "step": 241500 }, { "epoch": 0.9, "learning_rate": 5.007213719453245e-06, "loss": 3.614, "step": 242000 }, { "epoch": 0.9, "learning_rate": 4.91425341722071e-06, "loss": 3.6181, "step": 242500 }, { "epoch": 0.9, "learning_rate": 4.821293114988176e-06, "loss": 3.6146, "step": 243000 }, { "epoch": 0.91, "learning_rate": 4.728332812755641e-06, "loss": 3.6145, "step": 243500 }, { "epoch": 0.91, "learning_rate": 4.6353725105231065e-06, "loss": 3.6156, "step": 244000 }, { "epoch": 0.91, "learning_rate": 4.542412208290571e-06, "loss": 3.6149, "step": 244500 }, { "epoch": 0.91, "learning_rate": 4.449451906058037e-06, "loss": 3.613, "step": 245000 }, { "epoch": 0.91, "learning_rate": 4.356491603825503e-06, "loss": 3.6158, "step": 245500 }, { "epoch": 0.91, "learning_rate": 4.263531301592968e-06, "loss": 3.6231, "step": 246000 }, { "epoch": 0.92, "learning_rate": 4.1705709993604336e-06, "loss": 3.6094, "step": 246500 }, { "epoch": 0.92, "learning_rate": 4.077610697127899e-06, "loss": 3.6135, "step": 247000 }, { "epoch": 0.92, "learning_rate": 3.984650394895364e-06, "loss": 3.6115, "step": 247500 }, { "epoch": 0.92, "learning_rate": 3.891690092662829e-06, "loss": 3.6093, "step": 248000 }, { "epoch": 0.92, "learning_rate": 3.798729790430295e-06, "loss": 3.6147, "step": 248500 }, { "epoch": 0.93, "learning_rate": 3.7057694881977598e-06, "loss": 3.6174, "step": 249000 }, { "epoch": 0.93, "learning_rate": 3.6128091859652255e-06, "loss": 3.6175, "step": 249500 }, { "epoch": 0.93, "learning_rate": 3.5198488837326913e-06, "loss": 3.6099, "step": 250000 }, { "epoch": 0.93, "eval_accuracy": 0.3877557812917544, "eval_loss": 3.510763168334961, "eval_runtime": 5030.6308, "eval_samples_per_second": 89.878, "eval_steps_per_second": 1.404, "step": 250000 }, { "epoch": 0.93, "learning_rate": 3.4268885815001566e-06, "loss": 3.6052, "step": 250500 }, { "epoch": 0.93, "learning_rate": 3.3339282792676215e-06, "loss": 3.6146, "step": 251000 }, { "epoch": 0.94, "learning_rate": 3.2409679770350872e-06, "loss": 3.6118, "step": 251500 }, { "epoch": 0.94, "learning_rate": 3.1480076748025526e-06, "loss": 3.6099, "step": 252000 }, { "epoch": 0.94, "learning_rate": 3.055047372570018e-06, "loss": 3.6089, "step": 252500 }, { "epoch": 0.94, "learning_rate": 2.9620870703374832e-06, "loss": 3.6042, "step": 253000 }, { "epoch": 0.94, "learning_rate": 2.8691267681049485e-06, "loss": 3.6121, "step": 253500 }, { "epoch": 0.94, "learning_rate": 2.7761664658724143e-06, "loss": 3.6041, "step": 254000 }, { "epoch": 0.95, "learning_rate": 2.683206163639879e-06, "loss": 3.6135, "step": 254500 }, { "epoch": 0.95, "learning_rate": 2.5902458614073445e-06, "loss": 3.6043, "step": 255000 }, { "epoch": 0.95, "learning_rate": 2.4972855591748103e-06, "loss": 3.6029, "step": 255500 }, { "epoch": 0.95, "learning_rate": 2.4043252569422756e-06, "loss": 3.6112, "step": 256000 }, { "epoch": 0.95, "learning_rate": 2.311364954709741e-06, "loss": 3.6021, "step": 256500 }, { "epoch": 0.96, "learning_rate": 2.2184046524772062e-06, "loss": 3.6078, "step": 257000 }, { "epoch": 0.96, "learning_rate": 2.1254443502446716e-06, "loss": 3.6037, "step": 257500 }, { "epoch": 0.96, "learning_rate": 2.032484048012137e-06, "loss": 3.6054, "step": 258000 }, { "epoch": 0.96, "learning_rate": 1.9395237457796022e-06, "loss": 3.6063, "step": 258500 }, { "epoch": 0.96, "learning_rate": 1.8465634435470675e-06, "loss": 3.6049, "step": 259000 }, { "epoch": 0.96, "learning_rate": 1.753603141314533e-06, "loss": 3.6056, "step": 259500 }, { "epoch": 0.97, "learning_rate": 1.6606428390819984e-06, "loss": 3.6093, "step": 260000 }, { "epoch": 0.97, "eval_accuracy": 0.3880700733260521, "eval_loss": 3.507866144180298, "eval_runtime": 5028.8015, "eval_samples_per_second": 89.911, "eval_steps_per_second": 1.405, "step": 260000 }, { "epoch": 0.97, "learning_rate": 1.567682536849464e-06, "loss": 3.6062, "step": 260500 }, { "epoch": 0.97, "learning_rate": 1.4747222346169293e-06, "loss": 3.6027, "step": 261000 }, { "epoch": 0.97, "learning_rate": 1.3817619323843946e-06, "loss": 3.6005, "step": 261500 }, { "epoch": 0.97, "learning_rate": 1.28880163015186e-06, "loss": 3.6119, "step": 262000 }, { "epoch": 0.98, "learning_rate": 1.1958413279193255e-06, "loss": 3.6036, "step": 262500 }, { "epoch": 0.98, "learning_rate": 1.1028810256867908e-06, "loss": 3.6089, "step": 263000 }, { "epoch": 0.98, "learning_rate": 1.009920723454256e-06, "loss": 3.6052, "step": 263500 }, { "epoch": 0.98, "learning_rate": 9.169604212217216e-07, "loss": 3.6063, "step": 264000 }, { "epoch": 0.98, "learning_rate": 8.240001189891869e-07, "loss": 3.6115, "step": 264500 }, { "epoch": 0.99, "learning_rate": 7.310398167566523e-07, "loss": 3.6035, "step": 265000 }, { "epoch": 0.99, "learning_rate": 6.380795145241177e-07, "loss": 3.5975, "step": 265500 }, { "epoch": 0.99, "learning_rate": 5.45119212291583e-07, "loss": 3.6047, "step": 266000 }, { "epoch": 0.99, "learning_rate": 4.521589100590484e-07, "loss": 3.6039, "step": 266500 }, { "epoch": 0.99, "learning_rate": 3.5919860782651375e-07, "loss": 3.6111, "step": 267000 }, { "epoch": 0.99, "learning_rate": 2.6623830559397913e-07, "loss": 3.6062, "step": 267500 }, { "epoch": 1.0, "learning_rate": 1.7327800336144456e-07, "loss": 3.6008, "step": 268000 }, { "epoch": 1.0, "learning_rate": 8.031770112890991e-08, "loss": 3.6071, "step": 268500 }, { "epoch": 1.0, "step": 268932, "total_flos": 4.497266479988736e+18, "train_loss": 3.8604874901614594, "train_runtime": 258151.997, "train_samples_per_second": 33.336, "train_steps_per_second": 1.042 } ], "logging_steps": 500, "max_steps": 268932, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "total_flos": 4.497266479988736e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }