{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10000, "global_step": 263566, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.990514709788061e-05, "loss": 7.0943, "step": 500 }, { "epoch": 0.0, "learning_rate": 4.981029419576122e-05, "loss": 6.2317, "step": 1000 }, { "epoch": 0.01, "learning_rate": 4.971544129364182e-05, "loss": 5.9001, "step": 1500 }, { "epoch": 0.01, "learning_rate": 4.9620588391522424e-05, "loss": 5.6651, "step": 2000 }, { "epoch": 0.01, "learning_rate": 4.9525735489403035e-05, "loss": 5.4754, "step": 2500 }, { "epoch": 0.01, "learning_rate": 4.9430882587283645e-05, "loss": 5.3245, "step": 3000 }, { "epoch": 0.01, "learning_rate": 4.933602968516425e-05, "loss": 5.1815, "step": 3500 }, { "epoch": 0.02, "learning_rate": 4.924117678304486e-05, "loss": 5.0484, "step": 4000 }, { "epoch": 0.02, "learning_rate": 4.914632388092546e-05, "loss": 4.9349, "step": 4500 }, { "epoch": 0.02, "learning_rate": 4.905147097880607e-05, "loss": 4.8371, "step": 5000 }, { "epoch": 0.02, "learning_rate": 4.895661807668668e-05, "loss": 4.7574, "step": 5500 }, { "epoch": 0.02, "learning_rate": 4.886176517456728e-05, "loss": 4.6903, "step": 6000 }, { "epoch": 0.02, "learning_rate": 4.876691227244789e-05, "loss": 4.626, "step": 6500 }, { "epoch": 0.03, "learning_rate": 4.8672059370328496e-05, "loss": 4.5767, "step": 7000 }, { "epoch": 0.03, "learning_rate": 4.85772064682091e-05, "loss": 4.5388, "step": 7500 }, { "epoch": 0.03, "learning_rate": 4.848235356608971e-05, "loss": 4.4959, "step": 8000 }, { "epoch": 0.03, "learning_rate": 4.838750066397032e-05, "loss": 4.4589, "step": 8500 }, { "epoch": 0.03, "learning_rate": 4.8292647761850924e-05, "loss": 4.4324, "step": 9000 }, { "epoch": 0.04, "learning_rate": 4.8197794859731535e-05, "loss": 4.3988, "step": 9500 }, { "epoch": 0.04, "learning_rate": 4.810294195761214e-05, "loss": 4.375, "step": 10000 }, { "epoch": 0.04, "eval_accuracy": 0.31109238678227563, "eval_loss": 4.28147029876709, "eval_runtime": 5060.877, "eval_samples_per_second": 87.687, "eval_steps_per_second": 1.37, "step": 10000 }, { "epoch": 0.04, "learning_rate": 4.800808905549274e-05, "loss": 4.3423, "step": 10500 }, { "epoch": 0.04, "learning_rate": 4.791323615337335e-05, "loss": 4.3204, "step": 11000 }, { "epoch": 0.04, "learning_rate": 4.7818383251253956e-05, "loss": 4.2991, "step": 11500 }, { "epoch": 0.05, "learning_rate": 4.772353034913457e-05, "loss": 4.2818, "step": 12000 }, { "epoch": 0.05, "learning_rate": 4.762867744701517e-05, "loss": 4.2627, "step": 12500 }, { "epoch": 0.05, "learning_rate": 4.7533824544895774e-05, "loss": 4.2463, "step": 13000 }, { "epoch": 0.05, "learning_rate": 4.7438971642776385e-05, "loss": 4.2265, "step": 13500 }, { "epoch": 0.05, "learning_rate": 4.734411874065699e-05, "loss": 4.2159, "step": 14000 }, { "epoch": 0.06, "learning_rate": 4.72492658385376e-05, "loss": 4.198, "step": 14500 }, { "epoch": 0.06, "learning_rate": 4.715441293641821e-05, "loss": 4.1886, "step": 15000 }, { "epoch": 0.06, "learning_rate": 4.7059560034298813e-05, "loss": 4.1675, "step": 15500 }, { "epoch": 0.06, "learning_rate": 4.696470713217942e-05, "loss": 4.1553, "step": 16000 }, { "epoch": 0.06, "learning_rate": 4.686985423006002e-05, "loss": 4.1466, "step": 16500 }, { "epoch": 0.06, "learning_rate": 4.677500132794063e-05, "loss": 4.1307, "step": 17000 }, { "epoch": 0.07, "learning_rate": 4.668014842582124e-05, "loss": 4.1282, "step": 17500 }, { "epoch": 0.07, "learning_rate": 4.6585295523701846e-05, "loss": 4.1108, "step": 18000 }, { "epoch": 0.07, "learning_rate": 4.6490442621582456e-05, "loss": 4.1135, "step": 18500 }, { "epoch": 0.07, "learning_rate": 4.639558971946305e-05, "loss": 4.0876, "step": 19000 }, { "epoch": 0.07, "learning_rate": 4.6300736817343664e-05, "loss": 4.0871, "step": 19500 }, { "epoch": 0.08, "learning_rate": 4.6205883915224274e-05, "loss": 4.0754, "step": 20000 }, { "epoch": 0.08, "eval_accuracy": 0.3340521142462603, "eval_loss": 3.9983978271484375, "eval_runtime": 5045.5203, "eval_samples_per_second": 87.954, "eval_steps_per_second": 1.374, "step": 20000 }, { "epoch": 0.08, "learning_rate": 4.611103101310488e-05, "loss": 4.0747, "step": 20500 }, { "epoch": 0.08, "learning_rate": 4.601617811098549e-05, "loss": 4.0582, "step": 21000 }, { "epoch": 0.08, "learning_rate": 4.592132520886609e-05, "loss": 4.0545, "step": 21500 }, { "epoch": 0.08, "learning_rate": 4.5826472306746696e-05, "loss": 4.0345, "step": 22000 }, { "epoch": 0.09, "learning_rate": 4.5731619404627307e-05, "loss": 4.0376, "step": 22500 }, { "epoch": 0.09, "learning_rate": 4.563676650250791e-05, "loss": 4.0315, "step": 23000 }, { "epoch": 0.09, "learning_rate": 4.554191360038852e-05, "loss": 4.0217, "step": 23500 }, { "epoch": 0.09, "learning_rate": 4.544706069826913e-05, "loss": 4.0135, "step": 24000 }, { "epoch": 0.09, "learning_rate": 4.535220779614973e-05, "loss": 4.0034, "step": 24500 }, { "epoch": 0.09, "learning_rate": 4.525735489403034e-05, "loss": 4.004, "step": 25000 }, { "epoch": 0.1, "learning_rate": 4.516250199191095e-05, "loss": 3.9939, "step": 25500 }, { "epoch": 0.1, "learning_rate": 4.506764908979155e-05, "loss": 3.9871, "step": 26000 }, { "epoch": 0.1, "learning_rate": 4.4972796187672164e-05, "loss": 3.9787, "step": 26500 }, { "epoch": 0.1, "learning_rate": 4.487794328555277e-05, "loss": 3.9752, "step": 27000 }, { "epoch": 0.1, "learning_rate": 4.478309038343337e-05, "loss": 3.9605, "step": 27500 }, { "epoch": 0.11, "learning_rate": 4.468823748131398e-05, "loss": 3.9542, "step": 28000 }, { "epoch": 0.11, "learning_rate": 4.4593384579194585e-05, "loss": 3.9518, "step": 28500 }, { "epoch": 0.11, "learning_rate": 4.4498531677075196e-05, "loss": 3.9479, "step": 29000 }, { "epoch": 0.11, "learning_rate": 4.44036787749558e-05, "loss": 3.9445, "step": 29500 }, { "epoch": 0.11, "learning_rate": 4.430882587283641e-05, "loss": 3.9409, "step": 30000 }, { "epoch": 0.11, "eval_accuracy": 0.3456931371386037, "eval_loss": 3.86149525642395, "eval_runtime": 5041.4629, "eval_samples_per_second": 88.025, "eval_steps_per_second": 1.375, "step": 30000 }, { "epoch": 0.12, "learning_rate": 4.4213972970717014e-05, "loss": 3.9345, "step": 30500 }, { "epoch": 0.12, "learning_rate": 4.411912006859762e-05, "loss": 3.9328, "step": 31000 }, { "epoch": 0.12, "learning_rate": 4.402426716647823e-05, "loss": 3.9248, "step": 31500 }, { "epoch": 0.12, "learning_rate": 4.392941426435884e-05, "loss": 3.9126, "step": 32000 }, { "epoch": 0.12, "learning_rate": 4.383456136223944e-05, "loss": 3.9077, "step": 32500 }, { "epoch": 0.13, "learning_rate": 4.3739708460120046e-05, "loss": 3.9102, "step": 33000 }, { "epoch": 0.13, "learning_rate": 4.364485555800065e-05, "loss": 3.907, "step": 33500 }, { "epoch": 0.13, "learning_rate": 4.355000265588126e-05, "loss": 3.9017, "step": 34000 }, { "epoch": 0.13, "learning_rate": 4.345514975376187e-05, "loss": 3.8976, "step": 34500 }, { "epoch": 0.13, "learning_rate": 4.3360296851642475e-05, "loss": 3.8917, "step": 35000 }, { "epoch": 0.13, "learning_rate": 4.3265443949523085e-05, "loss": 3.8917, "step": 35500 }, { "epoch": 0.14, "learning_rate": 4.317059104740369e-05, "loss": 3.8851, "step": 36000 }, { "epoch": 0.14, "learning_rate": 4.307573814528429e-05, "loss": 3.8899, "step": 36500 }, { "epoch": 0.14, "learning_rate": 4.29808852431649e-05, "loss": 3.8667, "step": 37000 }, { "epoch": 0.14, "learning_rate": 4.288603234104551e-05, "loss": 3.8702, "step": 37500 }, { "epoch": 0.14, "learning_rate": 4.279117943892612e-05, "loss": 3.8705, "step": 38000 }, { "epoch": 0.15, "learning_rate": 4.269632653680672e-05, "loss": 3.8647, "step": 38500 }, { "epoch": 0.15, "learning_rate": 4.2601473634687325e-05, "loss": 3.8612, "step": 39000 }, { "epoch": 0.15, "learning_rate": 4.2506620732567936e-05, "loss": 3.8524, "step": 39500 }, { "epoch": 0.15, "learning_rate": 4.241176783044854e-05, "loss": 3.8554, "step": 40000 }, { "epoch": 0.15, "eval_accuracy": 0.3530753183612612, "eval_loss": 3.7798092365264893, "eval_runtime": 5037.6086, "eval_samples_per_second": 88.092, "eval_steps_per_second": 1.376, "step": 40000 }, { "epoch": 0.15, "learning_rate": 4.231691492832915e-05, "loss": 3.8487, "step": 40500 }, { "epoch": 0.16, "learning_rate": 4.222206202620976e-05, "loss": 3.851, "step": 41000 }, { "epoch": 0.16, "learning_rate": 4.2127209124090364e-05, "loss": 3.8431, "step": 41500 }, { "epoch": 0.16, "learning_rate": 4.203235622197097e-05, "loss": 3.8401, "step": 42000 }, { "epoch": 0.16, "learning_rate": 4.193750331985158e-05, "loss": 3.8296, "step": 42500 }, { "epoch": 0.16, "learning_rate": 4.184265041773218e-05, "loss": 3.8338, "step": 43000 }, { "epoch": 0.17, "learning_rate": 4.174779751561279e-05, "loss": 3.8295, "step": 43500 }, { "epoch": 0.17, "learning_rate": 4.1652944613493396e-05, "loss": 3.8285, "step": 44000 }, { "epoch": 0.17, "learning_rate": 4.1558091711374e-05, "loss": 3.8217, "step": 44500 }, { "epoch": 0.17, "learning_rate": 4.146323880925461e-05, "loss": 3.8262, "step": 45000 }, { "epoch": 0.17, "learning_rate": 4.1368385907135214e-05, "loss": 3.8241, "step": 45500 }, { "epoch": 0.17, "learning_rate": 4.1273533005015825e-05, "loss": 3.8145, "step": 46000 }, { "epoch": 0.18, "learning_rate": 4.117868010289643e-05, "loss": 3.8207, "step": 46500 }, { "epoch": 0.18, "learning_rate": 4.108382720077704e-05, "loss": 3.8128, "step": 47000 }, { "epoch": 0.18, "learning_rate": 4.098897429865764e-05, "loss": 3.8053, "step": 47500 }, { "epoch": 0.18, "learning_rate": 4.089412139653825e-05, "loss": 3.8023, "step": 48000 }, { "epoch": 0.18, "learning_rate": 4.079926849441886e-05, "loss": 3.8084, "step": 48500 }, { "epoch": 0.19, "learning_rate": 4.070441559229947e-05, "loss": 3.7967, "step": 49000 }, { "epoch": 0.19, "learning_rate": 4.060956269018007e-05, "loss": 3.7947, "step": 49500 }, { "epoch": 0.19, "learning_rate": 4.0514709788060675e-05, "loss": 3.7973, "step": 50000 }, { "epoch": 0.19, "eval_accuracy": 0.35837528569747157, "eval_loss": 3.7209770679473877, "eval_runtime": 5052.5501, "eval_samples_per_second": 87.831, "eval_steps_per_second": 1.372, "step": 50000 }, { "epoch": 0.19, "learning_rate": 4.041985688594128e-05, "loss": 3.7861, "step": 50500 }, { "epoch": 0.19, "learning_rate": 4.032500398382189e-05, "loss": 3.7878, "step": 51000 }, { "epoch": 0.2, "learning_rate": 4.02301510817025e-05, "loss": 3.7839, "step": 51500 }, { "epoch": 0.2, "learning_rate": 4.0135298179583104e-05, "loss": 3.7898, "step": 52000 }, { "epoch": 0.2, "learning_rate": 4.0040445277463714e-05, "loss": 3.7808, "step": 52500 }, { "epoch": 0.2, "learning_rate": 3.994559237534432e-05, "loss": 3.7857, "step": 53000 }, { "epoch": 0.2, "learning_rate": 3.985073947322492e-05, "loss": 3.7754, "step": 53500 }, { "epoch": 0.2, "learning_rate": 3.975588657110553e-05, "loss": 3.769, "step": 54000 }, { "epoch": 0.21, "learning_rate": 3.9661033668986136e-05, "loss": 3.7723, "step": 54500 }, { "epoch": 0.21, "learning_rate": 3.9566180766866747e-05, "loss": 3.7719, "step": 55000 }, { "epoch": 0.21, "learning_rate": 3.947132786474736e-05, "loss": 3.7684, "step": 55500 }, { "epoch": 0.21, "learning_rate": 3.9376474962627954e-05, "loss": 3.7672, "step": 56000 }, { "epoch": 0.21, "learning_rate": 3.9281622060508565e-05, "loss": 3.7595, "step": 56500 }, { "epoch": 0.22, "learning_rate": 3.918676915838917e-05, "loss": 3.764, "step": 57000 }, { "epoch": 0.22, "learning_rate": 3.909191625626978e-05, "loss": 3.7584, "step": 57500 }, { "epoch": 0.22, "learning_rate": 3.899706335415039e-05, "loss": 3.7532, "step": 58000 }, { "epoch": 0.22, "learning_rate": 3.890221045203099e-05, "loss": 3.7476, "step": 58500 }, { "epoch": 0.22, "learning_rate": 3.88073575499116e-05, "loss": 3.7502, "step": 59000 }, { "epoch": 0.23, "learning_rate": 3.871250464779221e-05, "loss": 3.7584, "step": 59500 }, { "epoch": 0.23, "learning_rate": 3.861765174567281e-05, "loss": 3.7421, "step": 60000 }, { "epoch": 0.23, "eval_accuracy": 0.3629990378932714, "eval_loss": 3.6750495433807373, "eval_runtime": 5050.4176, "eval_samples_per_second": 87.869, "eval_steps_per_second": 1.373, "step": 60000 }, { "epoch": 0.23, "learning_rate": 3.852279884355342e-05, "loss": 3.7503, "step": 60500 }, { "epoch": 0.23, "learning_rate": 3.8427945941434025e-05, "loss": 3.7485, "step": 61000 }, { "epoch": 0.23, "learning_rate": 3.8333093039314636e-05, "loss": 3.745, "step": 61500 }, { "epoch": 0.24, "learning_rate": 3.823824013719524e-05, "loss": 3.739, "step": 62000 }, { "epoch": 0.24, "learning_rate": 3.8143387235075843e-05, "loss": 3.7411, "step": 62500 }, { "epoch": 0.24, "learning_rate": 3.8048534332956454e-05, "loss": 3.739, "step": 63000 }, { "epoch": 0.24, "learning_rate": 3.795368143083706e-05, "loss": 3.7388, "step": 63500 }, { "epoch": 0.24, "learning_rate": 3.785882852871767e-05, "loss": 3.7339, "step": 64000 }, { "epoch": 0.24, "learning_rate": 3.776397562659827e-05, "loss": 3.7328, "step": 64500 }, { "epoch": 0.25, "learning_rate": 3.7669122724478876e-05, "loss": 3.7209, "step": 65000 }, { "epoch": 0.25, "learning_rate": 3.7574269822359486e-05, "loss": 3.727, "step": 65500 }, { "epoch": 0.25, "learning_rate": 3.74794169202401e-05, "loss": 3.7251, "step": 66000 }, { "epoch": 0.25, "learning_rate": 3.73845640181207e-05, "loss": 3.7199, "step": 66500 }, { "epoch": 0.25, "learning_rate": 3.728971111600131e-05, "loss": 3.7277, "step": 67000 }, { "epoch": 0.26, "learning_rate": 3.719485821388191e-05, "loss": 3.7232, "step": 67500 }, { "epoch": 0.26, "learning_rate": 3.710000531176252e-05, "loss": 3.7154, "step": 68000 }, { "epoch": 0.26, "learning_rate": 3.700515240964313e-05, "loss": 3.7159, "step": 68500 }, { "epoch": 0.26, "learning_rate": 3.691029950752373e-05, "loss": 3.711, "step": 69000 }, { "epoch": 0.26, "learning_rate": 3.681544660540434e-05, "loss": 3.7094, "step": 69500 }, { "epoch": 0.27, "learning_rate": 3.672059370328495e-05, "loss": 3.7097, "step": 70000 }, { "epoch": 0.27, "eval_accuracy": 0.36635205507688484, "eval_loss": 3.6377646923065186, "eval_runtime": 5044.3649, "eval_samples_per_second": 87.974, "eval_steps_per_second": 1.375, "step": 70000 }, { "epoch": 0.27, "learning_rate": 3.662574080116555e-05, "loss": 3.7109, "step": 70500 }, { "epoch": 0.27, "learning_rate": 3.653088789904616e-05, "loss": 3.7069, "step": 71000 }, { "epoch": 0.27, "learning_rate": 3.6436034996926765e-05, "loss": 3.7025, "step": 71500 }, { "epoch": 0.27, "learning_rate": 3.6341182094807376e-05, "loss": 3.7032, "step": 72000 }, { "epoch": 0.28, "learning_rate": 3.6246329192687986e-05, "loss": 3.7005, "step": 72500 }, { "epoch": 0.28, "learning_rate": 3.615147629056859e-05, "loss": 3.704, "step": 73000 }, { "epoch": 0.28, "learning_rate": 3.6056623388449194e-05, "loss": 3.6903, "step": 73500 }, { "epoch": 0.28, "learning_rate": 3.59617704863298e-05, "loss": 3.6989, "step": 74000 }, { "epoch": 0.28, "learning_rate": 3.586691758421041e-05, "loss": 3.6947, "step": 74500 }, { "epoch": 0.28, "learning_rate": 3.577206468209102e-05, "loss": 3.6969, "step": 75000 }, { "epoch": 0.29, "learning_rate": 3.567721177997162e-05, "loss": 3.696, "step": 75500 }, { "epoch": 0.29, "learning_rate": 3.5582358877852226e-05, "loss": 3.6905, "step": 76000 }, { "epoch": 0.29, "learning_rate": 3.5487505975732836e-05, "loss": 3.6851, "step": 76500 }, { "epoch": 0.29, "learning_rate": 3.539265307361344e-05, "loss": 3.6857, "step": 77000 }, { "epoch": 0.29, "learning_rate": 3.529780017149405e-05, "loss": 3.6889, "step": 77500 }, { "epoch": 0.3, "learning_rate": 3.5202947269374654e-05, "loss": 3.6911, "step": 78000 }, { "epoch": 0.3, "learning_rate": 3.5108094367255265e-05, "loss": 3.687, "step": 78500 }, { "epoch": 0.3, "learning_rate": 3.501324146513587e-05, "loss": 3.6808, "step": 79000 }, { "epoch": 0.3, "learning_rate": 3.491838856301647e-05, "loss": 3.6841, "step": 79500 }, { "epoch": 0.3, "learning_rate": 3.482353566089708e-05, "loss": 3.6741, "step": 80000 }, { "epoch": 0.3, "eval_accuracy": 0.36940481219098525, "eval_loss": 3.606105089187622, "eval_runtime": 5043.6804, "eval_samples_per_second": 87.986, "eval_steps_per_second": 1.375, "step": 80000 }, { "epoch": 0.31, "learning_rate": 3.4728682758777694e-05, "loss": 3.6789, "step": 80500 }, { "epoch": 0.31, "learning_rate": 3.46338298566583e-05, "loss": 3.6809, "step": 81000 }, { "epoch": 0.31, "learning_rate": 3.45389769545389e-05, "loss": 3.6854, "step": 81500 }, { "epoch": 0.31, "learning_rate": 3.4444124052419505e-05, "loss": 3.6692, "step": 82000 }, { "epoch": 0.31, "learning_rate": 3.4349271150300115e-05, "loss": 3.6771, "step": 82500 }, { "epoch": 0.31, "learning_rate": 3.4254418248180726e-05, "loss": 3.6773, "step": 83000 }, { "epoch": 0.32, "learning_rate": 3.415956534606133e-05, "loss": 3.6701, "step": 83500 }, { "epoch": 0.32, "learning_rate": 3.406471244394194e-05, "loss": 3.666, "step": 84000 }, { "epoch": 0.32, "learning_rate": 3.3969859541822544e-05, "loss": 3.672, "step": 84500 }, { "epoch": 0.32, "learning_rate": 3.387500663970315e-05, "loss": 3.6738, "step": 85000 }, { "epoch": 0.32, "learning_rate": 3.378015373758376e-05, "loss": 3.6705, "step": 85500 }, { "epoch": 0.33, "learning_rate": 3.368530083546436e-05, "loss": 3.6649, "step": 86000 }, { "epoch": 0.33, "learning_rate": 3.359044793334497e-05, "loss": 3.6736, "step": 86500 }, { "epoch": 0.33, "learning_rate": 3.3495595031225576e-05, "loss": 3.6689, "step": 87000 }, { "epoch": 0.33, "learning_rate": 3.340074212910618e-05, "loss": 3.6665, "step": 87500 }, { "epoch": 0.33, "learning_rate": 3.330588922698679e-05, "loss": 3.6641, "step": 88000 }, { "epoch": 0.34, "learning_rate": 3.3211036324867394e-05, "loss": 3.6536, "step": 88500 }, { "epoch": 0.34, "learning_rate": 3.3116183422748005e-05, "loss": 3.6658, "step": 89000 }, { "epoch": 0.34, "learning_rate": 3.3021330520628615e-05, "loss": 3.6544, "step": 89500 }, { "epoch": 0.34, "learning_rate": 3.292647761850922e-05, "loss": 3.6599, "step": 90000 }, { "epoch": 0.34, "eval_accuracy": 0.37180447854264453, "eval_loss": 3.5803401470184326, "eval_runtime": 5042.0712, "eval_samples_per_second": 88.014, "eval_steps_per_second": 1.375, "step": 90000 }, { "epoch": 0.34, "learning_rate": 3.283162471638982e-05, "loss": 3.6605, "step": 90500 }, { "epoch": 0.35, "learning_rate": 3.2736771814270426e-05, "loss": 3.6603, "step": 91000 }, { "epoch": 0.35, "learning_rate": 3.264191891215104e-05, "loss": 3.6576, "step": 91500 }, { "epoch": 0.35, "learning_rate": 3.254706601003165e-05, "loss": 3.6511, "step": 92000 }, { "epoch": 0.35, "learning_rate": 3.245221310791225e-05, "loss": 3.6518, "step": 92500 }, { "epoch": 0.35, "learning_rate": 3.2357360205792855e-05, "loss": 3.6522, "step": 93000 }, { "epoch": 0.35, "learning_rate": 3.2262507303673465e-05, "loss": 3.646, "step": 93500 }, { "epoch": 0.36, "learning_rate": 3.216765440155407e-05, "loss": 3.6494, "step": 94000 }, { "epoch": 0.36, "learning_rate": 3.207280149943468e-05, "loss": 3.6388, "step": 94500 }, { "epoch": 0.36, "learning_rate": 3.1977948597315284e-05, "loss": 3.6456, "step": 95000 }, { "epoch": 0.36, "learning_rate": 3.1883095695195894e-05, "loss": 3.6398, "step": 95500 }, { "epoch": 0.36, "learning_rate": 3.17882427930765e-05, "loss": 3.6476, "step": 96000 }, { "epoch": 0.37, "learning_rate": 3.16933898909571e-05, "loss": 3.6364, "step": 96500 }, { "epoch": 0.37, "learning_rate": 3.159853698883771e-05, "loss": 3.6456, "step": 97000 }, { "epoch": 0.37, "learning_rate": 3.150368408671832e-05, "loss": 3.645, "step": 97500 }, { "epoch": 0.37, "learning_rate": 3.1408831184598926e-05, "loss": 3.6357, "step": 98000 }, { "epoch": 0.37, "learning_rate": 3.131397828247954e-05, "loss": 3.6415, "step": 98500 }, { "epoch": 0.38, "learning_rate": 3.1219125380360134e-05, "loss": 3.6314, "step": 99000 }, { "epoch": 0.38, "learning_rate": 3.1124272478240744e-05, "loss": 3.64, "step": 99500 }, { "epoch": 0.38, "learning_rate": 3.1029419576121355e-05, "loss": 3.6356, "step": 100000 }, { "epoch": 0.38, "eval_accuracy": 0.37408231347678594, "eval_loss": 3.558403253555298, "eval_runtime": 5044.122, "eval_samples_per_second": 87.978, "eval_steps_per_second": 1.375, "step": 100000 }, { "epoch": 0.38, "learning_rate": 3.093456667400196e-05, "loss": 3.637, "step": 100500 }, { "epoch": 0.38, "learning_rate": 3.083971377188257e-05, "loss": 3.6353, "step": 101000 }, { "epoch": 0.39, "learning_rate": 3.074486086976317e-05, "loss": 3.6331, "step": 101500 }, { "epoch": 0.39, "learning_rate": 3.065000796764378e-05, "loss": 3.6288, "step": 102000 }, { "epoch": 0.39, "learning_rate": 3.055515506552439e-05, "loss": 3.6273, "step": 102500 }, { "epoch": 0.39, "learning_rate": 3.046030216340499e-05, "loss": 3.6351, "step": 103000 }, { "epoch": 0.39, "learning_rate": 3.03654492612856e-05, "loss": 3.6285, "step": 103500 }, { "epoch": 0.39, "learning_rate": 3.027059635916621e-05, "loss": 3.6256, "step": 104000 }, { "epoch": 0.4, "learning_rate": 3.0175743457046812e-05, "loss": 3.6248, "step": 104500 }, { "epoch": 0.4, "learning_rate": 3.0080890554927423e-05, "loss": 3.6182, "step": 105000 }, { "epoch": 0.4, "learning_rate": 2.9986037652808023e-05, "loss": 3.6242, "step": 105500 }, { "epoch": 0.4, "learning_rate": 2.9891184750688634e-05, "loss": 3.625, "step": 106000 }, { "epoch": 0.4, "learning_rate": 2.979633184856924e-05, "loss": 3.6191, "step": 106500 }, { "epoch": 0.41, "learning_rate": 2.9701478946449845e-05, "loss": 3.6267, "step": 107000 }, { "epoch": 0.41, "learning_rate": 2.9606626044330455e-05, "loss": 3.6227, "step": 107500 }, { "epoch": 0.41, "learning_rate": 2.9511773142211062e-05, "loss": 3.6217, "step": 108000 }, { "epoch": 0.41, "learning_rate": 2.9416920240091666e-05, "loss": 3.6168, "step": 108500 }, { "epoch": 0.41, "learning_rate": 2.9322067337972277e-05, "loss": 3.6204, "step": 109000 }, { "epoch": 0.42, "learning_rate": 2.922721443585288e-05, "loss": 3.6217, "step": 109500 }, { "epoch": 0.42, "learning_rate": 2.9132361533733487e-05, "loss": 3.6131, "step": 110000 }, { "epoch": 0.42, "eval_accuracy": 0.37584134336520747, "eval_loss": 3.542346715927124, "eval_runtime": 5051.4787, "eval_samples_per_second": 87.85, "eval_steps_per_second": 1.373, "step": 110000 }, { "epoch": 0.42, "learning_rate": 2.9037508631614098e-05, "loss": 3.6165, "step": 110500 }, { "epoch": 0.42, "learning_rate": 2.8942655729494698e-05, "loss": 3.6129, "step": 111000 }, { "epoch": 0.42, "learning_rate": 2.884780282737531e-05, "loss": 3.6177, "step": 111500 }, { "epoch": 0.42, "learning_rate": 2.8752949925255913e-05, "loss": 3.6187, "step": 112000 }, { "epoch": 0.43, "learning_rate": 2.865809702313652e-05, "loss": 3.6112, "step": 112500 }, { "epoch": 0.43, "learning_rate": 2.856324412101713e-05, "loss": 3.6103, "step": 113000 }, { "epoch": 0.43, "learning_rate": 2.8468391218897734e-05, "loss": 3.6103, "step": 113500 }, { "epoch": 0.43, "learning_rate": 2.837353831677834e-05, "loss": 3.615, "step": 114000 }, { "epoch": 0.43, "learning_rate": 2.827868541465895e-05, "loss": 3.6151, "step": 114500 }, { "epoch": 0.44, "learning_rate": 2.8183832512539555e-05, "loss": 3.6039, "step": 115000 }, { "epoch": 0.44, "learning_rate": 2.8088979610420162e-05, "loss": 3.6133, "step": 115500 }, { "epoch": 0.44, "learning_rate": 2.7994126708300766e-05, "loss": 3.6063, "step": 116000 }, { "epoch": 0.44, "learning_rate": 2.7899273806181377e-05, "loss": 3.6029, "step": 116500 }, { "epoch": 0.44, "learning_rate": 2.7804420904061984e-05, "loss": 3.6082, "step": 117000 }, { "epoch": 0.45, "learning_rate": 2.7709568001942588e-05, "loss": 3.6099, "step": 117500 }, { "epoch": 0.45, "learning_rate": 2.7614715099823195e-05, "loss": 3.6035, "step": 118000 }, { "epoch": 0.45, "learning_rate": 2.75198621977038e-05, "loss": 3.5997, "step": 118500 }, { "epoch": 0.45, "learning_rate": 2.742500929558441e-05, "loss": 3.6032, "step": 119000 }, { "epoch": 0.45, "learning_rate": 2.7330156393465016e-05, "loss": 3.5998, "step": 119500 }, { "epoch": 0.46, "learning_rate": 2.723530349134562e-05, "loss": 3.5991, "step": 120000 }, { "epoch": 0.46, "eval_accuracy": 0.3775510164297428, "eval_loss": 3.525380849838257, "eval_runtime": 5041.3032, "eval_samples_per_second": 88.027, "eval_steps_per_second": 1.375, "step": 120000 }, { "epoch": 0.46, "learning_rate": 2.714045058922623e-05, "loss": 3.6027, "step": 120500 }, { "epoch": 0.46, "learning_rate": 2.7045597687106838e-05, "loss": 3.5947, "step": 121000 }, { "epoch": 0.46, "learning_rate": 2.695074478498744e-05, "loss": 3.6033, "step": 121500 }, { "epoch": 0.46, "learning_rate": 2.6855891882868052e-05, "loss": 3.5933, "step": 122000 }, { "epoch": 0.46, "learning_rate": 2.6761038980748652e-05, "loss": 3.594, "step": 122500 }, { "epoch": 0.47, "learning_rate": 2.6666186078629263e-05, "loss": 3.599, "step": 123000 }, { "epoch": 0.47, "learning_rate": 2.6571333176509873e-05, "loss": 3.6013, "step": 123500 }, { "epoch": 0.47, "learning_rate": 2.6476480274390474e-05, "loss": 3.5982, "step": 124000 }, { "epoch": 0.47, "learning_rate": 2.6381627372271084e-05, "loss": 3.5937, "step": 124500 }, { "epoch": 0.47, "learning_rate": 2.628677447015169e-05, "loss": 3.5945, "step": 125000 }, { "epoch": 0.48, "learning_rate": 2.6191921568032295e-05, "loss": 3.5926, "step": 125500 }, { "epoch": 0.48, "learning_rate": 2.6097068665912906e-05, "loss": 3.5927, "step": 126000 }, { "epoch": 0.48, "learning_rate": 2.600221576379351e-05, "loss": 3.5892, "step": 126500 }, { "epoch": 0.48, "learning_rate": 2.5907362861674116e-05, "loss": 3.5938, "step": 127000 }, { "epoch": 0.48, "learning_rate": 2.5812509959554727e-05, "loss": 3.5867, "step": 127500 }, { "epoch": 0.49, "learning_rate": 2.571765705743533e-05, "loss": 3.5879, "step": 128000 }, { "epoch": 0.49, "learning_rate": 2.5622804155315938e-05, "loss": 3.5909, "step": 128500 }, { "epoch": 0.49, "learning_rate": 2.552795125319654e-05, "loss": 3.5861, "step": 129000 }, { "epoch": 0.49, "learning_rate": 2.543309835107715e-05, "loss": 3.5913, "step": 129500 }, { "epoch": 0.49, "learning_rate": 2.533824544895776e-05, "loss": 3.591, "step": 130000 }, { "epoch": 0.49, "eval_accuracy": 0.37901210353247916, "eval_loss": 3.510841131210327, "eval_runtime": 5053.0574, "eval_samples_per_second": 87.823, "eval_steps_per_second": 1.372, "step": 130000 }, { "epoch": 0.5, "learning_rate": 2.5243392546838363e-05, "loss": 3.5849, "step": 130500 }, { "epoch": 0.5, "learning_rate": 2.514853964471897e-05, "loss": 3.5868, "step": 131000 }, { "epoch": 0.5, "learning_rate": 2.505368674259958e-05, "loss": 3.5838, "step": 131500 }, { "epoch": 0.5, "learning_rate": 2.4958833840480184e-05, "loss": 3.5848, "step": 132000 }, { "epoch": 0.5, "learning_rate": 2.486398093836079e-05, "loss": 3.5818, "step": 132500 }, { "epoch": 0.5, "learning_rate": 2.47691280362414e-05, "loss": 3.5842, "step": 133000 }, { "epoch": 0.51, "learning_rate": 2.4674275134122006e-05, "loss": 3.584, "step": 133500 }, { "epoch": 0.51, "learning_rate": 2.457942223200261e-05, "loss": 3.5817, "step": 134000 }, { "epoch": 0.51, "learning_rate": 2.448456932988322e-05, "loss": 3.5781, "step": 134500 }, { "epoch": 0.51, "learning_rate": 2.4389716427763827e-05, "loss": 3.5761, "step": 135000 }, { "epoch": 0.51, "learning_rate": 2.429486352564443e-05, "loss": 3.5755, "step": 135500 }, { "epoch": 0.52, "learning_rate": 2.4200010623525038e-05, "loss": 3.5822, "step": 136000 }, { "epoch": 0.52, "learning_rate": 2.4105157721405645e-05, "loss": 3.5749, "step": 136500 }, { "epoch": 0.52, "learning_rate": 2.4010304819286252e-05, "loss": 3.5812, "step": 137000 }, { "epoch": 0.52, "learning_rate": 2.391545191716686e-05, "loss": 3.5786, "step": 137500 }, { "epoch": 0.52, "learning_rate": 2.3820599015047467e-05, "loss": 3.5826, "step": 138000 }, { "epoch": 0.53, "learning_rate": 2.372574611292807e-05, "loss": 3.5731, "step": 138500 }, { "epoch": 0.53, "learning_rate": 2.363089321080868e-05, "loss": 3.5784, "step": 139000 }, { "epoch": 0.53, "learning_rate": 2.3536040308689285e-05, "loss": 3.5738, "step": 139500 }, { "epoch": 0.53, "learning_rate": 2.3441187406569892e-05, "loss": 3.574, "step": 140000 }, { "epoch": 0.53, "eval_accuracy": 0.3804677520669924, "eval_loss": 3.4966471195220947, "eval_runtime": 5044.6959, "eval_samples_per_second": 87.968, "eval_steps_per_second": 1.375, "step": 140000 }, { "epoch": 0.53, "learning_rate": 2.33463345044505e-05, "loss": 3.5722, "step": 140500 }, { "epoch": 0.53, "learning_rate": 2.3251481602331106e-05, "loss": 3.5778, "step": 141000 }, { "epoch": 0.54, "learning_rate": 2.3156628700211713e-05, "loss": 3.5722, "step": 141500 }, { "epoch": 0.54, "learning_rate": 2.306177579809232e-05, "loss": 3.5658, "step": 142000 }, { "epoch": 0.54, "learning_rate": 2.2966922895972924e-05, "loss": 3.5671, "step": 142500 }, { "epoch": 0.54, "learning_rate": 2.2872069993853535e-05, "loss": 3.5696, "step": 143000 }, { "epoch": 0.54, "learning_rate": 2.2777217091734142e-05, "loss": 3.5691, "step": 143500 }, { "epoch": 0.55, "learning_rate": 2.2682364189614745e-05, "loss": 3.5737, "step": 144000 }, { "epoch": 0.55, "learning_rate": 2.2587511287495353e-05, "loss": 3.5693, "step": 144500 }, { "epoch": 0.55, "learning_rate": 2.249265838537596e-05, "loss": 3.5728, "step": 145000 }, { "epoch": 0.55, "learning_rate": 2.2397805483256567e-05, "loss": 3.569, "step": 145500 }, { "epoch": 0.55, "learning_rate": 2.2302952581137174e-05, "loss": 3.5559, "step": 146000 }, { "epoch": 0.56, "learning_rate": 2.220809967901778e-05, "loss": 3.5673, "step": 146500 }, { "epoch": 0.56, "learning_rate": 2.2113246776898385e-05, "loss": 3.567, "step": 147000 }, { "epoch": 0.56, "learning_rate": 2.2018393874778995e-05, "loss": 3.5692, "step": 147500 }, { "epoch": 0.56, "learning_rate": 2.19235409726596e-05, "loss": 3.5651, "step": 148000 }, { "epoch": 0.56, "learning_rate": 2.1828688070540206e-05, "loss": 3.559, "step": 148500 }, { "epoch": 0.57, "learning_rate": 2.1733835168420813e-05, "loss": 3.5657, "step": 149000 }, { "epoch": 0.57, "learning_rate": 2.163898226630142e-05, "loss": 3.5632, "step": 149500 }, { "epoch": 0.57, "learning_rate": 2.1544129364182028e-05, "loss": 3.5606, "step": 150000 }, { "epoch": 0.57, "eval_accuracy": 0.38146521499584235, "eval_loss": 3.486565113067627, "eval_runtime": 5047.1623, "eval_samples_per_second": 87.925, "eval_steps_per_second": 1.374, "step": 150000 }, { "epoch": 0.57, "learning_rate": 2.1449276462062635e-05, "loss": 3.5635, "step": 150500 }, { "epoch": 0.57, "learning_rate": 2.135442355994324e-05, "loss": 3.569, "step": 151000 }, { "epoch": 0.57, "learning_rate": 2.125957065782385e-05, "loss": 3.551, "step": 151500 }, { "epoch": 0.58, "learning_rate": 2.1164717755704456e-05, "loss": 3.5543, "step": 152000 }, { "epoch": 0.58, "learning_rate": 2.106986485358506e-05, "loss": 3.5556, "step": 152500 }, { "epoch": 0.58, "learning_rate": 2.0975011951465667e-05, "loss": 3.5598, "step": 153000 }, { "epoch": 0.58, "learning_rate": 2.0880159049346274e-05, "loss": 3.5592, "step": 153500 }, { "epoch": 0.58, "learning_rate": 2.078530614722688e-05, "loss": 3.5562, "step": 154000 }, { "epoch": 0.59, "learning_rate": 2.069045324510749e-05, "loss": 3.5561, "step": 154500 }, { "epoch": 0.59, "learning_rate": 2.0595600342988096e-05, "loss": 3.5573, "step": 155000 }, { "epoch": 0.59, "learning_rate": 2.05007474408687e-05, "loss": 3.5585, "step": 155500 }, { "epoch": 0.59, "learning_rate": 2.040589453874931e-05, "loss": 3.5576, "step": 156000 }, { "epoch": 0.59, "learning_rate": 2.0311041636629917e-05, "loss": 3.5529, "step": 156500 }, { "epoch": 0.6, "learning_rate": 2.021618873451052e-05, "loss": 3.5575, "step": 157000 }, { "epoch": 0.6, "learning_rate": 2.0121335832391128e-05, "loss": 3.5569, "step": 157500 }, { "epoch": 0.6, "learning_rate": 2.0026482930271735e-05, "loss": 3.5537, "step": 158000 }, { "epoch": 0.6, "learning_rate": 1.9931630028152342e-05, "loss": 3.5553, "step": 158500 }, { "epoch": 0.6, "learning_rate": 1.983677712603295e-05, "loss": 3.5524, "step": 159000 }, { "epoch": 0.61, "learning_rate": 1.9741924223913557e-05, "loss": 3.5562, "step": 159500 }, { "epoch": 0.61, "learning_rate": 1.9647071321794164e-05, "loss": 3.5516, "step": 160000 }, { "epoch": 0.61, "eval_accuracy": 0.3828251390025017, "eval_loss": 3.4739012718200684, "eval_runtime": 5050.1987, "eval_samples_per_second": 87.872, "eval_steps_per_second": 1.373, "step": 160000 }, { "epoch": 0.61, "learning_rate": 1.955221841967477e-05, "loss": 3.5508, "step": 160500 }, { "epoch": 0.61, "learning_rate": 1.9457365517555375e-05, "loss": 3.5424, "step": 161000 }, { "epoch": 0.61, "learning_rate": 1.936251261543598e-05, "loss": 3.5526, "step": 161500 }, { "epoch": 0.61, "learning_rate": 1.9267659713316592e-05, "loss": 3.5469, "step": 162000 }, { "epoch": 0.62, "learning_rate": 1.9172806811197196e-05, "loss": 3.5401, "step": 162500 }, { "epoch": 0.62, "learning_rate": 1.9077953909077803e-05, "loss": 3.5525, "step": 163000 }, { "epoch": 0.62, "learning_rate": 1.898310100695841e-05, "loss": 3.5494, "step": 163500 }, { "epoch": 0.62, "learning_rate": 1.8888248104839014e-05, "loss": 3.5527, "step": 164000 }, { "epoch": 0.62, "learning_rate": 1.8793395202719624e-05, "loss": 3.5477, "step": 164500 }, { "epoch": 0.63, "learning_rate": 1.869854230060023e-05, "loss": 3.548, "step": 165000 }, { "epoch": 0.63, "learning_rate": 1.8603689398480835e-05, "loss": 3.5466, "step": 165500 }, { "epoch": 0.63, "learning_rate": 1.8508836496361442e-05, "loss": 3.5491, "step": 166000 }, { "epoch": 0.63, "learning_rate": 1.8413983594242053e-05, "loss": 3.5431, "step": 166500 }, { "epoch": 0.63, "learning_rate": 1.8319130692122657e-05, "loss": 3.5462, "step": 167000 }, { "epoch": 0.64, "learning_rate": 1.8224277790003264e-05, "loss": 3.5453, "step": 167500 }, { "epoch": 0.64, "learning_rate": 1.812942488788387e-05, "loss": 3.5408, "step": 168000 }, { "epoch": 0.64, "learning_rate": 1.8034571985764478e-05, "loss": 3.5465, "step": 168500 }, { "epoch": 0.64, "learning_rate": 1.7939719083645085e-05, "loss": 3.5437, "step": 169000 }, { "epoch": 0.64, "learning_rate": 1.784486618152569e-05, "loss": 3.533, "step": 169500 }, { "epoch": 0.64, "learning_rate": 1.7750013279406296e-05, "loss": 3.5423, "step": 170000 }, { "epoch": 0.64, "eval_accuracy": 0.38377248075624093, "eval_loss": 3.4649875164031982, "eval_runtime": 5024.5121, "eval_samples_per_second": 88.322, "eval_steps_per_second": 1.38, "step": 170000 }, { "epoch": 0.65, "learning_rate": 1.7655160377286907e-05, "loss": 3.5446, "step": 170500 }, { "epoch": 0.65, "learning_rate": 1.756030747516751e-05, "loss": 3.5374, "step": 171000 }, { "epoch": 0.65, "learning_rate": 1.7465454573048118e-05, "loss": 3.5426, "step": 171500 }, { "epoch": 0.65, "learning_rate": 1.7370601670928725e-05, "loss": 3.5391, "step": 172000 }, { "epoch": 0.65, "learning_rate": 1.727574876880933e-05, "loss": 3.544, "step": 172500 }, { "epoch": 0.66, "learning_rate": 1.718089586668994e-05, "loss": 3.5396, "step": 173000 }, { "epoch": 0.66, "learning_rate": 1.7086042964570546e-05, "loss": 3.5385, "step": 173500 }, { "epoch": 0.66, "learning_rate": 1.699119006245115e-05, "loss": 3.534, "step": 174000 }, { "epoch": 0.66, "learning_rate": 1.6896337160331757e-05, "loss": 3.5374, "step": 174500 }, { "epoch": 0.66, "learning_rate": 1.6801484258212368e-05, "loss": 3.5351, "step": 175000 }, { "epoch": 0.67, "learning_rate": 1.670663135609297e-05, "loss": 3.5391, "step": 175500 }, { "epoch": 0.67, "learning_rate": 1.661177845397358e-05, "loss": 3.5351, "step": 176000 }, { "epoch": 0.67, "learning_rate": 1.6516925551854186e-05, "loss": 3.5315, "step": 176500 }, { "epoch": 0.67, "learning_rate": 1.6422072649734793e-05, "loss": 3.5324, "step": 177000 }, { "epoch": 0.67, "learning_rate": 1.63272197476154e-05, "loss": 3.5379, "step": 177500 }, { "epoch": 0.68, "learning_rate": 1.6232366845496007e-05, "loss": 3.534, "step": 178000 }, { "epoch": 0.68, "learning_rate": 1.613751394337661e-05, "loss": 3.5366, "step": 178500 }, { "epoch": 0.68, "learning_rate": 1.604266104125722e-05, "loss": 3.5364, "step": 179000 }, { "epoch": 0.68, "learning_rate": 1.5947808139137825e-05, "loss": 3.5398, "step": 179500 }, { "epoch": 0.68, "learning_rate": 1.5852955237018432e-05, "loss": 3.5298, "step": 180000 }, { "epoch": 0.68, "eval_accuracy": 0.3846720031995081, "eval_loss": 3.455994129180908, "eval_runtime": 5033.5621, "eval_samples_per_second": 88.163, "eval_steps_per_second": 1.378, "step": 180000 }, { "epoch": 0.68, "learning_rate": 1.575810233489904e-05, "loss": 3.5337, "step": 180500 }, { "epoch": 0.69, "learning_rate": 1.5663249432779646e-05, "loss": 3.5324, "step": 181000 }, { "epoch": 0.69, "learning_rate": 1.5568396530660254e-05, "loss": 3.5307, "step": 181500 }, { "epoch": 0.69, "learning_rate": 1.547354362854086e-05, "loss": 3.5273, "step": 182000 }, { "epoch": 0.69, "learning_rate": 1.5378690726421464e-05, "loss": 3.5301, "step": 182500 }, { "epoch": 0.69, "learning_rate": 1.528383782430207e-05, "loss": 3.5321, "step": 183000 }, { "epoch": 0.7, "learning_rate": 1.518898492218268e-05, "loss": 3.5283, "step": 183500 }, { "epoch": 0.7, "learning_rate": 1.5094132020063287e-05, "loss": 3.534, "step": 184000 }, { "epoch": 0.7, "learning_rate": 1.4999279117943893e-05, "loss": 3.5346, "step": 184500 }, { "epoch": 0.7, "learning_rate": 1.4904426215824498e-05, "loss": 3.5234, "step": 185000 }, { "epoch": 0.7, "learning_rate": 1.4809573313705107e-05, "loss": 3.5243, "step": 185500 }, { "epoch": 0.71, "learning_rate": 1.4714720411585714e-05, "loss": 3.5292, "step": 186000 }, { "epoch": 0.71, "learning_rate": 1.461986750946632e-05, "loss": 3.5241, "step": 186500 }, { "epoch": 0.71, "learning_rate": 1.4525014607346927e-05, "loss": 3.5258, "step": 187000 }, { "epoch": 0.71, "learning_rate": 1.4430161705227536e-05, "loss": 3.5241, "step": 187500 }, { "epoch": 0.71, "learning_rate": 1.4335308803108141e-05, "loss": 3.5281, "step": 188000 }, { "epoch": 0.72, "learning_rate": 1.4240455900988747e-05, "loss": 3.5271, "step": 188500 }, { "epoch": 0.72, "learning_rate": 1.4145602998869354e-05, "loss": 3.5222, "step": 189000 }, { "epoch": 0.72, "learning_rate": 1.405075009674996e-05, "loss": 3.5197, "step": 189500 }, { "epoch": 0.72, "learning_rate": 1.3955897194630568e-05, "loss": 3.5287, "step": 190000 }, { "epoch": 0.72, "eval_accuracy": 0.38565153361158844, "eval_loss": 3.447903871536255, "eval_runtime": 5039.8049, "eval_samples_per_second": 88.054, "eval_steps_per_second": 1.376, "step": 190000 }, { "epoch": 0.72, "learning_rate": 1.3861044292511175e-05, "loss": 3.5309, "step": 190500 }, { "epoch": 0.72, "learning_rate": 1.376619139039178e-05, "loss": 3.5236, "step": 191000 }, { "epoch": 0.73, "learning_rate": 1.3671338488272386e-05, "loss": 3.5268, "step": 191500 }, { "epoch": 0.73, "learning_rate": 1.3576485586152995e-05, "loss": 3.5261, "step": 192000 }, { "epoch": 0.73, "learning_rate": 1.3481632684033602e-05, "loss": 3.5242, "step": 192500 }, { "epoch": 0.73, "learning_rate": 1.3386779781914207e-05, "loss": 3.5268, "step": 193000 }, { "epoch": 0.73, "learning_rate": 1.3291926879794815e-05, "loss": 3.5302, "step": 193500 }, { "epoch": 0.74, "learning_rate": 1.3197073977675423e-05, "loss": 3.525, "step": 194000 }, { "epoch": 0.74, "learning_rate": 1.3102221075556029e-05, "loss": 3.5278, "step": 194500 }, { "epoch": 0.74, "learning_rate": 1.3007368173436634e-05, "loss": 3.5221, "step": 195000 }, { "epoch": 0.74, "learning_rate": 1.2912515271317241e-05, "loss": 3.5231, "step": 195500 }, { "epoch": 0.74, "learning_rate": 1.281766236919785e-05, "loss": 3.5237, "step": 196000 }, { "epoch": 0.75, "learning_rate": 1.2722809467078456e-05, "loss": 3.5236, "step": 196500 }, { "epoch": 0.75, "learning_rate": 1.2627956564959063e-05, "loss": 3.5201, "step": 197000 }, { "epoch": 0.75, "learning_rate": 1.2533103662839668e-05, "loss": 3.5221, "step": 197500 }, { "epoch": 0.75, "learning_rate": 1.2438250760720275e-05, "loss": 3.5216, "step": 198000 }, { "epoch": 0.75, "learning_rate": 1.2343397858600881e-05, "loss": 3.5202, "step": 198500 }, { "epoch": 0.76, "learning_rate": 1.224854495648149e-05, "loss": 3.5207, "step": 199000 }, { "epoch": 0.76, "learning_rate": 1.2153692054362095e-05, "loss": 3.5172, "step": 199500 }, { "epoch": 0.76, "learning_rate": 1.2058839152242702e-05, "loss": 3.5187, "step": 200000 }, { "epoch": 0.76, "eval_accuracy": 0.3863054900513532, "eval_loss": 3.440758466720581, "eval_runtime": 5028.552, "eval_samples_per_second": 88.251, "eval_steps_per_second": 1.379, "step": 200000 }, { "epoch": 0.76, "learning_rate": 1.196398625012331e-05, "loss": 3.5153, "step": 200500 }, { "epoch": 0.76, "learning_rate": 1.1869133348003917e-05, "loss": 3.5164, "step": 201000 }, { "epoch": 0.76, "learning_rate": 1.1774280445884522e-05, "loss": 3.5225, "step": 201500 }, { "epoch": 0.77, "learning_rate": 1.1679427543765129e-05, "loss": 3.5142, "step": 202000 }, { "epoch": 0.77, "learning_rate": 1.1584574641645736e-05, "loss": 3.519, "step": 202500 }, { "epoch": 0.77, "learning_rate": 1.1489721739526343e-05, "loss": 3.522, "step": 203000 }, { "epoch": 0.77, "learning_rate": 1.1394868837406949e-05, "loss": 3.5142, "step": 203500 }, { "epoch": 0.77, "learning_rate": 1.1300015935287558e-05, "loss": 3.5101, "step": 204000 }, { "epoch": 0.78, "learning_rate": 1.1205163033168163e-05, "loss": 3.5151, "step": 204500 }, { "epoch": 0.78, "learning_rate": 1.111031013104877e-05, "loss": 3.5112, "step": 205000 }, { "epoch": 0.78, "learning_rate": 1.1015457228929377e-05, "loss": 3.5161, "step": 205500 }, { "epoch": 0.78, "learning_rate": 1.0920604326809984e-05, "loss": 3.5135, "step": 206000 }, { "epoch": 0.78, "learning_rate": 1.082575142469059e-05, "loss": 3.5143, "step": 206500 }, { "epoch": 0.79, "learning_rate": 1.0730898522571197e-05, "loss": 3.5134, "step": 207000 }, { "epoch": 0.79, "learning_rate": 1.0636045620451804e-05, "loss": 3.5162, "step": 207500 }, { "epoch": 0.79, "learning_rate": 1.054119271833241e-05, "loss": 3.5202, "step": 208000 }, { "epoch": 0.79, "learning_rate": 1.0446339816213017e-05, "loss": 3.507, "step": 208500 }, { "epoch": 0.79, "learning_rate": 1.0351486914093624e-05, "loss": 3.5108, "step": 209000 }, { "epoch": 0.79, "learning_rate": 1.0256634011974231e-05, "loss": 3.5144, "step": 209500 }, { "epoch": 0.8, "learning_rate": 1.0161781109854836e-05, "loss": 3.5157, "step": 210000 }, { "epoch": 0.8, "eval_accuracy": 0.3870160062789933, "eval_loss": 3.4338622093200684, "eval_runtime": 5031.2535, "eval_samples_per_second": 88.203, "eval_steps_per_second": 1.378, "step": 210000 }, { "epoch": 0.8, "learning_rate": 1.0066928207735445e-05, "loss": 3.5167, "step": 210500 }, { "epoch": 0.8, "learning_rate": 9.97207530561605e-06, "loss": 3.5107, "step": 211000 }, { "epoch": 0.8, "learning_rate": 9.877222403496658e-06, "loss": 3.5114, "step": 211500 }, { "epoch": 0.8, "learning_rate": 9.782369501377265e-06, "loss": 3.5121, "step": 212000 }, { "epoch": 0.81, "learning_rate": 9.687516599257872e-06, "loss": 3.5084, "step": 212500 }, { "epoch": 0.81, "learning_rate": 9.592663697138478e-06, "loss": 3.5178, "step": 213000 }, { "epoch": 0.81, "learning_rate": 9.497810795019085e-06, "loss": 3.5076, "step": 213500 }, { "epoch": 0.81, "learning_rate": 9.402957892899692e-06, "loss": 3.5102, "step": 214000 }, { "epoch": 0.81, "learning_rate": 9.308104990780299e-06, "loss": 3.5069, "step": 214500 }, { "epoch": 0.82, "learning_rate": 9.213252088660904e-06, "loss": 3.5108, "step": 215000 }, { "epoch": 0.82, "learning_rate": 9.118399186541513e-06, "loss": 3.5087, "step": 215500 }, { "epoch": 0.82, "learning_rate": 9.023546284422119e-06, "loss": 3.5041, "step": 216000 }, { "epoch": 0.82, "learning_rate": 8.928693382302724e-06, "loss": 3.5125, "step": 216500 }, { "epoch": 0.82, "learning_rate": 8.833840480183333e-06, "loss": 3.5084, "step": 217000 }, { "epoch": 0.83, "learning_rate": 8.738987578063938e-06, "loss": 3.5034, "step": 217500 }, { "epoch": 0.83, "learning_rate": 8.644134675944546e-06, "loss": 3.5077, "step": 218000 }, { "epoch": 0.83, "learning_rate": 8.549281773825151e-06, "loss": 3.505, "step": 218500 }, { "epoch": 0.83, "learning_rate": 8.45442887170576e-06, "loss": 3.5072, "step": 219000 }, { "epoch": 0.83, "learning_rate": 8.359575969586365e-06, "loss": 3.5, "step": 219500 }, { "epoch": 0.83, "learning_rate": 8.264723067466972e-06, "loss": 3.5042, "step": 220000 }, { "epoch": 0.83, "eval_accuracy": 0.3876448779010485, "eval_loss": 3.4285898208618164, "eval_runtime": 5039.4169, "eval_samples_per_second": 88.06, "eval_steps_per_second": 1.376, "step": 220000 }, { "epoch": 0.84, "learning_rate": 8.16987016534758e-06, "loss": 3.5075, "step": 220500 }, { "epoch": 0.84, "learning_rate": 8.075017263228187e-06, "loss": 3.5056, "step": 221000 }, { "epoch": 0.84, "learning_rate": 7.980164361108792e-06, "loss": 3.5089, "step": 221500 }, { "epoch": 0.84, "learning_rate": 7.8853114589894e-06, "loss": 3.5114, "step": 222000 }, { "epoch": 0.84, "learning_rate": 7.790458556870006e-06, "loss": 3.5076, "step": 222500 }, { "epoch": 0.85, "learning_rate": 7.695605654750614e-06, "loss": 3.5053, "step": 223000 }, { "epoch": 0.85, "learning_rate": 7.60075275263122e-06, "loss": 3.5058, "step": 223500 }, { "epoch": 0.85, "learning_rate": 7.505899850511827e-06, "loss": 3.5078, "step": 224000 }, { "epoch": 0.85, "learning_rate": 7.411046948392433e-06, "loss": 3.5084, "step": 224500 }, { "epoch": 0.85, "learning_rate": 7.31619404627304e-06, "loss": 3.5038, "step": 225000 }, { "epoch": 0.86, "learning_rate": 7.221341144153647e-06, "loss": 3.5015, "step": 225500 }, { "epoch": 0.86, "learning_rate": 7.126488242034253e-06, "loss": 3.5034, "step": 226000 }, { "epoch": 0.86, "learning_rate": 7.03163533991486e-06, "loss": 3.5098, "step": 226500 }, { "epoch": 0.86, "learning_rate": 6.936782437795466e-06, "loss": 3.5001, "step": 227000 }, { "epoch": 0.86, "learning_rate": 6.841929535676074e-06, "loss": 3.5055, "step": 227500 }, { "epoch": 0.87, "learning_rate": 6.74707663355668e-06, "loss": 3.5018, "step": 228000 }, { "epoch": 0.87, "learning_rate": 6.652223731437288e-06, "loss": 3.503, "step": 228500 }, { "epoch": 0.87, "learning_rate": 6.557370829317894e-06, "loss": 3.503, "step": 229000 }, { "epoch": 0.87, "learning_rate": 6.462517927198501e-06, "loss": 3.498, "step": 229500 }, { "epoch": 0.87, "learning_rate": 6.3676650250791075e-06, "loss": 3.5033, "step": 230000 }, { "epoch": 0.87, "eval_accuracy": 0.38827686860475785, "eval_loss": 3.422902822494507, "eval_runtime": 5026.5934, "eval_samples_per_second": 88.285, "eval_steps_per_second": 1.379, "step": 230000 }, { "epoch": 0.87, "learning_rate": 6.272812122959715e-06, "loss": 3.5024, "step": 230500 }, { "epoch": 0.88, "learning_rate": 6.177959220840321e-06, "loss": 3.4986, "step": 231000 }, { "epoch": 0.88, "learning_rate": 6.083106318720928e-06, "loss": 3.5025, "step": 231500 }, { "epoch": 0.88, "learning_rate": 5.988253416601534e-06, "loss": 3.5065, "step": 232000 }, { "epoch": 0.88, "learning_rate": 5.8934005144821415e-06, "loss": 3.5047, "step": 232500 }, { "epoch": 0.88, "learning_rate": 5.798547612362748e-06, "loss": 3.502, "step": 233000 }, { "epoch": 0.89, "learning_rate": 5.703694710243355e-06, "loss": 3.5061, "step": 233500 }, { "epoch": 0.89, "learning_rate": 5.608841808123961e-06, "loss": 3.5036, "step": 234000 }, { "epoch": 0.89, "learning_rate": 5.513988906004568e-06, "loss": 3.4966, "step": 234500 }, { "epoch": 0.89, "learning_rate": 5.4191360038851754e-06, "loss": 3.5048, "step": 235000 }, { "epoch": 0.89, "learning_rate": 5.324283101765782e-06, "loss": 3.5024, "step": 235500 }, { "epoch": 0.9, "learning_rate": 5.229430199646389e-06, "loss": 3.495, "step": 236000 }, { "epoch": 0.9, "learning_rate": 5.134577297526995e-06, "loss": 3.5035, "step": 236500 }, { "epoch": 0.9, "learning_rate": 5.039724395407602e-06, "loss": 3.5026, "step": 237000 }, { "epoch": 0.9, "learning_rate": 4.9448714932882094e-06, "loss": 3.4997, "step": 237500 }, { "epoch": 0.9, "learning_rate": 4.850018591168816e-06, "loss": 3.5035, "step": 238000 }, { "epoch": 0.9, "learning_rate": 4.755165689049423e-06, "loss": 3.4975, "step": 238500 }, { "epoch": 0.91, "learning_rate": 4.660312786930029e-06, "loss": 3.5003, "step": 239000 }, { "epoch": 0.91, "learning_rate": 4.5654598848106354e-06, "loss": 3.4994, "step": 239500 }, { "epoch": 0.91, "learning_rate": 4.4706069826912426e-06, "loss": 3.501, "step": 240000 }, { "epoch": 0.91, "eval_accuracy": 0.38875705078485445, "eval_loss": 3.4187748432159424, "eval_runtime": 5045.601, "eval_samples_per_second": 87.952, "eval_steps_per_second": 1.374, "step": 240000 }, { "epoch": 0.91, "learning_rate": 4.375754080571849e-06, "loss": 3.4917, "step": 240500 }, { "epoch": 0.91, "learning_rate": 4.280901178452456e-06, "loss": 3.4986, "step": 241000 }, { "epoch": 0.92, "learning_rate": 4.186048276333063e-06, "loss": 3.496, "step": 241500 }, { "epoch": 0.92, "learning_rate": 4.091195374213669e-06, "loss": 3.4978, "step": 242000 }, { "epoch": 0.92, "learning_rate": 3.9963424720942765e-06, "loss": 3.491, "step": 242500 }, { "epoch": 0.92, "learning_rate": 3.901489569974883e-06, "loss": 3.4953, "step": 243000 }, { "epoch": 0.92, "learning_rate": 3.80663666785549e-06, "loss": 3.4962, "step": 243500 }, { "epoch": 0.93, "learning_rate": 3.7117837657360967e-06, "loss": 3.4982, "step": 244000 }, { "epoch": 0.93, "learning_rate": 3.6169308636167034e-06, "loss": 3.4973, "step": 244500 }, { "epoch": 0.93, "learning_rate": 3.52207796149731e-06, "loss": 3.4922, "step": 245000 }, { "epoch": 0.93, "learning_rate": 3.4272250593779172e-06, "loss": 3.4921, "step": 245500 }, { "epoch": 0.93, "learning_rate": 3.332372157258524e-06, "loss": 3.5039, "step": 246000 }, { "epoch": 0.94, "learning_rate": 3.2375192551391307e-06, "loss": 3.5025, "step": 246500 }, { "epoch": 0.94, "learning_rate": 3.1426663530197374e-06, "loss": 3.4964, "step": 247000 }, { "epoch": 0.94, "learning_rate": 3.0478134509003437e-06, "loss": 3.4936, "step": 247500 }, { "epoch": 0.94, "learning_rate": 2.952960548780951e-06, "loss": 3.4959, "step": 248000 }, { "epoch": 0.94, "learning_rate": 2.8581076466615575e-06, "loss": 3.4963, "step": 248500 }, { "epoch": 0.94, "learning_rate": 2.7632547445421642e-06, "loss": 3.5014, "step": 249000 }, { "epoch": 0.95, "learning_rate": 2.668401842422771e-06, "loss": 3.4948, "step": 249500 }, { "epoch": 0.95, "learning_rate": 2.5735489403033776e-06, "loss": 3.4946, "step": 250000 }, { "epoch": 0.95, "eval_accuracy": 0.38918558969561506, "eval_loss": 3.4148616790771484, "eval_runtime": 5028.0262, "eval_samples_per_second": 88.26, "eval_steps_per_second": 1.379, "step": 250000 }, { "epoch": 0.95, "learning_rate": 2.4786960381839844e-06, "loss": 3.4959, "step": 250500 }, { "epoch": 0.95, "learning_rate": 2.383843136064591e-06, "loss": 3.4955, "step": 251000 }, { "epoch": 0.95, "learning_rate": 2.2889902339451978e-06, "loss": 3.4869, "step": 251500 }, { "epoch": 0.96, "learning_rate": 2.1941373318258045e-06, "loss": 3.4975, "step": 252000 }, { "epoch": 0.96, "learning_rate": 2.099284429706411e-06, "loss": 3.4871, "step": 252500 }, { "epoch": 0.96, "learning_rate": 2.0044315275870183e-06, "loss": 3.4889, "step": 253000 }, { "epoch": 0.96, "learning_rate": 1.909578625467625e-06, "loss": 3.4928, "step": 253500 }, { "epoch": 0.96, "learning_rate": 1.8147257233482318e-06, "loss": 3.4921, "step": 254000 }, { "epoch": 0.97, "learning_rate": 1.7198728212288385e-06, "loss": 3.494, "step": 254500 }, { "epoch": 0.97, "learning_rate": 1.625019919109445e-06, "loss": 3.4918, "step": 255000 }, { "epoch": 0.97, "learning_rate": 1.5301670169900519e-06, "loss": 3.4919, "step": 255500 }, { "epoch": 0.97, "learning_rate": 1.4353141148706586e-06, "loss": 3.4907, "step": 256000 }, { "epoch": 0.97, "learning_rate": 1.3404612127512653e-06, "loss": 3.4937, "step": 256500 }, { "epoch": 0.98, "learning_rate": 1.2456083106318722e-06, "loss": 3.4893, "step": 257000 }, { "epoch": 0.98, "learning_rate": 1.150755408512479e-06, "loss": 3.4944, "step": 257500 }, { "epoch": 0.98, "learning_rate": 1.0559025063930857e-06, "loss": 3.4914, "step": 258000 }, { "epoch": 0.98, "learning_rate": 9.610496042736924e-07, "loss": 3.4947, "step": 258500 }, { "epoch": 0.98, "learning_rate": 8.661967021542991e-07, "loss": 3.4916, "step": 259000 }, { "epoch": 0.98, "learning_rate": 7.713438000349059e-07, "loss": 3.4884, "step": 259500 }, { "epoch": 0.99, "learning_rate": 6.764908979155126e-07, "loss": 3.4971, "step": 260000 }, { "epoch": 0.99, "eval_accuracy": 0.3894276158938788, "eval_loss": 3.412609100341797, "eval_runtime": 5032.8837, "eval_samples_per_second": 88.175, "eval_steps_per_second": 1.378, "step": 260000 }, { "epoch": 0.99, "learning_rate": 5.816379957961194e-07, "loss": 3.4929, "step": 260500 }, { "epoch": 0.99, "learning_rate": 4.867850936767261e-07, "loss": 3.4937, "step": 261000 }, { "epoch": 0.99, "learning_rate": 3.919321915573329e-07, "loss": 3.4859, "step": 261500 }, { "epoch": 0.99, "learning_rate": 2.9707928943793967e-07, "loss": 3.4796, "step": 262000 }, { "epoch": 1.0, "learning_rate": 2.022263873185464e-07, "loss": 3.4871, "step": 262500 }, { "epoch": 1.0, "learning_rate": 1.0737348519915316e-07, "loss": 3.4949, "step": 263000 }, { "epoch": 1.0, "learning_rate": 1.252058307975991e-08, "loss": 3.4884, "step": 263500 }, { "epoch": 1.0, "step": 263566, "total_flos": 4.407529912270848e+18, "train_loss": 3.6950655784551065, "train_runtime": 269467.2599, "train_samples_per_second": 31.299, "train_steps_per_second": 0.978 } ], "logging_steps": 500, "max_steps": 263566, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "total_flos": 4.407529912270848e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }