gpt2-ts_cx-en_00000-00009_50k / trainer_state.json
jonasknobloch's picture
Initial commit
89e6e1d
raw
history blame
71.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 10000,
"global_step": 263566,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 4.990514709788061e-05,
"loss": 7.0943,
"step": 500
},
{
"epoch": 0.0,
"learning_rate": 4.981029419576122e-05,
"loss": 6.2317,
"step": 1000
},
{
"epoch": 0.01,
"learning_rate": 4.971544129364182e-05,
"loss": 5.9001,
"step": 1500
},
{
"epoch": 0.01,
"learning_rate": 4.9620588391522424e-05,
"loss": 5.6651,
"step": 2000
},
{
"epoch": 0.01,
"learning_rate": 4.9525735489403035e-05,
"loss": 5.4754,
"step": 2500
},
{
"epoch": 0.01,
"learning_rate": 4.9430882587283645e-05,
"loss": 5.3245,
"step": 3000
},
{
"epoch": 0.01,
"learning_rate": 4.933602968516425e-05,
"loss": 5.1815,
"step": 3500
},
{
"epoch": 0.02,
"learning_rate": 4.924117678304486e-05,
"loss": 5.0484,
"step": 4000
},
{
"epoch": 0.02,
"learning_rate": 4.914632388092546e-05,
"loss": 4.9349,
"step": 4500
},
{
"epoch": 0.02,
"learning_rate": 4.905147097880607e-05,
"loss": 4.8371,
"step": 5000
},
{
"epoch": 0.02,
"learning_rate": 4.895661807668668e-05,
"loss": 4.7574,
"step": 5500
},
{
"epoch": 0.02,
"learning_rate": 4.886176517456728e-05,
"loss": 4.6903,
"step": 6000
},
{
"epoch": 0.02,
"learning_rate": 4.876691227244789e-05,
"loss": 4.626,
"step": 6500
},
{
"epoch": 0.03,
"learning_rate": 4.8672059370328496e-05,
"loss": 4.5767,
"step": 7000
},
{
"epoch": 0.03,
"learning_rate": 4.85772064682091e-05,
"loss": 4.5388,
"step": 7500
},
{
"epoch": 0.03,
"learning_rate": 4.848235356608971e-05,
"loss": 4.4959,
"step": 8000
},
{
"epoch": 0.03,
"learning_rate": 4.838750066397032e-05,
"loss": 4.4589,
"step": 8500
},
{
"epoch": 0.03,
"learning_rate": 4.8292647761850924e-05,
"loss": 4.4324,
"step": 9000
},
{
"epoch": 0.04,
"learning_rate": 4.8197794859731535e-05,
"loss": 4.3988,
"step": 9500
},
{
"epoch": 0.04,
"learning_rate": 4.810294195761214e-05,
"loss": 4.375,
"step": 10000
},
{
"epoch": 0.04,
"eval_accuracy": 0.31109238678227563,
"eval_loss": 4.28147029876709,
"eval_runtime": 5060.877,
"eval_samples_per_second": 87.687,
"eval_steps_per_second": 1.37,
"step": 10000
},
{
"epoch": 0.04,
"learning_rate": 4.800808905549274e-05,
"loss": 4.3423,
"step": 10500
},
{
"epoch": 0.04,
"learning_rate": 4.791323615337335e-05,
"loss": 4.3204,
"step": 11000
},
{
"epoch": 0.04,
"learning_rate": 4.7818383251253956e-05,
"loss": 4.2991,
"step": 11500
},
{
"epoch": 0.05,
"learning_rate": 4.772353034913457e-05,
"loss": 4.2818,
"step": 12000
},
{
"epoch": 0.05,
"learning_rate": 4.762867744701517e-05,
"loss": 4.2627,
"step": 12500
},
{
"epoch": 0.05,
"learning_rate": 4.7533824544895774e-05,
"loss": 4.2463,
"step": 13000
},
{
"epoch": 0.05,
"learning_rate": 4.7438971642776385e-05,
"loss": 4.2265,
"step": 13500
},
{
"epoch": 0.05,
"learning_rate": 4.734411874065699e-05,
"loss": 4.2159,
"step": 14000
},
{
"epoch": 0.06,
"learning_rate": 4.72492658385376e-05,
"loss": 4.198,
"step": 14500
},
{
"epoch": 0.06,
"learning_rate": 4.715441293641821e-05,
"loss": 4.1886,
"step": 15000
},
{
"epoch": 0.06,
"learning_rate": 4.7059560034298813e-05,
"loss": 4.1675,
"step": 15500
},
{
"epoch": 0.06,
"learning_rate": 4.696470713217942e-05,
"loss": 4.1553,
"step": 16000
},
{
"epoch": 0.06,
"learning_rate": 4.686985423006002e-05,
"loss": 4.1466,
"step": 16500
},
{
"epoch": 0.06,
"learning_rate": 4.677500132794063e-05,
"loss": 4.1307,
"step": 17000
},
{
"epoch": 0.07,
"learning_rate": 4.668014842582124e-05,
"loss": 4.1282,
"step": 17500
},
{
"epoch": 0.07,
"learning_rate": 4.6585295523701846e-05,
"loss": 4.1108,
"step": 18000
},
{
"epoch": 0.07,
"learning_rate": 4.6490442621582456e-05,
"loss": 4.1135,
"step": 18500
},
{
"epoch": 0.07,
"learning_rate": 4.639558971946305e-05,
"loss": 4.0876,
"step": 19000
},
{
"epoch": 0.07,
"learning_rate": 4.6300736817343664e-05,
"loss": 4.0871,
"step": 19500
},
{
"epoch": 0.08,
"learning_rate": 4.6205883915224274e-05,
"loss": 4.0754,
"step": 20000
},
{
"epoch": 0.08,
"eval_accuracy": 0.3340521142462603,
"eval_loss": 3.9983978271484375,
"eval_runtime": 5045.5203,
"eval_samples_per_second": 87.954,
"eval_steps_per_second": 1.374,
"step": 20000
},
{
"epoch": 0.08,
"learning_rate": 4.611103101310488e-05,
"loss": 4.0747,
"step": 20500
},
{
"epoch": 0.08,
"learning_rate": 4.601617811098549e-05,
"loss": 4.0582,
"step": 21000
},
{
"epoch": 0.08,
"learning_rate": 4.592132520886609e-05,
"loss": 4.0545,
"step": 21500
},
{
"epoch": 0.08,
"learning_rate": 4.5826472306746696e-05,
"loss": 4.0345,
"step": 22000
},
{
"epoch": 0.09,
"learning_rate": 4.5731619404627307e-05,
"loss": 4.0376,
"step": 22500
},
{
"epoch": 0.09,
"learning_rate": 4.563676650250791e-05,
"loss": 4.0315,
"step": 23000
},
{
"epoch": 0.09,
"learning_rate": 4.554191360038852e-05,
"loss": 4.0217,
"step": 23500
},
{
"epoch": 0.09,
"learning_rate": 4.544706069826913e-05,
"loss": 4.0135,
"step": 24000
},
{
"epoch": 0.09,
"learning_rate": 4.535220779614973e-05,
"loss": 4.0034,
"step": 24500
},
{
"epoch": 0.09,
"learning_rate": 4.525735489403034e-05,
"loss": 4.004,
"step": 25000
},
{
"epoch": 0.1,
"learning_rate": 4.516250199191095e-05,
"loss": 3.9939,
"step": 25500
},
{
"epoch": 0.1,
"learning_rate": 4.506764908979155e-05,
"loss": 3.9871,
"step": 26000
},
{
"epoch": 0.1,
"learning_rate": 4.4972796187672164e-05,
"loss": 3.9787,
"step": 26500
},
{
"epoch": 0.1,
"learning_rate": 4.487794328555277e-05,
"loss": 3.9752,
"step": 27000
},
{
"epoch": 0.1,
"learning_rate": 4.478309038343337e-05,
"loss": 3.9605,
"step": 27500
},
{
"epoch": 0.11,
"learning_rate": 4.468823748131398e-05,
"loss": 3.9542,
"step": 28000
},
{
"epoch": 0.11,
"learning_rate": 4.4593384579194585e-05,
"loss": 3.9518,
"step": 28500
},
{
"epoch": 0.11,
"learning_rate": 4.4498531677075196e-05,
"loss": 3.9479,
"step": 29000
},
{
"epoch": 0.11,
"learning_rate": 4.44036787749558e-05,
"loss": 3.9445,
"step": 29500
},
{
"epoch": 0.11,
"learning_rate": 4.430882587283641e-05,
"loss": 3.9409,
"step": 30000
},
{
"epoch": 0.11,
"eval_accuracy": 0.3456931371386037,
"eval_loss": 3.86149525642395,
"eval_runtime": 5041.4629,
"eval_samples_per_second": 88.025,
"eval_steps_per_second": 1.375,
"step": 30000
},
{
"epoch": 0.12,
"learning_rate": 4.4213972970717014e-05,
"loss": 3.9345,
"step": 30500
},
{
"epoch": 0.12,
"learning_rate": 4.411912006859762e-05,
"loss": 3.9328,
"step": 31000
},
{
"epoch": 0.12,
"learning_rate": 4.402426716647823e-05,
"loss": 3.9248,
"step": 31500
},
{
"epoch": 0.12,
"learning_rate": 4.392941426435884e-05,
"loss": 3.9126,
"step": 32000
},
{
"epoch": 0.12,
"learning_rate": 4.383456136223944e-05,
"loss": 3.9077,
"step": 32500
},
{
"epoch": 0.13,
"learning_rate": 4.3739708460120046e-05,
"loss": 3.9102,
"step": 33000
},
{
"epoch": 0.13,
"learning_rate": 4.364485555800065e-05,
"loss": 3.907,
"step": 33500
},
{
"epoch": 0.13,
"learning_rate": 4.355000265588126e-05,
"loss": 3.9017,
"step": 34000
},
{
"epoch": 0.13,
"learning_rate": 4.345514975376187e-05,
"loss": 3.8976,
"step": 34500
},
{
"epoch": 0.13,
"learning_rate": 4.3360296851642475e-05,
"loss": 3.8917,
"step": 35000
},
{
"epoch": 0.13,
"learning_rate": 4.3265443949523085e-05,
"loss": 3.8917,
"step": 35500
},
{
"epoch": 0.14,
"learning_rate": 4.317059104740369e-05,
"loss": 3.8851,
"step": 36000
},
{
"epoch": 0.14,
"learning_rate": 4.307573814528429e-05,
"loss": 3.8899,
"step": 36500
},
{
"epoch": 0.14,
"learning_rate": 4.29808852431649e-05,
"loss": 3.8667,
"step": 37000
},
{
"epoch": 0.14,
"learning_rate": 4.288603234104551e-05,
"loss": 3.8702,
"step": 37500
},
{
"epoch": 0.14,
"learning_rate": 4.279117943892612e-05,
"loss": 3.8705,
"step": 38000
},
{
"epoch": 0.15,
"learning_rate": 4.269632653680672e-05,
"loss": 3.8647,
"step": 38500
},
{
"epoch": 0.15,
"learning_rate": 4.2601473634687325e-05,
"loss": 3.8612,
"step": 39000
},
{
"epoch": 0.15,
"learning_rate": 4.2506620732567936e-05,
"loss": 3.8524,
"step": 39500
},
{
"epoch": 0.15,
"learning_rate": 4.241176783044854e-05,
"loss": 3.8554,
"step": 40000
},
{
"epoch": 0.15,
"eval_accuracy": 0.3530753183612612,
"eval_loss": 3.7798092365264893,
"eval_runtime": 5037.6086,
"eval_samples_per_second": 88.092,
"eval_steps_per_second": 1.376,
"step": 40000
},
{
"epoch": 0.15,
"learning_rate": 4.231691492832915e-05,
"loss": 3.8487,
"step": 40500
},
{
"epoch": 0.16,
"learning_rate": 4.222206202620976e-05,
"loss": 3.851,
"step": 41000
},
{
"epoch": 0.16,
"learning_rate": 4.2127209124090364e-05,
"loss": 3.8431,
"step": 41500
},
{
"epoch": 0.16,
"learning_rate": 4.203235622197097e-05,
"loss": 3.8401,
"step": 42000
},
{
"epoch": 0.16,
"learning_rate": 4.193750331985158e-05,
"loss": 3.8296,
"step": 42500
},
{
"epoch": 0.16,
"learning_rate": 4.184265041773218e-05,
"loss": 3.8338,
"step": 43000
},
{
"epoch": 0.17,
"learning_rate": 4.174779751561279e-05,
"loss": 3.8295,
"step": 43500
},
{
"epoch": 0.17,
"learning_rate": 4.1652944613493396e-05,
"loss": 3.8285,
"step": 44000
},
{
"epoch": 0.17,
"learning_rate": 4.1558091711374e-05,
"loss": 3.8217,
"step": 44500
},
{
"epoch": 0.17,
"learning_rate": 4.146323880925461e-05,
"loss": 3.8262,
"step": 45000
},
{
"epoch": 0.17,
"learning_rate": 4.1368385907135214e-05,
"loss": 3.8241,
"step": 45500
},
{
"epoch": 0.17,
"learning_rate": 4.1273533005015825e-05,
"loss": 3.8145,
"step": 46000
},
{
"epoch": 0.18,
"learning_rate": 4.117868010289643e-05,
"loss": 3.8207,
"step": 46500
},
{
"epoch": 0.18,
"learning_rate": 4.108382720077704e-05,
"loss": 3.8128,
"step": 47000
},
{
"epoch": 0.18,
"learning_rate": 4.098897429865764e-05,
"loss": 3.8053,
"step": 47500
},
{
"epoch": 0.18,
"learning_rate": 4.089412139653825e-05,
"loss": 3.8023,
"step": 48000
},
{
"epoch": 0.18,
"learning_rate": 4.079926849441886e-05,
"loss": 3.8084,
"step": 48500
},
{
"epoch": 0.19,
"learning_rate": 4.070441559229947e-05,
"loss": 3.7967,
"step": 49000
},
{
"epoch": 0.19,
"learning_rate": 4.060956269018007e-05,
"loss": 3.7947,
"step": 49500
},
{
"epoch": 0.19,
"learning_rate": 4.0514709788060675e-05,
"loss": 3.7973,
"step": 50000
},
{
"epoch": 0.19,
"eval_accuracy": 0.35837528569747157,
"eval_loss": 3.7209770679473877,
"eval_runtime": 5052.5501,
"eval_samples_per_second": 87.831,
"eval_steps_per_second": 1.372,
"step": 50000
},
{
"epoch": 0.19,
"learning_rate": 4.041985688594128e-05,
"loss": 3.7861,
"step": 50500
},
{
"epoch": 0.19,
"learning_rate": 4.032500398382189e-05,
"loss": 3.7878,
"step": 51000
},
{
"epoch": 0.2,
"learning_rate": 4.02301510817025e-05,
"loss": 3.7839,
"step": 51500
},
{
"epoch": 0.2,
"learning_rate": 4.0135298179583104e-05,
"loss": 3.7898,
"step": 52000
},
{
"epoch": 0.2,
"learning_rate": 4.0040445277463714e-05,
"loss": 3.7808,
"step": 52500
},
{
"epoch": 0.2,
"learning_rate": 3.994559237534432e-05,
"loss": 3.7857,
"step": 53000
},
{
"epoch": 0.2,
"learning_rate": 3.985073947322492e-05,
"loss": 3.7754,
"step": 53500
},
{
"epoch": 0.2,
"learning_rate": 3.975588657110553e-05,
"loss": 3.769,
"step": 54000
},
{
"epoch": 0.21,
"learning_rate": 3.9661033668986136e-05,
"loss": 3.7723,
"step": 54500
},
{
"epoch": 0.21,
"learning_rate": 3.9566180766866747e-05,
"loss": 3.7719,
"step": 55000
},
{
"epoch": 0.21,
"learning_rate": 3.947132786474736e-05,
"loss": 3.7684,
"step": 55500
},
{
"epoch": 0.21,
"learning_rate": 3.9376474962627954e-05,
"loss": 3.7672,
"step": 56000
},
{
"epoch": 0.21,
"learning_rate": 3.9281622060508565e-05,
"loss": 3.7595,
"step": 56500
},
{
"epoch": 0.22,
"learning_rate": 3.918676915838917e-05,
"loss": 3.764,
"step": 57000
},
{
"epoch": 0.22,
"learning_rate": 3.909191625626978e-05,
"loss": 3.7584,
"step": 57500
},
{
"epoch": 0.22,
"learning_rate": 3.899706335415039e-05,
"loss": 3.7532,
"step": 58000
},
{
"epoch": 0.22,
"learning_rate": 3.890221045203099e-05,
"loss": 3.7476,
"step": 58500
},
{
"epoch": 0.22,
"learning_rate": 3.88073575499116e-05,
"loss": 3.7502,
"step": 59000
},
{
"epoch": 0.23,
"learning_rate": 3.871250464779221e-05,
"loss": 3.7584,
"step": 59500
},
{
"epoch": 0.23,
"learning_rate": 3.861765174567281e-05,
"loss": 3.7421,
"step": 60000
},
{
"epoch": 0.23,
"eval_accuracy": 0.3629990378932714,
"eval_loss": 3.6750495433807373,
"eval_runtime": 5050.4176,
"eval_samples_per_second": 87.869,
"eval_steps_per_second": 1.373,
"step": 60000
},
{
"epoch": 0.23,
"learning_rate": 3.852279884355342e-05,
"loss": 3.7503,
"step": 60500
},
{
"epoch": 0.23,
"learning_rate": 3.8427945941434025e-05,
"loss": 3.7485,
"step": 61000
},
{
"epoch": 0.23,
"learning_rate": 3.8333093039314636e-05,
"loss": 3.745,
"step": 61500
},
{
"epoch": 0.24,
"learning_rate": 3.823824013719524e-05,
"loss": 3.739,
"step": 62000
},
{
"epoch": 0.24,
"learning_rate": 3.8143387235075843e-05,
"loss": 3.7411,
"step": 62500
},
{
"epoch": 0.24,
"learning_rate": 3.8048534332956454e-05,
"loss": 3.739,
"step": 63000
},
{
"epoch": 0.24,
"learning_rate": 3.795368143083706e-05,
"loss": 3.7388,
"step": 63500
},
{
"epoch": 0.24,
"learning_rate": 3.785882852871767e-05,
"loss": 3.7339,
"step": 64000
},
{
"epoch": 0.24,
"learning_rate": 3.776397562659827e-05,
"loss": 3.7328,
"step": 64500
},
{
"epoch": 0.25,
"learning_rate": 3.7669122724478876e-05,
"loss": 3.7209,
"step": 65000
},
{
"epoch": 0.25,
"learning_rate": 3.7574269822359486e-05,
"loss": 3.727,
"step": 65500
},
{
"epoch": 0.25,
"learning_rate": 3.74794169202401e-05,
"loss": 3.7251,
"step": 66000
},
{
"epoch": 0.25,
"learning_rate": 3.73845640181207e-05,
"loss": 3.7199,
"step": 66500
},
{
"epoch": 0.25,
"learning_rate": 3.728971111600131e-05,
"loss": 3.7277,
"step": 67000
},
{
"epoch": 0.26,
"learning_rate": 3.719485821388191e-05,
"loss": 3.7232,
"step": 67500
},
{
"epoch": 0.26,
"learning_rate": 3.710000531176252e-05,
"loss": 3.7154,
"step": 68000
},
{
"epoch": 0.26,
"learning_rate": 3.700515240964313e-05,
"loss": 3.7159,
"step": 68500
},
{
"epoch": 0.26,
"learning_rate": 3.691029950752373e-05,
"loss": 3.711,
"step": 69000
},
{
"epoch": 0.26,
"learning_rate": 3.681544660540434e-05,
"loss": 3.7094,
"step": 69500
},
{
"epoch": 0.27,
"learning_rate": 3.672059370328495e-05,
"loss": 3.7097,
"step": 70000
},
{
"epoch": 0.27,
"eval_accuracy": 0.36635205507688484,
"eval_loss": 3.6377646923065186,
"eval_runtime": 5044.3649,
"eval_samples_per_second": 87.974,
"eval_steps_per_second": 1.375,
"step": 70000
},
{
"epoch": 0.27,
"learning_rate": 3.662574080116555e-05,
"loss": 3.7109,
"step": 70500
},
{
"epoch": 0.27,
"learning_rate": 3.653088789904616e-05,
"loss": 3.7069,
"step": 71000
},
{
"epoch": 0.27,
"learning_rate": 3.6436034996926765e-05,
"loss": 3.7025,
"step": 71500
},
{
"epoch": 0.27,
"learning_rate": 3.6341182094807376e-05,
"loss": 3.7032,
"step": 72000
},
{
"epoch": 0.28,
"learning_rate": 3.6246329192687986e-05,
"loss": 3.7005,
"step": 72500
},
{
"epoch": 0.28,
"learning_rate": 3.615147629056859e-05,
"loss": 3.704,
"step": 73000
},
{
"epoch": 0.28,
"learning_rate": 3.6056623388449194e-05,
"loss": 3.6903,
"step": 73500
},
{
"epoch": 0.28,
"learning_rate": 3.59617704863298e-05,
"loss": 3.6989,
"step": 74000
},
{
"epoch": 0.28,
"learning_rate": 3.586691758421041e-05,
"loss": 3.6947,
"step": 74500
},
{
"epoch": 0.28,
"learning_rate": 3.577206468209102e-05,
"loss": 3.6969,
"step": 75000
},
{
"epoch": 0.29,
"learning_rate": 3.567721177997162e-05,
"loss": 3.696,
"step": 75500
},
{
"epoch": 0.29,
"learning_rate": 3.5582358877852226e-05,
"loss": 3.6905,
"step": 76000
},
{
"epoch": 0.29,
"learning_rate": 3.5487505975732836e-05,
"loss": 3.6851,
"step": 76500
},
{
"epoch": 0.29,
"learning_rate": 3.539265307361344e-05,
"loss": 3.6857,
"step": 77000
},
{
"epoch": 0.29,
"learning_rate": 3.529780017149405e-05,
"loss": 3.6889,
"step": 77500
},
{
"epoch": 0.3,
"learning_rate": 3.5202947269374654e-05,
"loss": 3.6911,
"step": 78000
},
{
"epoch": 0.3,
"learning_rate": 3.5108094367255265e-05,
"loss": 3.687,
"step": 78500
},
{
"epoch": 0.3,
"learning_rate": 3.501324146513587e-05,
"loss": 3.6808,
"step": 79000
},
{
"epoch": 0.3,
"learning_rate": 3.491838856301647e-05,
"loss": 3.6841,
"step": 79500
},
{
"epoch": 0.3,
"learning_rate": 3.482353566089708e-05,
"loss": 3.6741,
"step": 80000
},
{
"epoch": 0.3,
"eval_accuracy": 0.36940481219098525,
"eval_loss": 3.606105089187622,
"eval_runtime": 5043.6804,
"eval_samples_per_second": 87.986,
"eval_steps_per_second": 1.375,
"step": 80000
},
{
"epoch": 0.31,
"learning_rate": 3.4728682758777694e-05,
"loss": 3.6789,
"step": 80500
},
{
"epoch": 0.31,
"learning_rate": 3.46338298566583e-05,
"loss": 3.6809,
"step": 81000
},
{
"epoch": 0.31,
"learning_rate": 3.45389769545389e-05,
"loss": 3.6854,
"step": 81500
},
{
"epoch": 0.31,
"learning_rate": 3.4444124052419505e-05,
"loss": 3.6692,
"step": 82000
},
{
"epoch": 0.31,
"learning_rate": 3.4349271150300115e-05,
"loss": 3.6771,
"step": 82500
},
{
"epoch": 0.31,
"learning_rate": 3.4254418248180726e-05,
"loss": 3.6773,
"step": 83000
},
{
"epoch": 0.32,
"learning_rate": 3.415956534606133e-05,
"loss": 3.6701,
"step": 83500
},
{
"epoch": 0.32,
"learning_rate": 3.406471244394194e-05,
"loss": 3.666,
"step": 84000
},
{
"epoch": 0.32,
"learning_rate": 3.3969859541822544e-05,
"loss": 3.672,
"step": 84500
},
{
"epoch": 0.32,
"learning_rate": 3.387500663970315e-05,
"loss": 3.6738,
"step": 85000
},
{
"epoch": 0.32,
"learning_rate": 3.378015373758376e-05,
"loss": 3.6705,
"step": 85500
},
{
"epoch": 0.33,
"learning_rate": 3.368530083546436e-05,
"loss": 3.6649,
"step": 86000
},
{
"epoch": 0.33,
"learning_rate": 3.359044793334497e-05,
"loss": 3.6736,
"step": 86500
},
{
"epoch": 0.33,
"learning_rate": 3.3495595031225576e-05,
"loss": 3.6689,
"step": 87000
},
{
"epoch": 0.33,
"learning_rate": 3.340074212910618e-05,
"loss": 3.6665,
"step": 87500
},
{
"epoch": 0.33,
"learning_rate": 3.330588922698679e-05,
"loss": 3.6641,
"step": 88000
},
{
"epoch": 0.34,
"learning_rate": 3.3211036324867394e-05,
"loss": 3.6536,
"step": 88500
},
{
"epoch": 0.34,
"learning_rate": 3.3116183422748005e-05,
"loss": 3.6658,
"step": 89000
},
{
"epoch": 0.34,
"learning_rate": 3.3021330520628615e-05,
"loss": 3.6544,
"step": 89500
},
{
"epoch": 0.34,
"learning_rate": 3.292647761850922e-05,
"loss": 3.6599,
"step": 90000
},
{
"epoch": 0.34,
"eval_accuracy": 0.37180447854264453,
"eval_loss": 3.5803401470184326,
"eval_runtime": 5042.0712,
"eval_samples_per_second": 88.014,
"eval_steps_per_second": 1.375,
"step": 90000
},
{
"epoch": 0.34,
"learning_rate": 3.283162471638982e-05,
"loss": 3.6605,
"step": 90500
},
{
"epoch": 0.35,
"learning_rate": 3.2736771814270426e-05,
"loss": 3.6603,
"step": 91000
},
{
"epoch": 0.35,
"learning_rate": 3.264191891215104e-05,
"loss": 3.6576,
"step": 91500
},
{
"epoch": 0.35,
"learning_rate": 3.254706601003165e-05,
"loss": 3.6511,
"step": 92000
},
{
"epoch": 0.35,
"learning_rate": 3.245221310791225e-05,
"loss": 3.6518,
"step": 92500
},
{
"epoch": 0.35,
"learning_rate": 3.2357360205792855e-05,
"loss": 3.6522,
"step": 93000
},
{
"epoch": 0.35,
"learning_rate": 3.2262507303673465e-05,
"loss": 3.646,
"step": 93500
},
{
"epoch": 0.36,
"learning_rate": 3.216765440155407e-05,
"loss": 3.6494,
"step": 94000
},
{
"epoch": 0.36,
"learning_rate": 3.207280149943468e-05,
"loss": 3.6388,
"step": 94500
},
{
"epoch": 0.36,
"learning_rate": 3.1977948597315284e-05,
"loss": 3.6456,
"step": 95000
},
{
"epoch": 0.36,
"learning_rate": 3.1883095695195894e-05,
"loss": 3.6398,
"step": 95500
},
{
"epoch": 0.36,
"learning_rate": 3.17882427930765e-05,
"loss": 3.6476,
"step": 96000
},
{
"epoch": 0.37,
"learning_rate": 3.16933898909571e-05,
"loss": 3.6364,
"step": 96500
},
{
"epoch": 0.37,
"learning_rate": 3.159853698883771e-05,
"loss": 3.6456,
"step": 97000
},
{
"epoch": 0.37,
"learning_rate": 3.150368408671832e-05,
"loss": 3.645,
"step": 97500
},
{
"epoch": 0.37,
"learning_rate": 3.1408831184598926e-05,
"loss": 3.6357,
"step": 98000
},
{
"epoch": 0.37,
"learning_rate": 3.131397828247954e-05,
"loss": 3.6415,
"step": 98500
},
{
"epoch": 0.38,
"learning_rate": 3.1219125380360134e-05,
"loss": 3.6314,
"step": 99000
},
{
"epoch": 0.38,
"learning_rate": 3.1124272478240744e-05,
"loss": 3.64,
"step": 99500
},
{
"epoch": 0.38,
"learning_rate": 3.1029419576121355e-05,
"loss": 3.6356,
"step": 100000
},
{
"epoch": 0.38,
"eval_accuracy": 0.37408231347678594,
"eval_loss": 3.558403253555298,
"eval_runtime": 5044.122,
"eval_samples_per_second": 87.978,
"eval_steps_per_second": 1.375,
"step": 100000
},
{
"epoch": 0.38,
"learning_rate": 3.093456667400196e-05,
"loss": 3.637,
"step": 100500
},
{
"epoch": 0.38,
"learning_rate": 3.083971377188257e-05,
"loss": 3.6353,
"step": 101000
},
{
"epoch": 0.39,
"learning_rate": 3.074486086976317e-05,
"loss": 3.6331,
"step": 101500
},
{
"epoch": 0.39,
"learning_rate": 3.065000796764378e-05,
"loss": 3.6288,
"step": 102000
},
{
"epoch": 0.39,
"learning_rate": 3.055515506552439e-05,
"loss": 3.6273,
"step": 102500
},
{
"epoch": 0.39,
"learning_rate": 3.046030216340499e-05,
"loss": 3.6351,
"step": 103000
},
{
"epoch": 0.39,
"learning_rate": 3.03654492612856e-05,
"loss": 3.6285,
"step": 103500
},
{
"epoch": 0.39,
"learning_rate": 3.027059635916621e-05,
"loss": 3.6256,
"step": 104000
},
{
"epoch": 0.4,
"learning_rate": 3.0175743457046812e-05,
"loss": 3.6248,
"step": 104500
},
{
"epoch": 0.4,
"learning_rate": 3.0080890554927423e-05,
"loss": 3.6182,
"step": 105000
},
{
"epoch": 0.4,
"learning_rate": 2.9986037652808023e-05,
"loss": 3.6242,
"step": 105500
},
{
"epoch": 0.4,
"learning_rate": 2.9891184750688634e-05,
"loss": 3.625,
"step": 106000
},
{
"epoch": 0.4,
"learning_rate": 2.979633184856924e-05,
"loss": 3.6191,
"step": 106500
},
{
"epoch": 0.41,
"learning_rate": 2.9701478946449845e-05,
"loss": 3.6267,
"step": 107000
},
{
"epoch": 0.41,
"learning_rate": 2.9606626044330455e-05,
"loss": 3.6227,
"step": 107500
},
{
"epoch": 0.41,
"learning_rate": 2.9511773142211062e-05,
"loss": 3.6217,
"step": 108000
},
{
"epoch": 0.41,
"learning_rate": 2.9416920240091666e-05,
"loss": 3.6168,
"step": 108500
},
{
"epoch": 0.41,
"learning_rate": 2.9322067337972277e-05,
"loss": 3.6204,
"step": 109000
},
{
"epoch": 0.42,
"learning_rate": 2.922721443585288e-05,
"loss": 3.6217,
"step": 109500
},
{
"epoch": 0.42,
"learning_rate": 2.9132361533733487e-05,
"loss": 3.6131,
"step": 110000
},
{
"epoch": 0.42,
"eval_accuracy": 0.37584134336520747,
"eval_loss": 3.542346715927124,
"eval_runtime": 5051.4787,
"eval_samples_per_second": 87.85,
"eval_steps_per_second": 1.373,
"step": 110000
},
{
"epoch": 0.42,
"learning_rate": 2.9037508631614098e-05,
"loss": 3.6165,
"step": 110500
},
{
"epoch": 0.42,
"learning_rate": 2.8942655729494698e-05,
"loss": 3.6129,
"step": 111000
},
{
"epoch": 0.42,
"learning_rate": 2.884780282737531e-05,
"loss": 3.6177,
"step": 111500
},
{
"epoch": 0.42,
"learning_rate": 2.8752949925255913e-05,
"loss": 3.6187,
"step": 112000
},
{
"epoch": 0.43,
"learning_rate": 2.865809702313652e-05,
"loss": 3.6112,
"step": 112500
},
{
"epoch": 0.43,
"learning_rate": 2.856324412101713e-05,
"loss": 3.6103,
"step": 113000
},
{
"epoch": 0.43,
"learning_rate": 2.8468391218897734e-05,
"loss": 3.6103,
"step": 113500
},
{
"epoch": 0.43,
"learning_rate": 2.837353831677834e-05,
"loss": 3.615,
"step": 114000
},
{
"epoch": 0.43,
"learning_rate": 2.827868541465895e-05,
"loss": 3.6151,
"step": 114500
},
{
"epoch": 0.44,
"learning_rate": 2.8183832512539555e-05,
"loss": 3.6039,
"step": 115000
},
{
"epoch": 0.44,
"learning_rate": 2.8088979610420162e-05,
"loss": 3.6133,
"step": 115500
},
{
"epoch": 0.44,
"learning_rate": 2.7994126708300766e-05,
"loss": 3.6063,
"step": 116000
},
{
"epoch": 0.44,
"learning_rate": 2.7899273806181377e-05,
"loss": 3.6029,
"step": 116500
},
{
"epoch": 0.44,
"learning_rate": 2.7804420904061984e-05,
"loss": 3.6082,
"step": 117000
},
{
"epoch": 0.45,
"learning_rate": 2.7709568001942588e-05,
"loss": 3.6099,
"step": 117500
},
{
"epoch": 0.45,
"learning_rate": 2.7614715099823195e-05,
"loss": 3.6035,
"step": 118000
},
{
"epoch": 0.45,
"learning_rate": 2.75198621977038e-05,
"loss": 3.5997,
"step": 118500
},
{
"epoch": 0.45,
"learning_rate": 2.742500929558441e-05,
"loss": 3.6032,
"step": 119000
},
{
"epoch": 0.45,
"learning_rate": 2.7330156393465016e-05,
"loss": 3.5998,
"step": 119500
},
{
"epoch": 0.46,
"learning_rate": 2.723530349134562e-05,
"loss": 3.5991,
"step": 120000
},
{
"epoch": 0.46,
"eval_accuracy": 0.3775510164297428,
"eval_loss": 3.525380849838257,
"eval_runtime": 5041.3032,
"eval_samples_per_second": 88.027,
"eval_steps_per_second": 1.375,
"step": 120000
},
{
"epoch": 0.46,
"learning_rate": 2.714045058922623e-05,
"loss": 3.6027,
"step": 120500
},
{
"epoch": 0.46,
"learning_rate": 2.7045597687106838e-05,
"loss": 3.5947,
"step": 121000
},
{
"epoch": 0.46,
"learning_rate": 2.695074478498744e-05,
"loss": 3.6033,
"step": 121500
},
{
"epoch": 0.46,
"learning_rate": 2.6855891882868052e-05,
"loss": 3.5933,
"step": 122000
},
{
"epoch": 0.46,
"learning_rate": 2.6761038980748652e-05,
"loss": 3.594,
"step": 122500
},
{
"epoch": 0.47,
"learning_rate": 2.6666186078629263e-05,
"loss": 3.599,
"step": 123000
},
{
"epoch": 0.47,
"learning_rate": 2.6571333176509873e-05,
"loss": 3.6013,
"step": 123500
},
{
"epoch": 0.47,
"learning_rate": 2.6476480274390474e-05,
"loss": 3.5982,
"step": 124000
},
{
"epoch": 0.47,
"learning_rate": 2.6381627372271084e-05,
"loss": 3.5937,
"step": 124500
},
{
"epoch": 0.47,
"learning_rate": 2.628677447015169e-05,
"loss": 3.5945,
"step": 125000
},
{
"epoch": 0.48,
"learning_rate": 2.6191921568032295e-05,
"loss": 3.5926,
"step": 125500
},
{
"epoch": 0.48,
"learning_rate": 2.6097068665912906e-05,
"loss": 3.5927,
"step": 126000
},
{
"epoch": 0.48,
"learning_rate": 2.600221576379351e-05,
"loss": 3.5892,
"step": 126500
},
{
"epoch": 0.48,
"learning_rate": 2.5907362861674116e-05,
"loss": 3.5938,
"step": 127000
},
{
"epoch": 0.48,
"learning_rate": 2.5812509959554727e-05,
"loss": 3.5867,
"step": 127500
},
{
"epoch": 0.49,
"learning_rate": 2.571765705743533e-05,
"loss": 3.5879,
"step": 128000
},
{
"epoch": 0.49,
"learning_rate": 2.5622804155315938e-05,
"loss": 3.5909,
"step": 128500
},
{
"epoch": 0.49,
"learning_rate": 2.552795125319654e-05,
"loss": 3.5861,
"step": 129000
},
{
"epoch": 0.49,
"learning_rate": 2.543309835107715e-05,
"loss": 3.5913,
"step": 129500
},
{
"epoch": 0.49,
"learning_rate": 2.533824544895776e-05,
"loss": 3.591,
"step": 130000
},
{
"epoch": 0.49,
"eval_accuracy": 0.37901210353247916,
"eval_loss": 3.510841131210327,
"eval_runtime": 5053.0574,
"eval_samples_per_second": 87.823,
"eval_steps_per_second": 1.372,
"step": 130000
},
{
"epoch": 0.5,
"learning_rate": 2.5243392546838363e-05,
"loss": 3.5849,
"step": 130500
},
{
"epoch": 0.5,
"learning_rate": 2.514853964471897e-05,
"loss": 3.5868,
"step": 131000
},
{
"epoch": 0.5,
"learning_rate": 2.505368674259958e-05,
"loss": 3.5838,
"step": 131500
},
{
"epoch": 0.5,
"learning_rate": 2.4958833840480184e-05,
"loss": 3.5848,
"step": 132000
},
{
"epoch": 0.5,
"learning_rate": 2.486398093836079e-05,
"loss": 3.5818,
"step": 132500
},
{
"epoch": 0.5,
"learning_rate": 2.47691280362414e-05,
"loss": 3.5842,
"step": 133000
},
{
"epoch": 0.51,
"learning_rate": 2.4674275134122006e-05,
"loss": 3.584,
"step": 133500
},
{
"epoch": 0.51,
"learning_rate": 2.457942223200261e-05,
"loss": 3.5817,
"step": 134000
},
{
"epoch": 0.51,
"learning_rate": 2.448456932988322e-05,
"loss": 3.5781,
"step": 134500
},
{
"epoch": 0.51,
"learning_rate": 2.4389716427763827e-05,
"loss": 3.5761,
"step": 135000
},
{
"epoch": 0.51,
"learning_rate": 2.429486352564443e-05,
"loss": 3.5755,
"step": 135500
},
{
"epoch": 0.52,
"learning_rate": 2.4200010623525038e-05,
"loss": 3.5822,
"step": 136000
},
{
"epoch": 0.52,
"learning_rate": 2.4105157721405645e-05,
"loss": 3.5749,
"step": 136500
},
{
"epoch": 0.52,
"learning_rate": 2.4010304819286252e-05,
"loss": 3.5812,
"step": 137000
},
{
"epoch": 0.52,
"learning_rate": 2.391545191716686e-05,
"loss": 3.5786,
"step": 137500
},
{
"epoch": 0.52,
"learning_rate": 2.3820599015047467e-05,
"loss": 3.5826,
"step": 138000
},
{
"epoch": 0.53,
"learning_rate": 2.372574611292807e-05,
"loss": 3.5731,
"step": 138500
},
{
"epoch": 0.53,
"learning_rate": 2.363089321080868e-05,
"loss": 3.5784,
"step": 139000
},
{
"epoch": 0.53,
"learning_rate": 2.3536040308689285e-05,
"loss": 3.5738,
"step": 139500
},
{
"epoch": 0.53,
"learning_rate": 2.3441187406569892e-05,
"loss": 3.574,
"step": 140000
},
{
"epoch": 0.53,
"eval_accuracy": 0.3804677520669924,
"eval_loss": 3.4966471195220947,
"eval_runtime": 5044.6959,
"eval_samples_per_second": 87.968,
"eval_steps_per_second": 1.375,
"step": 140000
},
{
"epoch": 0.53,
"learning_rate": 2.33463345044505e-05,
"loss": 3.5722,
"step": 140500
},
{
"epoch": 0.53,
"learning_rate": 2.3251481602331106e-05,
"loss": 3.5778,
"step": 141000
},
{
"epoch": 0.54,
"learning_rate": 2.3156628700211713e-05,
"loss": 3.5722,
"step": 141500
},
{
"epoch": 0.54,
"learning_rate": 2.306177579809232e-05,
"loss": 3.5658,
"step": 142000
},
{
"epoch": 0.54,
"learning_rate": 2.2966922895972924e-05,
"loss": 3.5671,
"step": 142500
},
{
"epoch": 0.54,
"learning_rate": 2.2872069993853535e-05,
"loss": 3.5696,
"step": 143000
},
{
"epoch": 0.54,
"learning_rate": 2.2777217091734142e-05,
"loss": 3.5691,
"step": 143500
},
{
"epoch": 0.55,
"learning_rate": 2.2682364189614745e-05,
"loss": 3.5737,
"step": 144000
},
{
"epoch": 0.55,
"learning_rate": 2.2587511287495353e-05,
"loss": 3.5693,
"step": 144500
},
{
"epoch": 0.55,
"learning_rate": 2.249265838537596e-05,
"loss": 3.5728,
"step": 145000
},
{
"epoch": 0.55,
"learning_rate": 2.2397805483256567e-05,
"loss": 3.569,
"step": 145500
},
{
"epoch": 0.55,
"learning_rate": 2.2302952581137174e-05,
"loss": 3.5559,
"step": 146000
},
{
"epoch": 0.56,
"learning_rate": 2.220809967901778e-05,
"loss": 3.5673,
"step": 146500
},
{
"epoch": 0.56,
"learning_rate": 2.2113246776898385e-05,
"loss": 3.567,
"step": 147000
},
{
"epoch": 0.56,
"learning_rate": 2.2018393874778995e-05,
"loss": 3.5692,
"step": 147500
},
{
"epoch": 0.56,
"learning_rate": 2.19235409726596e-05,
"loss": 3.5651,
"step": 148000
},
{
"epoch": 0.56,
"learning_rate": 2.1828688070540206e-05,
"loss": 3.559,
"step": 148500
},
{
"epoch": 0.57,
"learning_rate": 2.1733835168420813e-05,
"loss": 3.5657,
"step": 149000
},
{
"epoch": 0.57,
"learning_rate": 2.163898226630142e-05,
"loss": 3.5632,
"step": 149500
},
{
"epoch": 0.57,
"learning_rate": 2.1544129364182028e-05,
"loss": 3.5606,
"step": 150000
},
{
"epoch": 0.57,
"eval_accuracy": 0.38146521499584235,
"eval_loss": 3.486565113067627,
"eval_runtime": 5047.1623,
"eval_samples_per_second": 87.925,
"eval_steps_per_second": 1.374,
"step": 150000
},
{
"epoch": 0.57,
"learning_rate": 2.1449276462062635e-05,
"loss": 3.5635,
"step": 150500
},
{
"epoch": 0.57,
"learning_rate": 2.135442355994324e-05,
"loss": 3.569,
"step": 151000
},
{
"epoch": 0.57,
"learning_rate": 2.125957065782385e-05,
"loss": 3.551,
"step": 151500
},
{
"epoch": 0.58,
"learning_rate": 2.1164717755704456e-05,
"loss": 3.5543,
"step": 152000
},
{
"epoch": 0.58,
"learning_rate": 2.106986485358506e-05,
"loss": 3.5556,
"step": 152500
},
{
"epoch": 0.58,
"learning_rate": 2.0975011951465667e-05,
"loss": 3.5598,
"step": 153000
},
{
"epoch": 0.58,
"learning_rate": 2.0880159049346274e-05,
"loss": 3.5592,
"step": 153500
},
{
"epoch": 0.58,
"learning_rate": 2.078530614722688e-05,
"loss": 3.5562,
"step": 154000
},
{
"epoch": 0.59,
"learning_rate": 2.069045324510749e-05,
"loss": 3.5561,
"step": 154500
},
{
"epoch": 0.59,
"learning_rate": 2.0595600342988096e-05,
"loss": 3.5573,
"step": 155000
},
{
"epoch": 0.59,
"learning_rate": 2.05007474408687e-05,
"loss": 3.5585,
"step": 155500
},
{
"epoch": 0.59,
"learning_rate": 2.040589453874931e-05,
"loss": 3.5576,
"step": 156000
},
{
"epoch": 0.59,
"learning_rate": 2.0311041636629917e-05,
"loss": 3.5529,
"step": 156500
},
{
"epoch": 0.6,
"learning_rate": 2.021618873451052e-05,
"loss": 3.5575,
"step": 157000
},
{
"epoch": 0.6,
"learning_rate": 2.0121335832391128e-05,
"loss": 3.5569,
"step": 157500
},
{
"epoch": 0.6,
"learning_rate": 2.0026482930271735e-05,
"loss": 3.5537,
"step": 158000
},
{
"epoch": 0.6,
"learning_rate": 1.9931630028152342e-05,
"loss": 3.5553,
"step": 158500
},
{
"epoch": 0.6,
"learning_rate": 1.983677712603295e-05,
"loss": 3.5524,
"step": 159000
},
{
"epoch": 0.61,
"learning_rate": 1.9741924223913557e-05,
"loss": 3.5562,
"step": 159500
},
{
"epoch": 0.61,
"learning_rate": 1.9647071321794164e-05,
"loss": 3.5516,
"step": 160000
},
{
"epoch": 0.61,
"eval_accuracy": 0.3828251390025017,
"eval_loss": 3.4739012718200684,
"eval_runtime": 5050.1987,
"eval_samples_per_second": 87.872,
"eval_steps_per_second": 1.373,
"step": 160000
},
{
"epoch": 0.61,
"learning_rate": 1.955221841967477e-05,
"loss": 3.5508,
"step": 160500
},
{
"epoch": 0.61,
"learning_rate": 1.9457365517555375e-05,
"loss": 3.5424,
"step": 161000
},
{
"epoch": 0.61,
"learning_rate": 1.936251261543598e-05,
"loss": 3.5526,
"step": 161500
},
{
"epoch": 0.61,
"learning_rate": 1.9267659713316592e-05,
"loss": 3.5469,
"step": 162000
},
{
"epoch": 0.62,
"learning_rate": 1.9172806811197196e-05,
"loss": 3.5401,
"step": 162500
},
{
"epoch": 0.62,
"learning_rate": 1.9077953909077803e-05,
"loss": 3.5525,
"step": 163000
},
{
"epoch": 0.62,
"learning_rate": 1.898310100695841e-05,
"loss": 3.5494,
"step": 163500
},
{
"epoch": 0.62,
"learning_rate": 1.8888248104839014e-05,
"loss": 3.5527,
"step": 164000
},
{
"epoch": 0.62,
"learning_rate": 1.8793395202719624e-05,
"loss": 3.5477,
"step": 164500
},
{
"epoch": 0.63,
"learning_rate": 1.869854230060023e-05,
"loss": 3.548,
"step": 165000
},
{
"epoch": 0.63,
"learning_rate": 1.8603689398480835e-05,
"loss": 3.5466,
"step": 165500
},
{
"epoch": 0.63,
"learning_rate": 1.8508836496361442e-05,
"loss": 3.5491,
"step": 166000
},
{
"epoch": 0.63,
"learning_rate": 1.8413983594242053e-05,
"loss": 3.5431,
"step": 166500
},
{
"epoch": 0.63,
"learning_rate": 1.8319130692122657e-05,
"loss": 3.5462,
"step": 167000
},
{
"epoch": 0.64,
"learning_rate": 1.8224277790003264e-05,
"loss": 3.5453,
"step": 167500
},
{
"epoch": 0.64,
"learning_rate": 1.812942488788387e-05,
"loss": 3.5408,
"step": 168000
},
{
"epoch": 0.64,
"learning_rate": 1.8034571985764478e-05,
"loss": 3.5465,
"step": 168500
},
{
"epoch": 0.64,
"learning_rate": 1.7939719083645085e-05,
"loss": 3.5437,
"step": 169000
},
{
"epoch": 0.64,
"learning_rate": 1.784486618152569e-05,
"loss": 3.533,
"step": 169500
},
{
"epoch": 0.64,
"learning_rate": 1.7750013279406296e-05,
"loss": 3.5423,
"step": 170000
},
{
"epoch": 0.64,
"eval_accuracy": 0.38377248075624093,
"eval_loss": 3.4649875164031982,
"eval_runtime": 5024.5121,
"eval_samples_per_second": 88.322,
"eval_steps_per_second": 1.38,
"step": 170000
},
{
"epoch": 0.65,
"learning_rate": 1.7655160377286907e-05,
"loss": 3.5446,
"step": 170500
},
{
"epoch": 0.65,
"learning_rate": 1.756030747516751e-05,
"loss": 3.5374,
"step": 171000
},
{
"epoch": 0.65,
"learning_rate": 1.7465454573048118e-05,
"loss": 3.5426,
"step": 171500
},
{
"epoch": 0.65,
"learning_rate": 1.7370601670928725e-05,
"loss": 3.5391,
"step": 172000
},
{
"epoch": 0.65,
"learning_rate": 1.727574876880933e-05,
"loss": 3.544,
"step": 172500
},
{
"epoch": 0.66,
"learning_rate": 1.718089586668994e-05,
"loss": 3.5396,
"step": 173000
},
{
"epoch": 0.66,
"learning_rate": 1.7086042964570546e-05,
"loss": 3.5385,
"step": 173500
},
{
"epoch": 0.66,
"learning_rate": 1.699119006245115e-05,
"loss": 3.534,
"step": 174000
},
{
"epoch": 0.66,
"learning_rate": 1.6896337160331757e-05,
"loss": 3.5374,
"step": 174500
},
{
"epoch": 0.66,
"learning_rate": 1.6801484258212368e-05,
"loss": 3.5351,
"step": 175000
},
{
"epoch": 0.67,
"learning_rate": 1.670663135609297e-05,
"loss": 3.5391,
"step": 175500
},
{
"epoch": 0.67,
"learning_rate": 1.661177845397358e-05,
"loss": 3.5351,
"step": 176000
},
{
"epoch": 0.67,
"learning_rate": 1.6516925551854186e-05,
"loss": 3.5315,
"step": 176500
},
{
"epoch": 0.67,
"learning_rate": 1.6422072649734793e-05,
"loss": 3.5324,
"step": 177000
},
{
"epoch": 0.67,
"learning_rate": 1.63272197476154e-05,
"loss": 3.5379,
"step": 177500
},
{
"epoch": 0.68,
"learning_rate": 1.6232366845496007e-05,
"loss": 3.534,
"step": 178000
},
{
"epoch": 0.68,
"learning_rate": 1.613751394337661e-05,
"loss": 3.5366,
"step": 178500
},
{
"epoch": 0.68,
"learning_rate": 1.604266104125722e-05,
"loss": 3.5364,
"step": 179000
},
{
"epoch": 0.68,
"learning_rate": 1.5947808139137825e-05,
"loss": 3.5398,
"step": 179500
},
{
"epoch": 0.68,
"learning_rate": 1.5852955237018432e-05,
"loss": 3.5298,
"step": 180000
},
{
"epoch": 0.68,
"eval_accuracy": 0.3846720031995081,
"eval_loss": 3.455994129180908,
"eval_runtime": 5033.5621,
"eval_samples_per_second": 88.163,
"eval_steps_per_second": 1.378,
"step": 180000
},
{
"epoch": 0.68,
"learning_rate": 1.575810233489904e-05,
"loss": 3.5337,
"step": 180500
},
{
"epoch": 0.69,
"learning_rate": 1.5663249432779646e-05,
"loss": 3.5324,
"step": 181000
},
{
"epoch": 0.69,
"learning_rate": 1.5568396530660254e-05,
"loss": 3.5307,
"step": 181500
},
{
"epoch": 0.69,
"learning_rate": 1.547354362854086e-05,
"loss": 3.5273,
"step": 182000
},
{
"epoch": 0.69,
"learning_rate": 1.5378690726421464e-05,
"loss": 3.5301,
"step": 182500
},
{
"epoch": 0.69,
"learning_rate": 1.528383782430207e-05,
"loss": 3.5321,
"step": 183000
},
{
"epoch": 0.7,
"learning_rate": 1.518898492218268e-05,
"loss": 3.5283,
"step": 183500
},
{
"epoch": 0.7,
"learning_rate": 1.5094132020063287e-05,
"loss": 3.534,
"step": 184000
},
{
"epoch": 0.7,
"learning_rate": 1.4999279117943893e-05,
"loss": 3.5346,
"step": 184500
},
{
"epoch": 0.7,
"learning_rate": 1.4904426215824498e-05,
"loss": 3.5234,
"step": 185000
},
{
"epoch": 0.7,
"learning_rate": 1.4809573313705107e-05,
"loss": 3.5243,
"step": 185500
},
{
"epoch": 0.71,
"learning_rate": 1.4714720411585714e-05,
"loss": 3.5292,
"step": 186000
},
{
"epoch": 0.71,
"learning_rate": 1.461986750946632e-05,
"loss": 3.5241,
"step": 186500
},
{
"epoch": 0.71,
"learning_rate": 1.4525014607346927e-05,
"loss": 3.5258,
"step": 187000
},
{
"epoch": 0.71,
"learning_rate": 1.4430161705227536e-05,
"loss": 3.5241,
"step": 187500
},
{
"epoch": 0.71,
"learning_rate": 1.4335308803108141e-05,
"loss": 3.5281,
"step": 188000
},
{
"epoch": 0.72,
"learning_rate": 1.4240455900988747e-05,
"loss": 3.5271,
"step": 188500
},
{
"epoch": 0.72,
"learning_rate": 1.4145602998869354e-05,
"loss": 3.5222,
"step": 189000
},
{
"epoch": 0.72,
"learning_rate": 1.405075009674996e-05,
"loss": 3.5197,
"step": 189500
},
{
"epoch": 0.72,
"learning_rate": 1.3955897194630568e-05,
"loss": 3.5287,
"step": 190000
},
{
"epoch": 0.72,
"eval_accuracy": 0.38565153361158844,
"eval_loss": 3.447903871536255,
"eval_runtime": 5039.8049,
"eval_samples_per_second": 88.054,
"eval_steps_per_second": 1.376,
"step": 190000
},
{
"epoch": 0.72,
"learning_rate": 1.3861044292511175e-05,
"loss": 3.5309,
"step": 190500
},
{
"epoch": 0.72,
"learning_rate": 1.376619139039178e-05,
"loss": 3.5236,
"step": 191000
},
{
"epoch": 0.73,
"learning_rate": 1.3671338488272386e-05,
"loss": 3.5268,
"step": 191500
},
{
"epoch": 0.73,
"learning_rate": 1.3576485586152995e-05,
"loss": 3.5261,
"step": 192000
},
{
"epoch": 0.73,
"learning_rate": 1.3481632684033602e-05,
"loss": 3.5242,
"step": 192500
},
{
"epoch": 0.73,
"learning_rate": 1.3386779781914207e-05,
"loss": 3.5268,
"step": 193000
},
{
"epoch": 0.73,
"learning_rate": 1.3291926879794815e-05,
"loss": 3.5302,
"step": 193500
},
{
"epoch": 0.74,
"learning_rate": 1.3197073977675423e-05,
"loss": 3.525,
"step": 194000
},
{
"epoch": 0.74,
"learning_rate": 1.3102221075556029e-05,
"loss": 3.5278,
"step": 194500
},
{
"epoch": 0.74,
"learning_rate": 1.3007368173436634e-05,
"loss": 3.5221,
"step": 195000
},
{
"epoch": 0.74,
"learning_rate": 1.2912515271317241e-05,
"loss": 3.5231,
"step": 195500
},
{
"epoch": 0.74,
"learning_rate": 1.281766236919785e-05,
"loss": 3.5237,
"step": 196000
},
{
"epoch": 0.75,
"learning_rate": 1.2722809467078456e-05,
"loss": 3.5236,
"step": 196500
},
{
"epoch": 0.75,
"learning_rate": 1.2627956564959063e-05,
"loss": 3.5201,
"step": 197000
},
{
"epoch": 0.75,
"learning_rate": 1.2533103662839668e-05,
"loss": 3.5221,
"step": 197500
},
{
"epoch": 0.75,
"learning_rate": 1.2438250760720275e-05,
"loss": 3.5216,
"step": 198000
},
{
"epoch": 0.75,
"learning_rate": 1.2343397858600881e-05,
"loss": 3.5202,
"step": 198500
},
{
"epoch": 0.76,
"learning_rate": 1.224854495648149e-05,
"loss": 3.5207,
"step": 199000
},
{
"epoch": 0.76,
"learning_rate": 1.2153692054362095e-05,
"loss": 3.5172,
"step": 199500
},
{
"epoch": 0.76,
"learning_rate": 1.2058839152242702e-05,
"loss": 3.5187,
"step": 200000
},
{
"epoch": 0.76,
"eval_accuracy": 0.3863054900513532,
"eval_loss": 3.440758466720581,
"eval_runtime": 5028.552,
"eval_samples_per_second": 88.251,
"eval_steps_per_second": 1.379,
"step": 200000
},
{
"epoch": 0.76,
"learning_rate": 1.196398625012331e-05,
"loss": 3.5153,
"step": 200500
},
{
"epoch": 0.76,
"learning_rate": 1.1869133348003917e-05,
"loss": 3.5164,
"step": 201000
},
{
"epoch": 0.76,
"learning_rate": 1.1774280445884522e-05,
"loss": 3.5225,
"step": 201500
},
{
"epoch": 0.77,
"learning_rate": 1.1679427543765129e-05,
"loss": 3.5142,
"step": 202000
},
{
"epoch": 0.77,
"learning_rate": 1.1584574641645736e-05,
"loss": 3.519,
"step": 202500
},
{
"epoch": 0.77,
"learning_rate": 1.1489721739526343e-05,
"loss": 3.522,
"step": 203000
},
{
"epoch": 0.77,
"learning_rate": 1.1394868837406949e-05,
"loss": 3.5142,
"step": 203500
},
{
"epoch": 0.77,
"learning_rate": 1.1300015935287558e-05,
"loss": 3.5101,
"step": 204000
},
{
"epoch": 0.78,
"learning_rate": 1.1205163033168163e-05,
"loss": 3.5151,
"step": 204500
},
{
"epoch": 0.78,
"learning_rate": 1.111031013104877e-05,
"loss": 3.5112,
"step": 205000
},
{
"epoch": 0.78,
"learning_rate": 1.1015457228929377e-05,
"loss": 3.5161,
"step": 205500
},
{
"epoch": 0.78,
"learning_rate": 1.0920604326809984e-05,
"loss": 3.5135,
"step": 206000
},
{
"epoch": 0.78,
"learning_rate": 1.082575142469059e-05,
"loss": 3.5143,
"step": 206500
},
{
"epoch": 0.79,
"learning_rate": 1.0730898522571197e-05,
"loss": 3.5134,
"step": 207000
},
{
"epoch": 0.79,
"learning_rate": 1.0636045620451804e-05,
"loss": 3.5162,
"step": 207500
},
{
"epoch": 0.79,
"learning_rate": 1.054119271833241e-05,
"loss": 3.5202,
"step": 208000
},
{
"epoch": 0.79,
"learning_rate": 1.0446339816213017e-05,
"loss": 3.507,
"step": 208500
},
{
"epoch": 0.79,
"learning_rate": 1.0351486914093624e-05,
"loss": 3.5108,
"step": 209000
},
{
"epoch": 0.79,
"learning_rate": 1.0256634011974231e-05,
"loss": 3.5144,
"step": 209500
},
{
"epoch": 0.8,
"learning_rate": 1.0161781109854836e-05,
"loss": 3.5157,
"step": 210000
},
{
"epoch": 0.8,
"eval_accuracy": 0.3870160062789933,
"eval_loss": 3.4338622093200684,
"eval_runtime": 5031.2535,
"eval_samples_per_second": 88.203,
"eval_steps_per_second": 1.378,
"step": 210000
},
{
"epoch": 0.8,
"learning_rate": 1.0066928207735445e-05,
"loss": 3.5167,
"step": 210500
},
{
"epoch": 0.8,
"learning_rate": 9.97207530561605e-06,
"loss": 3.5107,
"step": 211000
},
{
"epoch": 0.8,
"learning_rate": 9.877222403496658e-06,
"loss": 3.5114,
"step": 211500
},
{
"epoch": 0.8,
"learning_rate": 9.782369501377265e-06,
"loss": 3.5121,
"step": 212000
},
{
"epoch": 0.81,
"learning_rate": 9.687516599257872e-06,
"loss": 3.5084,
"step": 212500
},
{
"epoch": 0.81,
"learning_rate": 9.592663697138478e-06,
"loss": 3.5178,
"step": 213000
},
{
"epoch": 0.81,
"learning_rate": 9.497810795019085e-06,
"loss": 3.5076,
"step": 213500
},
{
"epoch": 0.81,
"learning_rate": 9.402957892899692e-06,
"loss": 3.5102,
"step": 214000
},
{
"epoch": 0.81,
"learning_rate": 9.308104990780299e-06,
"loss": 3.5069,
"step": 214500
},
{
"epoch": 0.82,
"learning_rate": 9.213252088660904e-06,
"loss": 3.5108,
"step": 215000
},
{
"epoch": 0.82,
"learning_rate": 9.118399186541513e-06,
"loss": 3.5087,
"step": 215500
},
{
"epoch": 0.82,
"learning_rate": 9.023546284422119e-06,
"loss": 3.5041,
"step": 216000
},
{
"epoch": 0.82,
"learning_rate": 8.928693382302724e-06,
"loss": 3.5125,
"step": 216500
},
{
"epoch": 0.82,
"learning_rate": 8.833840480183333e-06,
"loss": 3.5084,
"step": 217000
},
{
"epoch": 0.83,
"learning_rate": 8.738987578063938e-06,
"loss": 3.5034,
"step": 217500
},
{
"epoch": 0.83,
"learning_rate": 8.644134675944546e-06,
"loss": 3.5077,
"step": 218000
},
{
"epoch": 0.83,
"learning_rate": 8.549281773825151e-06,
"loss": 3.505,
"step": 218500
},
{
"epoch": 0.83,
"learning_rate": 8.45442887170576e-06,
"loss": 3.5072,
"step": 219000
},
{
"epoch": 0.83,
"learning_rate": 8.359575969586365e-06,
"loss": 3.5,
"step": 219500
},
{
"epoch": 0.83,
"learning_rate": 8.264723067466972e-06,
"loss": 3.5042,
"step": 220000
},
{
"epoch": 0.83,
"eval_accuracy": 0.3876448779010485,
"eval_loss": 3.4285898208618164,
"eval_runtime": 5039.4169,
"eval_samples_per_second": 88.06,
"eval_steps_per_second": 1.376,
"step": 220000
},
{
"epoch": 0.84,
"learning_rate": 8.16987016534758e-06,
"loss": 3.5075,
"step": 220500
},
{
"epoch": 0.84,
"learning_rate": 8.075017263228187e-06,
"loss": 3.5056,
"step": 221000
},
{
"epoch": 0.84,
"learning_rate": 7.980164361108792e-06,
"loss": 3.5089,
"step": 221500
},
{
"epoch": 0.84,
"learning_rate": 7.8853114589894e-06,
"loss": 3.5114,
"step": 222000
},
{
"epoch": 0.84,
"learning_rate": 7.790458556870006e-06,
"loss": 3.5076,
"step": 222500
},
{
"epoch": 0.85,
"learning_rate": 7.695605654750614e-06,
"loss": 3.5053,
"step": 223000
},
{
"epoch": 0.85,
"learning_rate": 7.60075275263122e-06,
"loss": 3.5058,
"step": 223500
},
{
"epoch": 0.85,
"learning_rate": 7.505899850511827e-06,
"loss": 3.5078,
"step": 224000
},
{
"epoch": 0.85,
"learning_rate": 7.411046948392433e-06,
"loss": 3.5084,
"step": 224500
},
{
"epoch": 0.85,
"learning_rate": 7.31619404627304e-06,
"loss": 3.5038,
"step": 225000
},
{
"epoch": 0.86,
"learning_rate": 7.221341144153647e-06,
"loss": 3.5015,
"step": 225500
},
{
"epoch": 0.86,
"learning_rate": 7.126488242034253e-06,
"loss": 3.5034,
"step": 226000
},
{
"epoch": 0.86,
"learning_rate": 7.03163533991486e-06,
"loss": 3.5098,
"step": 226500
},
{
"epoch": 0.86,
"learning_rate": 6.936782437795466e-06,
"loss": 3.5001,
"step": 227000
},
{
"epoch": 0.86,
"learning_rate": 6.841929535676074e-06,
"loss": 3.5055,
"step": 227500
},
{
"epoch": 0.87,
"learning_rate": 6.74707663355668e-06,
"loss": 3.5018,
"step": 228000
},
{
"epoch": 0.87,
"learning_rate": 6.652223731437288e-06,
"loss": 3.503,
"step": 228500
},
{
"epoch": 0.87,
"learning_rate": 6.557370829317894e-06,
"loss": 3.503,
"step": 229000
},
{
"epoch": 0.87,
"learning_rate": 6.462517927198501e-06,
"loss": 3.498,
"step": 229500
},
{
"epoch": 0.87,
"learning_rate": 6.3676650250791075e-06,
"loss": 3.5033,
"step": 230000
},
{
"epoch": 0.87,
"eval_accuracy": 0.38827686860475785,
"eval_loss": 3.422902822494507,
"eval_runtime": 5026.5934,
"eval_samples_per_second": 88.285,
"eval_steps_per_second": 1.379,
"step": 230000
},
{
"epoch": 0.87,
"learning_rate": 6.272812122959715e-06,
"loss": 3.5024,
"step": 230500
},
{
"epoch": 0.88,
"learning_rate": 6.177959220840321e-06,
"loss": 3.4986,
"step": 231000
},
{
"epoch": 0.88,
"learning_rate": 6.083106318720928e-06,
"loss": 3.5025,
"step": 231500
},
{
"epoch": 0.88,
"learning_rate": 5.988253416601534e-06,
"loss": 3.5065,
"step": 232000
},
{
"epoch": 0.88,
"learning_rate": 5.8934005144821415e-06,
"loss": 3.5047,
"step": 232500
},
{
"epoch": 0.88,
"learning_rate": 5.798547612362748e-06,
"loss": 3.502,
"step": 233000
},
{
"epoch": 0.89,
"learning_rate": 5.703694710243355e-06,
"loss": 3.5061,
"step": 233500
},
{
"epoch": 0.89,
"learning_rate": 5.608841808123961e-06,
"loss": 3.5036,
"step": 234000
},
{
"epoch": 0.89,
"learning_rate": 5.513988906004568e-06,
"loss": 3.4966,
"step": 234500
},
{
"epoch": 0.89,
"learning_rate": 5.4191360038851754e-06,
"loss": 3.5048,
"step": 235000
},
{
"epoch": 0.89,
"learning_rate": 5.324283101765782e-06,
"loss": 3.5024,
"step": 235500
},
{
"epoch": 0.9,
"learning_rate": 5.229430199646389e-06,
"loss": 3.495,
"step": 236000
},
{
"epoch": 0.9,
"learning_rate": 5.134577297526995e-06,
"loss": 3.5035,
"step": 236500
},
{
"epoch": 0.9,
"learning_rate": 5.039724395407602e-06,
"loss": 3.5026,
"step": 237000
},
{
"epoch": 0.9,
"learning_rate": 4.9448714932882094e-06,
"loss": 3.4997,
"step": 237500
},
{
"epoch": 0.9,
"learning_rate": 4.850018591168816e-06,
"loss": 3.5035,
"step": 238000
},
{
"epoch": 0.9,
"learning_rate": 4.755165689049423e-06,
"loss": 3.4975,
"step": 238500
},
{
"epoch": 0.91,
"learning_rate": 4.660312786930029e-06,
"loss": 3.5003,
"step": 239000
},
{
"epoch": 0.91,
"learning_rate": 4.5654598848106354e-06,
"loss": 3.4994,
"step": 239500
},
{
"epoch": 0.91,
"learning_rate": 4.4706069826912426e-06,
"loss": 3.501,
"step": 240000
},
{
"epoch": 0.91,
"eval_accuracy": 0.38875705078485445,
"eval_loss": 3.4187748432159424,
"eval_runtime": 5045.601,
"eval_samples_per_second": 87.952,
"eval_steps_per_second": 1.374,
"step": 240000
},
{
"epoch": 0.91,
"learning_rate": 4.375754080571849e-06,
"loss": 3.4917,
"step": 240500
},
{
"epoch": 0.91,
"learning_rate": 4.280901178452456e-06,
"loss": 3.4986,
"step": 241000
},
{
"epoch": 0.92,
"learning_rate": 4.186048276333063e-06,
"loss": 3.496,
"step": 241500
},
{
"epoch": 0.92,
"learning_rate": 4.091195374213669e-06,
"loss": 3.4978,
"step": 242000
},
{
"epoch": 0.92,
"learning_rate": 3.9963424720942765e-06,
"loss": 3.491,
"step": 242500
},
{
"epoch": 0.92,
"learning_rate": 3.901489569974883e-06,
"loss": 3.4953,
"step": 243000
},
{
"epoch": 0.92,
"learning_rate": 3.80663666785549e-06,
"loss": 3.4962,
"step": 243500
},
{
"epoch": 0.93,
"learning_rate": 3.7117837657360967e-06,
"loss": 3.4982,
"step": 244000
},
{
"epoch": 0.93,
"learning_rate": 3.6169308636167034e-06,
"loss": 3.4973,
"step": 244500
},
{
"epoch": 0.93,
"learning_rate": 3.52207796149731e-06,
"loss": 3.4922,
"step": 245000
},
{
"epoch": 0.93,
"learning_rate": 3.4272250593779172e-06,
"loss": 3.4921,
"step": 245500
},
{
"epoch": 0.93,
"learning_rate": 3.332372157258524e-06,
"loss": 3.5039,
"step": 246000
},
{
"epoch": 0.94,
"learning_rate": 3.2375192551391307e-06,
"loss": 3.5025,
"step": 246500
},
{
"epoch": 0.94,
"learning_rate": 3.1426663530197374e-06,
"loss": 3.4964,
"step": 247000
},
{
"epoch": 0.94,
"learning_rate": 3.0478134509003437e-06,
"loss": 3.4936,
"step": 247500
},
{
"epoch": 0.94,
"learning_rate": 2.952960548780951e-06,
"loss": 3.4959,
"step": 248000
},
{
"epoch": 0.94,
"learning_rate": 2.8581076466615575e-06,
"loss": 3.4963,
"step": 248500
},
{
"epoch": 0.94,
"learning_rate": 2.7632547445421642e-06,
"loss": 3.5014,
"step": 249000
},
{
"epoch": 0.95,
"learning_rate": 2.668401842422771e-06,
"loss": 3.4948,
"step": 249500
},
{
"epoch": 0.95,
"learning_rate": 2.5735489403033776e-06,
"loss": 3.4946,
"step": 250000
},
{
"epoch": 0.95,
"eval_accuracy": 0.38918558969561506,
"eval_loss": 3.4148616790771484,
"eval_runtime": 5028.0262,
"eval_samples_per_second": 88.26,
"eval_steps_per_second": 1.379,
"step": 250000
},
{
"epoch": 0.95,
"learning_rate": 2.4786960381839844e-06,
"loss": 3.4959,
"step": 250500
},
{
"epoch": 0.95,
"learning_rate": 2.383843136064591e-06,
"loss": 3.4955,
"step": 251000
},
{
"epoch": 0.95,
"learning_rate": 2.2889902339451978e-06,
"loss": 3.4869,
"step": 251500
},
{
"epoch": 0.96,
"learning_rate": 2.1941373318258045e-06,
"loss": 3.4975,
"step": 252000
},
{
"epoch": 0.96,
"learning_rate": 2.099284429706411e-06,
"loss": 3.4871,
"step": 252500
},
{
"epoch": 0.96,
"learning_rate": 2.0044315275870183e-06,
"loss": 3.4889,
"step": 253000
},
{
"epoch": 0.96,
"learning_rate": 1.909578625467625e-06,
"loss": 3.4928,
"step": 253500
},
{
"epoch": 0.96,
"learning_rate": 1.8147257233482318e-06,
"loss": 3.4921,
"step": 254000
},
{
"epoch": 0.97,
"learning_rate": 1.7198728212288385e-06,
"loss": 3.494,
"step": 254500
},
{
"epoch": 0.97,
"learning_rate": 1.625019919109445e-06,
"loss": 3.4918,
"step": 255000
},
{
"epoch": 0.97,
"learning_rate": 1.5301670169900519e-06,
"loss": 3.4919,
"step": 255500
},
{
"epoch": 0.97,
"learning_rate": 1.4353141148706586e-06,
"loss": 3.4907,
"step": 256000
},
{
"epoch": 0.97,
"learning_rate": 1.3404612127512653e-06,
"loss": 3.4937,
"step": 256500
},
{
"epoch": 0.98,
"learning_rate": 1.2456083106318722e-06,
"loss": 3.4893,
"step": 257000
},
{
"epoch": 0.98,
"learning_rate": 1.150755408512479e-06,
"loss": 3.4944,
"step": 257500
},
{
"epoch": 0.98,
"learning_rate": 1.0559025063930857e-06,
"loss": 3.4914,
"step": 258000
},
{
"epoch": 0.98,
"learning_rate": 9.610496042736924e-07,
"loss": 3.4947,
"step": 258500
},
{
"epoch": 0.98,
"learning_rate": 8.661967021542991e-07,
"loss": 3.4916,
"step": 259000
},
{
"epoch": 0.98,
"learning_rate": 7.713438000349059e-07,
"loss": 3.4884,
"step": 259500
},
{
"epoch": 0.99,
"learning_rate": 6.764908979155126e-07,
"loss": 3.4971,
"step": 260000
},
{
"epoch": 0.99,
"eval_accuracy": 0.3894276158938788,
"eval_loss": 3.412609100341797,
"eval_runtime": 5032.8837,
"eval_samples_per_second": 88.175,
"eval_steps_per_second": 1.378,
"step": 260000
},
{
"epoch": 0.99,
"learning_rate": 5.816379957961194e-07,
"loss": 3.4929,
"step": 260500
},
{
"epoch": 0.99,
"learning_rate": 4.867850936767261e-07,
"loss": 3.4937,
"step": 261000
},
{
"epoch": 0.99,
"learning_rate": 3.919321915573329e-07,
"loss": 3.4859,
"step": 261500
},
{
"epoch": 0.99,
"learning_rate": 2.9707928943793967e-07,
"loss": 3.4796,
"step": 262000
},
{
"epoch": 1.0,
"learning_rate": 2.022263873185464e-07,
"loss": 3.4871,
"step": 262500
},
{
"epoch": 1.0,
"learning_rate": 1.0737348519915316e-07,
"loss": 3.4949,
"step": 263000
},
{
"epoch": 1.0,
"learning_rate": 1.252058307975991e-08,
"loss": 3.4884,
"step": 263500
},
{
"epoch": 1.0,
"step": 263566,
"total_flos": 4.407529912270848e+18,
"train_loss": 3.6950655784551065,
"train_runtime": 269467.2599,
"train_samples_per_second": 31.299,
"train_steps_per_second": 0.978
}
],
"logging_steps": 500,
"max_steps": 263566,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"total_flos": 4.407529912270848e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}